Revert "cpuidle: Quickly notice prediction failure for repeat mode"
Revert commit69a37bea
(cpuidle: Quickly notice prediction failure for repeat mode), because it has been identified as the source of a significant performance regression in v3.8 and later as explained by Jeremy Eder: We believe we've identified a particular commit to the cpuidle code that seems to be impacting performance of variety of workloads. The simplest way to reproduce is using netperf TCP_RR test, so we're using that, on a pair of Sandy Bridge based servers. We also have data from a large database setup where performance is also measurably/positively impacted, though that test data isn't easily share-able. Included below are test results from 3 test kernels: kernel reverts ----------------------------------------------------------- 1) vanilla upstream (no reverts) 2) perfteam2 revertse11538d1f0
3) test reverts69a37beabf
e11538d1f0
In summary, netperf TCP_RR numbers improve by approximately 4% after reverting69a37beabf
. When69a37beabf
is included, C0 residency never seems to get above 40%. Taking that patch out gets C0 near 100% quite often, and performance increases. The below data are histograms representing the %c0 residency @ 1-second sample rates (using turbostat), while under netperf test. - If you look at the first 4 histograms, you can see %c0 residency almost entirely in the 30,40% bin. - The last pair, which reverts69a37beabf
, shows %c0 in the 80,90,100% bins. Below each kernel name are netperf TCP_RR trans/s numbers for the particular kernel that can be disclosed publicly, comparing the 3 test kernels. We ran a 4th test with the vanilla kernel where we've also set /dev/cpu_dma_latency=0 to show overall impact boosting single-threaded TCP_RR performance over 11% above baseline. 3.10-rc2 vanilla RX + c0 lock (/dev/cpu_dma_latency=0): TCP_RR trans/s 54323.78 ----------------------------------------------------------- 3.10-rc2 vanilla RX (no reverts) TCP_RR trans/s 48192.47 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 59]: *********************************************************** 40.0000 - 50.0000 [ 1]: * 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: Sender %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 11]: *********** 40.0000 - 50.0000 [ 49]: ************************************************* 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: ----------------------------------------------------------- 3.10-rc2 perfteam2 RX (reverts commite11538d1f0
) TCP_RR trans/s 49698.69 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 1]: * 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 59]: *********************************************************** 40.0000 - 50.0000 [ 0]: 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: Sender %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 2]: ** 40.0000 - 50.0000 [ 58]: ********************************************************** 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: ----------------------------------------------------------- 3.10-rc2 test RX (reverts69a37beabf
ande11538d1f0
) TCP_RR trans/s 47766.95 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 1]: * 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 27]: *************************** 40.0000 - 50.0000 [ 2]: ** 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 2]: ** 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 28]: **************************** Sender: 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 11]: *********** 40.0000 - 50.0000 [ 0]: 50.0000 - 60.0000 [ 1]: * 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 3]: *** 80.0000 - 90.0000 [ 7]: ******* 90.0000 - 100.0000 [ 38]: ************************************** These results demonstrate gaining back the tendency of the CPU to stay in more responsive, performant C-states (and thus yield measurably better performance), by reverting commit69a37beabf
. Requested-by: Jeremy Eder <jeder@redhat.com> Tested-by: Len Brown <len.brown@intel.com> Cc: 3.8+ <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
This commit is contained in:
@@ -28,13 +28,6 @@
|
|||||||
#define MAX_INTERESTING 50000
|
#define MAX_INTERESTING 50000
|
||||||
#define STDDEV_THRESH 400
|
#define STDDEV_THRESH 400
|
||||||
|
|
||||||
/* 60 * 60 > STDDEV_THRESH * INTERVALS = 400 * 8 */
|
|
||||||
#define MAX_DEVIATION 60
|
|
||||||
|
|
||||||
static DEFINE_PER_CPU(struct hrtimer, menu_hrtimer);
|
|
||||||
static DEFINE_PER_CPU(int, hrtimer_status);
|
|
||||||
/* menu hrtimer mode */
|
|
||||||
enum {MENU_HRTIMER_STOP, MENU_HRTIMER_REPEAT};
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Concepts and ideas behind the menu governor
|
* Concepts and ideas behind the menu governor
|
||||||
@@ -198,42 +191,17 @@ static u64 div_round64(u64 dividend, u32 divisor)
|
|||||||
return div_u64(dividend + (divisor / 2), divisor);
|
return div_u64(dividend + (divisor / 2), divisor);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Cancel the hrtimer if it is not triggered yet */
|
|
||||||
void menu_hrtimer_cancel(void)
|
|
||||||
{
|
|
||||||
int cpu = smp_processor_id();
|
|
||||||
struct hrtimer *hrtmr = &per_cpu(menu_hrtimer, cpu);
|
|
||||||
|
|
||||||
/* The timer is still not time out*/
|
|
||||||
if (per_cpu(hrtimer_status, cpu)) {
|
|
||||||
hrtimer_cancel(hrtmr);
|
|
||||||
per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_STOP;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
EXPORT_SYMBOL_GPL(menu_hrtimer_cancel);
|
|
||||||
|
|
||||||
/* Call back for hrtimer is triggered */
|
|
||||||
static enum hrtimer_restart menu_hrtimer_notify(struct hrtimer *hrtimer)
|
|
||||||
{
|
|
||||||
int cpu = smp_processor_id();
|
|
||||||
|
|
||||||
per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_STOP;
|
|
||||||
|
|
||||||
return HRTIMER_NORESTART;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Try detecting repeating patterns by keeping track of the last 8
|
* Try detecting repeating patterns by keeping track of the last 8
|
||||||
* intervals, and checking if the standard deviation of that set
|
* intervals, and checking if the standard deviation of that set
|
||||||
* of points is below a threshold. If it is... then use the
|
* of points is below a threshold. If it is... then use the
|
||||||
* average of these 8 points as the estimated value.
|
* average of these 8 points as the estimated value.
|
||||||
*/
|
*/
|
||||||
static u32 get_typical_interval(struct menu_device *data)
|
static void get_typical_interval(struct menu_device *data)
|
||||||
{
|
{
|
||||||
int i = 0, divisor = 0;
|
int i = 0, divisor = 0;
|
||||||
uint64_t max = 0, avg = 0, stddev = 0;
|
uint64_t max = 0, avg = 0, stddev = 0;
|
||||||
int64_t thresh = LLONG_MAX; /* Discard outliers above this value. */
|
int64_t thresh = LLONG_MAX; /* Discard outliers above this value. */
|
||||||
unsigned int ret = 0;
|
|
||||||
|
|
||||||
again:
|
again:
|
||||||
|
|
||||||
@@ -274,16 +242,13 @@ again:
|
|||||||
if (((avg > stddev * 6) && (divisor * 4 >= INTERVALS * 3))
|
if (((avg > stddev * 6) && (divisor * 4 >= INTERVALS * 3))
|
||||||
|| stddev <= 20) {
|
|| stddev <= 20) {
|
||||||
data->predicted_us = avg;
|
data->predicted_us = avg;
|
||||||
ret = 1;
|
return;
|
||||||
return ret;
|
|
||||||
|
|
||||||
} else if ((divisor * 4) > INTERVALS * 3) {
|
} else if ((divisor * 4) > INTERVALS * 3) {
|
||||||
/* Exclude the max interval */
|
/* Exclude the max interval */
|
||||||
thresh = max - 1;
|
thresh = max - 1;
|
||||||
goto again;
|
goto again;
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -298,9 +263,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
|
|||||||
int i;
|
int i;
|
||||||
int multiplier;
|
int multiplier;
|
||||||
struct timespec t;
|
struct timespec t;
|
||||||
int repeat = 0, low_predicted = 0;
|
|
||||||
int cpu = smp_processor_id();
|
|
||||||
struct hrtimer *hrtmr = &per_cpu(menu_hrtimer, cpu);
|
|
||||||
|
|
||||||
if (data->needs_update) {
|
if (data->needs_update) {
|
||||||
menu_update(drv, dev);
|
menu_update(drv, dev);
|
||||||
@@ -335,7 +297,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
|
|||||||
data->predicted_us = div_round64(data->expected_us * data->correction_factor[data->bucket],
|
data->predicted_us = div_round64(data->expected_us * data->correction_factor[data->bucket],
|
||||||
RESOLUTION * DECAY);
|
RESOLUTION * DECAY);
|
||||||
|
|
||||||
repeat = get_typical_interval(data);
|
get_typical_interval(data);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We want to default to C1 (hlt), not to busy polling
|
* We want to default to C1 (hlt), not to busy polling
|
||||||
@@ -356,10 +318,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
|
|||||||
|
|
||||||
if (s->disabled || su->disable)
|
if (s->disabled || su->disable)
|
||||||
continue;
|
continue;
|
||||||
if (s->target_residency > data->predicted_us) {
|
if (s->target_residency > data->predicted_us)
|
||||||
low_predicted = 1;
|
|
||||||
continue;
|
continue;
|
||||||
}
|
|
||||||
if (s->exit_latency > latency_req)
|
if (s->exit_latency > latency_req)
|
||||||
continue;
|
continue;
|
||||||
if (s->exit_latency * multiplier > data->predicted_us)
|
if (s->exit_latency * multiplier > data->predicted_us)
|
||||||
@@ -369,28 +329,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
|
|||||||
data->exit_us = s->exit_latency;
|
data->exit_us = s->exit_latency;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* not deepest C-state chosen for low predicted residency */
|
|
||||||
if (low_predicted) {
|
|
||||||
unsigned int timer_us = 0;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Set a timer to detect whether this sleep is much
|
|
||||||
* longer than repeat mode predicted. If the timer
|
|
||||||
* triggers, the code will evaluate whether to put
|
|
||||||
* the CPU into a deeper C-state.
|
|
||||||
* The timer is cancelled on CPU wakeup.
|
|
||||||
*/
|
|
||||||
timer_us = 2 * (data->predicted_us + MAX_DEVIATION);
|
|
||||||
|
|
||||||
if (repeat && (4 * timer_us < data->expected_us)) {
|
|
||||||
RCU_NONIDLE(hrtimer_start(hrtmr,
|
|
||||||
ns_to_ktime(1000 * timer_us),
|
|
||||||
HRTIMER_MODE_REL_PINNED));
|
|
||||||
/* In repeat case, menu hrtimer is started */
|
|
||||||
per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_REPEAT;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return data->last_state_idx;
|
return data->last_state_idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -481,9 +419,6 @@ static int menu_enable_device(struct cpuidle_driver *drv,
|
|||||||
struct cpuidle_device *dev)
|
struct cpuidle_device *dev)
|
||||||
{
|
{
|
||||||
struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
|
struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
|
||||||
struct hrtimer *t = &per_cpu(menu_hrtimer, dev->cpu);
|
|
||||||
hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
|
||||||
t->function = menu_hrtimer_notify;
|
|
||||||
|
|
||||||
memset(data, 0, sizeof(struct menu_device));
|
memset(data, 0, sizeof(struct menu_device));
|
||||||
|
|
||||||
|
@@ -174,10 +174,4 @@ static inline void tick_nohz_task_switch(struct task_struct *tsk) { }
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
# ifdef CONFIG_CPU_IDLE_GOV_MENU
|
|
||||||
extern void menu_hrtimer_cancel(void);
|
|
||||||
# else
|
|
||||||
static inline void menu_hrtimer_cancel(void) {}
|
|
||||||
# endif /* CONFIG_CPU_IDLE_GOV_MENU */
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@@ -827,13 +827,10 @@ void tick_nohz_irq_exit(void)
|
|||||||
{
|
{
|
||||||
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
|
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
|
||||||
|
|
||||||
if (ts->inidle) {
|
if (ts->inidle)
|
||||||
/* Cancel the timer because CPU already waken up from the C-states*/
|
|
||||||
menu_hrtimer_cancel();
|
|
||||||
__tick_nohz_idle_enter(ts);
|
__tick_nohz_idle_enter(ts);
|
||||||
} else {
|
else
|
||||||
tick_nohz_full_stop_tick(ts);
|
tick_nohz_full_stop_tick(ts);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -931,8 +928,6 @@ void tick_nohz_idle_exit(void)
|
|||||||
|
|
||||||
ts->inidle = 0;
|
ts->inidle = 0;
|
||||||
|
|
||||||
/* Cancel the timer because CPU already waken up from the C-states*/
|
|
||||||
menu_hrtimer_cancel();
|
|
||||||
if (ts->idle_active || ts->tick_stopped)
|
if (ts->idle_active || ts->tick_stopped)
|
||||||
now = ktime_get();
|
now = ktime_get();
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user