xen: Account for stolen time
This patch accounts for the time stolen from our VCPUs. Stolen time is time where a vcpu is runnable and could be running, but all available physical CPUs are being used for something else. This accounting gets run on each timer interrupt, just as a way to get it run relatively often, and when interesting things are going on. Stolen time is not really used by much in the kernel; it is reported in /proc/stats, and that's about it. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Acked-by: Chris Wright <chrisw@sous-sol.org> Cc: john stultz <johnstul@us.ibm.com> Cc: Rik van Riel <riel@redhat.com>
This commit is contained in:
committed by
Jeremy Fitzhardinge
parent
9a4029fd34
commit
f91a8b447b
@@ -11,6 +11,7 @@
|
|||||||
#include <linux/interrupt.h>
|
#include <linux/interrupt.h>
|
||||||
#include <linux/clocksource.h>
|
#include <linux/clocksource.h>
|
||||||
#include <linux/clockchips.h>
|
#include <linux/clockchips.h>
|
||||||
|
#include <linux/kernel_stat.h>
|
||||||
|
|
||||||
#include <asm/xen/hypervisor.h>
|
#include <asm/xen/hypervisor.h>
|
||||||
#include <asm/xen/hypercall.h>
|
#include <asm/xen/hypercall.h>
|
||||||
@@ -25,6 +26,7 @@
|
|||||||
|
|
||||||
/* Xen may fire a timer up to this many ns early */
|
/* Xen may fire a timer up to this many ns early */
|
||||||
#define TIMER_SLOP 100000
|
#define TIMER_SLOP 100000
|
||||||
|
#define NS_PER_TICK (1000000000LL / HZ)
|
||||||
|
|
||||||
/* These are perodically updated in shared_info, and then copied here. */
|
/* These are perodically updated in shared_info, and then copied here. */
|
||||||
struct shadow_time_info {
|
struct shadow_time_info {
|
||||||
@@ -37,6 +39,139 @@ struct shadow_time_info {
|
|||||||
|
|
||||||
static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
|
static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
|
||||||
|
|
||||||
|
/* runstate info updated by Xen */
|
||||||
|
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
|
||||||
|
|
||||||
|
/* snapshots of runstate info */
|
||||||
|
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
|
||||||
|
|
||||||
|
/* unused ns of stolen and blocked time */
|
||||||
|
static DEFINE_PER_CPU(u64, residual_stolen);
|
||||||
|
static DEFINE_PER_CPU(u64, residual_blocked);
|
||||||
|
|
||||||
|
/* return an consistent snapshot of 64-bit time/counter value */
|
||||||
|
static u64 get64(const u64 *p)
|
||||||
|
{
|
||||||
|
u64 ret;
|
||||||
|
|
||||||
|
if (BITS_PER_LONG < 64) {
|
||||||
|
u32 *p32 = (u32 *)p;
|
||||||
|
u32 h, l;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Read high then low, and then make sure high is
|
||||||
|
* still the same; this will only loop if low wraps
|
||||||
|
* and carries into high.
|
||||||
|
* XXX some clean way to make this endian-proof?
|
||||||
|
*/
|
||||||
|
do {
|
||||||
|
h = p32[1];
|
||||||
|
barrier();
|
||||||
|
l = p32[0];
|
||||||
|
barrier();
|
||||||
|
} while (p32[1] != h);
|
||||||
|
|
||||||
|
ret = (((u64)h) << 32) | l;
|
||||||
|
} else
|
||||||
|
ret = *p;
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Runstate accounting
|
||||||
|
*/
|
||||||
|
static void get_runstate_snapshot(struct vcpu_runstate_info *res)
|
||||||
|
{
|
||||||
|
u64 state_time;
|
||||||
|
struct vcpu_runstate_info *state;
|
||||||
|
|
||||||
|
preempt_disable();
|
||||||
|
|
||||||
|
state = &__get_cpu_var(runstate);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The runstate info is always updated by the hypervisor on
|
||||||
|
* the current CPU, so there's no need to use anything
|
||||||
|
* stronger than a compiler barrier when fetching it.
|
||||||
|
*/
|
||||||
|
do {
|
||||||
|
state_time = get64(&state->state_entry_time);
|
||||||
|
barrier();
|
||||||
|
*res = *state;
|
||||||
|
barrier();
|
||||||
|
} while (get64(&state->state_entry_time) != state_time);
|
||||||
|
|
||||||
|
preempt_enable();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void setup_runstate_info(int cpu)
|
||||||
|
{
|
||||||
|
struct vcpu_register_runstate_memory_area area;
|
||||||
|
|
||||||
|
area.addr.v = &per_cpu(runstate, cpu);
|
||||||
|
|
||||||
|
if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
|
||||||
|
cpu, &area))
|
||||||
|
BUG();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void do_stolen_accounting(void)
|
||||||
|
{
|
||||||
|
struct vcpu_runstate_info state;
|
||||||
|
struct vcpu_runstate_info *snap;
|
||||||
|
s64 blocked, runnable, offline, stolen;
|
||||||
|
cputime_t ticks;
|
||||||
|
|
||||||
|
get_runstate_snapshot(&state);
|
||||||
|
|
||||||
|
WARN_ON(state.state != RUNSTATE_running);
|
||||||
|
|
||||||
|
snap = &__get_cpu_var(runstate_snapshot);
|
||||||
|
|
||||||
|
/* work out how much time the VCPU has not been runn*ing* */
|
||||||
|
blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
|
||||||
|
runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
|
||||||
|
offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
|
||||||
|
|
||||||
|
*snap = state;
|
||||||
|
|
||||||
|
/* Add the appropriate number of ticks of stolen time,
|
||||||
|
including any left-overs from last time. Passing NULL to
|
||||||
|
account_steal_time accounts the time as stolen. */
|
||||||
|
stolen = runnable + offline + __get_cpu_var(residual_stolen);
|
||||||
|
|
||||||
|
if (stolen < 0)
|
||||||
|
stolen = 0;
|
||||||
|
|
||||||
|
ticks = 0;
|
||||||
|
while (stolen >= NS_PER_TICK) {
|
||||||
|
ticks++;
|
||||||
|
stolen -= NS_PER_TICK;
|
||||||
|
}
|
||||||
|
__get_cpu_var(residual_stolen) = stolen;
|
||||||
|
account_steal_time(NULL, ticks);
|
||||||
|
|
||||||
|
/* Add the appropriate number of ticks of blocked time,
|
||||||
|
including any left-overs from last time. Passing idle to
|
||||||
|
account_steal_time accounts the time as idle/wait. */
|
||||||
|
blocked += __get_cpu_var(residual_blocked);
|
||||||
|
|
||||||
|
if (blocked < 0)
|
||||||
|
blocked = 0;
|
||||||
|
|
||||||
|
ticks = 0;
|
||||||
|
while (blocked >= NS_PER_TICK) {
|
||||||
|
ticks++;
|
||||||
|
blocked -= NS_PER_TICK;
|
||||||
|
}
|
||||||
|
__get_cpu_var(residual_blocked) = blocked;
|
||||||
|
account_steal_time(idle_task(smp_processor_id()), ticks);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* Get the CPU speed from Xen */
|
||||||
unsigned long xen_cpu_khz(void)
|
unsigned long xen_cpu_khz(void)
|
||||||
{
|
{
|
||||||
u64 cpu_khz = 1000000ULL << 32;
|
u64 cpu_khz = 1000000ULL << 32;
|
||||||
@@ -56,13 +191,11 @@ unsigned long xen_cpu_khz(void)
|
|||||||
* Reads a consistent set of time-base values from Xen, into a shadow data
|
* Reads a consistent set of time-base values from Xen, into a shadow data
|
||||||
* area.
|
* area.
|
||||||
*/
|
*/
|
||||||
static void get_time_values_from_xen(void)
|
static unsigned get_time_values_from_xen(void)
|
||||||
{
|
{
|
||||||
struct vcpu_time_info *src;
|
struct vcpu_time_info *src;
|
||||||
struct shadow_time_info *dst;
|
struct shadow_time_info *dst;
|
||||||
|
|
||||||
preempt_disable();
|
|
||||||
|
|
||||||
/* src is shared memory with the hypervisor, so we need to
|
/* src is shared memory with the hypervisor, so we need to
|
||||||
make sure we get a consistent snapshot, even in the face of
|
make sure we get a consistent snapshot, even in the face of
|
||||||
being preempted. */
|
being preempted. */
|
||||||
@@ -79,7 +212,7 @@ static void get_time_values_from_xen(void)
|
|||||||
rmb(); /* test version after fetching data */
|
rmb(); /* test version after fetching data */
|
||||||
} while ((src->version & 1) | (dst->version ^ src->version));
|
} while ((src->version & 1) | (dst->version ^ src->version));
|
||||||
|
|
||||||
preempt_enable();
|
return dst->version;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -123,7 +256,7 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
|
|||||||
static u64 get_nsec_offset(struct shadow_time_info *shadow)
|
static u64 get_nsec_offset(struct shadow_time_info *shadow)
|
||||||
{
|
{
|
||||||
u64 now, delta;
|
u64 now, delta;
|
||||||
rdtscll(now);
|
now = native_read_tsc();
|
||||||
delta = now - shadow->tsc_timestamp;
|
delta = now - shadow->tsc_timestamp;
|
||||||
return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
|
return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
|
||||||
}
|
}
|
||||||
@@ -132,10 +265,14 @@ cycle_t xen_clocksource_read(void)
|
|||||||
{
|
{
|
||||||
struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
|
struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
|
||||||
cycle_t ret;
|
cycle_t ret;
|
||||||
|
unsigned version;
|
||||||
|
|
||||||
get_time_values_from_xen();
|
do {
|
||||||
|
version = get_time_values_from_xen();
|
||||||
ret = shadow->system_timestamp + get_nsec_offset(shadow);
|
barrier();
|
||||||
|
ret = shadow->system_timestamp + get_nsec_offset(shadow);
|
||||||
|
barrier();
|
||||||
|
} while (version != __get_cpu_var(xen_vcpu)->time.version);
|
||||||
|
|
||||||
put_cpu_var(shadow_time);
|
put_cpu_var(shadow_time);
|
||||||
|
|
||||||
@@ -352,6 +489,8 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
|
|||||||
ret = IRQ_HANDLED;
|
ret = IRQ_HANDLED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
do_stolen_accounting();
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -378,6 +517,8 @@ static void xen_setup_timer(int cpu)
|
|||||||
evt->irq = irq;
|
evt->irq = irq;
|
||||||
clockevents_register_device(evt);
|
clockevents_register_device(evt);
|
||||||
|
|
||||||
|
setup_runstate_info(cpu);
|
||||||
|
|
||||||
put_cpu_var(xen_clock_events);
|
put_cpu_var(xen_clock_events);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -390,7 +531,7 @@ __init void xen_time_init(void)
|
|||||||
clocksource_register(&xen_clocksource);
|
clocksource_register(&xen_clocksource);
|
||||||
|
|
||||||
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
|
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
|
||||||
/* Successfully turned off 100hz tick, so we have the
|
/* Successfully turned off 100Hz tick, so we have the
|
||||||
vcpuop-based timer interface */
|
vcpuop-based timer interface */
|
||||||
printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
|
printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
|
||||||
xen_clockevent = &xen_vcpuop_clockevent;
|
xen_clockevent = &xen_vcpuop_clockevent;
|
||||||
|
Reference in New Issue
Block a user