diff --git a/mm/vmstat.c b/mm/vmstat.c index cce7c766da7a..1b12d390dc68 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -7,6 +7,7 @@ * zoned VM statistics * Copyright (C) 2006 Silicon Graphics, Inc., * Christoph Lameter + * Copyright (C) 2008-2014 Christoph Lameter */ #include #include @@ -14,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -419,13 +421,22 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) EXPORT_SYMBOL(dec_zone_page_state); #endif -static inline void fold_diff(int *diff) + +/* + * Fold a differential into the global counters. + * Returns the number of counters updated. + */ +static int fold_diff(int *diff) { int i; + int changes = 0; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (diff[i]) + if (diff[i]) { atomic_long_add(diff[i], &vm_stat[i]); + changes++; + } + return changes; } /* @@ -441,12 +452,15 @@ static inline void fold_diff(int *diff) * statistics in the remote zone struct as well as the global cachelines * with the global counters. These could cause remote node cache line * bouncing and will have to be only done when necessary. + * + * The function returns the number of global counters updated. */ -static void refresh_cpu_vm_stats(void) +static int refresh_cpu_vm_stats(void) { struct zone *zone; int i; int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int changes = 0; for_each_populated_zone(zone) { struct per_cpu_pageset __percpu *p = zone->pageset; @@ -486,15 +500,17 @@ static void refresh_cpu_vm_stats(void) continue; } - if (__this_cpu_dec_return(p->expire)) continue; - if (__this_cpu_read(p->pcp.count)) + if (__this_cpu_read(p->pcp.count)) { drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); + changes++; + } #endif } - fold_diff(global_diff); + changes += fold_diff(global_diff); + return changes; } /* @@ -1239,20 +1255,108 @@ static const struct file_operations proc_vmstat_file_operations = { #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; +static cpumask_var_t cpu_stat_off; static void vmstat_update(struct work_struct *w) { - refresh_cpu_vm_stats(); - schedule_delayed_work(this_cpu_ptr(&vmstat_work), - round_jiffies_relative(sysctl_stat_interval)); + if (refresh_cpu_vm_stats()) + /* + * Counters were updated so we expect more updates + * to occur in the future. Keep on running the + * update worker thread. + */ + schedule_delayed_work(this_cpu_ptr(&vmstat_work), + round_jiffies_relative(sysctl_stat_interval)); + else { + /* + * We did not update any counters so the app may be in + * a mode where it does not cause counter updates. + * We may be uselessly running vmstat_update. + * Defer the checking for differentials to the + * shepherd thread on a different processor. + */ + int r; + /* + * Shepherd work thread does not race since it never + * changes the bit if its zero but the cpu + * online / off line code may race if + * worker threads are still allowed during + * shutdown / startup. + */ + r = cpumask_test_and_set_cpu(smp_processor_id(), + cpu_stat_off); + VM_BUG_ON(r); + } } -static void start_cpu_timer(int cpu) +/* + * Check if the diffs for a certain cpu indicate that + * an update is needed. + */ +static bool need_update(int cpu) { - struct delayed_work *work = &per_cpu(vmstat_work, cpu); + struct zone *zone; - INIT_DEFERRABLE_WORK(work, vmstat_update); - schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); + for_each_populated_zone(zone) { + struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); + + BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); + /* + * The fast way of checking if there are any vmstat diffs. + * This works because the diffs are byte sized items. + */ + if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS)) + return true; + + } + return false; +} + + +/* + * Shepherd worker thread that checks the + * differentials of processors that have their worker + * threads for vm statistics updates disabled because of + * inactivity. + */ +static void vmstat_shepherd(struct work_struct *w); + +static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd); + +static void vmstat_shepherd(struct work_struct *w) +{ + int cpu; + + get_online_cpus(); + /* Check processors whose vmstat worker threads have been disabled */ + for_each_cpu(cpu, cpu_stat_off) + if (need_update(cpu) && + cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) + + schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu), + __round_jiffies_relative(sysctl_stat_interval, cpu)); + + put_online_cpus(); + + schedule_delayed_work(&shepherd, + round_jiffies_relative(sysctl_stat_interval)); + +} + +static void __init start_shepherd_timer(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), + vmstat_update); + + if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) + BUG(); + cpumask_copy(cpu_stat_off, cpu_online_mask); + + schedule_delayed_work(&shepherd, + round_jiffies_relative(sysctl_stat_interval)); } static void vmstat_cpu_dead(int node) @@ -1283,17 +1387,17 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb, case CPU_ONLINE: case CPU_ONLINE_FROZEN: refresh_zone_stat_thresholds(); - start_cpu_timer(cpu); node_set_state(cpu_to_node(cpu), N_CPU); + cpumask_set_cpu(cpu, cpu_stat_off); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); - per_cpu(vmstat_work, cpu).work.func = NULL; + cpumask_clear_cpu(cpu, cpu_stat_off); break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - start_cpu_timer(cpu); + cpumask_set_cpu(cpu, cpu_stat_off); break; case CPU_DEAD: case CPU_DEAD_FROZEN: @@ -1313,15 +1417,10 @@ static struct notifier_block vmstat_notifier = static int __init setup_vmstat(void) { #ifdef CONFIG_SMP - int cpu; - cpu_notifier_register_begin(); __register_cpu_notifier(&vmstat_notifier); - for_each_online_cpu(cpu) { - start_cpu_timer(cpu); - node_set_state(cpu_to_node(cpu), N_CPU); - } + start_shepherd_timer(); cpu_notifier_register_done(); #endif #ifdef CONFIG_PROC_FS