[PATCH] NUMA slab locking fixes: fix cpu down and up locking
This fixes locking and bugs in cpu_down and cpu_up paths of the NUMA slab allocator. Sonny Rao <sonny@burdell.org> reported problems sometime back on POWER5 boxes, when the last cpu on the nodes were being offlined. We could not reproduce the same on x86_64 because the cpumask (node_to_cpumask) was not being updated on cpu down. Since that issue is now fixed, we can reproduce Sonny's problems on x86_64 NUMA, and here is the fix. The problem earlier was on CPU_DOWN, if it was the last cpu on the node to go down, the array_caches (shared, alien) and the kmem_list3 of the node were being freed (kfree) with the kmem_list3 lock held. If the l3 or the array_caches were to come from the same cache being cleared, we hit on badness. This patch cleans up the locking in cpu_up and cpu_down path. We cannot really free l3 on cpu down because, there is no node offlining yet and even though a cpu is not yet up, node local memory can be allocated for it. So l3s are usually allocated at keme_cache_create and destroyed at kmem_cache_destroy. Hence, we don't need cachep->spinlock protection to get to the cachep->nodelist[nodeid] either. Patch survived onlining and offlining on a 4 core 2 node Tyan box with a 4 dbench process running all the time. Signed-off-by: Alok N Kataria <alokk@calsoftinc.com> Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org> Cc: Christoph Lameter <christoph@lameter.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
committed by
Linus Torvalds
parent
ca3b9b9173
commit
4484ebf12b
123
mm/slab.c
123
mm/slab.c
@@ -884,14 +884,14 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
|
static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
|
||||||
{
|
{
|
||||||
int i = 0;
|
int i = 0;
|
||||||
struct array_cache *ac;
|
struct array_cache *ac;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
|
||||||
for_each_online_node(i) {
|
for_each_online_node(i) {
|
||||||
ac = l3->alien[i];
|
ac = alien[i];
|
||||||
if (ac) {
|
if (ac) {
|
||||||
spin_lock_irqsave(&ac->lock, flags);
|
spin_lock_irqsave(&ac->lock, flags);
|
||||||
__drain_alien_cache(cachep, ac, i);
|
__drain_alien_cache(cachep, ac, i);
|
||||||
@@ -901,8 +901,11 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
|
|||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
#define alloc_alien_cache(node, limit) do { } while (0)
|
#define alloc_alien_cache(node, limit) do { } while (0)
|
||||||
#define free_alien_cache(ac_ptr) do { } while (0)
|
#define drain_alien_cache(cachep, alien) do { } while (0)
|
||||||
#define drain_alien_cache(cachep, l3) do { } while (0)
|
|
||||||
|
static inline void free_alien_cache(struct array_cache **ac_ptr)
|
||||||
|
{
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static int __devinit cpuup_callback(struct notifier_block *nfb,
|
static int __devinit cpuup_callback(struct notifier_block *nfb,
|
||||||
@@ -936,6 +939,11 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
|
|||||||
l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
|
l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
|
||||||
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
|
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The l3s don't come and go as CPUs come and
|
||||||
|
* go. cache_chain_mutex is sufficient
|
||||||
|
* protection here.
|
||||||
|
*/
|
||||||
cachep->nodelists[node] = l3;
|
cachep->nodelists[node] = l3;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -950,26 +958,47 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
|
|||||||
& array cache's */
|
& array cache's */
|
||||||
list_for_each_entry(cachep, &cache_chain, next) {
|
list_for_each_entry(cachep, &cache_chain, next) {
|
||||||
struct array_cache *nc;
|
struct array_cache *nc;
|
||||||
|
struct array_cache *shared;
|
||||||
|
struct array_cache **alien;
|
||||||
|
|
||||||
nc = alloc_arraycache(node, cachep->limit,
|
nc = alloc_arraycache(node, cachep->limit,
|
||||||
cachep->batchcount);
|
cachep->batchcount);
|
||||||
if (!nc)
|
if (!nc)
|
||||||
goto bad;
|
goto bad;
|
||||||
|
shared = alloc_arraycache(node,
|
||||||
|
cachep->shared * cachep->batchcount,
|
||||||
|
0xbaadf00d);
|
||||||
|
if (!shared)
|
||||||
|
goto bad;
|
||||||
|
#ifdef CONFIG_NUMA
|
||||||
|
alien = alloc_alien_cache(node, cachep->limit);
|
||||||
|
if (!alien)
|
||||||
|
goto bad;
|
||||||
|
#endif
|
||||||
cachep->array[cpu] = nc;
|
cachep->array[cpu] = nc;
|
||||||
|
|
||||||
l3 = cachep->nodelists[node];
|
l3 = cachep->nodelists[node];
|
||||||
BUG_ON(!l3);
|
BUG_ON(!l3);
|
||||||
if (!l3->shared) {
|
|
||||||
if (!(nc = alloc_arraycache(node,
|
|
||||||
cachep->shared *
|
|
||||||
cachep->batchcount,
|
|
||||||
0xbaadf00d)))
|
|
||||||
goto bad;
|
|
||||||
|
|
||||||
/* we are serialised from CPU_DEAD or
|
spin_lock_irq(&l3->list_lock);
|
||||||
CPU_UP_CANCELLED by the cpucontrol lock */
|
if (!l3->shared) {
|
||||||
l3->shared = nc;
|
/*
|
||||||
|
* We are serialised from CPU_DEAD or
|
||||||
|
* CPU_UP_CANCELLED by the cpucontrol lock
|
||||||
|
*/
|
||||||
|
l3->shared = shared;
|
||||||
|
shared = NULL;
|
||||||
}
|
}
|
||||||
|
#ifdef CONFIG_NUMA
|
||||||
|
if (!l3->alien) {
|
||||||
|
l3->alien = alien;
|
||||||
|
alien = NULL;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
spin_unlock_irq(&l3->list_lock);
|
||||||
|
|
||||||
|
kfree(shared);
|
||||||
|
free_alien_cache(alien);
|
||||||
}
|
}
|
||||||
mutex_unlock(&cache_chain_mutex);
|
mutex_unlock(&cache_chain_mutex);
|
||||||
break;
|
break;
|
||||||
@@ -978,23 +1007,32 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
|
|||||||
break;
|
break;
|
||||||
#ifdef CONFIG_HOTPLUG_CPU
|
#ifdef CONFIG_HOTPLUG_CPU
|
||||||
case CPU_DEAD:
|
case CPU_DEAD:
|
||||||
|
/*
|
||||||
|
* Even if all the cpus of a node are down, we don't free the
|
||||||
|
* kmem_list3 of any cache. This to avoid a race between
|
||||||
|
* cpu_down, and a kmalloc allocation from another cpu for
|
||||||
|
* memory from the node of the cpu going down. The list3
|
||||||
|
* structure is usually allocated from kmem_cache_create() and
|
||||||
|
* gets destroyed at kmem_cache_destroy().
|
||||||
|
*/
|
||||||
/* fall thru */
|
/* fall thru */
|
||||||
case CPU_UP_CANCELED:
|
case CPU_UP_CANCELED:
|
||||||
mutex_lock(&cache_chain_mutex);
|
mutex_lock(&cache_chain_mutex);
|
||||||
|
|
||||||
list_for_each_entry(cachep, &cache_chain, next) {
|
list_for_each_entry(cachep, &cache_chain, next) {
|
||||||
struct array_cache *nc;
|
struct array_cache *nc;
|
||||||
|
struct array_cache *shared;
|
||||||
|
struct array_cache **alien;
|
||||||
cpumask_t mask;
|
cpumask_t mask;
|
||||||
|
|
||||||
mask = node_to_cpumask(node);
|
mask = node_to_cpumask(node);
|
||||||
spin_lock(&cachep->spinlock);
|
|
||||||
/* cpu is dead; no one can alloc from it. */
|
/* cpu is dead; no one can alloc from it. */
|
||||||
nc = cachep->array[cpu];
|
nc = cachep->array[cpu];
|
||||||
cachep->array[cpu] = NULL;
|
cachep->array[cpu] = NULL;
|
||||||
l3 = cachep->nodelists[node];
|
l3 = cachep->nodelists[node];
|
||||||
|
|
||||||
if (!l3)
|
if (!l3)
|
||||||
goto unlock_cache;
|
goto free_array_cache;
|
||||||
|
|
||||||
spin_lock_irq(&l3->list_lock);
|
spin_lock_irq(&l3->list_lock);
|
||||||
|
|
||||||
@@ -1005,33 +1043,43 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
|
|||||||
|
|
||||||
if (!cpus_empty(mask)) {
|
if (!cpus_empty(mask)) {
|
||||||
spin_unlock_irq(&l3->list_lock);
|
spin_unlock_irq(&l3->list_lock);
|
||||||
goto unlock_cache;
|
goto free_array_cache;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (l3->shared) {
|
shared = l3->shared;
|
||||||
|
if (shared) {
|
||||||
free_block(cachep, l3->shared->entry,
|
free_block(cachep, l3->shared->entry,
|
||||||
l3->shared->avail, node);
|
l3->shared->avail, node);
|
||||||
kfree(l3->shared);
|
|
||||||
l3->shared = NULL;
|
l3->shared = NULL;
|
||||||
}
|
}
|
||||||
if (l3->alien) {
|
|
||||||
drain_alien_cache(cachep, l3);
|
|
||||||
free_alien_cache(l3->alien);
|
|
||||||
l3->alien = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* free slabs belonging to this node */
|
alien = l3->alien;
|
||||||
if (__node_shrink(cachep, node)) {
|
l3->alien = NULL;
|
||||||
cachep->nodelists[node] = NULL;
|
|
||||||
spin_unlock_irq(&l3->list_lock);
|
spin_unlock_irq(&l3->list_lock);
|
||||||
kfree(l3);
|
|
||||||
} else {
|
kfree(shared);
|
||||||
spin_unlock_irq(&l3->list_lock);
|
if (alien) {
|
||||||
|
drain_alien_cache(cachep, alien);
|
||||||
|
free_alien_cache(alien);
|
||||||
}
|
}
|
||||||
unlock_cache:
|
free_array_cache:
|
||||||
spin_unlock(&cachep->spinlock);
|
|
||||||
kfree(nc);
|
kfree(nc);
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
|
* In the previous loop, all the objects were freed to
|
||||||
|
* the respective cache's slabs, now we can go ahead and
|
||||||
|
* shrink each nodelist to its limit.
|
||||||
|
*/
|
||||||
|
list_for_each_entry(cachep, &cache_chain, next) {
|
||||||
|
l3 = cachep->nodelists[node];
|
||||||
|
if (!l3)
|
||||||
|
continue;
|
||||||
|
spin_lock_irq(&l3->list_lock);
|
||||||
|
/* free slabs belonging to this node */
|
||||||
|
__node_shrink(cachep, node);
|
||||||
|
spin_unlock_irq(&l3->list_lock);
|
||||||
|
}
|
||||||
mutex_unlock(&cache_chain_mutex);
|
mutex_unlock(&cache_chain_mutex);
|
||||||
break;
|
break;
|
||||||
#endif
|
#endif
|
||||||
@@ -2011,7 +2059,6 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
|
|||||||
|
|
||||||
smp_call_function_all_cpus(do_drain, cachep);
|
smp_call_function_all_cpus(do_drain, cachep);
|
||||||
check_irq_on();
|
check_irq_on();
|
||||||
spin_lock(&cachep->spinlock);
|
|
||||||
for_each_online_node(node) {
|
for_each_online_node(node) {
|
||||||
l3 = cachep->nodelists[node];
|
l3 = cachep->nodelists[node];
|
||||||
if (l3) {
|
if (l3) {
|
||||||
@@ -2019,10 +2066,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
|
|||||||
drain_array_locked(cachep, l3->shared, 1, node);
|
drain_array_locked(cachep, l3->shared, 1, node);
|
||||||
spin_unlock_irq(&l3->list_lock);
|
spin_unlock_irq(&l3->list_lock);
|
||||||
if (l3->alien)
|
if (l3->alien)
|
||||||
drain_alien_cache(cachep, l3);
|
drain_alien_cache(cachep, l3->alien);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
spin_unlock(&cachep->spinlock);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int __node_shrink(struct kmem_cache *cachep, int node)
|
static int __node_shrink(struct kmem_cache *cachep, int node)
|
||||||
@@ -3440,7 +3486,7 @@ static void cache_reap(void *unused)
|
|||||||
|
|
||||||
l3 = searchp->nodelists[numa_node_id()];
|
l3 = searchp->nodelists[numa_node_id()];
|
||||||
if (l3->alien)
|
if (l3->alien)
|
||||||
drain_alien_cache(searchp, l3);
|
drain_alien_cache(searchp, l3->alien);
|
||||||
spin_lock_irq(&l3->list_lock);
|
spin_lock_irq(&l3->list_lock);
|
||||||
|
|
||||||
drain_array_locked(searchp, cpu_cache_get(searchp), 0,
|
drain_array_locked(searchp, cpu_cache_get(searchp), 0,
|
||||||
@@ -3598,7 +3644,8 @@ static int s_show(struct seq_file *m, void *p)
|
|||||||
num_slabs++;
|
num_slabs++;
|
||||||
}
|
}
|
||||||
free_objects += l3->free_objects;
|
free_objects += l3->free_objects;
|
||||||
shared_avail += l3->shared->avail;
|
if (l3->shared)
|
||||||
|
shared_avail += l3->shared->avail;
|
||||||
|
|
||||||
spin_unlock_irq(&l3->list_lock);
|
spin_unlock_irq(&l3->list_lock);
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user