netfilter: revised locking for x_tables

The x_tables are organized with a table structure and a per-cpu copies
of the counters and rules. On older kernels there was a reader/writer 
lock per table which was a performance bottleneck. In 2.6.30-rc, this
was converted to use RCU and the counters/rules which solved the performance
problems for do_table but made replacing rules much slower because of
the necessary RCU grace period.

This version uses a per-cpu set of spinlocks and counters to allow to
table processing to proceed without the cache thrashing of a global
reader lock and keeps the same performance for table updates.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Stephen Hemminger
2009-04-28 22:36:33 -07:00
committed by David S. Miller
parent bf368e4e70
commit 942e4a2bd6
5 changed files with 204 additions and 296 deletions

View File

@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buff *skb,
indev = in ? in->name : nulldevname;
outdev = out ? out->name : nulldevname;
rcu_read_lock_bh();
private = rcu_dereference(table->private);
table_base = rcu_dereference(private->entries[smp_processor_id()]);
xt_info_rdlock_bh();
private = table->private;
table_base = private->entries[smp_processor_id()];
e = get_entry(table_base, private->hook_entry[hook]);
back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
(2 * skb->dev->addr_len);
ADD_COUNTER(e->counters, hdr_len, 1);
t = arpt_get_target(e);
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
e = (void *)e + e->next_offset;
}
} while (!hotdrop);
rcu_read_unlock_bh();
xt_info_rdunlock_bh();
if (hotdrop)
return NF_DROP;
@@ -711,9 +711,12 @@ static void get_counters(const struct xt_table_info *t,
/* Instead of clearing (by a previous call to memset())
* the counters and using adds, we set the counters
* with data used by 'current' CPU
* We dont care about preemption here.
*
* Bottom half has to be disabled to prevent deadlock
* if new softirq were to run and call ipt_do_table
*/
curcpu = raw_smp_processor_id();
local_bh_disable();
curcpu = smp_processor_id();
i = 0;
ARPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -726,73 +729,22 @@ static void get_counters(const struct xt_table_info *t,
if (cpu == curcpu)
continue;
i = 0;
xt_info_wrlock(cpu);
ARPT_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_entry_to_counter,
counters,
&i);
xt_info_wrunlock(cpu);
}
}
/* We're lazy, and add to the first CPU; overflow works its fey magic
* and everything is OK. */
static int
add_counter_to_entry(struct arpt_entry *e,
const struct xt_counters addme[],
unsigned int *i)
{
ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
(*i)++;
return 0;
}
/* Take values from counters and add them back onto the current cpu */
static void put_counters(struct xt_table_info *t,
const struct xt_counters counters[])
{
unsigned int i, cpu;
local_bh_disable();
cpu = smp_processor_id();
i = 0;
ARPT_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_counter_to_entry,
counters,
&i);
local_bh_enable();
}
static inline int
zero_entry_counter(struct arpt_entry *e, void *arg)
{
e->counters.bcnt = 0;
e->counters.pcnt = 0;
return 0;
}
static void
clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
{
unsigned int cpu;
const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
for_each_possible_cpu(cpu) {
memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
zero_entry_counter, NULL);
}
}
static struct xt_counters *alloc_counters(struct xt_table *table)
{
unsigned int countersize;
struct xt_counters *counters;
struct xt_table_info *private = table->private;
struct xt_table_info *info;
/* We need atomic snapshot of counters: rest doesn't change
* (other than comefrom, which userspace doesn't care
@@ -802,30 +754,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
counters = vmalloc_node(countersize, numa_node_id());
if (counters == NULL)
goto nomem;
return ERR_PTR(-ENOMEM);
info = xt_alloc_table_info(private->size);
if (!info)
goto free_counters;
clone_counters(info, private);
mutex_lock(&table->lock);
xt_table_entry_swap_rcu(private, info);
synchronize_net(); /* Wait until smoke has cleared */
get_counters(info, counters);
put_counters(private, counters);
mutex_unlock(&table->lock);
xt_free_table_info(info);
get_counters(private, counters);
return counters;
free_counters:
vfree(counters);
nomem:
return ERR_PTR(-ENOMEM);
}
static int copy_entries_to_user(unsigned int total_size,
@@ -1094,8 +1027,9 @@ static int __do_replace(struct net *net, const char *name,
(newinfo->number <= oldinfo->initial_entries))
module_put(t->me);
/* Get the old counters. */
/* Get the old counters, and synchronize with replace */
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1165,10 +1099,23 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
return ret;
}
/* We're lazy, and add to the first CPU; overflow works its fey magic
* and everything is OK. */
static int
add_counter_to_entry(struct arpt_entry *e,
const struct xt_counters addme[],
unsigned int *i)
{
ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
(*i)++;
return 0;
}
static int do_add_counters(struct net *net, void __user *user, unsigned int len,
int compat)
{
unsigned int i;
unsigned int i, curcpu;
struct xt_counters_info tmp;
struct xt_counters *paddc;
unsigned int num_counters;
@@ -1224,26 +1171,26 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
goto free;
}
mutex_lock(&t->lock);
local_bh_disable();
private = t->private;
if (private->number != num_counters) {
ret = -EINVAL;
goto unlock_up_free;
}
preempt_disable();
i = 0;
/* Choose the copy that is on our node */
loc_cpu_entry = private->entries[smp_processor_id()];
curcpu = smp_processor_id();
loc_cpu_entry = private->entries[curcpu];
xt_info_wrlock(curcpu);
ARPT_ENTRY_ITERATE(loc_cpu_entry,
private->size,
add_counter_to_entry,
paddc,
&i);
preempt_enable();
xt_info_wrunlock(curcpu);
unlock_up_free:
mutex_unlock(&t->lock);
local_bh_enable();
xt_table_unlock(t);
module_put(t->me);
free:

View File

@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
tgpar.hooknum = hook;
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
rcu_read_lock_bh();
private = rcu_dereference(table->private);
table_base = rcu_dereference(private->entries[smp_processor_id()]);
xt_info_rdlock_bh();
private = table->private;
table_base = private->entries[smp_processor_id()];
e = get_entry(table_base, private->hook_entry[hook]);
@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
e = (void *)e + e->next_offset;
}
} while (!hotdrop);
rcu_read_unlock_bh();
xt_info_rdunlock_bh();
#ifdef DEBUG_ALLOW_ALL
return NF_ACCEPT;
@@ -896,10 +894,13 @@ get_counters(const struct xt_table_info *t,
/* Instead of clearing (by a previous call to memset())
* the counters and using adds, we set the counters
* with data used by 'current' CPU
* We dont care about preemption here.
* with data used by 'current' CPU.
*
* Bottom half has to be disabled to prevent deadlock
* if new softirq were to run and call ipt_do_table
*/
curcpu = raw_smp_processor_id();
local_bh_disable();
curcpu = smp_processor_id();
i = 0;
IPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -912,74 +913,22 @@ get_counters(const struct xt_table_info *t,
if (cpu == curcpu)
continue;
i = 0;
xt_info_wrlock(cpu);
IPT_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_entry_to_counter,
counters,
&i);
xt_info_wrunlock(cpu);
}
}
/* We're lazy, and add to the first CPU; overflow works its fey magic
* and everything is OK. */
static int
add_counter_to_entry(struct ipt_entry *e,
const struct xt_counters addme[],
unsigned int *i)
{
ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
(*i)++;
return 0;
}
/* Take values from counters and add them back onto the current cpu */
static void put_counters(struct xt_table_info *t,
const struct xt_counters counters[])
{
unsigned int i, cpu;
local_bh_disable();
cpu = smp_processor_id();
i = 0;
IPT_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_counter_to_entry,
counters,
&i);
local_bh_enable();
}
static inline int
zero_entry_counter(struct ipt_entry *e, void *arg)
{
e->counters.bcnt = 0;
e->counters.pcnt = 0;
return 0;
}
static void
clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
{
unsigned int cpu;
const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
for_each_possible_cpu(cpu) {
memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
zero_entry_counter, NULL);
}
}
static struct xt_counters * alloc_counters(struct xt_table *table)
{
unsigned int countersize;
struct xt_counters *counters;
struct xt_table_info *private = table->private;
struct xt_table_info *info;
/* We need atomic snapshot of counters: rest doesn't change
(other than comefrom, which userspace doesn't care
@@ -988,30 +937,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
counters = vmalloc_node(countersize, numa_node_id());
if (counters == NULL)
goto nomem;
return ERR_PTR(-ENOMEM);
info = xt_alloc_table_info(private->size);
if (!info)
goto free_counters;
clone_counters(info, private);
mutex_lock(&table->lock);
xt_table_entry_swap_rcu(private, info);
synchronize_net(); /* Wait until smoke has cleared */
get_counters(info, counters);
put_counters(private, counters);
mutex_unlock(&table->lock);
xt_free_table_info(info);
get_counters(private, counters);
return counters;
free_counters:
vfree(counters);
nomem:
return ERR_PTR(-ENOMEM);
}
static int
@@ -1306,8 +1236,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
(newinfo->number <= oldinfo->initial_entries))
module_put(t->me);
/* Get the old counters. */
/* Get the old counters, and synchronize with replace */
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1377,11 +1308,23 @@ do_replace(struct net *net, void __user *user, unsigned int len)
return ret;
}
/* We're lazy, and add to the first CPU; overflow works its fey magic
* and everything is OK. */
static int
add_counter_to_entry(struct ipt_entry *e,
const struct xt_counters addme[],
unsigned int *i)
{
ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
(*i)++;
return 0;
}
static int
do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
{
unsigned int i;
unsigned int i, curcpu;
struct xt_counters_info tmp;
struct xt_counters *paddc;
unsigned int num_counters;
@@ -1437,25 +1380,26 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
goto free;
}
mutex_lock(&t->lock);
local_bh_disable();
private = t->private;
if (private->number != num_counters) {
ret = -EINVAL;
goto unlock_up_free;
}
preempt_disable();
i = 0;
/* Choose the copy that is on our node */
loc_cpu_entry = private->entries[raw_smp_processor_id()];
curcpu = smp_processor_id();
loc_cpu_entry = private->entries[curcpu];
xt_info_wrlock(curcpu);
IPT_ENTRY_ITERATE(loc_cpu_entry,
private->size,
add_counter_to_entry,
paddc,
&i);
preempt_enable();
xt_info_wrunlock(curcpu);
unlock_up_free:
mutex_unlock(&t->lock);
local_bh_enable();
xt_table_unlock(t);
module_put(t->me);
free: