From cd67cd5eb25ae9a7bafbfd3d52d4c05e1d80af3b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 6 Feb 2015 09:44:44 +0200 Subject: [PATCH 1/5] ipvs: use 64-bit rates in stats IPVS stats are limited to 2^(32-10) conns/s and packets/s, 2^(32-5) bytes/s. It is time to use 64 bits: * Change all conn/packet kernel counters to 64-bit and update them in u64_stats_update_{begin,end} section * In kernel use struct ip_vs_kstats instead of the user-space struct ip_vs_stats_user and use new func ip_vs_export_stats_user to export it to sockopt users to preserve compatibility with 32-bit values * Rename cpu counters "ustats" to "cnt" * To netlink users provide additionally 64-bit stats: IPVS_SVC_ATTR_STATS64 and IPVS_DEST_ATTR_STATS64. Old stats remain for old binaries. * We can use ip_vs_copy_stats in ip_vs_stats_percpu_show Thanks to Chris Caputo for providing initial patch for ip_vs_est.c Signed-off-by: Chris Caputo Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 48 ++++++--- include/uapi/linux/ip_vs.h | 7 +- net/netfilter/ipvs/ip_vs_core.c | 36 ++++--- net/netfilter/ipvs/ip_vs_ctl.c | 170 +++++++++++++++++++++----------- net/netfilter/ipvs/ip_vs_est.c | 94 +++++++++--------- 5 files changed, 219 insertions(+), 136 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 615b20b58545..a627fe690c19 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -365,15 +365,15 @@ struct ip_vs_seq { /* counters per cpu */ struct ip_vs_counters { - __u32 conns; /* connections scheduled */ - __u32 inpkts; /* incoming packets */ - __u32 outpkts; /* outgoing packets */ + __u64 conns; /* connections scheduled */ + __u64 inpkts; /* incoming packets */ + __u64 outpkts; /* outgoing packets */ __u64 inbytes; /* incoming bytes */ __u64 outbytes; /* outgoing bytes */ }; /* Stats per cpu */ struct ip_vs_cpu_stats { - struct ip_vs_counters ustats; + struct ip_vs_counters cnt; struct u64_stats_sync syncp; }; @@ -383,23 +383,40 @@ struct ip_vs_estimator { u64 last_inbytes; u64 last_outbytes; - u32 last_conns; - u32 last_inpkts; - u32 last_outpkts; + u64 last_conns; + u64 last_inpkts; + u64 last_outpkts; - u32 cps; - u32 inpps; - u32 outpps; - u32 inbps; - u32 outbps; + u64 cps; + u64 inpps; + u64 outpps; + u64 inbps; + u64 outbps; +}; + +/* + * IPVS statistics object, 64-bit kernel version of struct ip_vs_stats_user + */ +struct ip_vs_kstats { + u64 conns; /* connections scheduled */ + u64 inpkts; /* incoming packets */ + u64 outpkts; /* outgoing packets */ + u64 inbytes; /* incoming bytes */ + u64 outbytes; /* outgoing bytes */ + + u64 cps; /* current connection rate */ + u64 inpps; /* current in packet rate */ + u64 outpps; /* current out packet rate */ + u64 inbps; /* current in byte rate */ + u64 outbps; /* current out byte rate */ }; struct ip_vs_stats { - struct ip_vs_stats_user ustats; /* statistics */ + struct ip_vs_kstats kstats; /* kernel statistics */ struct ip_vs_estimator est; /* estimator */ struct ip_vs_cpu_stats __percpu *cpustats; /* per cpu counters */ spinlock_t lock; /* spin lock */ - struct ip_vs_stats_user ustats0; /* reset values */ + struct ip_vs_kstats kstats0; /* reset values */ }; struct dst_entry; @@ -1388,8 +1405,7 @@ void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts); void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats); void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats); void ip_vs_zero_estimator(struct ip_vs_stats *stats); -void ip_vs_read_estimator(struct ip_vs_stats_user *dst, - struct ip_vs_stats *stats); +void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats); /* Various IPVS packet transmitters (from ip_vs_xmit.c) */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index cabe95d5b461..3199243f2028 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -358,6 +358,8 @@ enum { IPVS_SVC_ATTR_PE_NAME, /* name of ct retriever */ + IPVS_SVC_ATTR_STATS64, /* nested attribute for service stats */ + __IPVS_SVC_ATTR_MAX, }; @@ -387,6 +389,8 @@ enum { IPVS_DEST_ATTR_ADDR_FAMILY, /* Address family of address */ + IPVS_DEST_ATTR_STATS64, /* nested attribute for dest stats */ + __IPVS_DEST_ATTR_MAX, }; @@ -410,7 +414,8 @@ enum { /* * Attributes used to describe service or destination entry statistics * - * Used inside nested attributes IPVS_SVC_ATTR_STATS and IPVS_DEST_ATTR_STATS + * Used inside nested attributes IPVS_SVC_ATTR_STATS, IPVS_DEST_ATTR_STATS, + * IPVS_SVC_ATTR_STATS64 and IPVS_DEST_ATTR_STATS64. */ enum { IPVS_STATS_ATTR_UNSPEC = 0, diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 990decba1fe4..c9470c86308f 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -119,24 +119,24 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) struct ip_vs_service *svc; s = this_cpu_ptr(dest->stats.cpustats); - s->ustats.inpkts++; u64_stats_update_begin(&s->syncp); - s->ustats.inbytes += skb->len; + s->cnt.inpkts++; + s->cnt.inbytes += skb->len; u64_stats_update_end(&s->syncp); rcu_read_lock(); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); - s->ustats.inpkts++; u64_stats_update_begin(&s->syncp); - s->ustats.inbytes += skb->len; + s->cnt.inpkts++; + s->cnt.inbytes += skb->len; u64_stats_update_end(&s->syncp); rcu_read_unlock(); s = this_cpu_ptr(ipvs->tot_stats.cpustats); - s->ustats.inpkts++; u64_stats_update_begin(&s->syncp); - s->ustats.inbytes += skb->len; + s->cnt.inpkts++; + s->cnt.inbytes += skb->len; u64_stats_update_end(&s->syncp); } } @@ -153,24 +153,24 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) struct ip_vs_service *svc; s = this_cpu_ptr(dest->stats.cpustats); - s->ustats.outpkts++; u64_stats_update_begin(&s->syncp); - s->ustats.outbytes += skb->len; + s->cnt.outpkts++; + s->cnt.outbytes += skb->len; u64_stats_update_end(&s->syncp); rcu_read_lock(); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); - s->ustats.outpkts++; u64_stats_update_begin(&s->syncp); - s->ustats.outbytes += skb->len; + s->cnt.outpkts++; + s->cnt.outbytes += skb->len; u64_stats_update_end(&s->syncp); rcu_read_unlock(); s = this_cpu_ptr(ipvs->tot_stats.cpustats); - s->ustats.outpkts++; u64_stats_update_begin(&s->syncp); - s->ustats.outbytes += skb->len; + s->cnt.outpkts++; + s->cnt.outbytes += skb->len; u64_stats_update_end(&s->syncp); } } @@ -183,13 +183,19 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) struct ip_vs_cpu_stats *s; s = this_cpu_ptr(cp->dest->stats.cpustats); - s->ustats.conns++; + u64_stats_update_begin(&s->syncp); + s->cnt.conns++; + u64_stats_update_end(&s->syncp); s = this_cpu_ptr(svc->stats.cpustats); - s->ustats.conns++; + u64_stats_update_begin(&s->syncp); + s->cnt.conns++; + u64_stats_update_end(&s->syncp); s = this_cpu_ptr(ipvs->tot_stats.cpustats); - s->ustats.conns++; + u64_stats_update_begin(&s->syncp); + s->cnt.conns++; + u64_stats_update_end(&s->syncp); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index e55759056361..6fd60059faf0 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -729,9 +729,9 @@ static void ip_vs_trash_cleanup(struct net *net) } static void -ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) +ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) { -#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c +#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c spin_lock_bh(&src->lock); @@ -746,6 +746,21 @@ ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) spin_unlock_bh(&src->lock); } +static void +ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src) +{ + dst->conns = (u32)src->conns; + dst->inpkts = (u32)src->inpkts; + dst->outpkts = (u32)src->outpkts; + dst->inbytes = src->inbytes; + dst->outbytes = src->outbytes; + dst->cps = (u32)src->cps; + dst->inpps = (u32)src->inpps; + dst->outpps = (u32)src->outpps; + dst->inbps = (u32)src->inbps; + dst->outbps = (u32)src->outbps; +} + static void ip_vs_zero_stats(struct ip_vs_stats *stats) { @@ -753,7 +768,7 @@ ip_vs_zero_stats(struct ip_vs_stats *stats) /* get current counters as zero point, rates are zeroed */ -#define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c +#define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c IP_VS_ZERO_STATS_COUNTER(conns); IP_VS_ZERO_STATS_COUNTER(inpkts); @@ -2044,7 +2059,7 @@ static const struct file_operations ip_vs_info_fops = { static int ip_vs_stats_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); - struct ip_vs_stats_user show; + struct ip_vs_kstats show; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, @@ -2053,17 +2068,22 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) " Conns Packets Packets Bytes Bytes\n"); ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats); - seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns, - show.inpkts, show.outpkts, - (unsigned long long) show.inbytes, - (unsigned long long) show.outbytes); + seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n", + (unsigned long long)show.conns, + (unsigned long long)show.inpkts, + (unsigned long long)show.outpkts, + (unsigned long long)show.inbytes, + (unsigned long long)show.outbytes); -/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ +/* 01234567 01234567 01234567 0123456701234567 0123456701234567*/ seq_puts(seq, - " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); - seq_printf(seq, "%8X %8X %8X %16X %16X\n", - show.cps, show.inpps, show.outpps, - show.inbps, show.outbps); + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n", + (unsigned long long)show.cps, + (unsigned long long)show.inpps, + (unsigned long long)show.outpps, + (unsigned long long)show.inbps, + (unsigned long long)show.outbps); return 0; } @@ -2086,7 +2106,7 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) struct net *net = seq_file_single_net(seq); struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; - struct ip_vs_stats_user rates; + struct ip_vs_kstats kstats; int i; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ @@ -2098,41 +2118,41 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) for_each_possible_cpu(i) { struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); unsigned int start; - __u64 inbytes, outbytes; + u64 conns, inpkts, outpkts, inbytes, outbytes; do { start = u64_stats_fetch_begin_irq(&u->syncp); - inbytes = u->ustats.inbytes; - outbytes = u->ustats.outbytes; + conns = u->cnt.conns; + inpkts = u->cnt.inpkts; + outpkts = u->cnt.outpkts; + inbytes = u->cnt.inbytes; + outbytes = u->cnt.outbytes; } while (u64_stats_fetch_retry_irq(&u->syncp, start)); - seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n", - i, u->ustats.conns, u->ustats.inpkts, - u->ustats.outpkts, (__u64)inbytes, - (__u64)outbytes); + seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", + i, (u64)conns, (u64)inpkts, + (u64)outpkts, (u64)inbytes, + (u64)outbytes); } - spin_lock_bh(&tot_stats->lock); + ip_vs_copy_stats(&kstats, tot_stats); - seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n", - tot_stats->ustats.conns, tot_stats->ustats.inpkts, - tot_stats->ustats.outpkts, - (unsigned long long) tot_stats->ustats.inbytes, - (unsigned long long) tot_stats->ustats.outbytes); + seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n", + (unsigned long long)kstats.conns, + (unsigned long long)kstats.inpkts, + (unsigned long long)kstats.outpkts, + (unsigned long long)kstats.inbytes, + (unsigned long long)kstats.outbytes); - ip_vs_read_estimator(&rates, tot_stats); - - spin_unlock_bh(&tot_stats->lock); - -/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ +/* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, - " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); - seq_printf(seq, " %8X %8X %8X %16X %16X\n", - rates.cps, - rates.inpps, - rates.outpps, - rates.inbps, - rates.outbps); + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n", + kstats.cps, + kstats.inpps, + kstats.outpps, + kstats.inbps, + kstats.outbps); return 0; } @@ -2400,6 +2420,7 @@ static void ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { struct ip_vs_scheduler *sched; + struct ip_vs_kstats kstats; sched = rcu_dereference_protected(src->scheduler, 1); dst->protocol = src->protocol; @@ -2411,7 +2432,8 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) dst->timeout = src->timeout / HZ; dst->netmask = src->netmask; dst->num_dests = src->num_dests; - ip_vs_copy_stats(&dst->stats, &src->stats); + ip_vs_copy_stats(&kstats, &src->stats); + ip_vs_export_stats_user(&dst->stats, &kstats); } static inline int @@ -2485,6 +2507,7 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, int count = 0; struct ip_vs_dest *dest; struct ip_vs_dest_entry entry; + struct ip_vs_kstats kstats; memset(&entry, 0, sizeof(entry)); list_for_each_entry(dest, &svc->destinations, n_list) { @@ -2506,7 +2529,8 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, entry.activeconns = atomic_read(&dest->activeconns); entry.inactconns = atomic_read(&dest->inactconns); entry.persistconns = atomic_read(&dest->persistconns); - ip_vs_copy_stats(&entry.stats, &dest->stats); + ip_vs_copy_stats(&kstats, &dest->stats); + ip_vs_export_stats_user(&entry.stats, &kstats); if (copy_to_user(&uptr->entrytable[count], &entry, sizeof(entry))) { ret = -EFAULT; @@ -2798,25 +2822,51 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, - struct ip_vs_stats *stats) + struct ip_vs_kstats *kstats) { - struct ip_vs_stats_user ustats; struct nlattr *nl_stats = nla_nest_start(skb, container_type); + if (!nl_stats) return -EMSGSIZE; - ip_vs_copy_stats(&ustats, stats); + if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) || + nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) || + nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) || + nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) || + nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) || + nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) || + nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps)) + goto nla_put_failure; + nla_nest_end(skb, nl_stats); - if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) || - nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) || - nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) || - nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) || - nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) || - nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) || - nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) || - nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) || - nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) || - nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps)) + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_stats); + return -EMSGSIZE; +} + +static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type, + struct ip_vs_kstats *kstats) +{ + struct nlattr *nl_stats = nla_nest_start(skb, container_type); + + if (!nl_stats) + return -EMSGSIZE; + + if (nla_put_u64(skb, IPVS_STATS_ATTR_CONNS, kstats->conns) || + nla_put_u64(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts) || + nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) || + nla_put_u64(skb, IPVS_STATS_ATTR_CPS, kstats->cps) || + nla_put_u64(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps) || + nla_put_u64(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps) || + nla_put_u64(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps)) goto nla_put_failure; nla_nest_end(skb, nl_stats); @@ -2835,6 +2885,7 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, struct nlattr *nl_service; struct ip_vs_flags flags = { .flags = svc->flags, .mask = ~0 }; + struct ip_vs_kstats kstats; nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); if (!nl_service) @@ -2860,7 +2911,10 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) goto nla_put_failure; - if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) + ip_vs_copy_stats(&kstats, &svc->stats); + if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats)) + goto nla_put_failure; + if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats)) goto nla_put_failure; nla_nest_end(skb, nl_service); @@ -3032,6 +3086,7 @@ static struct ip_vs_service *ip_vs_genl_find_service(struct net *net, static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) { struct nlattr *nl_dest; + struct ip_vs_kstats kstats; nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST); if (!nl_dest) @@ -3054,7 +3109,10 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) atomic_read(&dest->persistconns)) || nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af)) goto nla_put_failure; - if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) + ip_vs_copy_stats(&kstats, &dest->stats); + if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats)) + goto nla_put_failure; + if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats)) goto nla_put_failure; nla_nest_end(skb, nl_dest); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 1425e9a924c4..ef0eb0a8d552 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -45,17 +45,19 @@ NOTES. - * The stored value for average bps is scaled by 2^5, so that maximal - rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. + * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10. - * A lot code is taken from net/sched/estimator.c + * Netlink users can see 64-bit values but sockopt users are restricted + to 32-bit values for conns, packets, bps, cps and pps. + + * A lot of code is taken from net/core/gen_estimator.c */ /* * Make a summary from each cpu */ -static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, +static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum, struct ip_vs_cpu_stats __percpu *stats) { int i; @@ -64,27 +66,31 @@ static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, for_each_possible_cpu(i) { struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); unsigned int start; - __u64 inbytes, outbytes; + u64 conns, inpkts, outpkts, inbytes, outbytes; + if (add) { - sum->conns += s->ustats.conns; - sum->inpkts += s->ustats.inpkts; - sum->outpkts += s->ustats.outpkts; do { start = u64_stats_fetch_begin(&s->syncp); - inbytes = s->ustats.inbytes; - outbytes = s->ustats.outbytes; + conns = s->cnt.conns; + inpkts = s->cnt.inpkts; + outpkts = s->cnt.outpkts; + inbytes = s->cnt.inbytes; + outbytes = s->cnt.outbytes; } while (u64_stats_fetch_retry(&s->syncp, start)); + sum->conns += conns; + sum->inpkts += inpkts; + sum->outpkts += outpkts; sum->inbytes += inbytes; sum->outbytes += outbytes; } else { add = true; - sum->conns = s->ustats.conns; - sum->inpkts = s->ustats.inpkts; - sum->outpkts = s->ustats.outpkts; do { start = u64_stats_fetch_begin(&s->syncp); - sum->inbytes = s->ustats.inbytes; - sum->outbytes = s->ustats.outbytes; + sum->conns = s->cnt.conns; + sum->inpkts = s->cnt.inpkts; + sum->outpkts = s->cnt.outpkts; + sum->inbytes = s->cnt.inbytes; + sum->outbytes = s->cnt.outbytes; } while (u64_stats_fetch_retry(&s->syncp, start)); } } @@ -95,10 +101,7 @@ static void estimation_timer(unsigned long arg) { struct ip_vs_estimator *e; struct ip_vs_stats *s; - u32 n_conns; - u32 n_inpkts, n_outpkts; - u64 n_inbytes, n_outbytes; - u32 rate; + u64 rate; struct net *net = (struct net *)arg; struct netns_ipvs *ipvs; @@ -108,33 +111,29 @@ static void estimation_timer(unsigned long arg) s = container_of(e, struct ip_vs_stats, est); spin_lock(&s->lock); - ip_vs_read_cpu_stats(&s->ustats, s->cpustats); - n_conns = s->ustats.conns; - n_inpkts = s->ustats.inpkts; - n_outpkts = s->ustats.outpkts; - n_inbytes = s->ustats.inbytes; - n_outbytes = s->ustats.outbytes; + ip_vs_read_cpu_stats(&s->kstats, s->cpustats); /* scaled by 2^10, but divided 2 seconds */ - rate = (n_conns - e->last_conns) << 9; - e->last_conns = n_conns; - e->cps += ((long)rate - (long)e->cps) >> 2; + rate = (s->kstats.conns - e->last_conns) << 9; + e->last_conns = s->kstats.conns; + e->cps += ((s64)rate - (s64)e->cps) >> 2; - rate = (n_inpkts - e->last_inpkts) << 9; - e->last_inpkts = n_inpkts; - e->inpps += ((long)rate - (long)e->inpps) >> 2; + rate = (s->kstats.inpkts - e->last_inpkts) << 9; + e->last_inpkts = s->kstats.inpkts; + e->inpps += ((s64)rate - (s64)e->inpps) >> 2; - rate = (n_outpkts - e->last_outpkts) << 9; - e->last_outpkts = n_outpkts; - e->outpps += ((long)rate - (long)e->outpps) >> 2; + rate = (s->kstats.outpkts - e->last_outpkts) << 9; + e->last_outpkts = s->kstats.outpkts; + e->outpps += ((s64)rate - (s64)e->outpps) >> 2; - rate = (n_inbytes - e->last_inbytes) << 4; - e->last_inbytes = n_inbytes; - e->inbps += ((long)rate - (long)e->inbps) >> 2; + /* scaled by 2^5, but divided 2 seconds */ + rate = (s->kstats.inbytes - e->last_inbytes) << 4; + e->last_inbytes = s->kstats.inbytes; + e->inbps += ((s64)rate - (s64)e->inbps) >> 2; - rate = (n_outbytes - e->last_outbytes) << 4; - e->last_outbytes = n_outbytes; - e->outbps += ((long)rate - (long)e->outbps) >> 2; + rate = (s->kstats.outbytes - e->last_outbytes) << 4; + e->last_outbytes = s->kstats.outbytes; + e->outbps += ((s64)rate - (s64)e->outbps) >> 2; spin_unlock(&s->lock); } spin_unlock(&ipvs->est_lock); @@ -166,14 +165,14 @@ void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats) void ip_vs_zero_estimator(struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; - struct ip_vs_stats_user *u = &stats->ustats; + struct ip_vs_kstats *k = &stats->kstats; /* reset counters, caller must hold the stats->lock lock */ - est->last_inbytes = u->inbytes; - est->last_outbytes = u->outbytes; - est->last_conns = u->conns; - est->last_inpkts = u->inpkts; - est->last_outpkts = u->outpkts; + est->last_inbytes = k->inbytes; + est->last_outbytes = k->outbytes; + est->last_conns = k->conns; + est->last_inpkts = k->inpkts; + est->last_outpkts = k->outpkts; est->cps = 0; est->inpps = 0; est->outpps = 0; @@ -182,8 +181,7 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats) } /* Get decoded rates */ -void ip_vs_read_estimator(struct ip_vs_stats_user *dst, - struct ip_vs_stats *stats) +void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) { struct ip_vs_estimator *e = &stats->est; From 5cca4ace0fac4474edb02120bf1284ed6eb059aa Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 21 Jan 2015 10:53:50 +1100 Subject: [PATCH 2/5] netfilter: Don't hide NETFILTER_XT_MATCH_ADDRTYPE behind NETFILTER_ADVANCED Docker needs NETFILTER_XT_MATCH_ADDRTYPE, so move it out from behind NETFILTER_ADVANCED and make it default to a module. Signed-off-by: Anton Blanchard Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index b02660fa9eb0..c68c3b441381 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -951,7 +951,7 @@ comment "Xtables matches" config NETFILTER_XT_MATCH_ADDRTYPE tristate '"addrtype" address type match support' - depends on NETFILTER_ADVANCED + default m if NETFILTER_ADVANCED=n ---help--- This option allows you to match what routing thinks of an address, eg. UNICAST, LOCAL, BROADCAST, ... From 7f73b9f1ca7334eec0ff9a40e37ece92dd3e420f Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 11 Feb 2015 20:33:05 +0800 Subject: [PATCH 3/5] netfilter: ipset: fix boolreturn.cocci warnings net/netfilter/xt_set.c:196:9-10: WARNING: return of 0/1 in function 'set_match_v3' with return type bool net/netfilter/xt_set.c:242:9-10: WARNING: return of 0/1 in function 'set_match_v4' with return type bool Return statements in functions returning bool should use true/false instead of 1/0. Generated by: scripts/coccinelle/misc/boolreturn.cocci CC: Jozsef Kadlecsik Signed-off-by: Fengguang Wu Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_set.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 0d47afea9682..89045982ec94 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -193,7 +193,7 @@ set_match_v3(const struct sk_buff *skb, struct xt_action_param *par) return ret; if (!match_counter0(opt.ext.packets, &info->packets)) - return 0; + return false; return match_counter0(opt.ext.bytes, &info->bytes); } @@ -239,7 +239,7 @@ set_match_v4(const struct sk_buff *skb, struct xt_action_param *par) return ret; if (!match_counter(opt.ext.packets, &info->packets)) - return 0; + return false; return match_counter(opt.ext.bytes, &info->bytes); } From d752c364571743d696c2a54a449ce77550c35ac5 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 23 Feb 2015 15:02:34 -0300 Subject: [PATCH 4/5] ipvs: allow rescheduling of new connections when port reuse is detected Currently, when TCP/SCTP port reusing happens, IPVS will find the old entry and use it for the new one, behaving like a forced persistence. But if you consider a cluster with a heavy load of small connections, such reuse will happen often and may lead to a not optimal load balancing and might prevent a new node from getting a fair load. This patch introduces a new sysctl, conn_reuse_mode, that allows controlling how to proceed when port reuse is detected. The default value will allow rescheduling of new connections only if the old entry was in TIME_WAIT state for TCP or CLOSED for SCTP. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- Documentation/networking/ipvs-sysctl.txt | 21 +++++++++++++++ include/net/ip_vs.h | 11 ++++++++ net/netfilter/ipvs/ip_vs_core.c | 33 +++++++++++++++++++++--- net/netfilter/ipvs/ip_vs_ctl.c | 8 ++++++ net/netfilter/ipvs/ip_vs_sync.c | 21 +++++++++++++-- 5 files changed, 88 insertions(+), 6 deletions(-) diff --git a/Documentation/networking/ipvs-sysctl.txt b/Documentation/networking/ipvs-sysctl.txt index 7a3c04729591..3ba709531adb 100644 --- a/Documentation/networking/ipvs-sysctl.txt +++ b/Documentation/networking/ipvs-sysctl.txt @@ -22,6 +22,27 @@ backup_only - BOOLEAN If set, disable the director function while the server is in backup mode to avoid packet loops for DR/TUN methods. +conn_reuse_mode - INTEGER + 1 - default + + Controls how ipvs will deal with connections that are detected + port reuse. It is a bitmap, with the values being: + + 0: disable any special handling on port reuse. The new + connection will be delivered to the same real server that was + servicing the previous connection. This will effectively + disable expire_nodest_conn. + + bit 1: enable rescheduling of new connections when it is safe. + That is, whenever expire_nodest_conn and for TCP sockets, when + the connection is in TIME_WAIT state (which is only possible if + you use NAT mode). + + bit 2: it is bit 1 plus, for TCP connections, when connections + are in FIN_WAIT state, as this is the last state seen by load + balancer in Direct Routing mode. This bit helps on adding new + real servers to a very busy cluster. + conntrack - BOOLEAN 0 - disabled (default) not 0 - enabled diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index a627fe690c19..20fd23398537 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -941,6 +941,7 @@ struct netns_ipvs { int sysctl_nat_icmp_send; int sysctl_pmtu_disc; int sysctl_backup_only; + int sysctl_conn_reuse_mode; /* ip_vs_lblc */ int sysctl_lblc_expiration; @@ -1059,6 +1060,11 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs) ipvs->sysctl_backup_only; } +static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_conn_reuse_mode; +} + #else static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) @@ -1126,6 +1132,11 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs) return 0; } +static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs) +{ + return 1; +} + #endif /* IPVS core functions diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index c9470c86308f..6103ab933c5b 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1042,6 +1042,26 @@ static inline bool is_new_conn(const struct sk_buff *skb, } } +static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, + int conn_reuse_mode) +{ + /* Controlled (FTP DATA or persistence)? */ + if (cp->control) + return false; + + switch (cp->protocol) { + case IPPROTO_TCP: + return (cp->state == IP_VS_TCP_S_TIME_WAIT) || + ((conn_reuse_mode & 2) && + (cp->state == IP_VS_TCP_S_FIN_WAIT) && + (cp->flags & IP_VS_CONN_F_NOOUTPUT)); + case IPPROTO_SCTP: + return cp->state == IP_VS_SCTP_S_CLOSED; + default: + return false; + } +} + /* Handle response packets: rewrite addresses and send away... */ static unsigned int @@ -1580,6 +1600,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) struct ip_vs_conn *cp; int ret, pkts; struct netns_ipvs *ipvs; + int conn_reuse_mode; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) @@ -1648,10 +1669,14 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) */ cp = pp->conn_in_get(af, skb, &iph, 0); - if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest && - unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs && - is_new_conn(skb, &iph)) { - ip_vs_conn_expire_now(cp); + conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); + if (conn_reuse_mode && !iph.fragoffs && + is_new_conn(skb, &iph) && cp && + ((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && + unlikely(!atomic_read(&cp->dest->weight))) || + unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) { + if (!atomic_read(&cp->n_control)) + ip_vs_conn_expire_now(cp); __ip_vs_conn_put(cp); cp = NULL; } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 6fd60059faf0..76cc9ffd87fa 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1823,6 +1823,12 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "conn_reuse_mode", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", @@ -3790,6 +3796,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) ipvs->sysctl_pmtu_disc = 1; tbl[idx++].data = &ipvs->sysctl_pmtu_disc; tbl[idx++].data = &ipvs->sysctl_backup_only; + ipvs->sysctl_conn_reuse_mode = 1; + tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index c47ffd7a0a70..f96229cdb6e1 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -845,10 +845,27 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, struct ip_vs_conn *cp; struct netns_ipvs *ipvs = net_ipvs(net); - if (!(flags & IP_VS_CONN_F_TEMPLATE)) + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { cp = ip_vs_conn_in_get(param); - else + if (cp && ((cp->dport != dport) || + !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) { + if (!(flags & IP_VS_CONN_F_INACTIVE)) { + ip_vs_conn_expire_now(cp); + __ip_vs_conn_put(cp); + cp = NULL; + } else { + /* This is the expiration message for the + * connection that was already replaced, so we + * just ignore it. + */ + __ip_vs_conn_put(cp); + kfree(param->pe_data); + return; + } + } + } else { cp = ip_vs_ct_in_get(param); + } if (cp) { /* Free pe_data */ From 5f15893943bfdc804e8703c5aa2c8dd8bf7ddf3f Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Mon, 16 Feb 2015 11:32:28 +0100 Subject: [PATCH 5/5] netfilter: nft_compat: add support for arptables extensions This patch adds support to arptables extensions from nft_compat. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index c598f74063a1..a990df2f3f71 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -20,6 +20,7 @@ #include #include #include +#include #include static int nft_compat_chain_validate_dependency(const char *tablename, @@ -42,6 +43,7 @@ union nft_entry { struct ipt_entry e4; struct ip6t_entry e6; struct ebt_entry ebt; + struct arpt_entry arp; }; static inline void @@ -140,6 +142,8 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, entry->ebt.ethproto = proto; entry->ebt.invflags = inv ? EBT_IPROTO : 0; break; + case NFPROTO_ARP: + break; } par->entryinfo = entry; par->target = target; @@ -351,6 +355,8 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, entry->ebt.ethproto = proto; entry->ebt.invflags = inv ? EBT_IPROTO : 0; break; + case NFPROTO_ARP: + break; } par->entryinfo = entry; par->match = match; @@ -537,6 +543,9 @@ nfnl_compat_get(struct sock *nfnl, struct sk_buff *skb, case NFPROTO_BRIDGE: fmt = "ebt_%s"; break; + case NFPROTO_ARP: + fmt = "arpt_%s"; + break; default: pr_err("nft_compat: unsupported protocol %d\n", nfmsg->nfgen_family);