From 5cf3d46192fccf68b4a4759e4d7346e41c669a76 Mon Sep 17 00:00:00 2001 From: David Held Date: Tue, 15 Jul 2014 23:28:31 -0400 Subject: [PATCH 1/2] udp: Simplify __udp*_lib_mcast_deliver. Switch to using sk_nulls_for_each which shortens the code and makes it easier to update. Signed-off-by: David Held Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 48 ++++++++-------------------- net/ipv6/udp.c | 86 ++++++++++++++++++++------------------------------ 2 files changed, 48 insertions(+), 86 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 668af516f094..bbcc33737ef1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -594,26 +594,6 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, return true; } -static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, - __be16 loc_port, __be32 loc_addr, - __be16 rmt_port, __be32 rmt_addr, - int dif) -{ - struct hlist_nulls_node *node; - unsigned short hnum = ntohs(loc_port); - - sk_nulls_for_each_from(sk, node) { - if (__udp_is_mcast_sock(net, sk, - loc_port, loc_addr, - rmt_port, rmt_addr, - dif, hnum)) - goto found; - } - sk = NULL; -found: - return sk; -} - /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -1667,23 +1647,23 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udp_table *udptable) { struct sock *sk, *stack[256 / sizeof(struct sock *)]; - struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); - int dif; + struct hlist_nulls_node *node; + unsigned short hnum = ntohs(uh->dest); + struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); + int dif = skb->dev->ifindex; unsigned int i, count = 0; spin_lock(&hslot->lock); - sk = sk_nulls_head(&hslot->head); - dif = skb->dev->ifindex; - sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); - while (sk) { - stack[count++] = sk; - sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, - daddr, uh->source, saddr, dif); - if (unlikely(count == ARRAY_SIZE(stack))) { - if (!sk) - break; - flush_stack(stack, count, skb, ~0); - count = 0; + sk_nulls_for_each(sk, node, &hslot->head) { + if (__udp_is_mcast_sock(net, sk, + uh->dest, daddr, + uh->source, saddr, + dif, hnum)) { + if (unlikely(count == ARRAY_SIZE(stack))) { + flush_stack(stack, count, skb, ~0); + count = 0; + } + stack[count++] = sk; } } /* diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b4481df3d5fa..7d3bd80085be 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -702,43 +702,26 @@ drop: return -1; } -static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk, - __be16 loc_port, const struct in6_addr *loc_addr, - __be16 rmt_port, const struct in6_addr *rmt_addr, - int dif) +static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, + __be16 loc_port, const struct in6_addr *loc_addr, + __be16 rmt_port, const struct in6_addr *rmt_addr, + int dif, unsigned short hnum) { - struct hlist_nulls_node *node; - unsigned short num = ntohs(loc_port); + struct inet_sock *inet = inet_sk(sk); - sk_nulls_for_each_from(sk, node) { - struct inet_sock *inet = inet_sk(sk); + if (!net_eq(sock_net(sk), net)) + return false; - if (!net_eq(sock_net(sk), net)) - continue; - - if (udp_sk(sk)->udp_port_hash == num && - sk->sk_family == PF_INET6) { - if (inet->inet_dport) { - if (inet->inet_dport != rmt_port) - continue; - } - if (!ipv6_addr_any(&sk->sk_v6_daddr) && - !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) - continue; - - if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) - continue; - - if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { - if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)) - continue; - } - if (!inet6_mc_check(sk, loc_addr, rmt_addr)) - continue; - return sk; - } - } - return NULL; + if (udp_sk(sk)->udp_port_hash != hnum || + sk->sk_family != PF_INET6 || + (inet->inet_dport && inet->inet_dport != rmt_port) || + (!ipv6_addr_any(&sk->sk_v6_daddr) && + !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) || + (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + return false; + if (!inet6_mc_check(sk, loc_addr, rmt_addr)) + return false; + return true; } static void flush_stack(struct sock **stack, unsigned int count, @@ -787,28 +770,27 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, { struct sock *sk, *stack[256 / sizeof(struct sock *)]; const struct udphdr *uh = udp_hdr(skb); - struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); - int dif; + struct hlist_nulls_node *node; + unsigned short hnum = ntohs(uh->dest); + struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); + int dif = inet6_iif(skb); unsigned int i, count = 0; spin_lock(&hslot->lock); - sk = sk_nulls_head(&hslot->head); - dif = inet6_iif(skb); - sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); - while (sk) { - /* If zero checksum and no_check is not on for - * the socket then skip it. - */ - if (uh->check || udp_sk(sk)->no_check6_rx) + sk_nulls_for_each(sk, node, &hslot->head) { + if (__udp_v6_is_mcast_sock(net, sk, + uh->dest, daddr, + uh->source, saddr, + dif, hnum) && + /* If zero checksum and no_check is not on for + * the socket then skip it. + */ + (uh->check || udp_sk(sk)->no_check6_rx)) { + if (unlikely(count == ARRAY_SIZE(stack))) { + flush_stack(stack, count, skb, ~0); + count = 0; + } stack[count++] = sk; - - sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr, - uh->source, saddr, dif); - if (unlikely(count == ARRAY_SIZE(stack))) { - if (!sk) - break; - flush_stack(stack, count, skb, ~0); - count = 0; } } /* From 2dc41cff7545d55c6294525c811594576f8e119c Mon Sep 17 00:00:00 2001 From: David Held Date: Tue, 15 Jul 2014 23:28:32 -0400 Subject: [PATCH 2/2] udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver. Many multicast sources can have the same port which can result in a very large list when hashing by port only. Hash by address and port instead if this is the case. This makes multicast more similar to unicast. On a 24-core machine receiving from 500 multicast sockets on the same port, before this patch 80% of system CPU was used up by spin locking and only ~25% of packets were successfully delivered. With this patch, all packets are delivered and kernel overhead is ~8% system CPU on spinlocks. Signed-off-by: David Held Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 14 ++++++++++++++ net/ipv4/udp.c | 31 +++++++++++++++++++++---------- net/ipv6/udp.c | 30 ++++++++++++++++++++---------- 3 files changed, 55 insertions(+), 20 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 29e48a6d1ded..28f734601b50 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -660,6 +660,20 @@ static inline void sk_add_bind_node(struct sock *sk, #define sk_for_each_bound(__sk, list) \ hlist_for_each_entry(__sk, list, sk_bind_node) +/** + * sk_nulls_for_each_entry_offset - iterate over a list at a given struct offset + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_node to use as a loop cursor. + * @head: the head for your list. + * @offset: offset of hlist_node within the struct. + * + */ +#define sk_nulls_for_each_entry_offset(tpos, pos, head, offset) \ + for (pos = (head)->first; \ + (!is_a_nulls(pos)) && \ + ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \ + pos = pos->next) + static inline struct user_namespace *sk_user_ns(struct sock *sk) { /* Careful only use this in a context where these parameters diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index bbcc33737ef1..f31053b90ee0 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1619,6 +1619,8 @@ static void flush_stack(struct sock **stack, unsigned int count, if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) skb1 = NULL; + + sock_put(sk); } if (unlikely(skb1)) kfree_skb(skb1); @@ -1651,10 +1653,20 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, unsigned short hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); int dif = skb->dev->ifindex; - unsigned int i, count = 0; + unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); + unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); + + if (use_hash2) { + hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & + udp_table.mask; + hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask; +start_lookup: + hslot = &udp_table.hash2[hash2]; + offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); + } spin_lock(&hslot->lock); - sk_nulls_for_each(sk, node, &hslot->head) { + sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { if (__udp_is_mcast_sock(net, sk, uh->dest, daddr, uh->source, saddr, @@ -1664,24 +1676,23 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, count = 0; } stack[count++] = sk; + sock_hold(sk); } } - /* - * before releasing chain lock, we must take a reference on sockets - */ - for (i = 0; i < count; i++) - sock_hold(stack[i]); spin_unlock(&hslot->lock); + /* Also lookup *:port if we are using hash2 and haven't done so yet. */ + if (use_hash2 && hash2 != hash2_any) { + hash2 = hash2_any; + goto start_lookup; + } + /* * do the slow work with no lock held */ if (count) { flush_stack(stack, count, skb, count - 1); - - for (i = 0; i < count; i++) - sock_put(stack[i]); } else { kfree_skb(skb); } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7d3bd80085be..f9d8800bb72f 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -745,6 +745,7 @@ static void flush_stack(struct sock **stack, unsigned int count, if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0) skb1 = NULL; + sock_put(sk); } if (unlikely(skb1)) kfree_skb(skb1); @@ -774,10 +775,20 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, unsigned short hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); int dif = inet6_iif(skb); - unsigned int i, count = 0; + unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); + unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); + + if (use_hash2) { + hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) & + udp_table.mask; + hash2 = udp6_portaddr_hash(net, daddr, hnum) & udp_table.mask; +start_lookup: + hslot = &udp_table.hash2[hash2]; + offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); + } spin_lock(&hslot->lock); - sk_nulls_for_each(sk, node, &hslot->head) { + sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { if (__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr, uh->source, saddr, @@ -791,21 +802,20 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, count = 0; } stack[count++] = sk; + sock_hold(sk); } } - /* - * before releasing the lock, we must take reference on sockets - */ - for (i = 0; i < count; i++) - sock_hold(stack[i]); spin_unlock(&hslot->lock); + /* Also lookup *:port if we are using hash2 and haven't done so yet. */ + if (use_hash2 && hash2 != hash2_any) { + hash2 = hash2_any; + goto start_lookup; + } + if (count) { flush_stack(stack, count, skb, count - 1); - - for (i = 0; i < count; i++) - sock_put(stack[i]); } else { kfree_skb(skb); }