Merge branch 'udp_hash'
David Held says:
====================
udp: Fix multicast performance issues.
Fix performance issues with listening to many different multicast
sockets on different addresses with the same port. Instead of always
using hash1, fall back to hash2 lookup when hash1 lookup is long.
Patch 1 is a general cleanup and simplification which also makes the
main implementation in Patch 2 simpler.
Eric's recent change 63c6f81cdd
avoided this being an issue in early
demux. This makes it work for regular delivery as well.
v1->v2
- updated hash collision detection
v2->v3
- avoid flushing under lock unnecessarily at ARRAY_SIZE boundary
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
@@ -660,6 +660,20 @@ static inline void sk_add_bind_node(struct sock *sk,
|
|||||||
#define sk_for_each_bound(__sk, list) \
|
#define sk_for_each_bound(__sk, list) \
|
||||||
hlist_for_each_entry(__sk, list, sk_bind_node)
|
hlist_for_each_entry(__sk, list, sk_bind_node)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* sk_nulls_for_each_entry_offset - iterate over a list at a given struct offset
|
||||||
|
* @tpos: the type * to use as a loop cursor.
|
||||||
|
* @pos: the &struct hlist_node to use as a loop cursor.
|
||||||
|
* @head: the head for your list.
|
||||||
|
* @offset: offset of hlist_node within the struct.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
#define sk_nulls_for_each_entry_offset(tpos, pos, head, offset) \
|
||||||
|
for (pos = (head)->first; \
|
||||||
|
(!is_a_nulls(pos)) && \
|
||||||
|
({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \
|
||||||
|
pos = pos->next)
|
||||||
|
|
||||||
static inline struct user_namespace *sk_user_ns(struct sock *sk)
|
static inline struct user_namespace *sk_user_ns(struct sock *sk)
|
||||||
{
|
{
|
||||||
/* Careful only use this in a context where these parameters
|
/* Careful only use this in a context where these parameters
|
||||||
|
@@ -594,26 +594,6 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
|
|
||||||
__be16 loc_port, __be32 loc_addr,
|
|
||||||
__be16 rmt_port, __be32 rmt_addr,
|
|
||||||
int dif)
|
|
||||||
{
|
|
||||||
struct hlist_nulls_node *node;
|
|
||||||
unsigned short hnum = ntohs(loc_port);
|
|
||||||
|
|
||||||
sk_nulls_for_each_from(sk, node) {
|
|
||||||
if (__udp_is_mcast_sock(net, sk,
|
|
||||||
loc_port, loc_addr,
|
|
||||||
rmt_port, rmt_addr,
|
|
||||||
dif, hnum))
|
|
||||||
goto found;
|
|
||||||
}
|
|
||||||
sk = NULL;
|
|
||||||
found:
|
|
||||||
return sk;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This routine is called by the ICMP module when it gets some
|
* This routine is called by the ICMP module when it gets some
|
||||||
* sort of error condition. If err < 0 then the socket should
|
* sort of error condition. If err < 0 then the socket should
|
||||||
@@ -1639,6 +1619,8 @@ static void flush_stack(struct sock **stack, unsigned int count,
|
|||||||
|
|
||||||
if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
|
if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
|
||||||
skb1 = NULL;
|
skb1 = NULL;
|
||||||
|
|
||||||
|
sock_put(sk);
|
||||||
}
|
}
|
||||||
if (unlikely(skb1))
|
if (unlikely(skb1))
|
||||||
kfree_skb(skb1);
|
kfree_skb(skb1);
|
||||||
@@ -1667,41 +1649,50 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
|
|||||||
struct udp_table *udptable)
|
struct udp_table *udptable)
|
||||||
{
|
{
|
||||||
struct sock *sk, *stack[256 / sizeof(struct sock *)];
|
struct sock *sk, *stack[256 / sizeof(struct sock *)];
|
||||||
struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
|
struct hlist_nulls_node *node;
|
||||||
int dif;
|
unsigned short hnum = ntohs(uh->dest);
|
||||||
unsigned int i, count = 0;
|
struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
|
||||||
|
int dif = skb->dev->ifindex;
|
||||||
|
unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
|
||||||
|
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
|
||||||
|
|
||||||
|
if (use_hash2) {
|
||||||
|
hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
|
||||||
|
udp_table.mask;
|
||||||
|
hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask;
|
||||||
|
start_lookup:
|
||||||
|
hslot = &udp_table.hash2[hash2];
|
||||||
|
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
|
||||||
|
}
|
||||||
|
|
||||||
spin_lock(&hslot->lock);
|
spin_lock(&hslot->lock);
|
||||||
sk = sk_nulls_head(&hslot->head);
|
sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
|
||||||
dif = skb->dev->ifindex;
|
if (__udp_is_mcast_sock(net, sk,
|
||||||
sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
|
uh->dest, daddr,
|
||||||
while (sk) {
|
uh->source, saddr,
|
||||||
stack[count++] = sk;
|
dif, hnum)) {
|
||||||
sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
|
if (unlikely(count == ARRAY_SIZE(stack))) {
|
||||||
daddr, uh->source, saddr, dif);
|
flush_stack(stack, count, skb, ~0);
|
||||||
if (unlikely(count == ARRAY_SIZE(stack))) {
|
count = 0;
|
||||||
if (!sk)
|
}
|
||||||
break;
|
stack[count++] = sk;
|
||||||
flush_stack(stack, count, skb, ~0);
|
sock_hold(sk);
|
||||||
count = 0;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*
|
|
||||||
* before releasing chain lock, we must take a reference on sockets
|
|
||||||
*/
|
|
||||||
for (i = 0; i < count; i++)
|
|
||||||
sock_hold(stack[i]);
|
|
||||||
|
|
||||||
spin_unlock(&hslot->lock);
|
spin_unlock(&hslot->lock);
|
||||||
|
|
||||||
|
/* Also lookup *:port if we are using hash2 and haven't done so yet. */
|
||||||
|
if (use_hash2 && hash2 != hash2_any) {
|
||||||
|
hash2 = hash2_any;
|
||||||
|
goto start_lookup;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* do the slow work with no lock held
|
* do the slow work with no lock held
|
||||||
*/
|
*/
|
||||||
if (count) {
|
if (count) {
|
||||||
flush_stack(stack, count, skb, count - 1);
|
flush_stack(stack, count, skb, count - 1);
|
||||||
|
|
||||||
for (i = 0; i < count; i++)
|
|
||||||
sock_put(stack[i]);
|
|
||||||
} else {
|
} else {
|
||||||
kfree_skb(skb);
|
kfree_skb(skb);
|
||||||
}
|
}
|
||||||
|
114
net/ipv6/udp.c
114
net/ipv6/udp.c
@@ -702,43 +702,26 @@ drop:
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk,
|
static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
|
||||||
__be16 loc_port, const struct in6_addr *loc_addr,
|
__be16 loc_port, const struct in6_addr *loc_addr,
|
||||||
__be16 rmt_port, const struct in6_addr *rmt_addr,
|
__be16 rmt_port, const struct in6_addr *rmt_addr,
|
||||||
int dif)
|
int dif, unsigned short hnum)
|
||||||
{
|
{
|
||||||
struct hlist_nulls_node *node;
|
struct inet_sock *inet = inet_sk(sk);
|
||||||
unsigned short num = ntohs(loc_port);
|
|
||||||
|
|
||||||
sk_nulls_for_each_from(sk, node) {
|
if (!net_eq(sock_net(sk), net))
|
||||||
struct inet_sock *inet = inet_sk(sk);
|
return false;
|
||||||
|
|
||||||
if (!net_eq(sock_net(sk), net))
|
if (udp_sk(sk)->udp_port_hash != hnum ||
|
||||||
continue;
|
sk->sk_family != PF_INET6 ||
|
||||||
|
(inet->inet_dport && inet->inet_dport != rmt_port) ||
|
||||||
if (udp_sk(sk)->udp_port_hash == num &&
|
(!ipv6_addr_any(&sk->sk_v6_daddr) &&
|
||||||
sk->sk_family == PF_INET6) {
|
!ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
|
||||||
if (inet->inet_dport) {
|
(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
|
||||||
if (inet->inet_dport != rmt_port)
|
return false;
|
||||||
continue;
|
if (!inet6_mc_check(sk, loc_addr, rmt_addr))
|
||||||
}
|
return false;
|
||||||
if (!ipv6_addr_any(&sk->sk_v6_daddr) &&
|
return true;
|
||||||
!ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
|
|
||||||
if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr))
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!inet6_mc_check(sk, loc_addr, rmt_addr))
|
|
||||||
continue;
|
|
||||||
return sk;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void flush_stack(struct sock **stack, unsigned int count,
|
static void flush_stack(struct sock **stack, unsigned int count,
|
||||||
@@ -762,6 +745,7 @@ static void flush_stack(struct sock **stack, unsigned int count,
|
|||||||
|
|
||||||
if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0)
|
if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0)
|
||||||
skb1 = NULL;
|
skb1 = NULL;
|
||||||
|
sock_put(sk);
|
||||||
}
|
}
|
||||||
if (unlikely(skb1))
|
if (unlikely(skb1))
|
||||||
kfree_skb(skb1);
|
kfree_skb(skb1);
|
||||||
@@ -787,43 +771,51 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
|
|||||||
{
|
{
|
||||||
struct sock *sk, *stack[256 / sizeof(struct sock *)];
|
struct sock *sk, *stack[256 / sizeof(struct sock *)];
|
||||||
const struct udphdr *uh = udp_hdr(skb);
|
const struct udphdr *uh = udp_hdr(skb);
|
||||||
struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
|
struct hlist_nulls_node *node;
|
||||||
int dif;
|
unsigned short hnum = ntohs(uh->dest);
|
||||||
unsigned int i, count = 0;
|
struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
|
||||||
|
int dif = inet6_iif(skb);
|
||||||
|
unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
|
||||||
|
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
|
||||||
|
|
||||||
|
if (use_hash2) {
|
||||||
|
hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
|
||||||
|
udp_table.mask;
|
||||||
|
hash2 = udp6_portaddr_hash(net, daddr, hnum) & udp_table.mask;
|
||||||
|
start_lookup:
|
||||||
|
hslot = &udp_table.hash2[hash2];
|
||||||
|
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
|
||||||
|
}
|
||||||
|
|
||||||
spin_lock(&hslot->lock);
|
spin_lock(&hslot->lock);
|
||||||
sk = sk_nulls_head(&hslot->head);
|
sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
|
||||||
dif = inet6_iif(skb);
|
if (__udp_v6_is_mcast_sock(net, sk,
|
||||||
sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
|
uh->dest, daddr,
|
||||||
while (sk) {
|
uh->source, saddr,
|
||||||
/* If zero checksum and no_check is not on for
|
dif, hnum) &&
|
||||||
* the socket then skip it.
|
/* If zero checksum and no_check is not on for
|
||||||
*/
|
* the socket then skip it.
|
||||||
if (uh->check || udp_sk(sk)->no_check6_rx)
|
*/
|
||||||
|
(uh->check || udp_sk(sk)->no_check6_rx)) {
|
||||||
|
if (unlikely(count == ARRAY_SIZE(stack))) {
|
||||||
|
flush_stack(stack, count, skb, ~0);
|
||||||
|
count = 0;
|
||||||
|
}
|
||||||
stack[count++] = sk;
|
stack[count++] = sk;
|
||||||
|
sock_hold(sk);
|
||||||
sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr,
|
|
||||||
uh->source, saddr, dif);
|
|
||||||
if (unlikely(count == ARRAY_SIZE(stack))) {
|
|
||||||
if (!sk)
|
|
||||||
break;
|
|
||||||
flush_stack(stack, count, skb, ~0);
|
|
||||||
count = 0;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*
|
|
||||||
* before releasing the lock, we must take reference on sockets
|
|
||||||
*/
|
|
||||||
for (i = 0; i < count; i++)
|
|
||||||
sock_hold(stack[i]);
|
|
||||||
|
|
||||||
spin_unlock(&hslot->lock);
|
spin_unlock(&hslot->lock);
|
||||||
|
|
||||||
|
/* Also lookup *:port if we are using hash2 and haven't done so yet. */
|
||||||
|
if (use_hash2 && hash2 != hash2_any) {
|
||||||
|
hash2 = hash2_any;
|
||||||
|
goto start_lookup;
|
||||||
|
}
|
||||||
|
|
||||||
if (count) {
|
if (count) {
|
||||||
flush_stack(stack, count, skb, count - 1);
|
flush_stack(stack, count, skb, count - 1);
|
||||||
|
|
||||||
for (i = 0; i < count; i++)
|
|
||||||
sock_put(stack[i]);
|
|
||||||
} else {
|
} else {
|
||||||
kfree_skb(skb);
|
kfree_skb(skb);
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user