packet: support extensible, 64 bit clean mmaped ring structure
The tpacket_hdr is not 64 bit clean due to use of an unsigned long and can't be extended because the following struct sockaddr_ll needs to be at a fixed offset. Add support for a version 2 tpacket protocol that removes these limitations. Userspace can query the header size through a new getsockopt option and change the protocol version through a setsockopt option. The changes needed to switch to the new protocol version are: 1. replace struct tpacket_hdr by struct tpacket2_hdr 2. query header len and save 3. set protocol version to 2 - set up ring as usual 4. for getting the sockaddr_ll, use (void *)hdr + TPACKET_ALIGN(hdrlen) instead of (void *)hdr + TPACKET_ALIGN(sizeof(struct tpacket_hdr)) Steps 2 and 4 can be omitted if the struct sockaddr_ll isn't needed. Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
committed by
David S. Miller
parent
bc1d0411b8
commit
bbd6ef87c5
@@ -43,6 +43,8 @@ struct sockaddr_ll
|
|||||||
#define PACKET_COPY_THRESH 7
|
#define PACKET_COPY_THRESH 7
|
||||||
#define PACKET_AUXDATA 8
|
#define PACKET_AUXDATA 8
|
||||||
#define PACKET_ORIGDEV 9
|
#define PACKET_ORIGDEV 9
|
||||||
|
#define PACKET_VERSION 10
|
||||||
|
#define PACKET_HDRLEN 11
|
||||||
|
|
||||||
struct tpacket_stats
|
struct tpacket_stats
|
||||||
{
|
{
|
||||||
@@ -79,6 +81,25 @@ struct tpacket_hdr
|
|||||||
#define TPACKET_ALIGN(x) (((x)+TPACKET_ALIGNMENT-1)&~(TPACKET_ALIGNMENT-1))
|
#define TPACKET_ALIGN(x) (((x)+TPACKET_ALIGNMENT-1)&~(TPACKET_ALIGNMENT-1))
|
||||||
#define TPACKET_HDRLEN (TPACKET_ALIGN(sizeof(struct tpacket_hdr)) + sizeof(struct sockaddr_ll))
|
#define TPACKET_HDRLEN (TPACKET_ALIGN(sizeof(struct tpacket_hdr)) + sizeof(struct sockaddr_ll))
|
||||||
|
|
||||||
|
struct tpacket2_hdr
|
||||||
|
{
|
||||||
|
__u32 tp_status;
|
||||||
|
__u32 tp_len;
|
||||||
|
__u32 tp_snaplen;
|
||||||
|
__u16 tp_mac;
|
||||||
|
__u16 tp_net;
|
||||||
|
__u32 tp_sec;
|
||||||
|
__u32 tp_nsec;
|
||||||
|
};
|
||||||
|
|
||||||
|
#define TPACKET2_HDRLEN (TPACKET_ALIGN(sizeof(struct tpacket2_hdr)) + sizeof(struct sockaddr_ll))
|
||||||
|
|
||||||
|
enum tpacket_versions
|
||||||
|
{
|
||||||
|
TPACKET_V1,
|
||||||
|
TPACKET_V2,
|
||||||
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Frame structure:
|
Frame structure:
|
||||||
|
|
||||||
|
@@ -186,6 +186,8 @@ struct packet_sock {
|
|||||||
unsigned int pg_vec_order;
|
unsigned int pg_vec_order;
|
||||||
unsigned int pg_vec_pages;
|
unsigned int pg_vec_pages;
|
||||||
unsigned int pg_vec_len;
|
unsigned int pg_vec_len;
|
||||||
|
enum tpacket_versions tp_version;
|
||||||
|
unsigned int tp_hdrlen;
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -201,14 +203,52 @@ struct packet_skb_cb {
|
|||||||
|
|
||||||
#ifdef CONFIG_PACKET_MMAP
|
#ifdef CONFIG_PACKET_MMAP
|
||||||
|
|
||||||
static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position)
|
static void *packet_lookup_frame(struct packet_sock *po, unsigned int position,
|
||||||
|
int status)
|
||||||
{
|
{
|
||||||
unsigned int pg_vec_pos, frame_offset;
|
unsigned int pg_vec_pos, frame_offset;
|
||||||
|
union {
|
||||||
|
struct tpacket_hdr *h1;
|
||||||
|
struct tpacket2_hdr *h2;
|
||||||
|
void *raw;
|
||||||
|
} h;
|
||||||
|
|
||||||
pg_vec_pos = position / po->frames_per_block;
|
pg_vec_pos = position / po->frames_per_block;
|
||||||
frame_offset = position % po->frames_per_block;
|
frame_offset = position % po->frames_per_block;
|
||||||
|
|
||||||
return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
|
h.raw = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
|
||||||
|
switch (po->tp_version) {
|
||||||
|
case TPACKET_V1:
|
||||||
|
if (status != h.h1->tp_status ? TP_STATUS_USER :
|
||||||
|
TP_STATUS_KERNEL)
|
||||||
|
return NULL;
|
||||||
|
break;
|
||||||
|
case TPACKET_V2:
|
||||||
|
if (status != h.h2->tp_status ? TP_STATUS_USER :
|
||||||
|
TP_STATUS_KERNEL)
|
||||||
|
return NULL;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return h.raw;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __packet_set_status(struct packet_sock *po, void *frame, int status)
|
||||||
|
{
|
||||||
|
union {
|
||||||
|
struct tpacket_hdr *h1;
|
||||||
|
struct tpacket2_hdr *h2;
|
||||||
|
void *raw;
|
||||||
|
} h;
|
||||||
|
|
||||||
|
h.raw = frame;
|
||||||
|
switch (po->tp_version) {
|
||||||
|
case TPACKET_V1:
|
||||||
|
h.h1->tp_status = status;
|
||||||
|
break;
|
||||||
|
case TPACKET_V2:
|
||||||
|
h.h2->tp_status = status;
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -551,14 +591,19 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
|
|||||||
struct sock *sk;
|
struct sock *sk;
|
||||||
struct packet_sock *po;
|
struct packet_sock *po;
|
||||||
struct sockaddr_ll *sll;
|
struct sockaddr_ll *sll;
|
||||||
struct tpacket_hdr *h;
|
union {
|
||||||
|
struct tpacket_hdr *h1;
|
||||||
|
struct tpacket2_hdr *h2;
|
||||||
|
void *raw;
|
||||||
|
} h;
|
||||||
u8 * skb_head = skb->data;
|
u8 * skb_head = skb->data;
|
||||||
int skb_len = skb->len;
|
int skb_len = skb->len;
|
||||||
unsigned int snaplen, res;
|
unsigned int snaplen, res;
|
||||||
unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
|
unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
|
||||||
unsigned short macoff, netoff;
|
unsigned short macoff, netoff, hdrlen;
|
||||||
struct sk_buff *copy_skb = NULL;
|
struct sk_buff *copy_skb = NULL;
|
||||||
struct timeval tv;
|
struct timeval tv;
|
||||||
|
struct timespec ts;
|
||||||
|
|
||||||
if (skb->pkt_type == PACKET_LOOPBACK)
|
if (skb->pkt_type == PACKET_LOOPBACK)
|
||||||
goto drop;
|
goto drop;
|
||||||
@@ -590,10 +635,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
|
|||||||
snaplen = res;
|
snaplen = res;
|
||||||
|
|
||||||
if (sk->sk_type == SOCK_DGRAM) {
|
if (sk->sk_type == SOCK_DGRAM) {
|
||||||
macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
|
macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16;
|
||||||
} else {
|
} else {
|
||||||
unsigned maclen = skb_network_offset(skb);
|
unsigned maclen = skb_network_offset(skb);
|
||||||
netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
|
netoff = TPACKET_ALIGN(po->tp_hdrlen +
|
||||||
|
(maclen < 16 ? 16 : maclen));
|
||||||
macoff = netoff - maclen;
|
macoff = netoff - maclen;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -616,9 +662,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
|
|||||||
}
|
}
|
||||||
|
|
||||||
spin_lock(&sk->sk_receive_queue.lock);
|
spin_lock(&sk->sk_receive_queue.lock);
|
||||||
h = packet_lookup_frame(po, po->head);
|
h.raw = packet_lookup_frame(po, po->head, TP_STATUS_KERNEL);
|
||||||
|
if (!h.raw)
|
||||||
if (h->tp_status)
|
|
||||||
goto ring_is_full;
|
goto ring_is_full;
|
||||||
po->head = po->head != po->frame_max ? po->head+1 : 0;
|
po->head = po->head != po->frame_max ? po->head+1 : 0;
|
||||||
po->stats.tp_packets++;
|
po->stats.tp_packets++;
|
||||||
@@ -630,20 +675,40 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
|
|||||||
status &= ~TP_STATUS_LOSING;
|
status &= ~TP_STATUS_LOSING;
|
||||||
spin_unlock(&sk->sk_receive_queue.lock);
|
spin_unlock(&sk->sk_receive_queue.lock);
|
||||||
|
|
||||||
skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
|
skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
|
||||||
|
|
||||||
h->tp_len = skb->len;
|
switch (po->tp_version) {
|
||||||
h->tp_snaplen = snaplen;
|
case TPACKET_V1:
|
||||||
h->tp_mac = macoff;
|
h.h1->tp_len = skb->len;
|
||||||
h->tp_net = netoff;
|
h.h1->tp_snaplen = snaplen;
|
||||||
|
h.h1->tp_mac = macoff;
|
||||||
|
h.h1->tp_net = netoff;
|
||||||
if (skb->tstamp.tv64)
|
if (skb->tstamp.tv64)
|
||||||
tv = ktime_to_timeval(skb->tstamp);
|
tv = ktime_to_timeval(skb->tstamp);
|
||||||
else
|
else
|
||||||
do_gettimeofday(&tv);
|
do_gettimeofday(&tv);
|
||||||
h->tp_sec = tv.tv_sec;
|
h.h1->tp_sec = tv.tv_sec;
|
||||||
h->tp_usec = tv.tv_usec;
|
h.h1->tp_usec = tv.tv_usec;
|
||||||
|
hdrlen = sizeof(*h.h1);
|
||||||
|
break;
|
||||||
|
case TPACKET_V2:
|
||||||
|
h.h2->tp_len = skb->len;
|
||||||
|
h.h2->tp_snaplen = snaplen;
|
||||||
|
h.h2->tp_mac = macoff;
|
||||||
|
h.h2->tp_net = netoff;
|
||||||
|
if (skb->tstamp.tv64)
|
||||||
|
ts = ktime_to_timespec(skb->tstamp);
|
||||||
|
else
|
||||||
|
getnstimeofday(&ts);
|
||||||
|
h.h2->tp_sec = ts.tv_sec;
|
||||||
|
h.h2->tp_nsec = ts.tv_nsec;
|
||||||
|
hdrlen = sizeof(*h.h2);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
BUG();
|
||||||
|
}
|
||||||
|
|
||||||
sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
|
sll = h.raw + TPACKET_ALIGN(hdrlen);
|
||||||
sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
|
sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
|
||||||
sll->sll_family = AF_PACKET;
|
sll->sll_family = AF_PACKET;
|
||||||
sll->sll_hatype = dev->type;
|
sll->sll_hatype = dev->type;
|
||||||
@@ -654,14 +719,14 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
|
|||||||
else
|
else
|
||||||
sll->sll_ifindex = dev->ifindex;
|
sll->sll_ifindex = dev->ifindex;
|
||||||
|
|
||||||
h->tp_status = status;
|
__packet_set_status(po, h.raw, status);
|
||||||
smp_mb();
|
smp_mb();
|
||||||
|
|
||||||
{
|
{
|
||||||
struct page *p_start, *p_end;
|
struct page *p_start, *p_end;
|
||||||
u8 *h_end = (u8 *)h + macoff + snaplen - 1;
|
u8 *h_end = h.raw + macoff + snaplen - 1;
|
||||||
|
|
||||||
p_start = virt_to_page(h);
|
p_start = virt_to_page(h.raw);
|
||||||
p_end = virt_to_page(h_end);
|
p_end = virt_to_page(h_end);
|
||||||
while (p_start <= p_end) {
|
while (p_start <= p_end) {
|
||||||
flush_dcache_page(p_start);
|
flush_dcache_page(p_start);
|
||||||
@@ -1362,6 +1427,25 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
|
|||||||
pkt_sk(sk)->copy_thresh = val;
|
pkt_sk(sk)->copy_thresh = val;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
case PACKET_VERSION:
|
||||||
|
{
|
||||||
|
int val;
|
||||||
|
|
||||||
|
if (optlen != sizeof(val))
|
||||||
|
return -EINVAL;
|
||||||
|
if (po->pg_vec)
|
||||||
|
return -EBUSY;
|
||||||
|
if (copy_from_user(&val, optval, sizeof(val)))
|
||||||
|
return -EFAULT;
|
||||||
|
switch (val) {
|
||||||
|
case TPACKET_V1:
|
||||||
|
case TPACKET_V2:
|
||||||
|
po->tp_version = val;
|
||||||
|
return 0;
|
||||||
|
default:
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
case PACKET_AUXDATA:
|
case PACKET_AUXDATA:
|
||||||
{
|
{
|
||||||
@@ -1437,6 +1521,31 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
|
|||||||
|
|
||||||
data = &val;
|
data = &val;
|
||||||
break;
|
break;
|
||||||
|
#ifdef CONFIG_PACKET_MMAP
|
||||||
|
case PACKET_VERSION:
|
||||||
|
if (len > sizeof(int))
|
||||||
|
len = sizeof(int);
|
||||||
|
val = po->tp_version;
|
||||||
|
data = &val;
|
||||||
|
break;
|
||||||
|
case PACKET_HDRLEN:
|
||||||
|
if (len > sizeof(int))
|
||||||
|
len = sizeof(int);
|
||||||
|
if (copy_from_user(&val, optval, len))
|
||||||
|
return -EFAULT;
|
||||||
|
switch (val) {
|
||||||
|
case TPACKET_V1:
|
||||||
|
val = sizeof(struct tpacket_hdr);
|
||||||
|
break;
|
||||||
|
case TPACKET_V2:
|
||||||
|
val = sizeof(struct tpacket2_hdr);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
data = &val;
|
||||||
|
break;
|
||||||
|
#endif
|
||||||
default:
|
default:
|
||||||
return -ENOPROTOOPT;
|
return -ENOPROTOOPT;
|
||||||
}
|
}
|
||||||
@@ -1570,11 +1679,8 @@ static unsigned int packet_poll(struct file * file, struct socket *sock,
|
|||||||
spin_lock_bh(&sk->sk_receive_queue.lock);
|
spin_lock_bh(&sk->sk_receive_queue.lock);
|
||||||
if (po->pg_vec) {
|
if (po->pg_vec) {
|
||||||
unsigned last = po->head ? po->head-1 : po->frame_max;
|
unsigned last = po->head ? po->head-1 : po->frame_max;
|
||||||
struct tpacket_hdr *h;
|
|
||||||
|
|
||||||
h = packet_lookup_frame(po, last);
|
if (packet_lookup_frame(po, last, TP_STATUS_USER))
|
||||||
|
|
||||||
if (h->tp_status)
|
|
||||||
mask |= POLLIN | POLLRDNORM;
|
mask |= POLLIN | POLLRDNORM;
|
||||||
}
|
}
|
||||||
spin_unlock_bh(&sk->sk_receive_queue.lock);
|
spin_unlock_bh(&sk->sk_receive_queue.lock);
|
||||||
@@ -1669,11 +1775,20 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
|
|||||||
if (unlikely(po->pg_vec))
|
if (unlikely(po->pg_vec))
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
|
|
||||||
|
switch (po->tp_version) {
|
||||||
|
case TPACKET_V1:
|
||||||
|
po->tp_hdrlen = TPACKET_HDRLEN;
|
||||||
|
break;
|
||||||
|
case TPACKET_V2:
|
||||||
|
po->tp_hdrlen = TPACKET2_HDRLEN;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
if (unlikely((int)req->tp_block_size <= 0))
|
if (unlikely((int)req->tp_block_size <= 0))
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
|
if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
|
if (unlikely(req->tp_frame_size < po->tp_hdrlen))
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
|
if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
@@ -1692,13 +1807,11 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
|
|||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
for (i = 0; i < req->tp_block_nr; i++) {
|
for (i = 0; i < req->tp_block_nr; i++) {
|
||||||
char *ptr = pg_vec[i];
|
void *ptr = pg_vec[i];
|
||||||
struct tpacket_hdr *header;
|
|
||||||
int k;
|
int k;
|
||||||
|
|
||||||
for (k = 0; k < po->frames_per_block; k++) {
|
for (k = 0; k < po->frames_per_block; k++) {
|
||||||
header = (struct tpacket_hdr *) ptr;
|
__packet_set_status(po, ptr, TP_STATUS_KERNEL);
|
||||||
header->tp_status = TP_STATUS_KERNEL;
|
|
||||||
ptr += req->tp_frame_size;
|
ptr += req->tp_frame_size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user