Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Daniel Borkmann says:

====================
The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) Add the ability to use unaligned chunks in the AF_XDP umem. By
   relaxing where the chunks can be placed, it allows to use an
   arbitrary buffer size and place whenever there is a free
   address in the umem. Helps more seamless DPDK AF_XDP driver
   integration. Support for i40e, ixgbe and mlx5e, from Kevin and
   Maxim.

2) Addition of a wakeup flag for AF_XDP tx and fill rings so the
   application can wake up the kernel for rx/tx processing which
   avoids busy-spinning of the latter, useful when app and driver
   is located on the same core. Support for i40e, ixgbe and mlx5e,
   from Magnus and Maxim.

3) bpftool fixes for printf()-like functions so compiler can actually
   enforce checks, bpftool build system improvements for custom output
   directories, and addition of 'bpftool map freeze' command, from Quentin.

4) Support attaching/detaching XDP programs from 'bpftool net' command,
   from Daniel.

5) Automatic xskmap cleanup when AF_XDP socket is released, and several
   barrier/{read,write}_once fixes in AF_XDP code, from Björn.

6) Relicense of bpf_helpers.h/bpf_endian.h for future libbpf
   inclusion as well as libbpf versioning improvements, from Andrii.

7) Several new BPF kselftests for verifier precision tracking, from Alexei.

8) Several BPF kselftest fixes wrt endianess to run on s390x, from Ilya.

9) And more BPF kselftest improvements all over the place, from Stanislav.

10) Add simple BPF map op cache for nfp driver to batch dumps, from Jakub.

11) AF_XDP socket umem mapping improvements for 32bit archs, from Ivan.

12) Add BPF-to-BPF call and BTF line info support for s390x JIT, from Yauheni.

13) Small optimization in arm64 JIT to spare 1 insns for BPF_MOD, from Jerin.

14) Fix an error check in bpf_tcp_gen_syncookie() helper, from Petar.

15) Various minor fixes and cleanups, from Nathan, Masahiro, Masanari,
    Peter, Wei, Yue.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller
2019-09-06 16:49:17 +02:00
124 changed files with 3492 additions and 701 deletions

View File

@@ -9,5 +9,11 @@ void syscall_defines(void)
COMMENT("Linux system call numbers.");
SYSNR(__NR_write);
SYSNR(__NR_read);
#ifdef __NR_mmap2
SYSNR(__NR_mmap2);
#endif
#ifdef __NR_mmap
SYSNR(__NR_mmap);
#endif
}

View File

@@ -68,12 +68,25 @@ PROG(SYS__NR_read)(struct pt_regs *ctx)
return 0;
}
PROG(SYS__NR_mmap)(struct pt_regs *ctx)
#ifdef __NR_mmap2
PROG(SYS__NR_mmap2)(struct pt_regs *ctx)
{
char fmt[] = "mmap\n";
char fmt[] = "mmap2\n";
bpf_trace_printk(fmt, sizeof(fmt));
return 0;
}
#endif
#ifdef __NR_mmap
PROG(SYS__NR_mmap)(struct pt_regs *ctx)
{
char fmt[] = "mmap\n";
bpf_trace_printk(fmt, sizeof(fmt));
return 0;
}
#endif
char _license[] SEC("license") = "GPL";
u32 _version SEC("version") = LINUX_VERSION_CODE;

View File

@@ -67,8 +67,14 @@ static int opt_ifindex;
static int opt_queue;
static int opt_poll;
static int opt_interval = 1;
static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP;
static u32 opt_umem_flags;
static int opt_unaligned_chunks;
static int opt_mmap_flags;
static u32 opt_xdp_bind_flags;
static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
static int opt_timeout = 1000;
static bool opt_need_wakeup = true;
static __u32 prog_id;
struct xsk_umem_info {
@@ -282,7 +288,9 @@ static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
.frame_size = opt_xsk_frame_size,
.frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM,
.flags = opt_umem_flags
};
int ret;
umem = calloc(1, sizeof(*umem));
@@ -291,6 +299,7 @@ static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq,
&cfg);
if (ret)
exit_with_error(-ret);
@@ -352,6 +361,8 @@ static struct option long_options[] = {
{"zero-copy", no_argument, 0, 'z'},
{"copy", no_argument, 0, 'c'},
{"frame-size", required_argument, 0, 'f'},
{"no-need-wakeup", no_argument, 0, 'm'},
{"unaligned", no_argument, 0, 'u'},
{0, 0, 0, 0}
};
@@ -372,6 +383,9 @@ static void usage(const char *prog)
" -z, --zero-copy Force zero-copy mode.\n"
" -c, --copy Force copy mode.\n"
" -f, --frame-size=n Set the frame size (must be a power of two, default is %d).\n"
" -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n"
" -f, --frame-size=n Set the frame size (must be a power of two in aligned mode, default is %d).\n"
" -u, --unaligned Enable unaligned chunk placement\n"
"\n";
fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE);
exit(EXIT_FAILURE);
@@ -384,8 +398,8 @@ static void parse_command_line(int argc, char **argv)
opterr = 0;
for (;;) {
c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:", long_options,
&option_index);
c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:mu",
long_options, &option_index);
if (c == -1)
break;
@@ -424,12 +438,21 @@ static void parse_command_line(int argc, char **argv)
case 'c':
opt_xdp_bind_flags |= XDP_COPY;
break;
case 'u':
opt_umem_flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG;
opt_unaligned_chunks = 1;
opt_mmap_flags = MAP_HUGETLB;
break;
case 'F':
opt_xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
break;
case 'f':
opt_xsk_frame_size = atoi(optarg);
case 'm':
opt_need_wakeup = false;
opt_xdp_bind_flags &= ~XDP_USE_NEED_WAKEUP;
break;
default:
usage(basename(argv[0]));
}
@@ -442,7 +465,8 @@ static void parse_command_line(int argc, char **argv)
usage(basename(argv[0]));
}
if (opt_xsk_frame_size & (opt_xsk_frame_size - 1)) {
if ((opt_xsk_frame_size & (opt_xsk_frame_size - 1)) &&
!opt_unaligned_chunks) {
fprintf(stderr, "--frame-size=%d is not a power of two\n",
opt_xsk_frame_size);
usage(basename(argv[0]));
@@ -459,8 +483,10 @@ static void kick_tx(struct xsk_socket_info *xsk)
exit_with_error(errno);
}
static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
struct pollfd *fds)
{
struct xsk_umem_info *umem = xsk->umem;
u32 idx_cq = 0, idx_fq = 0;
unsigned int rcvd;
size_t ndescs;
@@ -468,27 +494,30 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
if (!xsk->outstanding_tx)
return;
kick_tx(xsk);
if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
kick_tx(xsk);
ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE :
xsk->outstanding_tx;
/* re-add completed Tx buffers */
rcvd = xsk_ring_cons__peek(&xsk->umem->cq, ndescs, &idx_cq);
rcvd = xsk_ring_cons__peek(&umem->cq, ndescs, &idx_cq);
if (rcvd > 0) {
unsigned int i;
int ret;
ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
while (ret != rcvd) {
if (ret < 0)
exit_with_error(-ret);
ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd,
&idx_fq);
if (xsk_ring_prod__needs_wakeup(&umem->fq))
ret = poll(fds, num_socks, opt_timeout);
ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
}
for (i = 0; i < rcvd; i++)
*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) =
*xsk_ring_cons__comp_addr(&xsk->umem->cq,
idx_cq++);
*xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) =
*xsk_ring_cons__comp_addr(&umem->cq, idx_cq++);
xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
xsk_ring_cons__release(&xsk->umem->cq, rcvd);
@@ -505,7 +534,8 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk)
if (!xsk->outstanding_tx)
return;
kick_tx(xsk);
if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
kick_tx(xsk);
rcvd = xsk_ring_cons__peek(&xsk->umem->cq, BATCH_SIZE, &idx);
if (rcvd > 0) {
@@ -515,30 +545,38 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk)
}
}
static void rx_drop(struct xsk_socket_info *xsk)
static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
{
unsigned int rcvd, i;
u32 idx_rx = 0, idx_fq = 0;
int ret;
rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
if (!rcvd)
if (!rcvd) {
if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
ret = poll(fds, num_socks, opt_timeout);
return;
}
ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
while (ret != rcvd) {
if (ret < 0)
exit_with_error(-ret);
if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
ret = poll(fds, num_socks, opt_timeout);
ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
}
for (i = 0; i < rcvd; i++) {
u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
u64 orig = xsk_umem__extract_addr(addr);
addr = xsk_umem__add_offset_to_addr(addr);
char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
hex_dump(pkt, len, addr);
*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = addr;
*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig;
}
xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
@@ -549,42 +587,65 @@ static void rx_drop(struct xsk_socket_info *xsk)
static void rx_drop_all(void)
{
struct pollfd fds[MAX_SOCKS + 1];
int i, ret, timeout, nfds = 1;
int i, ret;
memset(fds, 0, sizeof(fds));
for (i = 0; i < num_socks; i++) {
fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
fds[i].events = POLLIN;
timeout = 1000; /* 1sn */
}
for (;;) {
if (opt_poll) {
ret = poll(fds, nfds, timeout);
ret = poll(fds, num_socks, opt_timeout);
if (ret <= 0)
continue;
}
for (i = 0; i < num_socks; i++)
rx_drop(xsks[i]);
rx_drop(xsks[i], fds);
}
}
static void tx_only(struct xsk_socket_info *xsk)
static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb)
{
int timeout, ret, nfds = 1;
struct pollfd fds[nfds + 1];
u32 idx, frame_nb = 0;
u32 idx;
if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) == BATCH_SIZE) {
unsigned int i;
for (i = 0; i < BATCH_SIZE; i++) {
xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr =
(frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT;
xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
sizeof(pkt_data) - 1;
}
xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE);
xsk->outstanding_tx += BATCH_SIZE;
frame_nb += BATCH_SIZE;
frame_nb %= NUM_FRAMES;
}
complete_tx_only(xsk);
}
static void tx_only_all(void)
{
struct pollfd fds[MAX_SOCKS];
u32 frame_nb[MAX_SOCKS] = {};
int i, ret;
memset(fds, 0, sizeof(fds));
fds[0].fd = xsk_socket__fd(xsk->xsk);
fds[0].events = POLLOUT;
timeout = 1000; /* 1sn */
for (i = 0; i < num_socks; i++) {
fds[0].fd = xsk_socket__fd(xsks[i]->xsk);
fds[0].events = POLLOUT;
}
for (;;) {
if (opt_poll) {
ret = poll(fds, nfds, timeout);
ret = poll(fds, num_socks, opt_timeout);
if (ret <= 0)
continue;
@@ -592,69 +653,78 @@ static void tx_only(struct xsk_socket_info *xsk)
continue;
}
if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) ==
BATCH_SIZE) {
unsigned int i;
for (i = 0; i < BATCH_SIZE; i++) {
xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr
= (frame_nb + i) * opt_xsk_frame_size;
xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
sizeof(pkt_data) - 1;
}
xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE);
xsk->outstanding_tx += BATCH_SIZE;
frame_nb += BATCH_SIZE;
frame_nb %= NUM_FRAMES;
}
complete_tx_only(xsk);
for (i = 0; i < num_socks; i++)
tx_only(xsks[i], frame_nb[i]);
}
}
static void l2fwd(struct xsk_socket_info *xsk)
static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
{
for (;;) {
unsigned int rcvd, i;
u32 idx_rx = 0, idx_tx = 0;
int ret;
unsigned int rcvd, i;
u32 idx_rx = 0, idx_tx = 0;
int ret;
for (;;) {
complete_tx_l2fwd(xsk);
complete_tx_l2fwd(xsk, fds);
rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE,
&idx_rx);
if (rcvd > 0)
break;
}
rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
if (!rcvd) {
if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
ret = poll(fds, num_socks, opt_timeout);
return;
}
ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
while (ret != rcvd) {
if (ret < 0)
exit_with_error(-ret);
if (xsk_ring_prod__needs_wakeup(&xsk->tx))
kick_tx(xsk);
ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
while (ret != rcvd) {
if (ret < 0)
exit_with_error(-ret);
ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
}
for (i = 0; i < rcvd; i++) {
u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
u64 orig = xsk_umem__extract_addr(addr);
addr = xsk_umem__add_offset_to_addr(addr);
char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
swap_mac_addresses(pkt);
hex_dump(pkt, len, addr);
xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = orig;
xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
}
xsk_ring_prod__submit(&xsk->tx, rcvd);
xsk_ring_cons__release(&xsk->rx, rcvd);
xsk->rx_npkts += rcvd;
xsk->outstanding_tx += rcvd;
}
static void l2fwd_all(void)
{
struct pollfd fds[MAX_SOCKS];
int i, ret;
memset(fds, 0, sizeof(fds));
for (i = 0; i < num_socks; i++) {
fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
fds[i].events = POLLOUT | POLLIN;
}
for (;;) {
if (opt_poll) {
ret = poll(fds, num_socks, opt_timeout);
if (ret <= 0)
continue;
}
for (i = 0; i < rcvd; i++) {
u64 addr = xsk_ring_cons__rx_desc(&xsk->rx,
idx_rx)->addr;
u32 len = xsk_ring_cons__rx_desc(&xsk->rx,
idx_rx++)->len;
char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
swap_mac_addresses(pkt);
hex_dump(pkt, len, addr);
xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = addr;
xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
}
xsk_ring_prod__submit(&xsk->tx, rcvd);
xsk_ring_cons__release(&xsk->rx, rcvd);
xsk->rx_npkts += rcvd;
xsk->outstanding_tx += rcvd;
for (i = 0; i < num_socks; i++)
l2fwd(xsks[i], fds);
}
}
@@ -674,11 +744,14 @@ int main(int argc, char **argv)
exit(EXIT_FAILURE);
}
ret = posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
NUM_FRAMES * opt_xsk_frame_size);
if (ret)
exit_with_error(ret);
/* Reserve memory for the umem. Use hugepages if unaligned chunk mode */
bufs = mmap(NULL, NUM_FRAMES * opt_xsk_frame_size,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | opt_mmap_flags, -1, 0);
if (bufs == MAP_FAILED) {
printf("ERROR: mmap failed\n");
exit(EXIT_FAILURE);
}
/* Create sockets... */
umem = xsk_configure_umem(bufs, NUM_FRAMES * opt_xsk_frame_size);
xsks[num_socks++] = xsk_configure_socket(umem);
@@ -705,9 +778,9 @@ int main(int argc, char **argv)
if (opt_bench == BENCH_RXDROP)
rx_drop_all();
else if (opt_bench == BENCH_TXONLY)
tx_only(xsks[0]);
tx_only_all();
else
l2fwd(xsks[0]);
l2fwd_all();
return 0;
}