diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 19f4f253b4f90..d53756914b2f6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -143,8 +143,16 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); -int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, - int len, int flags, int *copied_from_self); +typedef int (*sk_msg_read_actor_t)(void *arg, struct page *page, + unsigned int offset, size_t len); +/* Core function for reading ingress_msg, dispatches to the given actor */ +int sk_msg_read_core(struct sock *sk, struct sk_psock *psock, + size_t len, int flags, + sk_msg_read_actor_t actor, void *actor_arg, + int *copied_from_self); +int sk_msg_recvmsg_actor(void *arg, struct page *page, + unsigned int offset, size_t len); + bool sk_msg_is_readable(struct sock *sk); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 5dd2bf24449ef..84f2744d57f8a 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -34,6 +34,9 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk); int inet_send_prepare(struct sock *sk); int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); +ssize_t inet_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); void inet_splice_eof(struct socket *sock); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); diff --git a/include/net/sock.h b/include/net/sock.h index 6c9a83016e955..de28af168ec45 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1317,6 +1317,9 @@ struct proto { size_t len); int (*recvmsg)(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); + ssize_t (*splice_read)(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, struct sockaddr_unsized *addr, int addr_len); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 3261793abe833..3a2b27a12b635 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -409,10 +409,12 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, - int len, int flags, int *copied_from_self) +/* Core function for reading ingress_msg, dispatches to the given actor */ +int sk_msg_read_core(struct sock *sk, struct sk_psock *psock, + size_t len, int flags, + sk_msg_read_actor_t actor, void *actor_arg, + int *copied_from_self) { - struct iov_iter *iter = &msg->msg_iter; int peek = flags & MSG_PEEK; struct sk_msg *msg_rx; int i, copied = 0; @@ -440,8 +442,19 @@ int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg if (copied + copy > len) copy = len - copied; if (copy) - copy = copy_page_to_iter(page, sge->offset, copy, iter); + copy = actor(actor_arg, page, + sge->offset, copy); if (!copy) { + /* + * The loop processes msg_rx->sg entries + * sequentially and prior entries may + * already be consumed. Advance sg.start + * so the next call resumes at the correct + * entry, otherwise it would revisit + * zero-length entries and return -EFAULT. + */ + if (!peek) + msg_rx->sg.start = i; copied = copied ? copied : -EFAULT; goto out; } @@ -495,12 +508,23 @@ int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg out: return copied; } +EXPORT_SYMBOL_GPL(sk_msg_read_core); + +int sk_msg_recvmsg_actor(void *arg, struct page *page, + unsigned int offset, size_t len) +{ + struct msghdr *msg = arg; + + return copy_page_to_iter(page, offset, len, &msg->msg_iter); +} +EXPORT_SYMBOL_GPL(sk_msg_recvmsg_actor); /* Receive sk_msg from psock->ingress_msg to @msg. */ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags) { - return __sk_msg_recvmsg(sk, psock, msg, len, flags, NULL); + return sk_msg_read_core(sk, psock, len, flags, + sk_msg_recvmsg_actor, msg, NULL); } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8036e76aa1e47..2c7b35d9c62da 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -876,6 +876,19 @@ void inet_splice_eof(struct socket *sock) } EXPORT_SYMBOL_GPL(inet_splice_eof); +ssize_t inet_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct sock *sk = sock->sk; + const struct proto *prot; + + prot = READ_ONCE(sk->sk_prot); + return INDIRECT_CALL_1(prot->splice_read, tcp_splice_read, sock, + ppos, pipe, len, flags); +} +EXPORT_SYMBOL_GPL(inet_splice_read); + INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *, size_t, int, int *)); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, @@ -1079,7 +1092,7 @@ const struct proto_ops inet_stream_ops = { .mmap = tcp_mmap, #endif .splice_eof = inet_splice_eof, - .splice_read = tcp_splice_read, + .splice_read = inet_splice_read, .set_peek_off = sk_set_peek_off, .read_sock = tcp_read_sock, .read_skb = tcp_read_skb, diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 813d2e498c93a..4497f32a87b32 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -218,31 +219,26 @@ static bool is_next_msg_fin(struct sk_psock *psock) return false; } -static int tcp_bpf_recvmsg_parser(struct sock *sk, - struct msghdr *msg, - size_t len, - int flags, - int *addr_len) +/* + * __tcp_bpf_recvmsg_parser - inner recvmsg for strparser path + * + * Handles TCP seq tracking, pre-accept receive_queue draining, FIN detection, + * and receive window updates. The actual data read is delegated to @actor. + * + * Caller must hold a psock ref. Socket lock is acquired/released internally. + * Returns bytes read, or negative error. + */ +static int __tcp_bpf_recvmsg_parser(struct sock *sk, struct sk_psock *psock, + sk_msg_read_actor_t actor, void *actor_arg, + size_t len, int flags) { int peek = flags & MSG_PEEK; - struct sk_psock *psock; - struct tcp_sock *tcp; + struct tcp_sock *tcp = tcp_sk(sk); int copied_from_self = 0; int copied = 0; u32 seq; - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - - if (!len) - return 0; - - psock = sk_psock_get(sk); - if (unlikely(!psock)) - return tcp_recvmsg(sk, msg, len, flags, addr_len); - lock_sock(sk); - tcp = tcp_sk(sk); seq = tcp->copied_seq; /* We may have received data on the sk_receive_queue pre-accept and * then we can not use read_skb in this context because we haven't @@ -264,7 +260,8 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, } msg_bytes_ready: - copied = __sk_msg_recvmsg(sk, psock, msg, len, flags, &copied_from_self); + copied = sk_msg_read_core(sk, psock, len, flags, + actor, actor_arg, &copied_from_self); /* The typical case for EFAULT is the socket was gracefully * shutdown with a FIN pkt. So check here the other case is * some error on copy_page_to_iter which would be unexpected. @@ -329,10 +326,34 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, unlock: release_sock(sk); - sk_psock_put(sk, psock); return copied; } +static int tcp_bpf_recvmsg_parser(struct sock *sk, + struct msghdr *msg, + size_t len, + int flags, + int *addr_len) +{ + struct sk_psock *psock; + int ret; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + if (!len) + return 0; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_recvmsg(sk, msg, len, flags, addr_len); + + ret = __tcp_bpf_recvmsg_parser(sk, psock, + sk_msg_recvmsg_actor, msg, len, flags); + sk_psock_put(sk, psock); + return ret; +} + static int tcp_bpf_ioctl(struct sock *sk, int cmd, int *karg) { bool slow; @@ -351,29 +372,25 @@ static int tcp_bpf_ioctl(struct sock *sk, int cmd, int *karg) return 0; } -static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) +/* + * __tcp_bpf_recvmsg - inner recvmsg for non-parser (verdict only) path + * + * No TCP seq tracking needed (tcp_eat_skb handles it at verdict time). + * Returns bytes read, 0 if caller should fall back to the normal TCP + * read path (data on receive_queue but not in psock), or negative error. + * + * Caller must hold a psock ref. Socket lock is acquired/released internally. + */ +static int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, + sk_msg_read_actor_t actor, void *actor_arg, + size_t len, int flags) { - struct sk_psock *psock; int copied, ret; - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - - if (!len) - return 0; - - psock = sk_psock_get(sk); - if (unlikely(!psock)) - return tcp_recvmsg(sk, msg, len, flags, addr_len); - if (!skb_queue_empty(&sk->sk_receive_queue) && - sk_psock_queue_empty(psock)) { - sk_psock_put(sk, psock); - return tcp_recvmsg(sk, msg, len, flags, addr_len); - } lock_sock(sk); msg_bytes_ready: - copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + copied = sk_msg_read_core(sk, psock, len, flags, + actor, actor_arg, NULL); if (!copied) { long timeo; int data; @@ -388,8 +405,7 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; release_sock(sk); - sk_psock_put(sk, psock); - return tcp_recvmsg(sk, msg, len, flags, addr_len); + return 0; } copied = -EAGAIN; } @@ -397,6 +413,134 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, unlock: release_sock(sk); + return ret; +} + +static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int flags, int *addr_len) +{ + struct sk_psock *psock; + int ret; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + if (!len) + return 0; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_recvmsg(sk, msg, len, flags, addr_len); + if (!skb_queue_empty(&sk->sk_receive_queue) && + sk_psock_queue_empty(psock)) { + sk_psock_put(sk, psock); + return tcp_recvmsg(sk, msg, len, flags, addr_len); + } + + ret = __tcp_bpf_recvmsg(sk, psock, sk_msg_recvmsg_actor, msg, + len, flags); + sk_psock_put(sk, psock); + if (!ret) + return tcp_recvmsg(sk, msg, len, flags, addr_len); + return ret; +} + +struct tcp_bpf_splice_ctx { + struct pipe_inode_info *pipe; + struct sock *sk; +}; + +static int sk_msg_splice_actor(void *arg, struct page *page, + unsigned int offset, size_t len) +{ + struct tcp_bpf_splice_ctx *ctx = arg; + struct pipe_buffer buf = { + .ops = &nosteal_pipe_buf_ops, + }; + ssize_t ret; + + if (PageSlab(page)) { + /* + * skb linear data is backed by slab memory where + * get_page() is invalid. Copy to a page fragment from + * the socket's page allocator, matching what + * linear_to_page() does in the standard TCP splice + * path (skb_splice_bits). + */ + struct page_frag *pfrag = sk_page_frag(ctx->sk); + + if (!sk_page_frag_refill(ctx->sk, pfrag)) + return 0; + + len = min_t(size_t, len, pfrag->size - pfrag->offset); + memcpy(page_address(pfrag->page) + pfrag->offset, + page_address(page) + offset, len); + buf.page = pfrag->page; + buf.offset = pfrag->offset; + buf.len = len; + pfrag->offset += len; + } else { + buf.page = page; + buf.offset = offset; + buf.len = len; + } + + get_page(buf.page); + + /* + * add_to_pipe() calls pipe_buf_release() on failure, which + * handles put_page() via nosteal_pipe_buf_ops, so no explicit + * cleanup is needed here. + */ + ret = add_to_pipe(ctx->pipe, &buf); + if (ret <= 0) + return 0; + return ret; +} + +static ssize_t tcp_bpf_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct sock *sk = sock->sk; + struct tcp_bpf_splice_ctx ctx = { .pipe = pipe, .sk = sk }; + int bpf_flags = flags & SPLICE_F_NONBLOCK ? MSG_DONTWAIT : 0; + struct sk_psock *psock; + int ret; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_splice_read(sock, ppos, pipe, len, flags); + if (!skb_queue_empty(&sk->sk_receive_queue) && + sk_psock_queue_empty(psock)) { + sk_psock_put(sk, psock); + return tcp_splice_read(sock, ppos, pipe, len, flags); + } + + ret = __tcp_bpf_recvmsg(sk, psock, sk_msg_splice_actor, &ctx, + len, bpf_flags); + sk_psock_put(sk, psock); + if (!ret) + return tcp_splice_read(sock, ppos, pipe, len, flags); + return ret; +} + +static ssize_t tcp_bpf_splice_read_parser(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct sock *sk = sock->sk; + struct tcp_bpf_splice_ctx ctx = { .pipe = pipe, .sk = sk }; + int bpf_flags = flags & SPLICE_F_NONBLOCK ? MSG_DONTWAIT : 0; + struct sk_psock *psock; + int ret; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_splice_read(sock, ppos, pipe, len, flags); + + ret = __tcp_bpf_recvmsg_parser(sk, psock, sk_msg_splice_actor, &ctx, + len, bpf_flags); sk_psock_put(sk, psock); return ret; } @@ -628,6 +772,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_BASE].destroy = sock_map_destroy; prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; + prot[TCP_BPF_BASE].splice_read = tcp_bpf_splice_read; prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; prot[TCP_BPF_BASE].ioctl = tcp_bpf_ioctl; @@ -636,9 +781,11 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_RX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_RX].recvmsg = tcp_bpf_recvmsg_parser; + prot[TCP_BPF_RX].splice_read = tcp_bpf_splice_read_parser; prot[TCP_BPF_TXRX] = prot[TCP_BPF_TX]; prot[TCP_BPF_TXRX].recvmsg = tcp_bpf_recvmsg_parser; + prot[TCP_BPF_TXRX].splice_read = tcp_bpf_splice_read_parser; } static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c7b2463c2e254..fc24b75e8431a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3431,6 +3431,7 @@ struct proto tcp_prot = { .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, + .splice_read = tcp_splice_read, .splice_eof = tcp_splice_eof, .backlog_rcv = tcp_v4_do_rcv, .release_cb = tcp_release_cb, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 23cc9b4cb2f12..12256b0234ffd 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -705,7 +705,7 @@ const struct proto_ops inet6_stream_ops = { #endif .splice_eof = inet_splice_eof, .sendmsg_locked = tcp_sendmsg_locked, - .splice_read = tcp_splice_read, + .splice_read = inet_splice_read, .set_peek_off = sk_set_peek_off, .read_sock = tcp_read_sock, .read_skb = tcp_read_skb, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index bb09d5ccf5990..1eab63d3be2e5 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2293,6 +2293,7 @@ struct proto tcpv6_prot = { .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, + .splice_read = tcp_splice_read, .splice_eof = tcp_splice_eof, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, diff --git a/tools/testing/selftests/bpf/benchs/bench_sockmap.c b/tools/testing/selftests/bpf/benchs/bench_sockmap.c index cfc072aa7fff7..ffcf5ad8cafa0 100644 --- a/tools/testing/selftests/bpf/benchs/bench_sockmap.c +++ b/tools/testing/selftests/bpf/benchs/bench_sockmap.c @@ -7,6 +7,9 @@ #include #include #include +#include +#include +#include #include #include "bench.h" #include "bench_sockmap_prog.skel.h" @@ -46,6 +49,8 @@ enum SOCKMAP_ARG_FLAG { ARG_CTL_RX_STRP, ARG_CONSUMER_DELAY_TIME, ARG_PRODUCER_DURATION, + ARG_CTL_SPLICE, + ARG_CTL_VERIFY, }; #define TXMODE_NORMAL() \ @@ -110,6 +115,9 @@ static struct socmap_ctx { int delay_consumer; int prod_run_time; int strp_size; + bool use_splice; + bool verify; + int pipefd[2]; } ctx = { .prod_send = 0, .user_read = 0, @@ -119,6 +127,9 @@ static struct socmap_ctx { .delay_consumer = 0, .prod_run_time = 0, .strp_size = 0, + .use_splice = false, + .verify = false, + .pipefd = {-1, -1}, }; static void bench_sockmap_prog_destroy(void) @@ -130,6 +141,11 @@ static void bench_sockmap_prog_destroy(void) close(ctx.fds[i]); } + if (ctx.pipefd[0] >= 0) + close(ctx.pipefd[0]); + if (ctx.pipefd[1] >= 0) + close(ctx.pipefd[1]); + bench_sockmap_prog__destroy(ctx.skel); } @@ -320,6 +336,7 @@ static int setup_tx_sockmap(void) static void setup(void) { + int rcvbuf = 16 * 1024 * 1024; int err; ctx.skel = bench_sockmap_prog__open_and_load(); @@ -350,6 +367,18 @@ static void setup(void) goto err; } + if (ctx.use_splice) { + if (pipe(ctx.pipefd)) { + fprintf(stderr, "pipe error:%d\n", errno); + goto err; + } + } + + setsockopt(ctx.c2, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(rcvbuf)); + + if (ctx.use_splice) + set_non_block(ctx.c2, true); + return; err: @@ -368,6 +397,8 @@ static void measure(struct bench_res *res) static void verify_data(int *check_pos, char *buf, int rcv) { + if (!ctx.verify) + return; for (int i = 0 ; i < rcv; i++) { if (buf[i] != snd_data[(*check_pos) % DATA_REPEAT_SIZE]) { fprintf(stderr, "verify data fail"); @@ -388,6 +419,9 @@ static void *consumer(void *input) char *buf = malloc(recv_buf_size); int delay_read = ctx.delay_consumer; + printf("cons[%d] started, tid=%ld cpu=%d\n", + tid, syscall(SYS_gettid), sched_getcpu()); + if (!buf) { fprintf(stderr, "fail to init read buffer"); return NULL; @@ -419,7 +453,15 @@ static void *consumer(void *input) } /* read real endpoint by consumer 0 */ atomic_inc(&ctx.read_calls); - rcv = read(ctx.c2, buf, recv_buf_size); + if (ctx.use_splice) { + rcv = splice(ctx.c2, NULL, ctx.pipefd[1], + NULL, recv_buf_size, + SPLICE_F_NONBLOCK); + if (rcv > 0) + rcv = read(ctx.pipefd[0], buf, rcv); + } else { + rcv = read(ctx.c2, buf, recv_buf_size); + } if (rcv < 0 && errno != EAGAIN) { fprintf(stderr, "%s fail to read c2 %d\n", __func__, errno); return NULL; @@ -440,6 +482,9 @@ static void *producer(void *input) int target; FILE *file; + printf("prod started, tid=%ld cpu=%d\n", + syscall(SYS_gettid), sched_getcpu()); + file = tmpfile(); if (!file) { fprintf(stderr, "create file for sendfile"); @@ -554,6 +599,10 @@ static const struct argp_option opts[] = { "delay consumer start"}, { "producer-duration", ARG_PRODUCER_DURATION, "SEC", 0, "producer duration"}, + { "splice", ARG_CTL_SPLICE, NULL, 0, + "use splice instead of read for consumer"}, + { "verify", ARG_CTL_VERIFY, NULL, 0, + "verify received data correctness"}, {}, }; @@ -572,6 +621,12 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) case ARG_CTL_RX_STRP: ctx.strp_size = strtol(arg, NULL, 10); break; + case ARG_CTL_SPLICE: + ctx.use_splice = true; + break; + case ARG_CTL_VERIFY: + ctx.verify = true; + break; default: return ARGP_ERR_UNKNOWN; } diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index dd3c757859f6b..ea0b49ec9a936 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -18,6 +18,23 @@ #include "sockmap_helpers.h" +static bool use_splice; + +static bool __start_subtest(const char *name) +{ + if (!use_splice) + return (test__start_subtest)(name); + + char buf[MAX_TEST_NAME]; + + snprintf(buf, sizeof(buf), "%s splice", name); + return (test__start_subtest)(buf); +} + +#define test__start_subtest(name) __start_subtest(name) +#define recv_timeout(fd, buf, len, flags, timeout) \ + recv_timeout_with_splice(fd, buf, len, flags, timeout, use_splice) + #define TCP_REPAIR 19 /* TCP sock is under repair right now */ #define TCP_REPAIR_ON 1 @@ -1314,7 +1331,7 @@ static void test_sockmap_multi_channels(int sotype) test_sockmap_pass_prog__destroy(skel); } -void test_sockmap_basic(void) +static void __test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKMAP); @@ -1391,3 +1408,12 @@ void test_sockmap_basic(void) if (test__start_subtest("sockmap udp multi channels")) test_sockmap_multi_channels(SOCK_DGRAM); } + +void test_sockmap_basic(void) +{ + use_splice = false; + __test_sockmap_basic(); + + use_splice = true; + __test_sockmap_basic(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h index d815efac52fda..1f0da657243f4 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h @@ -80,4 +80,66 @@ static inline int add_to_sockmap(int mapfd, int fd1, int fd2) return xbpf_map_update_elem(mapfd, &u32(1), &u64(fd2), BPF_NOEXIST); } +static inline ssize_t recv_timeout_with_splice(int fd, void *buf, size_t len, + int flags, + unsigned int timeout_sec, + bool do_splice) +{ + ssize_t total = 0; + int pipefd[2]; + int fl; + + int sotype, protocol; + socklen_t optlen = sizeof(sotype); + + if (!do_splice || (flags & MSG_PEEK) || + getsockopt(fd, SOL_SOCKET, SO_TYPE, &sotype, &optlen) || + sotype != SOCK_STREAM || + getsockopt(fd, SOL_SOCKET, SO_PROTOCOL, &protocol, &optlen) || + protocol != IPPROTO_TCP) + return recv_timeout(fd, buf, len, flags, timeout_sec); + + if (poll_read(fd, timeout_sec)) + return -1; + + if (pipe(pipefd) < 0) + return -1; + + /* + * tcp_splice_read() only checks sock->file->f_flags for + * O_NONBLOCK, ignoring SPLICE_F_NONBLOCK for the socket + * side timeout. Set O_NONBLOCK on the fd so the loop won't + * block forever when no more data is available. + */ + fl = fcntl(fd, F_GETFL); + fcntl(fd, F_SETFL, fl | O_NONBLOCK); + + /* + * Pipe has limited buffer slots (default 16), so a single + * splice may not transfer all requested bytes. Loop until + * we've read enough or no more data is available. + */ + while (total < (ssize_t)len) { + ssize_t spliced, n; + + spliced = splice(fd, NULL, pipefd[1], NULL, len - total, + SPLICE_F_NONBLOCK); + if (spliced <= 0) + break; + + n = read(pipefd[0], buf + total, spliced); + if (n <= 0) + break; + + total += n; + } + + fcntl(fd, F_SETFL, fl); + + close(pipefd[0]); + close(pipefd[1]); + + return total > 0 ? total : -1; +} + #endif // __SOCKMAP_HELPERS__ diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_strp.c b/tools/testing/selftests/bpf/prog_tests/sockmap_strp.c index 621b3b71888ef..2226399eee0d6 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_strp.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_strp.c @@ -6,6 +6,23 @@ #include "test_skmsg_load_helpers.skel.h" #include "test_sockmap_strp.skel.h" +static bool use_splice; + +static bool __start_subtest(const char *name) +{ + if (!use_splice) + return (test__start_subtest)(name); + + char buf[MAX_TEST_NAME]; + + snprintf(buf, sizeof(buf), "%s splice", name); + return (test__start_subtest)(buf); +} + +#define test__start_subtest(name) __start_subtest(name) +#define recv_timeout(fd, buf, len, flags, timeout) \ + recv_timeout_with_splice(fd, buf, len, flags, timeout, use_splice) + #define STRP_PKT_HEAD_LEN 4 #define STRP_PKT_BODY_LEN 6 #define STRP_PKT_FULL_LEN (STRP_PKT_HEAD_LEN + STRP_PKT_BODY_LEN) @@ -431,7 +448,7 @@ static void test_sockmap_strp_verdict(int family, int sotype) test_sockmap_strp__destroy(strp); } -void test_sockmap_strp(void) +static void __test_sockmap_strp(void) { if (test__start_subtest("sockmap strp tcp pass")) test_sockmap_strp_pass(AF_INET, SOCK_STREAM, false); @@ -452,3 +469,12 @@ void test_sockmap_strp(void) if (test__start_subtest("sockmap strp tcp dispatch")) test_sockmap_strp_dispatch_pkt(AF_INET, SOCK_STREAM); } + +void test_sockmap_strp(void) +{ + use_splice = false; + __test_sockmap_strp(); + + use_splice = true; + __test_sockmap_strp(); +}