From 33f0a3030fcf0189b788d340caa6a5e33f5e134b Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 4 Mar 2026 14:33:52 +0800 Subject: [PATCH 1/7] net: add splice_read to struct proto and set it in tcp_prot/tcpv6_prot Add a splice_read function pointer to struct proto between recvmsg and splice_eof. Set it to tcp_splice_read in both tcp_prot and tcpv6_prot. Signed-off-by: Jiayuan Chen --- include/net/sock.h | 3 +++ net/ipv4/tcp_ipv4.c | 1 + net/ipv6/tcp_ipv6.c | 1 + 3 files changed, 5 insertions(+) diff --git a/include/net/sock.h b/include/net/sock.h index 6c9a83016e955..de28af168ec45 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1317,6 +1317,9 @@ struct proto { size_t len); int (*recvmsg)(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); + ssize_t (*splice_read)(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, struct sockaddr_unsized *addr, int addr_len); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c7b2463c2e254..fc24b75e8431a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3431,6 +3431,7 @@ struct proto tcp_prot = { .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, + .splice_read = tcp_splice_read, .splice_eof = tcp_splice_eof, .backlog_rcv = tcp_v4_do_rcv, .release_cb = tcp_release_cb, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index bb09d5ccf5990..1eab63d3be2e5 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2293,6 +2293,7 @@ struct proto tcpv6_prot = { .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, + .splice_read = tcp_splice_read, .splice_eof = tcp_splice_eof, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, From 3d9b644e8e7b60a44f277e06ab4e328b0c79f982 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 4 Mar 2026 14:33:53 +0800 Subject: [PATCH 2/7] inet: add inet_splice_read() and use it in inet_stream_ops/inet6_stream_ops Add inet_splice_read() which dispatches to sk->sk_prot->splice_read via INDIRECT_CALL_1. Replace the direct tcp_splice_read reference in inet_stream_ops and inet6_stream_ops with inet_splice_read. Signed-off-by: Jiayuan Chen --- include/net/inet_common.h | 3 +++ net/ipv4/af_inet.c | 15 ++++++++++++++- net/ipv6/af_inet6.c | 2 +- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 5dd2bf24449ef..84f2744d57f8a 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -34,6 +34,9 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk); int inet_send_prepare(struct sock *sk); int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); +ssize_t inet_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); void inet_splice_eof(struct socket *sock); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8036e76aa1e47..2c7b35d9c62da 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -876,6 +876,19 @@ void inet_splice_eof(struct socket *sock) } EXPORT_SYMBOL_GPL(inet_splice_eof); +ssize_t inet_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct sock *sk = sock->sk; + const struct proto *prot; + + prot = READ_ONCE(sk->sk_prot); + return INDIRECT_CALL_1(prot->splice_read, tcp_splice_read, sock, + ppos, pipe, len, flags); +} +EXPORT_SYMBOL_GPL(inet_splice_read); + INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *, size_t, int, int *)); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, @@ -1079,7 +1092,7 @@ const struct proto_ops inet_stream_ops = { .mmap = tcp_mmap, #endif .splice_eof = inet_splice_eof, - .splice_read = tcp_splice_read, + .splice_read = inet_splice_read, .set_peek_off = sk_set_peek_off, .read_sock = tcp_read_sock, .read_skb = tcp_read_skb, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 23cc9b4cb2f12..12256b0234ffd 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -705,7 +705,7 @@ const struct proto_ops inet6_stream_ops = { #endif .splice_eof = inet_splice_eof, .sendmsg_locked = tcp_sendmsg_locked, - .splice_read = tcp_splice_read, + .splice_read = inet_splice_read, .set_peek_off = sk_set_peek_off, .read_sock = tcp_read_sock, .read_skb = tcp_read_skb, From cbaaed1581ea8089eff3ed3b91154ed1bfa84b1d Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 4 Mar 2026 14:33:54 +0800 Subject: [PATCH 3/7] tcp_bpf: refactor recvmsg with read actor abstraction Refactor the read operation with no functional changes. tcp_bpf has two read paths: strparser and non-strparser. Currently the differences are implemented directly in their respective recvmsg functions, which works fine. However, upcoming splice support would require duplicating the same logic for both paths. To avoid this, extract the strparser-specific differences into an independent abstraction that can be reused by splice. For ingress_msg data processing, introduce a function pointer callback approach. The current implementation passes sk_msg_recvmsg_actor(), which performs copy_page_to_iter() - the same copy logic previously embedded in sk_msg_recvmsg(). This provides the extension point for future splice support, where a different actor can be plugged in. Signed-off-by: Jiayuan Chen --- include/linux/skmsg.h | 12 ++++- net/core/skmsg.c | 24 +++++++-- net/ipv4/tcp_bpf.c | 123 ++++++++++++++++++++++++++++-------------- 3 files changed, 112 insertions(+), 47 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 19f4f253b4f90..d53756914b2f6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -143,8 +143,16 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); -int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, - int len, int flags, int *copied_from_self); +typedef int (*sk_msg_read_actor_t)(void *arg, struct page *page, + unsigned int offset, size_t len); +/* Core function for reading ingress_msg, dispatches to the given actor */ +int sk_msg_read_core(struct sock *sk, struct sk_psock *psock, + size_t len, int flags, + sk_msg_read_actor_t actor, void *actor_arg, + int *copied_from_self); +int sk_msg_recvmsg_actor(void *arg, struct page *page, + unsigned int offset, size_t len); + bool sk_msg_is_readable(struct sock *sk); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 3261793abe833..1253e1ed63b15 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -409,10 +409,12 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, - int len, int flags, int *copied_from_self) +/* Core function for reading ingress_msg, dispatches to the given actor */ +int sk_msg_read_core(struct sock *sk, struct sk_psock *psock, + size_t len, int flags, + sk_msg_read_actor_t actor, void *actor_arg, + int *copied_from_self) { - struct iov_iter *iter = &msg->msg_iter; int peek = flags & MSG_PEEK; struct sk_msg *msg_rx; int i, copied = 0; @@ -440,7 +442,8 @@ int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg if (copied + copy > len) copy = len - copied; if (copy) - copy = copy_page_to_iter(page, sge->offset, copy, iter); + copy = actor(actor_arg, page, + sge->offset, copy); if (!copy) { copied = copied ? copied : -EFAULT; goto out; @@ -495,12 +498,23 @@ int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg out: return copied; } +EXPORT_SYMBOL_GPL(sk_msg_read_core); + +int sk_msg_recvmsg_actor(void *arg, struct page *page, + unsigned int offset, size_t len) +{ + struct msghdr *msg = arg; + + return copy_page_to_iter(page, offset, len, &msg->msg_iter); +} +EXPORT_SYMBOL_GPL(sk_msg_recvmsg_actor); /* Receive sk_msg from psock->ingress_msg to @msg. */ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags) { - return __sk_msg_recvmsg(sk, psock, msg, len, flags, NULL); + return sk_msg_read_core(sk, psock, len, flags, + sk_msg_recvmsg_actor, msg, NULL); } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 813d2e498c93a..d3ec77dce6a70 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -218,31 +218,26 @@ static bool is_next_msg_fin(struct sk_psock *psock) return false; } -static int tcp_bpf_recvmsg_parser(struct sock *sk, - struct msghdr *msg, - size_t len, - int flags, - int *addr_len) +/* + * __tcp_bpf_recvmsg_parser - inner recvmsg for strparser path + * + * Handles TCP seq tracking, pre-accept receive_queue draining, FIN detection, + * and receive window updates. The actual data read is delegated to @actor. + * + * Caller must hold a psock ref. Socket lock is acquired/released internally. + * Returns bytes read, or negative error. + */ +static int __tcp_bpf_recvmsg_parser(struct sock *sk, struct sk_psock *psock, + sk_msg_read_actor_t actor, void *actor_arg, + size_t len, int flags) { int peek = flags & MSG_PEEK; - struct sk_psock *psock; - struct tcp_sock *tcp; + struct tcp_sock *tcp = tcp_sk(sk); int copied_from_self = 0; int copied = 0; u32 seq; - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - - if (!len) - return 0; - - psock = sk_psock_get(sk); - if (unlikely(!psock)) - return tcp_recvmsg(sk, msg, len, flags, addr_len); - lock_sock(sk); - tcp = tcp_sk(sk); seq = tcp->copied_seq; /* We may have received data on the sk_receive_queue pre-accept and * then we can not use read_skb in this context because we haven't @@ -264,7 +259,8 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, } msg_bytes_ready: - copied = __sk_msg_recvmsg(sk, psock, msg, len, flags, &copied_from_self); + copied = sk_msg_read_core(sk, psock, len, flags, + actor, actor_arg, &copied_from_self); /* The typical case for EFAULT is the socket was gracefully * shutdown with a FIN pkt. So check here the other case is * some error on copy_page_to_iter which would be unexpected. @@ -329,10 +325,34 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, unlock: release_sock(sk); - sk_psock_put(sk, psock); return copied; } +static int tcp_bpf_recvmsg_parser(struct sock *sk, + struct msghdr *msg, + size_t len, + int flags, + int *addr_len) +{ + struct sk_psock *psock; + int ret; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + if (!len) + return 0; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_recvmsg(sk, msg, len, flags, addr_len); + + ret = __tcp_bpf_recvmsg_parser(sk, psock, + sk_msg_recvmsg_actor, msg, len, flags); + sk_psock_put(sk, psock); + return ret; +} + static int tcp_bpf_ioctl(struct sock *sk, int cmd, int *karg) { bool slow; @@ -351,29 +371,25 @@ static int tcp_bpf_ioctl(struct sock *sk, int cmd, int *karg) return 0; } -static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len) +/* + * __tcp_bpf_recvmsg - inner recvmsg for non-parser (verdict only) path + * + * No TCP seq tracking needed (tcp_eat_skb handles it at verdict time). + * Returns bytes read, 0 if caller should fall back to the normal TCP + * read path (data on receive_queue but not in psock), or negative error. + * + * Caller must hold a psock ref. Socket lock is acquired/released internally. + */ +static int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, + sk_msg_read_actor_t actor, void *actor_arg, + size_t len, int flags) { - struct sk_psock *psock; int copied, ret; - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - - if (!len) - return 0; - - psock = sk_psock_get(sk); - if (unlikely(!psock)) - return tcp_recvmsg(sk, msg, len, flags, addr_len); - if (!skb_queue_empty(&sk->sk_receive_queue) && - sk_psock_queue_empty(psock)) { - sk_psock_put(sk, psock); - return tcp_recvmsg(sk, msg, len, flags, addr_len); - } lock_sock(sk); msg_bytes_ready: - copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + copied = sk_msg_read_core(sk, psock, len, flags, + actor, actor_arg, NULL); if (!copied) { long timeo; int data; @@ -388,8 +404,7 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; release_sock(sk); - sk_psock_put(sk, psock); - return tcp_recvmsg(sk, msg, len, flags, addr_len); + return 0; } copied = -EAGAIN; } @@ -397,7 +412,35 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, unlock: release_sock(sk); + return ret; +} + +static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int flags, int *addr_len) +{ + struct sk_psock *psock; + int ret; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + if (!len) + return 0; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_recvmsg(sk, msg, len, flags, addr_len); + if (!skb_queue_empty(&sk->sk_receive_queue) && + sk_psock_queue_empty(psock)) { + sk_psock_put(sk, psock); + return tcp_recvmsg(sk, msg, len, flags, addr_len); + } + + ret = __tcp_bpf_recvmsg(sk, psock, sk_msg_recvmsg_actor, msg, + len, flags); sk_psock_put(sk, psock); + if (!ret) + return tcp_recvmsg(sk, msg, len, flags, addr_len); return ret; } From d0349e191d69a1db0effc37c5f26bc22e6e0faca Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 4 Mar 2026 14:33:55 +0800 Subject: [PATCH 4/7] tcp_bpf: add splice_read support for sockmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement splice_read for sockmap using an always-copy approach. Each page from the psock ingress scatterlist is copied to a newly allocated page before being added to the pipe, avoiding lifetime and slab-page issues. Add sk_msg_splice_actor() which allocates a fresh page via alloc_page(), copies the data with memcpy(), then passes it to add_to_pipe(). The newly allocated page already has a refcount of 1, so no additional get_page() is needed. On add_to_pipe() failure, no explicit cleanup is needed since add_to_pipe() internally calls pipe_buf_release(). Also fix sk_msg_read_core() to update msg_rx->sg.start when the actor returns 0 mid-way through processing. The loop processes msg_rx->sg entries sequentially — if the actor fails (e.g. pipe full for splice, or user buffer fault for recvmsg), prior entries may already be consumed with sge->length set to 0. Without advancing sg.start, subsequent calls would revisit these zero-length entries and return -EFAULT. This is especially common with the splice actor since the pipe has a small fixed capacity (16 slots), but theoretically affects recvmsg as well. Signed-off-by: Jiayuan Chen --- net/core/skmsg.c | 10 ++++++ net/ipv4/tcp_bpf.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 1253e1ed63b15..3a2b27a12b635 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -445,6 +445,16 @@ int sk_msg_read_core(struct sock *sk, struct sk_psock *psock, copy = actor(actor_arg, page, sge->offset, copy); if (!copy) { + /* + * The loop processes msg_rx->sg entries + * sequentially and prior entries may + * already be consumed. Advance sg.start + * so the next call resumes at the correct + * entry, otherwise it would revisit + * zero-length entries and return -EFAULT. + */ + if (!peek) + msg_rx->sg.start = i; copied = copied ? copied : -EFAULT; goto out; } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index d3ec77dce6a70..6aeb9ddfb098e 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -444,6 +445,85 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return ret; } +struct tcp_bpf_splice_ctx { + struct pipe_inode_info *pipe; +}; + +static int sk_msg_splice_actor(void *arg, struct page *page, + unsigned int offset, size_t len) +{ + struct tcp_bpf_splice_ctx *ctx = arg; + struct pipe_buffer buf = { + .ops = &nosteal_pipe_buf_ops, + }; + ssize_t ret; + + buf.page = alloc_page(GFP_KERNEL); + if (!buf.page) + return 0; + + memcpy(page_address(buf.page), page_address(page) + offset, len); + buf.offset = 0; + buf.len = len; + + /* + * add_to_pipe() calls pipe_buf_release() on failure, which + * handles put_page() via nosteal_pipe_buf_ops, so no explicit + * cleanup is needed here. + */ + ret = add_to_pipe(ctx->pipe, &buf); + if (ret <= 0) + return 0; + return ret; +} + +static ssize_t tcp_bpf_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct tcp_bpf_splice_ctx ctx = { .pipe = pipe }; + int bpf_flags = flags & SPLICE_F_NONBLOCK ? MSG_DONTWAIT : 0; + struct sock *sk = sock->sk; + struct sk_psock *psock; + int ret; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_splice_read(sock, ppos, pipe, len, flags); + if (!skb_queue_empty(&sk->sk_receive_queue) && + sk_psock_queue_empty(psock)) { + sk_psock_put(sk, psock); + return tcp_splice_read(sock, ppos, pipe, len, flags); + } + + ret = __tcp_bpf_recvmsg(sk, psock, sk_msg_splice_actor, &ctx, + len, bpf_flags); + sk_psock_put(sk, psock); + if (!ret) + return tcp_splice_read(sock, ppos, pipe, len, flags); + return ret; +} + +static ssize_t tcp_bpf_splice_read_parser(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct tcp_bpf_splice_ctx ctx = { .pipe = pipe }; + int bpf_flags = flags & SPLICE_F_NONBLOCK ? MSG_DONTWAIT : 0; + struct sock *sk = sock->sk; + struct sk_psock *psock; + int ret; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return tcp_splice_read(sock, ppos, pipe, len, flags); + + ret = __tcp_bpf_recvmsg_parser(sk, psock, sk_msg_splice_actor, &ctx, + len, bpf_flags); + sk_psock_put(sk, psock); + return ret; +} + static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, int *copied, int flags) { @@ -671,6 +751,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_BASE].destroy = sock_map_destroy; prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; + prot[TCP_BPF_BASE].splice_read = tcp_bpf_splice_read; prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; prot[TCP_BPF_BASE].ioctl = tcp_bpf_ioctl; @@ -679,9 +760,11 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_RX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_RX].recvmsg = tcp_bpf_recvmsg_parser; + prot[TCP_BPF_RX].splice_read = tcp_bpf_splice_read_parser; prot[TCP_BPF_TXRX] = prot[TCP_BPF_TX]; prot[TCP_BPF_TXRX].recvmsg = tcp_bpf_recvmsg_parser; + prot[TCP_BPF_TXRX].splice_read = tcp_bpf_splice_read_parser; } static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops) From 0cbdda31b1f4bbceed92d9437facd5f593e85a2e Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 4 Mar 2026 14:33:56 +0800 Subject: [PATCH 5/7] tcp_bpf: optimize splice_read with zero-copy for non-slab pages The previous splice_read implementation copies all data through intermediate pages (alloc_page + memcpy). This is wasteful for skb fragment pages which are allocated from the page allocator and can be safely referenced via get_page(). Optimize by checking PageSlab() to distinguish between linear skb data (slab-backed) and fragment pages (page allocator-backed): - For slab pages (skb linear data): copy to a page fragment via sk_page_frag, matching what linear_to_page() does in the standard TCP splice path (skb_splice_bits). get_page() is invalid on slab pages so a copy is unavoidable here. - For non-slab pages (skb frags): use get_page() directly for true zero-copy, same as skb_splice_bits does for fragments. Both paths use nosteal_pipe_buf_ops. The sk_page_frag approach is more memory-efficient than alloc_page for small linear copies, as multiple copies can share a single page fragment. Benchmark results with rx-verdict-ingress mode (loopback, 8 CPUs): splice(2) + always-copy: ~2770 MB/s (before this patch) splice(2) + zero-copy: ~4270 MB/s (after this patch, +54%) read(2): ~4292 MB/s (baseline for reference) Signed-off-by: Jiayuan Chen --- net/ipv4/tcp_bpf.c | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 6aeb9ddfb098e..4497f32a87b32 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -447,6 +447,7 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, struct tcp_bpf_splice_ctx { struct pipe_inode_info *pipe; + struct sock *sk; }; static int sk_msg_splice_actor(void *arg, struct page *page, @@ -458,13 +459,33 @@ static int sk_msg_splice_actor(void *arg, struct page *page, }; ssize_t ret; - buf.page = alloc_page(GFP_KERNEL); - if (!buf.page) - return 0; + if (PageSlab(page)) { + /* + * skb linear data is backed by slab memory where + * get_page() is invalid. Copy to a page fragment from + * the socket's page allocator, matching what + * linear_to_page() does in the standard TCP splice + * path (skb_splice_bits). + */ + struct page_frag *pfrag = sk_page_frag(ctx->sk); + + if (!sk_page_frag_refill(ctx->sk, pfrag)) + return 0; - memcpy(page_address(buf.page), page_address(page) + offset, len); - buf.offset = 0; - buf.len = len; + len = min_t(size_t, len, pfrag->size - pfrag->offset); + memcpy(page_address(pfrag->page) + pfrag->offset, + page_address(page) + offset, len); + buf.page = pfrag->page; + buf.offset = pfrag->offset; + buf.len = len; + pfrag->offset += len; + } else { + buf.page = page; + buf.offset = offset; + buf.len = len; + } + + get_page(buf.page); /* * add_to_pipe() calls pipe_buf_release() on failure, which @@ -481,9 +502,9 @@ static ssize_t tcp_bpf_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - struct tcp_bpf_splice_ctx ctx = { .pipe = pipe }; - int bpf_flags = flags & SPLICE_F_NONBLOCK ? MSG_DONTWAIT : 0; struct sock *sk = sock->sk; + struct tcp_bpf_splice_ctx ctx = { .pipe = pipe, .sk = sk }; + int bpf_flags = flags & SPLICE_F_NONBLOCK ? MSG_DONTWAIT : 0; struct sk_psock *psock; int ret; @@ -508,9 +529,9 @@ static ssize_t tcp_bpf_splice_read_parser(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - struct tcp_bpf_splice_ctx ctx = { .pipe = pipe }; - int bpf_flags = flags & SPLICE_F_NONBLOCK ? MSG_DONTWAIT : 0; struct sock *sk = sock->sk; + struct tcp_bpf_splice_ctx ctx = { .pipe = pipe, .sk = sk }; + int bpf_flags = flags & SPLICE_F_NONBLOCK ? MSG_DONTWAIT : 0; struct sk_psock *psock; int ret; From d24b0f254baac473b54a58b9e41db59c41f74533 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 4 Mar 2026 14:33:57 +0800 Subject: [PATCH 6/7] selftests/bpf: add splice_read tests for sockmap Add splice_read coverage to sockmap_basic and sockmap_strp selftests. Each test suite now runs twice: once with normal recv_timeout() and once with splice-based reads, verifying that data read via splice(2) through a pipe produces identical results. A recv_timeout_with_splice() helper is added to sockmap_helpers.h that creates a temporary pipe, splices data from the socket into the pipe, then reads from the pipe into the user buffer. MSG_PEEK calls fall back to native recv since splice does not support peek. Non-TCP sockets also fall back to native recv. The splice subtests are distinguished by appending " splice" to each subtest name via a test__start_subtest macro override. ./test_progs -a sockmap_* ... Summary: 5/830 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Jiayuan Chen --- .../selftests/bpf/prog_tests/sockmap_basic.c | 28 ++++++++- .../bpf/prog_tests/sockmap_helpers.h | 62 +++++++++++++++++++ .../selftests/bpf/prog_tests/sockmap_strp.c | 28 ++++++++- 3 files changed, 116 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index dd3c757859f6b..ea0b49ec9a936 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -18,6 +18,23 @@ #include "sockmap_helpers.h" +static bool use_splice; + +static bool __start_subtest(const char *name) +{ + if (!use_splice) + return (test__start_subtest)(name); + + char buf[MAX_TEST_NAME]; + + snprintf(buf, sizeof(buf), "%s splice", name); + return (test__start_subtest)(buf); +} + +#define test__start_subtest(name) __start_subtest(name) +#define recv_timeout(fd, buf, len, flags, timeout) \ + recv_timeout_with_splice(fd, buf, len, flags, timeout, use_splice) + #define TCP_REPAIR 19 /* TCP sock is under repair right now */ #define TCP_REPAIR_ON 1 @@ -1314,7 +1331,7 @@ static void test_sockmap_multi_channels(int sotype) test_sockmap_pass_prog__destroy(skel); } -void test_sockmap_basic(void) +static void __test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKMAP); @@ -1391,3 +1408,12 @@ void test_sockmap_basic(void) if (test__start_subtest("sockmap udp multi channels")) test_sockmap_multi_channels(SOCK_DGRAM); } + +void test_sockmap_basic(void) +{ + use_splice = false; + __test_sockmap_basic(); + + use_splice = true; + __test_sockmap_basic(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h index d815efac52fda..1f0da657243f4 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h @@ -80,4 +80,66 @@ static inline int add_to_sockmap(int mapfd, int fd1, int fd2) return xbpf_map_update_elem(mapfd, &u32(1), &u64(fd2), BPF_NOEXIST); } +static inline ssize_t recv_timeout_with_splice(int fd, void *buf, size_t len, + int flags, + unsigned int timeout_sec, + bool do_splice) +{ + ssize_t total = 0; + int pipefd[2]; + int fl; + + int sotype, protocol; + socklen_t optlen = sizeof(sotype); + + if (!do_splice || (flags & MSG_PEEK) || + getsockopt(fd, SOL_SOCKET, SO_TYPE, &sotype, &optlen) || + sotype != SOCK_STREAM || + getsockopt(fd, SOL_SOCKET, SO_PROTOCOL, &protocol, &optlen) || + protocol != IPPROTO_TCP) + return recv_timeout(fd, buf, len, flags, timeout_sec); + + if (poll_read(fd, timeout_sec)) + return -1; + + if (pipe(pipefd) < 0) + return -1; + + /* + * tcp_splice_read() only checks sock->file->f_flags for + * O_NONBLOCK, ignoring SPLICE_F_NONBLOCK for the socket + * side timeout. Set O_NONBLOCK on the fd so the loop won't + * block forever when no more data is available. + */ + fl = fcntl(fd, F_GETFL); + fcntl(fd, F_SETFL, fl | O_NONBLOCK); + + /* + * Pipe has limited buffer slots (default 16), so a single + * splice may not transfer all requested bytes. Loop until + * we've read enough or no more data is available. + */ + while (total < (ssize_t)len) { + ssize_t spliced, n; + + spliced = splice(fd, NULL, pipefd[1], NULL, len - total, + SPLICE_F_NONBLOCK); + if (spliced <= 0) + break; + + n = read(pipefd[0], buf + total, spliced); + if (n <= 0) + break; + + total += n; + } + + fcntl(fd, F_SETFL, fl); + + close(pipefd[0]); + close(pipefd[1]); + + return total > 0 ? total : -1; +} + #endif // __SOCKMAP_HELPERS__ diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_strp.c b/tools/testing/selftests/bpf/prog_tests/sockmap_strp.c index 621b3b71888ef..2226399eee0d6 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_strp.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_strp.c @@ -6,6 +6,23 @@ #include "test_skmsg_load_helpers.skel.h" #include "test_sockmap_strp.skel.h" +static bool use_splice; + +static bool __start_subtest(const char *name) +{ + if (!use_splice) + return (test__start_subtest)(name); + + char buf[MAX_TEST_NAME]; + + snprintf(buf, sizeof(buf), "%s splice", name); + return (test__start_subtest)(buf); +} + +#define test__start_subtest(name) __start_subtest(name) +#define recv_timeout(fd, buf, len, flags, timeout) \ + recv_timeout_with_splice(fd, buf, len, flags, timeout, use_splice) + #define STRP_PKT_HEAD_LEN 4 #define STRP_PKT_BODY_LEN 6 #define STRP_PKT_FULL_LEN (STRP_PKT_HEAD_LEN + STRP_PKT_BODY_LEN) @@ -431,7 +448,7 @@ static void test_sockmap_strp_verdict(int family, int sotype) test_sockmap_strp__destroy(strp); } -void test_sockmap_strp(void) +static void __test_sockmap_strp(void) { if (test__start_subtest("sockmap strp tcp pass")) test_sockmap_strp_pass(AF_INET, SOCK_STREAM, false); @@ -452,3 +469,12 @@ void test_sockmap_strp(void) if (test__start_subtest("sockmap strp tcp dispatch")) test_sockmap_strp_dispatch_pkt(AF_INET, SOCK_STREAM); } + +void test_sockmap_strp(void) +{ + use_splice = false; + __test_sockmap_strp(); + + use_splice = true; + __test_sockmap_strp(); +} From 17ac338dc75ef0ddb4a55ec044f9dbc27d591c6d Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 4 Mar 2026 14:33:58 +0800 Subject: [PATCH 7/7] selftests/bpf: add splice option to sockmap benchmark Add --splice option to bench_sockmap that uses splice(2) instead of read(2) in the consumer path. A global pipe is created once during setup and reused across iterations to avoid per-call pipe creation overhead. When --splice is enabled, the consumer splices data from the socket into the pipe, then reads from the pipe into the user buffer. The socket is set to O_NONBLOCK to prevent tcp_splice_read() from blocking indefinitely, as it only checks sock->file->f_flags for non-blocking mode, ignoring SPLICE_F_NONBLOCK. Also increase SO_RCVBUF to 16MB to avoid sk_psock_backlog being throttled by the default sk_rcvbuf limit, and add --verify option to optionally enable data correctness checking (disabled by default for benchmark accuracy). Benchmark results with rx-verdict-ingress mode (loopback, 8 CPUs): read(2): ~4292 MB/s splice(2) + zero-copy: ~4270 MB/s splice(2) + always-copy: ~2770 MB/s Zero-copy splice achieves near-parity with read(2), while the always-copy fallback is ~35% slower. Usage: # Steer softirqs to CPU 7 to avoid contending with the producer CPU echo 80 > /sys/class/net/lo/queues/rx-0/rps_cpus # Raise the receive buffer ceiling so the benchmark can set 16MB rcvbuf sysctl -w net.core.rmem_max=16777216 # Run the benchmark ./bench sockmap --rx-verdict-ingress --splice -c 2 -p 1 -a -d 30 Signed-off-by: Jiayuan Chen --- .../selftests/bpf/benchs/bench_sockmap.c | 57 ++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/benchs/bench_sockmap.c b/tools/testing/selftests/bpf/benchs/bench_sockmap.c index cfc072aa7fff7..ffcf5ad8cafa0 100644 --- a/tools/testing/selftests/bpf/benchs/bench_sockmap.c +++ b/tools/testing/selftests/bpf/benchs/bench_sockmap.c @@ -7,6 +7,9 @@ #include #include #include +#include +#include +#include #include #include "bench.h" #include "bench_sockmap_prog.skel.h" @@ -46,6 +49,8 @@ enum SOCKMAP_ARG_FLAG { ARG_CTL_RX_STRP, ARG_CONSUMER_DELAY_TIME, ARG_PRODUCER_DURATION, + ARG_CTL_SPLICE, + ARG_CTL_VERIFY, }; #define TXMODE_NORMAL() \ @@ -110,6 +115,9 @@ static struct socmap_ctx { int delay_consumer; int prod_run_time; int strp_size; + bool use_splice; + bool verify; + int pipefd[2]; } ctx = { .prod_send = 0, .user_read = 0, @@ -119,6 +127,9 @@ static struct socmap_ctx { .delay_consumer = 0, .prod_run_time = 0, .strp_size = 0, + .use_splice = false, + .verify = false, + .pipefd = {-1, -1}, }; static void bench_sockmap_prog_destroy(void) @@ -130,6 +141,11 @@ static void bench_sockmap_prog_destroy(void) close(ctx.fds[i]); } + if (ctx.pipefd[0] >= 0) + close(ctx.pipefd[0]); + if (ctx.pipefd[1] >= 0) + close(ctx.pipefd[1]); + bench_sockmap_prog__destroy(ctx.skel); } @@ -320,6 +336,7 @@ static int setup_tx_sockmap(void) static void setup(void) { + int rcvbuf = 16 * 1024 * 1024; int err; ctx.skel = bench_sockmap_prog__open_and_load(); @@ -350,6 +367,18 @@ static void setup(void) goto err; } + if (ctx.use_splice) { + if (pipe(ctx.pipefd)) { + fprintf(stderr, "pipe error:%d\n", errno); + goto err; + } + } + + setsockopt(ctx.c2, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(rcvbuf)); + + if (ctx.use_splice) + set_non_block(ctx.c2, true); + return; err: @@ -368,6 +397,8 @@ static void measure(struct bench_res *res) static void verify_data(int *check_pos, char *buf, int rcv) { + if (!ctx.verify) + return; for (int i = 0 ; i < rcv; i++) { if (buf[i] != snd_data[(*check_pos) % DATA_REPEAT_SIZE]) { fprintf(stderr, "verify data fail"); @@ -388,6 +419,9 @@ static void *consumer(void *input) char *buf = malloc(recv_buf_size); int delay_read = ctx.delay_consumer; + printf("cons[%d] started, tid=%ld cpu=%d\n", + tid, syscall(SYS_gettid), sched_getcpu()); + if (!buf) { fprintf(stderr, "fail to init read buffer"); return NULL; @@ -419,7 +453,15 @@ static void *consumer(void *input) } /* read real endpoint by consumer 0 */ atomic_inc(&ctx.read_calls); - rcv = read(ctx.c2, buf, recv_buf_size); + if (ctx.use_splice) { + rcv = splice(ctx.c2, NULL, ctx.pipefd[1], + NULL, recv_buf_size, + SPLICE_F_NONBLOCK); + if (rcv > 0) + rcv = read(ctx.pipefd[0], buf, rcv); + } else { + rcv = read(ctx.c2, buf, recv_buf_size); + } if (rcv < 0 && errno != EAGAIN) { fprintf(stderr, "%s fail to read c2 %d\n", __func__, errno); return NULL; @@ -440,6 +482,9 @@ static void *producer(void *input) int target; FILE *file; + printf("prod started, tid=%ld cpu=%d\n", + syscall(SYS_gettid), sched_getcpu()); + file = tmpfile(); if (!file) { fprintf(stderr, "create file for sendfile"); @@ -554,6 +599,10 @@ static const struct argp_option opts[] = { "delay consumer start"}, { "producer-duration", ARG_PRODUCER_DURATION, "SEC", 0, "producer duration"}, + { "splice", ARG_CTL_SPLICE, NULL, 0, + "use splice instead of read for consumer"}, + { "verify", ARG_CTL_VERIFY, NULL, 0, + "verify received data correctness"}, {}, }; @@ -572,6 +621,12 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) case ARG_CTL_RX_STRP: ctx.strp_size = strtol(arg, NULL, 10); break; + case ARG_CTL_SPLICE: + ctx.use_splice = true; + break; + case ARG_CTL_VERIFY: + ctx.verify = true; + break; default: return ARGP_ERR_UNKNOWN; }