Skip to content

Commit 7da5c11

Browse files
mrpreKernel Patches Daemon
authored andcommitted
bpf, sockmap: Fix FIONREAD for sockmap
A socket using sockmap has its own independent receive queue: ingress_msg. This queue may contain data from its own protocol stack or from other sockets. Therefore, for sockmap, relying solely on copied_seq and rcv_nxt to calculate FIONREAD is not enough. This patch adds a new ingress_size field in the psock structure to record the data length in ingress_msg. Additionally, we implement new ioctl interfaces for TCP and UDP to intercept FIONREAD operations. While Unix and VSOCK also support sockmap and have similar FIONREAD calculation issues, fixing them would require more extensive changes (please let me know if modifications are needed). I believe it's not appropriate to include those changes under this fix patch. Previous work by John Fastabend made some efforts towards FIONREAD support: commit e5c6de5 ("bpf, sockmap: Incorrectly handling copied_seq") Although the current patch is based on the previous work by John Fastabend, it is acceptable for our Fixes tag to point to the same commit. FD1:read() -- FD1->copied_seq++ | [read data] | [enqueue data] v [sockmap] -> ingress to self -> ingress_msg queue FD1 native stack ------> ^ -- FD1->rcv_nxt++ -> redirect to other | [enqueue data] | | | ingress to FD1 v ^ ... | [sockmap] FD2 native stack Fixes: 04919be ("tcp: Introduce tcp_read_skb()") Signed-off-by: Jiayuan Chen <[email protected]>
1 parent 292fdb3 commit 7da5c11

File tree

4 files changed

+90
-5
lines changed

4 files changed

+90
-5
lines changed

include/linux/skmsg.h

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ struct sk_psock {
9797
struct sk_buff_head ingress_skb;
9898
struct list_head ingress_msg;
9999
spinlock_t ingress_lock;
100+
ssize_t ingress_size;
100101
unsigned long state;
101102
struct list_head link;
102103
spinlock_t link_lock;
@@ -321,6 +322,16 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb)
321322
kfree_skb(skb);
322323
}
323324

325+
static inline ssize_t sk_psock_get_msg_size(struct sk_psock *psock)
326+
{
327+
return psock->ingress_size;
328+
}
329+
330+
static inline void sk_psock_inc_msg_size(struct sk_psock *psock, ssize_t diff)
331+
{
332+
psock->ingress_size += diff;
333+
}
334+
324335
static inline bool sk_psock_queue_msg(struct sk_psock *psock,
325336
struct sk_msg *msg)
326337
{
@@ -329,6 +340,7 @@ static inline bool sk_psock_queue_msg(struct sk_psock *psock,
329340
spin_lock_bh(&psock->ingress_lock);
330341
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
331342
list_add_tail(&msg->list, &psock->ingress_msg);
343+
sk_psock_inc_msg_size(psock, msg->sg.size);
332344
ret = true;
333345
} else {
334346
sk_msg_free(psock->sk, msg);
@@ -345,8 +357,10 @@ static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
345357

346358
spin_lock_bh(&psock->ingress_lock);
347359
msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
348-
if (msg)
360+
if (msg) {
349361
list_del(&msg->list);
362+
sk_psock_inc_msg_size(psock, -msg->sg.size);
363+
}
350364
spin_unlock_bh(&psock->ingress_lock);
351365
return msg;
352366
}
@@ -523,6 +537,36 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
523537
return !!psock->saved_data_ready;
524538
}
525539

540+
static inline ssize_t sk_psock_msg_inq(struct sock *sk)
541+
{
542+
struct sk_psock *psock;
543+
ssize_t inq = 0;
544+
545+
psock = sk_psock_get(sk);
546+
if (likely(psock)) {
547+
inq = sk_psock_get_msg_size(psock);
548+
sk_psock_put(sk, psock);
549+
}
550+
return inq;
551+
}
552+
553+
/* for udp */
554+
static inline ssize_t sk_msg_first_length(struct sock *sk)
555+
{
556+
struct sk_psock *psock;
557+
struct sk_msg *msg;
558+
ssize_t inq = 0;
559+
560+
psock = sk_psock_get(sk);
561+
if (likely(psock)) {
562+
msg = sk_psock_peek_msg(psock);
563+
if (msg)
564+
inq = msg->sg.size;
565+
sk_psock_put(sk, psock);
566+
}
567+
return inq;
568+
}
569+
526570
#if IS_ENABLED(CONFIG_NET_SOCK_MSG)
527571

528572
#define BPF_F_STRPARSER (1UL << 1)

net/core/skmsg.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,7 @@ int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg
455455
atomic_sub(copy, &sk->sk_rmem_alloc);
456456
}
457457
msg_rx->sg.size -= copy;
458+
sk_psock_inc_msg_size(psock, -copy);
458459

459460
if (!sge->length) {
460461
sk_msg_iter_var_next(i);
@@ -819,9 +820,11 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
819820
list_del(&msg->list);
820821
if (!msg->skb)
821822
atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc);
823+
sk_psock_inc_msg_size(psock, -((ssize_t)msg->sg.size));
822824
sk_msg_free(psock->sk, msg);
823825
kfree(msg);
824826
}
827+
WARN_ON_ONCE(psock->ingress_size);
825828
}
826829

827830
static void __sk_psock_zap_ingress(struct sk_psock *psock)

net/ipv4/tcp_bpf.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include <net/inet_common.h>
1212
#include <net/tls.h>
13+
#include <asm/ioctls.h>
1314

1415
void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
1516
{
@@ -332,6 +333,25 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
332333
return copied;
333334
}
334335

336+
static int tcp_bpf_ioctl(struct sock *sk, int cmd, int *karg)
337+
{
338+
bool slow;
339+
340+
/* we only care about FIONREAD */
341+
if (cmd != SIOCINQ)
342+
return tcp_ioctl(sk, cmd, karg);
343+
344+
/* works similar as tcp_ioctl */
345+
if (sk->sk_state == TCP_LISTEN)
346+
return -EINVAL;
347+
348+
slow = lock_sock_fast(sk);
349+
*karg = sk_psock_msg_inq(sk);
350+
unlock_sock_fast(sk, slow);
351+
352+
return 0;
353+
}
354+
335355
static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
336356
int flags, int *addr_len)
337357
{
@@ -610,6 +630,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
610630
prot[TCP_BPF_BASE].close = sock_map_close;
611631
prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
612632
prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable;
633+
prot[TCP_BPF_BASE].ioctl = tcp_bpf_ioctl;
613634

614635
prot[TCP_BPF_TX] = prot[TCP_BPF_BASE];
615636
prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;

net/ipv4/udp_bpf.c

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <net/sock.h>
66
#include <net/udp.h>
77
#include <net/inet_common.h>
8+
#include <asm/ioctls.h>
89

910
#include "udp_impl.h"
1011

@@ -111,12 +112,28 @@ enum {
111112
static DEFINE_SPINLOCK(udpv6_prot_lock);
112113
static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS];
113114

115+
static int udp_bpf_ioctl(struct sock *sk, int cmd, int *karg)
116+
{
117+
/* we only care about FIONREAD */
118+
if (cmd != SIOCINQ)
119+
return udp_ioctl(sk, cmd, karg);
120+
121+
/* works similar as udp_ioctl.
122+
* man udp(7): "FIONREAD (SIOCINQ): Returns the size of the next
123+
* pending datagram in the integer in bytes, or 0 when no datagram
124+
* is pending."
125+
*/
126+
*karg = sk_msg_first_length(sk);
127+
return 0;
128+
}
129+
114130
static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
115131
{
116-
*prot = *base;
117-
prot->close = sock_map_close;
118-
prot->recvmsg = udp_bpf_recvmsg;
119-
prot->sock_is_readable = sk_msg_is_readable;
132+
*prot = *base;
133+
prot->close = sock_map_close;
134+
prot->recvmsg = udp_bpf_recvmsg;
135+
prot->sock_is_readable = sk_msg_is_readable;
136+
prot->ioctl = udp_bpf_ioctl;
120137
}
121138

122139
static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)

0 commit comments

Comments
 (0)