Skip to content

Commit 09322d9

Browse files
mrpreKernel Patches Daemon
authored andcommitted
bpf, sockmap: Fix FIONREAD for sockmap
A socket using sockmap has its own independent receive queue: ingress_msg. This queue may contain data from its own protocol stack or from other sockets. Therefore, for sockmap, relying solely on copied_seq and rcv_nxt to calculate FIONREAD is not enough. This patch adds a new ingress_size field in the psock structure to record the data length in ingress_msg. Additionally, we implement new ioctl interfaces for TCP and UDP to intercept FIONREAD operations. While Unix and VSOCK also support sockmap and have similar FIONREAD calculation issues, fixing them would require more extensive changes (please let me know if modifications are needed). I believe it's not appropriate to include those changes under this fix patch. Previous work by John Fastabend made some efforts towards FIONREAD support: commit e5c6de5 ("bpf, sockmap: Incorrectly handling copied_seq") Although the current patch is based on the previous work by John Fastabend, it is acceptable for our Fixes tag to point to the same commit. FD1:read() -- FD1->copied_seq++ | [read data] | [enqueue data] v [sockmap] -> ingress to self -> ingress_msg queue FD1 native stack ------> ^ -- FD1->rcv_nxt++ -> redirect to other | [enqueue data] | | | ingress to FD1 v ^ ... | [sockmap] FD2 native stack Fixes: 04919be ("tcp: Introduce tcp_read_skb()") Signed-off-by: Jiayuan Chen <[email protected]>
1 parent 623bab9 commit 09322d9

File tree

4 files changed

+90
-5
lines changed

4 files changed

+90
-5
lines changed

include/linux/skmsg.h

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ struct sk_psock {
9797
struct sk_buff_head ingress_skb;
9898
struct list_head ingress_msg;
9999
spinlock_t ingress_lock;
100+
ssize_t ingress_size;
100101
unsigned long state;
101102
struct list_head link;
102103
spinlock_t link_lock;
@@ -319,6 +320,16 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb)
319320
kfree_skb(skb);
320321
}
321322

323+
static inline ssize_t sk_psock_get_msg_size(struct sk_psock *psock)
324+
{
325+
return psock->ingress_size;
326+
}
327+
328+
static inline void sk_psock_inc_msg_size(struct sk_psock *psock, ssize_t diff)
329+
{
330+
psock->ingress_size += diff;
331+
}
332+
322333
static inline bool sk_psock_queue_msg(struct sk_psock *psock,
323334
struct sk_msg *msg)
324335
{
@@ -327,6 +338,7 @@ static inline bool sk_psock_queue_msg(struct sk_psock *psock,
327338
spin_lock_bh(&psock->ingress_lock);
328339
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
329340
list_add_tail(&msg->list, &psock->ingress_msg);
341+
sk_psock_inc_msg_size(psock, msg->sg.size);
330342
ret = true;
331343
} else {
332344
sk_msg_free(psock->sk, msg);
@@ -343,8 +355,10 @@ static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
343355

344356
spin_lock_bh(&psock->ingress_lock);
345357
msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
346-
if (msg)
358+
if (msg) {
347359
list_del(&msg->list);
360+
sk_psock_inc_msg_size(psock, -msg->sg.size);
361+
}
348362
spin_unlock_bh(&psock->ingress_lock);
349363
return msg;
350364
}
@@ -521,6 +535,36 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
521535
return !!psock->saved_data_ready;
522536
}
523537

538+
static inline ssize_t sk_psock_msg_inq(struct sock *sk)
539+
{
540+
struct sk_psock *psock;
541+
ssize_t inq = 0;
542+
543+
psock = sk_psock_get(sk);
544+
if (likely(psock)) {
545+
inq = sk_psock_get_msg_size(psock);
546+
sk_psock_put(sk, psock);
547+
}
548+
return inq;
549+
}
550+
551+
/* for udp */
552+
static inline ssize_t sk_msg_first_length(struct sock *sk)
553+
{
554+
struct sk_psock *psock;
555+
struct sk_msg *msg;
556+
ssize_t inq = 0;
557+
558+
psock = sk_psock_get(sk);
559+
if (likely(psock)) {
560+
msg = sk_psock_peek_msg(psock);
561+
if (msg)
562+
inq = msg->sg.size;
563+
sk_psock_put(sk, psock);
564+
}
565+
return inq;
566+
}
567+
524568
#if IS_ENABLED(CONFIG_NET_SOCK_MSG)
525569

526570
#define BPF_F_STRPARSER (1UL << 1)

net/core/skmsg.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
451451
atomic_sub(copy, &sk->sk_rmem_alloc);
452452
}
453453
msg_rx->sg.size -= copy;
454+
sk_psock_inc_msg_size(psock, -copy);
454455

455456
if (!sge->length) {
456457
sk_msg_iter_var_next(i);
@@ -801,9 +802,11 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
801802
list_del(&msg->list);
802803
if (!msg->skb)
803804
atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc);
805+
sk_psock_inc_msg_size(psock, -((ssize_t)msg->sg.size));
804806
sk_msg_free(psock->sk, msg);
805807
kfree(msg);
806808
}
809+
WARN_ON_ONCE(psock->ingress_size);
807810
}
808811

809812
static void __sk_psock_zap_ingress(struct sk_psock *psock)

net/ipv4/tcp_bpf.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include <net/inet_common.h>
1212
#include <net/tls.h>
13+
#include <asm/ioctls.h>
1314

1415
void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
1516
{
@@ -331,6 +332,25 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
331332
return copied;
332333
}
333334

335+
static int tcp_bpf_ioctl(struct sock *sk, int cmd, int *karg)
336+
{
337+
bool slow;
338+
339+
/* we only care about FIONREAD */
340+
if (cmd != SIOCINQ)
341+
return tcp_ioctl(sk, cmd, karg);
342+
343+
/* works similar as tcp_ioctl */
344+
if (sk->sk_state == TCP_LISTEN)
345+
return -EINVAL;
346+
347+
slow = lock_sock_fast(sk);
348+
*karg = sk_psock_msg_inq(sk);
349+
unlock_sock_fast(sk, slow);
350+
351+
return 0;
352+
}
353+
334354
static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
335355
int flags, int *addr_len)
336356
{
@@ -609,6 +629,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
609629
prot[TCP_BPF_BASE].close = sock_map_close;
610630
prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
611631
prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable;
632+
prot[TCP_BPF_BASE].ioctl = tcp_bpf_ioctl;
612633

613634
prot[TCP_BPF_TX] = prot[TCP_BPF_BASE];
614635
prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;

net/ipv4/udp_bpf.c

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <net/sock.h>
66
#include <net/udp.h>
77
#include <net/inet_common.h>
8+
#include <asm/ioctls.h>
89

910
#include "udp_impl.h"
1011

@@ -111,12 +112,28 @@ enum {
111112
static DEFINE_SPINLOCK(udpv6_prot_lock);
112113
static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS];
113114

115+
static int udp_bpf_ioctl(struct sock *sk, int cmd, int *karg)
116+
{
117+
/* we only care about FIONREAD */
118+
if (cmd != SIOCINQ)
119+
return udp_ioctl(sk, cmd, karg);
120+
121+
/* works similar as udp_ioctl.
122+
* man udp(7): "FIONREAD (SIOCINQ): Returns the size of the next
123+
* pending datagram in the integer in bytes, or 0 when no datagram
124+
* is pending."
125+
*/
126+
*karg = sk_msg_first_length(sk);
127+
return 0;
128+
}
129+
114130
static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
115131
{
116-
*prot = *base;
117-
prot->close = sock_map_close;
118-
prot->recvmsg = udp_bpf_recvmsg;
119-
prot->sock_is_readable = sk_msg_is_readable;
132+
*prot = *base;
133+
prot->close = sock_map_close;
134+
prot->recvmsg = udp_bpf_recvmsg;
135+
prot->sock_is_readable = sk_msg_is_readable;
136+
prot->ioctl = udp_bpf_ioctl;
120137
}
121138

122139
static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)

0 commit comments

Comments
 (0)