Skip to content

Commit e5c6de5

Browse files
jrfastabborkmann
authored andcommitted
bpf, sockmap: Incorrectly handling copied_seq
The read_skb() logic is incrementing the tcp->copied_seq which is used for among other things calculating how many outstanding bytes can be read by the application. This results in application errors, if the application does an ioctl(FIONREAD) we return zero because this is calculated from the copied_seq value. To fix this we move tcp->copied_seq accounting into the recv handler so that we update these when the recvmsg() hook is called and data is in fact copied into user buffers. This gives an accurate FIONREAD value as expected and improves ACK handling. Before we were calling the tcp_rcv_space_adjust() which would update 'number of bytes copied to user in last RTT' which is wrong for programs returning SK_PASS. The bytes are only copied to the user when recvmsg is handled. Doing the fix for recvmsg is straightforward, but fixing redirect and SK_DROP pkts is a bit tricker. Build a tcp_psock_eat() helper and then call this from skmsg handlers. This fixes another issue where a broken socket with a BPF program doing a resubmit could hang the receiver. This happened because although read_skb() consumed the skb through sock_drop() it did not update the copied_seq. Now if a single reccv socket is redirecting to many sockets (for example for lb) the receiver sk will be hung even though we might expect it to continue. The hang comes from not updating the copied_seq numbers and memory pressure resulting from that. We have a slight layer problem of calling tcp_eat_skb even if its not a TCP socket. To fix we could refactor and create per type receiver handlers. I decided this is more work than we want in the fix and we already have some small tweaks depending on caller that use the helper skb_bpf_strparser(). So we extend that a bit and always set the strparser bit when it is in use and then we can gate the seq_copied updates on this. Fixes: 04919be ("tcp: Introduce tcp_read_skb()") Signed-off-by: John Fastabend <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]> Reviewed-by: Jakub Sitnicki <[email protected]> Link: https://lore.kernel.org/bpf/[email protected]
1 parent 6df7f76 commit e5c6de5

File tree

4 files changed

+45
-18
lines changed

4 files changed

+45
-18
lines changed

include/net/tcp.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1470,6 +1470,8 @@ static inline void tcp_adjust_rcv_ssthresh(struct sock *sk)
14701470
}
14711471

14721472
void tcp_cleanup_rbuf(struct sock *sk, int copied);
1473+
void __tcp_cleanup_rbuf(struct sock *sk, int copied);
1474+
14731475

14741476
/* We provision sk_rcvbuf around 200% of sk_rcvlowat.
14751477
* If 87.5 % (7/8) of the space has been consumed, we want to override
@@ -2326,6 +2328,14 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
23262328
void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
23272329
#endif /* CONFIG_BPF_SYSCALL */
23282330

2331+
#ifdef CONFIG_INET
2332+
void tcp_eat_skb(struct sock *sk, struct sk_buff *skb);
2333+
#else
2334+
static inline void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
2335+
{
2336+
}
2337+
#endif
2338+
23292339
int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress,
23302340
struct sk_msg *msg, u32 bytes, int flags);
23312341
#endif /* CONFIG_NET_SOCK_MSG */

net/core/skmsg.c

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -979,10 +979,8 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
979979
err = -EIO;
980980
sk_other = psock->sk;
981981
if (sock_flag(sk_other, SOCK_DEAD) ||
982-
!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
983-
skb_bpf_redirect_clear(skb);
982+
!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
984983
goto out_free;
985-
}
986984

987985
skb_bpf_set_ingress(skb);
988986

@@ -1011,18 +1009,19 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
10111009
err = 0;
10121010
}
10131011
spin_unlock_bh(&psock->ingress_lock);
1014-
if (err < 0) {
1015-
skb_bpf_redirect_clear(skb);
1012+
if (err < 0)
10161013
goto out_free;
1017-
}
10181014
}
10191015
break;
10201016
case __SK_REDIRECT:
1017+
tcp_eat_skb(psock->sk, skb);
10211018
err = sk_psock_skb_redirect(psock, skb);
10221019
break;
10231020
case __SK_DROP:
10241021
default:
10251022
out_free:
1023+
skb_bpf_redirect_clear(skb);
1024+
tcp_eat_skb(psock->sk, skb);
10261025
sock_drop(psock->sk, skb);
10271026
}
10281027

@@ -1067,8 +1066,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
10671066
skb_dst_drop(skb);
10681067
skb_bpf_redirect_clear(skb);
10691068
ret = bpf_prog_run_pin_on_cpu(prog, skb);
1070-
if (ret == SK_PASS)
1071-
skb_bpf_set_strparser(skb);
1069+
skb_bpf_set_strparser(skb);
10721070
ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
10731071
skb->sk = NULL;
10741072
}
@@ -1176,6 +1174,7 @@ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
11761174
psock = sk_psock(sk);
11771175
if (unlikely(!psock)) {
11781176
len = 0;
1177+
tcp_eat_skb(sk, skb);
11791178
sock_drop(sk, skb);
11801179
goto out;
11811180
}

net/ipv4/tcp.c

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1571,7 +1571,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
15711571
* calculation of whether or not we must ACK for the sake of
15721572
* a window update.
15731573
*/
1574-
static void __tcp_cleanup_rbuf(struct sock *sk, int copied)
1574+
void __tcp_cleanup_rbuf(struct sock *sk, int copied)
15751575
{
15761576
struct tcp_sock *tp = tcp_sk(sk);
15771577
bool time_to_ack = false;
@@ -1786,14 +1786,6 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
17861786
break;
17871787
}
17881788
}
1789-
WRITE_ONCE(tp->copied_seq, seq);
1790-
1791-
tcp_rcv_space_adjust(sk);
1792-
1793-
/* Clean up data we have read: This will do ACK frames. */
1794-
if (copied > 0)
1795-
__tcp_cleanup_rbuf(sk, copied);
1796-
17971789
return copied;
17981790
}
17991791
EXPORT_SYMBOL(tcp_read_skb);

net/ipv4/tcp_bpf.c

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,24 @@
1111
#include <net/inet_common.h>
1212
#include <net/tls.h>
1313

14+
void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
15+
{
16+
struct tcp_sock *tcp;
17+
int copied;
18+
19+
if (!skb || !skb->len || !sk_is_tcp(sk))
20+
return;
21+
22+
if (skb_bpf_strparser(skb))
23+
return;
24+
25+
tcp = tcp_sk(sk);
26+
copied = tcp->copied_seq + skb->len;
27+
WRITE_ONCE(tcp->copied_seq, copied);
28+
tcp_rcv_space_adjust(sk);
29+
__tcp_cleanup_rbuf(sk, skb->len);
30+
}
31+
1432
static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
1533
struct sk_msg *msg, u32 apply_bytes, int flags)
1634
{
@@ -198,8 +216,10 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
198216
int flags,
199217
int *addr_len)
200218
{
219+
struct tcp_sock *tcp = tcp_sk(sk);
220+
u32 seq = tcp->copied_seq;
201221
struct sk_psock *psock;
202-
int copied;
222+
int copied = 0;
203223

204224
if (unlikely(flags & MSG_ERRQUEUE))
205225
return inet_recv_error(sk, msg, len, addr_len);
@@ -244,9 +264,11 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
244264

245265
if (is_fin) {
246266
copied = 0;
267+
seq++;
247268
goto out;
248269
}
249270
}
271+
seq += copied;
250272
if (!copied) {
251273
long timeo;
252274
int data;
@@ -284,6 +306,10 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
284306
copied = -EAGAIN;
285307
}
286308
out:
309+
WRITE_ONCE(tcp->copied_seq, seq);
310+
tcp_rcv_space_adjust(sk);
311+
if (copied > 0)
312+
__tcp_cleanup_rbuf(sk, copied);
287313
release_sock(sk);
288314
sk_psock_put(sk, psock);
289315
return copied;

0 commit comments

Comments
 (0)