Skip to content

Commit 0c615f1

Browse files
committed
Daniel Borkmann says: ==================== pull-request: bpf 2023-05-24 We've added 19 non-merge commits during the last 10 day(s) which contain a total of 20 files changed, 738 insertions(+), 448 deletions(-). The main changes are: 1) Batch of BPF sockmap fixes found when running against NGINX TCP tests, from John Fastabend. 2) Fix a memleak in the LRU{,_PERCPU} hash map when bucket locking fails, from Anton Protopopov. 3) Init the BPF offload table earlier than just late_initcall, from Jakub Kicinski. 4) Fix ctx access mask generation for 32-bit narrow loads of 64-bit fields, from Will Deacon. 5) Remove a now unsupported __fallthrough in BPF samples, from Andrii Nakryiko. 6) Fix a typo in pkg-config call for building sign-file, from Jeremy Sowden. * tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf: bpf, sockmap: Test progs verifier error with latest clang bpf, sockmap: Test FIONREAD returns correct bytes in rx buffer with drops bpf, sockmap: Test FIONREAD returns correct bytes in rx buffer bpf, sockmap: Test shutdown() correctly exits epoll and recv()=0 bpf, sockmap: Build helper to create connected socket pair bpf, sockmap: Pull socket helpers out of listen test for general use bpf, sockmap: Incorrectly handling copied_seq bpf, sockmap: Wake up polling after data copy bpf, sockmap: TCP data stall on recv before accept bpf, sockmap: Handle fin correctly bpf, sockmap: Improved check for empty queue bpf, sockmap: Reschedule is now done through backlog bpf, sockmap: Convert schedule_work into delayed_work bpf, sockmap: Pass skb ownership through read_skb bpf: fix a memory leak in the LRU and LRU_PERCPU hash maps bpf: Fix mask generation for 32-bit narrow loads of 64-bit fields samples/bpf: Drop unnecessary fallthrough bpf: netdev: init the offload table earlier selftests/bpf: Fix pkg-config call building sign-file ==================== Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents 878ecb0 + f726e03 commit 0c615f1

File tree

20 files changed

+738
-448
lines changed

20 files changed

+738
-448
lines changed

include/linux/skmsg.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ struct sk_psock_link {
7171
};
7272

7373
struct sk_psock_work_state {
74-
struct sk_buff *skb;
7574
u32 len;
7675
u32 off;
7776
};
@@ -105,7 +104,7 @@ struct sk_psock {
105104
struct proto *sk_proto;
106105
struct mutex work_mutex;
107106
struct sk_psock_work_state work_state;
108-
struct work_struct work;
107+
struct delayed_work work;
109108
struct rcu_work rwork;
110109
};
111110

include/net/tcp.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1470,6 +1470,8 @@ static inline void tcp_adjust_rcv_ssthresh(struct sock *sk)
14701470
}
14711471

14721472
void tcp_cleanup_rbuf(struct sock *sk, int copied);
1473+
void __tcp_cleanup_rbuf(struct sock *sk, int copied);
1474+
14731475

14741476
/* We provision sk_rcvbuf around 200% of sk_rcvlowat.
14751477
* If 87.5 % (7/8) of the space has been consumed, we want to override
@@ -2326,6 +2328,14 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
23262328
void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
23272329
#endif /* CONFIG_BPF_SYSCALL */
23282330

2331+
#ifdef CONFIG_INET
2332+
void tcp_eat_skb(struct sock *sk, struct sk_buff *skb);
2333+
#else
2334+
static inline void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
2335+
{
2336+
}
2337+
#endif
2338+
23292339
int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress,
23302340
struct sk_msg *msg, u32 bytes, int flags);
23312341
#endif /* CONFIG_NET_SOCK_MSG */

kernel/bpf/hashtab.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1215,7 +1215,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
12151215

12161216
ret = htab_lock_bucket(htab, b, hash, &flags);
12171217
if (ret)
1218-
return ret;
1218+
goto err_lock_bucket;
12191219

12201220
l_old = lookup_elem_raw(head, hash, key, key_size);
12211221

@@ -1236,6 +1236,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
12361236
err:
12371237
htab_unlock_bucket(htab, b, hash, flags);
12381238

1239+
err_lock_bucket:
12391240
if (ret)
12401241
htab_lru_push_free(htab, l_new);
12411242
else if (l_old)
@@ -1338,7 +1339,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
13381339

13391340
ret = htab_lock_bucket(htab, b, hash, &flags);
13401341
if (ret)
1341-
return ret;
1342+
goto err_lock_bucket;
13421343

13431344
l_old = lookup_elem_raw(head, hash, key, key_size);
13441345

@@ -1361,6 +1362,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
13611362
ret = 0;
13621363
err:
13631364
htab_unlock_bucket(htab, b, hash, flags);
1365+
err_lock_bucket:
13641366
if (l_new)
13651367
bpf_lru_push_free(&htab->lru, &l_new->lru_node);
13661368
return ret;

kernel/bpf/offload.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -859,4 +859,4 @@ static int __init bpf_offload_init(void)
859859
return rhashtable_init(&offdevs, &offdevs_params);
860860
}
861861

862-
late_initcall(bpf_offload_init);
862+
core_initcall(bpf_offload_init);

kernel/bpf/verifier.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17033,7 +17033,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
1703317033
insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
1703417034
insn->dst_reg,
1703517035
shift);
17036-
insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
17036+
insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
1703717037
(1ULL << size * 8) - 1);
1703817038
}
1703917039
}

net/core/skmsg.c

Lines changed: 38 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -481,8 +481,6 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
481481
msg_rx = sk_psock_peek_msg(psock);
482482
}
483483
out:
484-
if (psock->work_state.skb && copied > 0)
485-
schedule_work(&psock->work);
486484
return copied;
487485
}
488486
EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
@@ -624,42 +622,33 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
624622

625623
static void sk_psock_skb_state(struct sk_psock *psock,
626624
struct sk_psock_work_state *state,
627-
struct sk_buff *skb,
628625
int len, int off)
629626
{
630627
spin_lock_bh(&psock->ingress_lock);
631628
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
632-
state->skb = skb;
633629
state->len = len;
634630
state->off = off;
635-
} else {
636-
sock_drop(psock->sk, skb);
637631
}
638632
spin_unlock_bh(&psock->ingress_lock);
639633
}
640634

641635
static void sk_psock_backlog(struct work_struct *work)
642636
{
643-
struct sk_psock *psock = container_of(work, struct sk_psock, work);
637+
struct delayed_work *dwork = to_delayed_work(work);
638+
struct sk_psock *psock = container_of(dwork, struct sk_psock, work);
644639
struct sk_psock_work_state *state = &psock->work_state;
645640
struct sk_buff *skb = NULL;
641+
u32 len = 0, off = 0;
646642
bool ingress;
647-
u32 len, off;
648643
int ret;
649644

650645
mutex_lock(&psock->work_mutex);
651-
if (unlikely(state->skb)) {
652-
spin_lock_bh(&psock->ingress_lock);
653-
skb = state->skb;
646+
if (unlikely(state->len)) {
654647
len = state->len;
655648
off = state->off;
656-
state->skb = NULL;
657-
spin_unlock_bh(&psock->ingress_lock);
658649
}
659-
if (skb)
660-
goto start;
661650

662-
while ((skb = skb_dequeue(&psock->ingress_skb))) {
651+
while ((skb = skb_peek(&psock->ingress_skb))) {
663652
len = skb->len;
664653
off = 0;
665654
if (skb_bpf_strparser(skb)) {
@@ -668,7 +657,6 @@ static void sk_psock_backlog(struct work_struct *work)
668657
off = stm->offset;
669658
len = stm->full_len;
670659
}
671-
start:
672660
ingress = skb_bpf_ingress(skb);
673661
skb_bpf_redirect_clear(skb);
674662
do {
@@ -678,22 +666,28 @@ static void sk_psock_backlog(struct work_struct *work)
678666
len, ingress);
679667
if (ret <= 0) {
680668
if (ret == -EAGAIN) {
681-
sk_psock_skb_state(psock, state, skb,
682-
len, off);
669+
sk_psock_skb_state(psock, state, len, off);
670+
671+
/* Delay slightly to prioritize any
672+
* other work that might be here.
673+
*/
674+
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
675+
schedule_delayed_work(&psock->work, 1);
683676
goto end;
684677
}
685678
/* Hard errors break pipe and stop xmit. */
686679
sk_psock_report_error(psock, ret ? -ret : EPIPE);
687680
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
688-
sock_drop(psock->sk, skb);
689681
goto end;
690682
}
691683
off += ret;
692684
len -= ret;
693685
} while (len);
694686

695-
if (!ingress)
687+
skb = skb_dequeue(&psock->ingress_skb);
688+
if (!ingress) {
696689
kfree_skb(skb);
690+
}
697691
}
698692
end:
699693
mutex_unlock(&psock->work_mutex);
@@ -734,7 +728,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
734728
INIT_LIST_HEAD(&psock->link);
735729
spin_lock_init(&psock->link_lock);
736730

737-
INIT_WORK(&psock->work, sk_psock_backlog);
731+
INIT_DELAYED_WORK(&psock->work, sk_psock_backlog);
738732
mutex_init(&psock->work_mutex);
739733
INIT_LIST_HEAD(&psock->ingress_msg);
740734
spin_lock_init(&psock->ingress_lock);
@@ -786,11 +780,6 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock)
786780
skb_bpf_redirect_clear(skb);
787781
sock_drop(psock->sk, skb);
788782
}
789-
kfree_skb(psock->work_state.skb);
790-
/* We null the skb here to ensure that calls to sk_psock_backlog
791-
* do not pick up the free'd skb.
792-
*/
793-
psock->work_state.skb = NULL;
794783
__sk_psock_purge_ingress_msg(psock);
795784
}
796785

@@ -809,7 +798,6 @@ void sk_psock_stop(struct sk_psock *psock)
809798
spin_lock_bh(&psock->ingress_lock);
810799
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
811800
sk_psock_cork_free(psock);
812-
__sk_psock_zap_ingress(psock);
813801
spin_unlock_bh(&psock->ingress_lock);
814802
}
815803

@@ -823,7 +811,8 @@ static void sk_psock_destroy(struct work_struct *work)
823811

824812
sk_psock_done_strp(psock);
825813

826-
cancel_work_sync(&psock->work);
814+
cancel_delayed_work_sync(&psock->work);
815+
__sk_psock_zap_ingress(psock);
827816
mutex_destroy(&psock->work_mutex);
828817

829818
psock_progs_drop(&psock->progs);
@@ -938,7 +927,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb)
938927
}
939928

940929
skb_queue_tail(&psock_other->ingress_skb, skb);
941-
schedule_work(&psock_other->work);
930+
schedule_delayed_work(&psock_other->work, 0);
942931
spin_unlock_bh(&psock_other->ingress_lock);
943932
return 0;
944933
}
@@ -990,10 +979,8 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
990979
err = -EIO;
991980
sk_other = psock->sk;
992981
if (sock_flag(sk_other, SOCK_DEAD) ||
993-
!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
994-
skb_bpf_redirect_clear(skb);
982+
!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
995983
goto out_free;
996-
}
997984

998985
skb_bpf_set_ingress(skb);
999986

@@ -1018,22 +1005,23 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
10181005
spin_lock_bh(&psock->ingress_lock);
10191006
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
10201007
skb_queue_tail(&psock->ingress_skb, skb);
1021-
schedule_work(&psock->work);
1008+
schedule_delayed_work(&psock->work, 0);
10221009
err = 0;
10231010
}
10241011
spin_unlock_bh(&psock->ingress_lock);
1025-
if (err < 0) {
1026-
skb_bpf_redirect_clear(skb);
1012+
if (err < 0)
10271013
goto out_free;
1028-
}
10291014
}
10301015
break;
10311016
case __SK_REDIRECT:
1017+
tcp_eat_skb(psock->sk, skb);
10321018
err = sk_psock_skb_redirect(psock, skb);
10331019
break;
10341020
case __SK_DROP:
10351021
default:
10361022
out_free:
1023+
skb_bpf_redirect_clear(skb);
1024+
tcp_eat_skb(psock->sk, skb);
10371025
sock_drop(psock->sk, skb);
10381026
}
10391027

@@ -1049,7 +1037,7 @@ static void sk_psock_write_space(struct sock *sk)
10491037
psock = sk_psock(sk);
10501038
if (likely(psock)) {
10511039
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
1052-
schedule_work(&psock->work);
1040+
schedule_delayed_work(&psock->work, 0);
10531041
write_space = psock->saved_write_space;
10541042
}
10551043
rcu_read_unlock();
@@ -1078,8 +1066,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
10781066
skb_dst_drop(skb);
10791067
skb_bpf_redirect_clear(skb);
10801068
ret = bpf_prog_run_pin_on_cpu(prog, skb);
1081-
if (ret == SK_PASS)
1082-
skb_bpf_set_strparser(skb);
1069+
skb_bpf_set_strparser(skb);
10831070
ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
10841071
skb->sk = NULL;
10851072
}
@@ -1183,12 +1170,11 @@ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
11831170
int ret = __SK_DROP;
11841171
int len = skb->len;
11851172

1186-
skb_get(skb);
1187-
11881173
rcu_read_lock();
11891174
psock = sk_psock(sk);
11901175
if (unlikely(!psock)) {
11911176
len = 0;
1177+
tcp_eat_skb(sk, skb);
11921178
sock_drop(sk, skb);
11931179
goto out;
11941180
}
@@ -1212,12 +1198,21 @@ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
12121198
static void sk_psock_verdict_data_ready(struct sock *sk)
12131199
{
12141200
struct socket *sock = sk->sk_socket;
1201+
int copied;
12151202

12161203
trace_sk_data_ready(sk);
12171204

12181205
if (unlikely(!sock || !sock->ops || !sock->ops->read_skb))
12191206
return;
1220-
sock->ops->read_skb(sk, sk_psock_verdict_recv);
1207+
copied = sock->ops->read_skb(sk, sk_psock_verdict_recv);
1208+
if (copied >= 0) {
1209+
struct sk_psock *psock;
1210+
1211+
rcu_read_lock();
1212+
psock = sk_psock(sk);
1213+
psock->saved_data_ready(sk);
1214+
rcu_read_unlock();
1215+
}
12211216
}
12221217

12231218
void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)

net/core/sock_map.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1644,9 +1644,10 @@ void sock_map_close(struct sock *sk, long timeout)
16441644
rcu_read_unlock();
16451645
sk_psock_stop(psock);
16461646
release_sock(sk);
1647-
cancel_work_sync(&psock->work);
1647+
cancel_delayed_work_sync(&psock->work);
16481648
sk_psock_put(sk, psock);
16491649
}
1650+
16501651
/* Make sure we do not recurse. This is a bug.
16511652
* Leak the socket instead of crashing on a stack overflow.
16521653
*/

net/ipv4/tcp.c

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1571,7 +1571,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
15711571
* calculation of whether or not we must ACK for the sake of
15721572
* a window update.
15731573
*/
1574-
static void __tcp_cleanup_rbuf(struct sock *sk, int copied)
1574+
void __tcp_cleanup_rbuf(struct sock *sk, int copied)
15751575
{
15761576
struct tcp_sock *tp = tcp_sk(sk);
15771577
bool time_to_ack = false;
@@ -1773,7 +1773,6 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
17731773
WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
17741774
tcp_flags = TCP_SKB_CB(skb)->tcp_flags;
17751775
used = recv_actor(sk, skb);
1776-
consume_skb(skb);
17771776
if (used < 0) {
17781777
if (!copied)
17791778
copied = used;
@@ -1787,14 +1786,6 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
17871786
break;
17881787
}
17891788
}
1790-
WRITE_ONCE(tp->copied_seq, seq);
1791-
1792-
tcp_rcv_space_adjust(sk);
1793-
1794-
/* Clean up data we have read: This will do ACK frames. */
1795-
if (copied > 0)
1796-
__tcp_cleanup_rbuf(sk, copied);
1797-
17981789
return copied;
17991790
}
18001791
EXPORT_SYMBOL(tcp_read_skb);

0 commit comments

Comments
 (0)