Skip to content

Commit 29173d0

Browse files
jrfastabborkmann
authored andcommitted
bpf, sockmap: Convert schedule_work into delayed_work
Sk_buffs are fed into sockmap verdict programs either from a strparser (when the user might want to decide how framing of skb is done by attaching another parser program) or directly through tcp_read_sock. The tcp_read_sock is the preferred method for performance when the BPF logic is a stream parser. The flow for Cilium's common use case with a stream parser is, tcp_read_sock() sk_psock_verdict_recv ret = bpf_prog_run_pin_on_cpu() sk_psock_verdict_apply(sock, skb, ret) // if system is under memory pressure or app is slow we may // need to queue skb. Do this queuing through ingress_skb and // then kick timer to wake up handler skb_queue_tail(ingress_skb, skb) schedule_work(work); The work queue is wired up to sk_psock_backlog(). This will then walk the ingress_skb skb list that holds our sk_buffs that could not be handled, but should be OK to run at some later point. However, its possible that the workqueue doing this work still hits an error when sending the skb. When this happens the skbuff is requeued on a temporary 'state' struct kept with the workqueue. This is necessary because its possible to partially send an skbuff before hitting an error and we need to know how and where to restart when the workqueue runs next. Now for the trouble, we don't rekick the workqueue. This can cause a stall where the skbuff we just cached on the state variable might never be sent. This happens when its the last packet in a flow and no further packets come along that would cause the system to kick the workqueue from that side. To fix we could do simple schedule_work(), but while under memory pressure it makes sense to back off some instead of continue to retry repeatedly. So instead to fix convert schedule_work to schedule_delayed_work and add backoff logic to reschedule from backlog queue on errors. Its not obvious though what a good backoff is so use '1'. To test we observed some flakes whil running NGINX compliance test with sockmap we attributed these failed test to this bug and subsequent issue. >From on list discussion. This commit bec2171("skmsg: Schedule psock work if the cached skb exists on the psock") was intended to address similar race, but had a couple cases it missed. Most obvious it only accounted for receiving traffic on the local socket so if redirecting into another socket we could still get an sk_buff stuck here. Next it missed the case where copied=0 in the recv() handler and then we wouldn't kick the scheduler. Also its sub-optimal to require userspace to kick the internal mechanisms of sockmap to wake it up and copy data to user. It results in an extra syscall and requires the app to actual handle the EAGAIN correctly. Fixes: 04919be ("tcp: Introduce tcp_read_skb()") Signed-off-by: John Fastabend <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]> Tested-by: William Findlay <[email protected]> Reviewed-by: Jakub Sitnicki <[email protected]> Link: https://lore.kernel.org/bpf/[email protected]
1 parent 78fa0d6 commit 29173d0

File tree

3 files changed

+17
-9
lines changed

3 files changed

+17
-9
lines changed

include/linux/skmsg.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ struct sk_psock {
105105
struct proto *sk_proto;
106106
struct mutex work_mutex;
107107
struct sk_psock_work_state work_state;
108-
struct work_struct work;
108+
struct delayed_work work;
109109
struct rcu_work rwork;
110110
};
111111

net/core/skmsg.c

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -482,7 +482,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
482482
}
483483
out:
484484
if (psock->work_state.skb && copied > 0)
485-
schedule_work(&psock->work);
485+
schedule_delayed_work(&psock->work, 0);
486486
return copied;
487487
}
488488
EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
@@ -640,7 +640,8 @@ static void sk_psock_skb_state(struct sk_psock *psock,
640640

641641
static void sk_psock_backlog(struct work_struct *work)
642642
{
643-
struct sk_psock *psock = container_of(work, struct sk_psock, work);
643+
struct delayed_work *dwork = to_delayed_work(work);
644+
struct sk_psock *psock = container_of(dwork, struct sk_psock, work);
644645
struct sk_psock_work_state *state = &psock->work_state;
645646
struct sk_buff *skb = NULL;
646647
bool ingress;
@@ -680,6 +681,12 @@ static void sk_psock_backlog(struct work_struct *work)
680681
if (ret == -EAGAIN) {
681682
sk_psock_skb_state(psock, state, skb,
682683
len, off);
684+
685+
/* Delay slightly to prioritize any
686+
* other work that might be here.
687+
*/
688+
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
689+
schedule_delayed_work(&psock->work, 1);
683690
goto end;
684691
}
685692
/* Hard errors break pipe and stop xmit. */
@@ -734,7 +741,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
734741
INIT_LIST_HEAD(&psock->link);
735742
spin_lock_init(&psock->link_lock);
736743

737-
INIT_WORK(&psock->work, sk_psock_backlog);
744+
INIT_DELAYED_WORK(&psock->work, sk_psock_backlog);
738745
mutex_init(&psock->work_mutex);
739746
INIT_LIST_HEAD(&psock->ingress_msg);
740747
spin_lock_init(&psock->ingress_lock);
@@ -823,7 +830,7 @@ static void sk_psock_destroy(struct work_struct *work)
823830

824831
sk_psock_done_strp(psock);
825832

826-
cancel_work_sync(&psock->work);
833+
cancel_delayed_work_sync(&psock->work);
827834
mutex_destroy(&psock->work_mutex);
828835

829836
psock_progs_drop(&psock->progs);
@@ -938,7 +945,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb)
938945
}
939946

940947
skb_queue_tail(&psock_other->ingress_skb, skb);
941-
schedule_work(&psock_other->work);
948+
schedule_delayed_work(&psock_other->work, 0);
942949
spin_unlock_bh(&psock_other->ingress_lock);
943950
return 0;
944951
}
@@ -1018,7 +1025,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
10181025
spin_lock_bh(&psock->ingress_lock);
10191026
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
10201027
skb_queue_tail(&psock->ingress_skb, skb);
1021-
schedule_work(&psock->work);
1028+
schedule_delayed_work(&psock->work, 0);
10221029
err = 0;
10231030
}
10241031
spin_unlock_bh(&psock->ingress_lock);
@@ -1049,7 +1056,7 @@ static void sk_psock_write_space(struct sock *sk)
10491056
psock = sk_psock(sk);
10501057
if (likely(psock)) {
10511058
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
1052-
schedule_work(&psock->work);
1059+
schedule_delayed_work(&psock->work, 0);
10531060
write_space = psock->saved_write_space;
10541061
}
10551062
rcu_read_unlock();

net/core/sock_map.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1644,9 +1644,10 @@ void sock_map_close(struct sock *sk, long timeout)
16441644
rcu_read_unlock();
16451645
sk_psock_stop(psock);
16461646
release_sock(sk);
1647-
cancel_work_sync(&psock->work);
1647+
cancel_delayed_work_sync(&psock->work);
16481648
sk_psock_put(sk, psock);
16491649
}
1650+
16501651
/* Make sure we do not recurse. This is a bug.
16511652
* Leak the socket instead of crashing on a stack overflow.
16521653
*/

0 commit comments

Comments
 (0)