Skip to content

Commit 405df89

Browse files
jrfastabborkmann
authored andcommitted
bpf, sockmap: Improved check for empty queue
We noticed some rare sk_buffs were stepping past the queue when system was under memory pressure. The general theory is to skip enqueueing sk_buffs when its not necessary which is the normal case with a system that is properly provisioned for the task, no memory pressure and enough cpu assigned. But, if we can't allocate memory due to an ENOMEM error when enqueueing the sk_buff into the sockmap receive queue we push it onto a delayed workqueue to retry later. When a new sk_buff is received we then check if that queue is empty. However, there is a problem with simply checking the queue length. When a sk_buff is being processed from the ingress queue but not yet on the sockmap msg receive queue its possible to also recv a sk_buff through normal path. It will check the ingress queue which is zero and then skip ahead of the pkt being processed. Previously we used sock lock from both contexts which made the problem harder to hit, but not impossible. To fix instead of popping the skb from the queue entirely we peek the skb from the queue and do the copy there. This ensures checks to the queue length are non-zero while skb is being processed. Then finally when the entire skb has been copied to user space queue or another socket we pop it off the queue. This way the queue length check allows bypassing the queue only after the list has been completely processed. To reproduce issue we run NGINX compliance test with sockmap running and observe some flakes in our testing that we attributed to this issue. Fixes: 04919be ("tcp: Introduce tcp_read_skb()") Suggested-by: Jakub Sitnicki <[email protected]> Signed-off-by: John Fastabend <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]> Tested-by: William Findlay <[email protected]> Reviewed-by: Jakub Sitnicki <[email protected]> Link: https://lore.kernel.org/bpf/[email protected]
1 parent bce2255 commit 405df89

File tree

2 files changed

+8
-25
lines changed

2 files changed

+8
-25
lines changed

include/linux/skmsg.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ struct sk_psock_link {
7171
};
7272

7373
struct sk_psock_work_state {
74-
struct sk_buff *skb;
7574
u32 len;
7675
u32 off;
7776
};

net/core/skmsg.c

Lines changed: 8 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -622,16 +622,12 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
622622

623623
static void sk_psock_skb_state(struct sk_psock *psock,
624624
struct sk_psock_work_state *state,
625-
struct sk_buff *skb,
626625
int len, int off)
627626
{
628627
spin_lock_bh(&psock->ingress_lock);
629628
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
630-
state->skb = skb;
631629
state->len = len;
632630
state->off = off;
633-
} else {
634-
sock_drop(psock->sk, skb);
635631
}
636632
spin_unlock_bh(&psock->ingress_lock);
637633
}
@@ -642,23 +638,17 @@ static void sk_psock_backlog(struct work_struct *work)
642638
struct sk_psock *psock = container_of(dwork, struct sk_psock, work);
643639
struct sk_psock_work_state *state = &psock->work_state;
644640
struct sk_buff *skb = NULL;
641+
u32 len = 0, off = 0;
645642
bool ingress;
646-
u32 len, off;
647643
int ret;
648644

649645
mutex_lock(&psock->work_mutex);
650-
if (unlikely(state->skb)) {
651-
spin_lock_bh(&psock->ingress_lock);
652-
skb = state->skb;
646+
if (unlikely(state->len)) {
653647
len = state->len;
654648
off = state->off;
655-
state->skb = NULL;
656-
spin_unlock_bh(&psock->ingress_lock);
657649
}
658-
if (skb)
659-
goto start;
660650

661-
while ((skb = skb_dequeue(&psock->ingress_skb))) {
651+
while ((skb = skb_peek(&psock->ingress_skb))) {
662652
len = skb->len;
663653
off = 0;
664654
if (skb_bpf_strparser(skb)) {
@@ -667,7 +657,6 @@ static void sk_psock_backlog(struct work_struct *work)
667657
off = stm->offset;
668658
len = stm->full_len;
669659
}
670-
start:
671660
ingress = skb_bpf_ingress(skb);
672661
skb_bpf_redirect_clear(skb);
673662
do {
@@ -677,8 +666,7 @@ static void sk_psock_backlog(struct work_struct *work)
677666
len, ingress);
678667
if (ret <= 0) {
679668
if (ret == -EAGAIN) {
680-
sk_psock_skb_state(psock, state, skb,
681-
len, off);
669+
sk_psock_skb_state(psock, state, len, off);
682670

683671
/* Delay slightly to prioritize any
684672
* other work that might be here.
@@ -690,15 +678,16 @@ static void sk_psock_backlog(struct work_struct *work)
690678
/* Hard errors break pipe and stop xmit. */
691679
sk_psock_report_error(psock, ret ? -ret : EPIPE);
692680
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
693-
sock_drop(psock->sk, skb);
694681
goto end;
695682
}
696683
off += ret;
697684
len -= ret;
698685
} while (len);
699686

700-
if (!ingress)
687+
skb = skb_dequeue(&psock->ingress_skb);
688+
if (!ingress) {
701689
kfree_skb(skb);
690+
}
702691
}
703692
end:
704693
mutex_unlock(&psock->work_mutex);
@@ -791,11 +780,6 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock)
791780
skb_bpf_redirect_clear(skb);
792781
sock_drop(psock->sk, skb);
793782
}
794-
kfree_skb(psock->work_state.skb);
795-
/* We null the skb here to ensure that calls to sk_psock_backlog
796-
* do not pick up the free'd skb.
797-
*/
798-
psock->work_state.skb = NULL;
799783
__sk_psock_purge_ingress_msg(psock);
800784
}
801785

@@ -814,7 +798,6 @@ void sk_psock_stop(struct sk_psock *psock)
814798
spin_lock_bh(&psock->ingress_lock);
815799
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
816800
sk_psock_cork_free(psock);
817-
__sk_psock_zap_ingress(psock);
818801
spin_unlock_bh(&psock->ingress_lock);
819802
}
820803

@@ -829,6 +812,7 @@ static void sk_psock_destroy(struct work_struct *work)
829812
sk_psock_done_strp(psock);
830813

831814
cancel_delayed_work_sync(&psock->work);
815+
__sk_psock_zap_ingress(psock);
832816
mutex_destroy(&psock->work_mutex);
833817

834818
psock_progs_drop(&psock->progs);

0 commit comments

Comments
 (0)