Skip to content

Commit edf6b4b

Browse files
Paolo Abeniintel-lab-lkp
authored andcommitted
mptcp: move the whole rx path under msk socket lock protection
After commit c2e6048 ("mptcp: fix race in release_cb") we can move the whole MPTCP rx path under the socket lock leveraging the release_cb. We can drop a bunch of spin_lock pairs in the receive functions, use a single receive queue and invoke __mptcp_move_skbs only when subflows ask for it. This will allow more cleanup in the next patch. Some changes are worth specific mention: The msk rcvbuf update now always happens under both the msk and the subflow socket lock: we can drop a bunch of ONCE annotation and consolidate the checks. When the skbs move is delayed at msk release callback time, even the msk rcvbuf update is delayed; additionally take care of such action in __mptcp_move_skbs(). Signed-off-by: Paolo Abeni <[email protected]>
1 parent 420b898 commit edf6b4b

File tree

3 files changed

+61
-66
lines changed

3 files changed

+61
-66
lines changed

net/mptcp/fastopen.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf
4949
MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
5050

5151
mptcp_data_lock(sk);
52+
DEBUG_NET_WARN_ON_ONCE(sock_owned_by_user_nocheck(sk));
5253

5354
mptcp_set_owner_r(skb, sk);
5455
__skb_queue_tail(&sk->sk_receive_queue, skb);
@@ -65,6 +66,7 @@ void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflo
6566
struct sock *sk = (struct sock *)msk;
6667
struct sk_buff *skb;
6768

69+
DEBUG_NET_WARN_ON_ONCE(sock_owned_by_user_nocheck(sk));
6870
skb = skb_peek_tail(&sk->sk_receive_queue);
6971
if (skb) {
7072
WARN_ON_ONCE(MPTCP_SKB_CB(skb)->end_seq);

net/mptcp/protocol.c

Lines changed: 58 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -643,18 +643,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
643643
bool more_data_avail;
644644
struct tcp_sock *tp;
645645
bool done = false;
646-
int sk_rbuf;
647-
648-
sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
649-
650-
if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
651-
int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
652-
653-
if (unlikely(ssk_rbuf > sk_rbuf)) {
654-
WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf);
655-
sk_rbuf = ssk_rbuf;
656-
}
657-
}
658646

659647
pr_debug("msk=%p ssk=%p\n", msk, ssk);
660648
tp = tcp_sk(ssk);
@@ -722,7 +710,7 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
722710
WRITE_ONCE(tp->copied_seq, seq);
723711
more_data_avail = mptcp_subflow_data_available(ssk);
724712

725-
if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) {
713+
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
726714
done = true;
727715
break;
728716
}
@@ -846,11 +834,30 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
846834
return moved > 0;
847835
}
848836

837+
static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk)
838+
{
839+
if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf))
840+
WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf);
841+
}
842+
843+
static void __mptcp_data_ready(struct sock *sk, struct sock *ssk)
844+
{
845+
struct mptcp_sock *msk = mptcp_sk(sk);
846+
847+
__mptcp_rcvbuf_update(sk, ssk);
848+
849+
/* over limit? can't append more skbs to msk, Also, no need to wake-up*/
850+
if (__mptcp_rmem(sk) > sk->sk_rcvbuf)
851+
return;
852+
853+
/* Wake-up the reader only for in-sequence data */
854+
if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
855+
sk->sk_data_ready(sk);
856+
}
857+
849858
void mptcp_data_ready(struct sock *sk, struct sock *ssk)
850859
{
851860
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
852-
struct mptcp_sock *msk = mptcp_sk(sk);
853-
int sk_rbuf, ssk_rbuf;
854861

855862
/* The peer can send data while we are shutting down this
856863
* subflow at msk destruction time, but we must avoid enqueuing
@@ -859,19 +866,11 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
859866
if (unlikely(subflow->disposable))
860867
return;
861868

862-
ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
863-
sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
864-
if (unlikely(ssk_rbuf > sk_rbuf))
865-
sk_rbuf = ssk_rbuf;
866-
867-
/* over limit? can't append more skbs to msk, Also, no need to wake-up*/
868-
if (__mptcp_rmem(sk) > sk_rbuf)
869-
return;
870-
871-
/* Wake-up the reader only for in-sequence data */
872869
mptcp_data_lock(sk);
873-
if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
874-
sk->sk_data_ready(sk);
870+
if (!sock_owned_by_user(sk))
871+
__mptcp_data_ready(sk, ssk);
872+
else
873+
__set_bit(MPTCP_DEQUEUE, &mptcp_sk(sk)->cb_flags);
875874
mptcp_data_unlock(sk);
876875
}
877876

@@ -1942,16 +1941,17 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
19421941

19431942
static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied);
19441943

1945-
static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
1944+
static int __mptcp_recvmsg_mskq(struct sock *sk,
19461945
struct msghdr *msg,
19471946
size_t len, int flags,
19481947
struct scm_timestamping_internal *tss,
19491948
int *cmsg_flags)
19501949
{
1950+
struct mptcp_sock *msk = mptcp_sk(sk);
19511951
struct sk_buff *skb, *tmp;
19521952
int copied = 0;
19531953

1954-
skb_queue_walk_safe(&msk->receive_queue, skb, tmp) {
1954+
skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) {
19551955
u32 offset = MPTCP_SKB_CB(skb)->offset;
19561956
u32 data_len = skb->len - offset;
19571957
u32 count = min_t(size_t, len - copied, data_len);
@@ -1986,7 +1986,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
19861986
/* we will bulk release the skb memory later */
19871987
skb->destructor = NULL;
19881988
WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize);
1989-
__skb_unlink(skb, &msk->receive_queue);
1989+
__skb_unlink(skb, &sk->sk_receive_queue);
19901990
__kfree_skb(skb);
19911991
msk->bytes_consumed += count;
19921992
}
@@ -2111,62 +2111,54 @@ static void __mptcp_update_rmem(struct sock *sk)
21112111
WRITE_ONCE(msk->rmem_released, 0);
21122112
}
21132113

2114-
static void __mptcp_splice_receive_queue(struct sock *sk)
2114+
static bool __mptcp_move_skbs(struct sock *sk)
21152115
{
2116+
struct mptcp_subflow_context *subflow;
21162117
struct mptcp_sock *msk = mptcp_sk(sk);
2117-
2118-
skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue);
2119-
}
2120-
2121-
static bool __mptcp_move_skbs(struct mptcp_sock *msk)
2122-
{
2123-
struct sock *sk = (struct sock *)msk;
21242118
unsigned int moved = 0;
21252119
bool ret, done;
21262120

2121+
/* verify we can move any data from the subflow, eventually updating */
2122+
if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
2123+
mptcp_for_each_subflow(msk, subflow)
2124+
__mptcp_rcvbuf_update(sk, subflow->tcp_sock);
2125+
2126+
if (__mptcp_rmem(sk) > sk->sk_rcvbuf)
2127+
return false;
2128+
21272129
do {
21282130
struct sock *ssk = mptcp_subflow_recv_lookup(msk);
21292131
bool slowpath;
21302132

2131-
/* we can have data pending in the subflows only if the msk
2132-
* receive buffer was full at subflow_data_ready() time,
2133-
* that is an unlikely slow path.
2134-
*/
2135-
if (likely(!ssk))
2133+
if (unlikely(!ssk))
21362134
break;
21372135

21382136
slowpath = lock_sock_fast(ssk);
2139-
mptcp_data_lock(sk);
21402137
__mptcp_update_rmem(sk);
21412138
done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
2142-
mptcp_data_unlock(sk);
21432139

21442140
if (unlikely(ssk->sk_err))
21452141
__mptcp_error_report(sk);
21462142
unlock_sock_fast(ssk, slowpath);
21472143
} while (!done);
21482144

2149-
/* acquire the data lock only if some input data is pending */
21502145
ret = moved > 0;
21512146
if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) ||
2152-
!skb_queue_empty_lockless(&sk->sk_receive_queue)) {
2153-
mptcp_data_lock(sk);
2147+
!skb_queue_empty(&sk->sk_receive_queue)) {
21542148
__mptcp_update_rmem(sk);
21552149
ret |= __mptcp_ofo_queue(msk);
2156-
__mptcp_splice_receive_queue(sk);
2157-
mptcp_data_unlock(sk);
21582150
}
21592151
if (ret)
21602152
mptcp_check_data_fin((struct sock *)msk);
2161-
return !skb_queue_empty(&msk->receive_queue);
2153+
return ret;
21622154
}
21632155

21642156
static unsigned int mptcp_inq_hint(const struct sock *sk)
21652157
{
21662158
const struct mptcp_sock *msk = mptcp_sk(sk);
21672159
const struct sk_buff *skb;
21682160

2169-
skb = skb_peek(&msk->receive_queue);
2161+
skb = skb_peek(&sk->sk_receive_queue);
21702162
if (skb) {
21712163
u64 hint_val = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq;
21722164

@@ -2212,7 +2204,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
22122204
while (copied < len) {
22132205
int err, bytes_read;
22142206

2215-
bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags);
2207+
bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, &tss, &cmsg_flags);
22162208
if (unlikely(bytes_read < 0)) {
22172209
if (!copied)
22182210
copied = bytes_read;
@@ -2221,7 +2213,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
22212213

22222214
copied += bytes_read;
22232215

2224-
if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk))
2216+
if (skb_queue_empty(&sk->sk_receive_queue) && __mptcp_move_skbs(sk))
22252217
continue;
22262218

22272219
/* only the MPTCP socket status is relevant here. The exit
@@ -2247,7 +2239,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
22472239
/* race breaker: the shutdown could be after the
22482240
* previous receive queue check
22492241
*/
2250-
if (__mptcp_move_skbs(msk))
2242+
if (__mptcp_move_skbs(sk))
22512243
continue;
22522244
break;
22532245
}
@@ -2291,9 +2283,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
22912283
}
22922284
}
22932285

2294-
pr_debug("msk=%p rx queue empty=%d:%d copied=%d\n",
2295-
msk, skb_queue_empty_lockless(&sk->sk_receive_queue),
2296-
skb_queue_empty(&msk->receive_queue), copied);
2286+
pr_debug("msk=%p rx queue empty=%d copied=%d\n",
2287+
msk, skb_queue_empty(&sk->sk_receive_queue), copied);
22972288

22982289
release_sock(sk);
22992290
return copied;
@@ -2820,7 +2811,6 @@ static void __mptcp_init_sock(struct sock *sk)
28202811
INIT_LIST_HEAD(&msk->join_list);
28212812
INIT_LIST_HEAD(&msk->rtx_queue);
28222813
INIT_WORK(&msk->work, mptcp_worker);
2823-
__skb_queue_head_init(&msk->receive_queue);
28242814
msk->out_of_order_queue = RB_ROOT;
28252815
msk->first_pending = NULL;
28262816
WRITE_ONCE(msk->rmem_fwd_alloc, 0);
@@ -3403,12 +3393,8 @@ void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags)
34033393
mptcp_for_each_subflow_safe(msk, subflow, tmp)
34043394
__mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags);
34053395

3406-
/* move to sk_receive_queue, sk_stream_kill_queues will purge it */
3407-
mptcp_data_lock(sk);
3408-
skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
34093396
__skb_queue_purge(&sk->sk_receive_queue);
34103397
skb_rbtree_purge(&msk->out_of_order_queue);
3411-
mptcp_data_unlock(sk);
34123398

34133399
/* move all the rx fwd alloc into the sk_mem_reclaim_final in
34143400
* inet_sock_destruct() will dispose it
@@ -3451,7 +3437,8 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
34513437

34523438
#define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \
34533439
BIT(MPTCP_RETRANSMIT) | \
3454-
BIT(MPTCP_FLUSH_JOIN_LIST))
3440+
BIT(MPTCP_FLUSH_JOIN_LIST) | \
3441+
BIT(MPTCP_DEQUEUE))
34553442

34563443
/* processes deferred events and flush wmem */
34573444
static void mptcp_release_cb(struct sock *sk)
@@ -3485,6 +3472,11 @@ static void mptcp_release_cb(struct sock *sk)
34853472
__mptcp_push_pending(sk, 0);
34863473
if (flags & BIT(MPTCP_RETRANSMIT))
34873474
__mptcp_retrans(sk);
3475+
if ((flags & BIT(MPTCP_DEQUEUE)) && __mptcp_move_skbs(sk)) {
3476+
/* notify ack seq update */
3477+
mptcp_cleanup_rbuf(msk, 0);
3478+
sk->sk_data_ready(sk);
3479+
}
34883480

34893481
cond_resched();
34903482
spin_lock_bh(&sk->sk_lock.slock);
@@ -3722,7 +3714,8 @@ static int mptcp_ioctl(struct sock *sk, int cmd, int *karg)
37223714
return -EINVAL;
37233715

37243716
lock_sock(sk);
3725-
__mptcp_move_skbs(msk);
3717+
if (__mptcp_move_skbs(sk))
3718+
mptcp_cleanup_rbuf(msk, 0);
37263719
*karg = mptcp_inq_hint(sk);
37273720
release_sock(sk);
37283721
break;

net/mptcp/protocol.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@
124124
#define MPTCP_FLUSH_JOIN_LIST 5
125125
#define MPTCP_SYNC_STATE 6
126126
#define MPTCP_SYNC_SNDBUF 7
127+
#define MPTCP_DEQUEUE 8
127128

128129
struct mptcp_skb_cb {
129130
u64 map_seq;
@@ -322,7 +323,6 @@ struct mptcp_sock {
322323
struct work_struct work;
323324
struct sk_buff *ooo_last_skb;
324325
struct rb_root out_of_order_queue;
325-
struct sk_buff_head receive_queue;
326326
struct list_head conn_list;
327327
struct list_head rtx_queue;
328328
struct mptcp_data_frag *first_pending;

0 commit comments

Comments
 (0)