Skip to content

Commit 972983f

Browse files
committed
Merge branch 'mptcp-stalled-connections-fix'
Matthieu Baerts says: ==================== mptcp: fix stalled connections Daire reported a few issues with MPTCP where some connections were stalled in different states. Paolo did a great job fixing them. Patch 1 fixes bogus receive window shrinkage with multiple subflows. Due to a race condition and unlucky circumstances, that may lead to TCP-level window shrinkage, and the connection being stalled on the sender end. Patch 2 is a preparation for patch 3 which processes pending subflow errors on close. Without that and under specific circumstances, the MPTCP-level socket might not switch to the CLOSE state and stall. Patch 4 is also a preparation patch for the next one. Patch 5 fixes MPTCP connections not switching to the CLOSE state when all subflows have been closed but no DATA_FIN have been exchanged to explicitly close the MPTCP connection. Now connections in such state will switch to the CLOSE state after a timeout, still allowing the "make-after-break" feature but making sure connections don't stall forever. It will be possible to modify this timeout -- currently matching TCP TIMEWAIT value (60 seconds) -- in a future version. ==================== Signed-off-by: Matthieu Baerts <[email protected]>
2 parents 8a47558 + 27e5ccc commit 972983f

File tree

4 files changed

+130
-103
lines changed

4 files changed

+130
-103
lines changed

net/mptcp/options.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1269,12 +1269,13 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
12691269

12701270
if (rcv_wnd == rcv_wnd_old)
12711271
break;
1272-
if (before64(rcv_wnd_new, rcv_wnd)) {
1272+
1273+
rcv_wnd_old = rcv_wnd;
1274+
if (before64(rcv_wnd_new, rcv_wnd_old)) {
12731275
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE);
12741276
goto raise_win;
12751277
}
12761278
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT);
1277-
rcv_wnd_old = rcv_wnd;
12781279
}
12791280
return;
12801281
}

net/mptcp/protocol.c

Lines changed: 102 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
405405
return false;
406406
}
407407

408-
static void mptcp_stop_timer(struct sock *sk)
408+
static void mptcp_stop_rtx_timer(struct sock *sk)
409409
{
410410
struct inet_connection_sock *icsk = inet_csk(sk);
411411

@@ -770,6 +770,46 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
770770
return moved;
771771
}
772772

773+
static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk)
774+
{
775+
int err = sock_error(ssk);
776+
int ssk_state;
777+
778+
if (!err)
779+
return false;
780+
781+
/* only propagate errors on fallen-back sockets or
782+
* on MPC connect
783+
*/
784+
if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk)))
785+
return false;
786+
787+
/* We need to propagate only transition to CLOSE state.
788+
* Orphaned socket will see such state change via
789+
* subflow_sched_work_if_closed() and that path will properly
790+
* destroy the msk as needed.
791+
*/
792+
ssk_state = inet_sk_state_load(ssk);
793+
if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
794+
inet_sk_state_store(sk, ssk_state);
795+
WRITE_ONCE(sk->sk_err, -err);
796+
797+
/* This barrier is coupled with smp_rmb() in mptcp_poll() */
798+
smp_wmb();
799+
sk_error_report(sk);
800+
return true;
801+
}
802+
803+
void __mptcp_error_report(struct sock *sk)
804+
{
805+
struct mptcp_subflow_context *subflow;
806+
struct mptcp_sock *msk = mptcp_sk(sk);
807+
808+
mptcp_for_each_subflow(msk, subflow)
809+
if (__mptcp_subflow_error_report(sk, mptcp_subflow_tcp_sock(subflow)))
810+
break;
811+
}
812+
773813
/* In most cases we will be able to lock the mptcp socket. If its already
774814
* owned, we need to defer to the work queue to avoid ABBA deadlock.
775815
*/
@@ -852,6 +892,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
852892
mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++;
853893
mptcp_sockopt_sync_locked(msk, ssk);
854894
mptcp_subflow_joined(msk, ssk);
895+
mptcp_stop_tout_timer(sk);
855896
return true;
856897
}
857898

@@ -871,12 +912,12 @@ static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list
871912
}
872913
}
873914

874-
static bool mptcp_timer_pending(struct sock *sk)
915+
static bool mptcp_rtx_timer_pending(struct sock *sk)
875916
{
876917
return timer_pending(&inet_csk(sk)->icsk_retransmit_timer);
877918
}
878919

879-
static void mptcp_reset_timer(struct sock *sk)
920+
static void mptcp_reset_rtx_timer(struct sock *sk)
880921
{
881922
struct inet_connection_sock *icsk = inet_csk(sk);
882923
unsigned long tout;
@@ -1010,10 +1051,10 @@ static void __mptcp_clean_una(struct sock *sk)
10101051
out:
10111052
if (snd_una == READ_ONCE(msk->snd_nxt) &&
10121053
snd_una == READ_ONCE(msk->write_seq)) {
1013-
if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk))
1014-
mptcp_stop_timer(sk);
1054+
if (mptcp_rtx_timer_pending(sk) && !mptcp_data_fin_enabled(msk))
1055+
mptcp_stop_rtx_timer(sk);
10151056
} else {
1016-
mptcp_reset_timer(sk);
1057+
mptcp_reset_rtx_timer(sk);
10171058
}
10181059
}
10191060

@@ -1586,8 +1627,8 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
15861627
mptcp_push_release(ssk, &info);
15871628

15881629
/* ensure the rtx timer is running */
1589-
if (!mptcp_timer_pending(sk))
1590-
mptcp_reset_timer(sk);
1630+
if (!mptcp_rtx_timer_pending(sk))
1631+
mptcp_reset_rtx_timer(sk);
15911632
if (do_check_data_fin)
15921633
mptcp_check_send_data_fin(sk);
15931634
}
@@ -1650,8 +1691,8 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool
16501691
if (copied) {
16511692
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
16521693
info.size_goal);
1653-
if (!mptcp_timer_pending(sk))
1654-
mptcp_reset_timer(sk);
1694+
if (!mptcp_rtx_timer_pending(sk))
1695+
mptcp_reset_rtx_timer(sk);
16551696

16561697
if (msk->snd_data_fin_enable &&
16571698
msk->snd_nxt + 1 == msk->write_seq)
@@ -2220,7 +2261,7 @@ static void mptcp_retransmit_timer(struct timer_list *t)
22202261
sock_put(sk);
22212262
}
22222263

2223-
static void mptcp_timeout_timer(struct timer_list *t)
2264+
static void mptcp_tout_timer(struct timer_list *t)
22242265
{
22252266
struct sock *sk = from_timer(sk, t, sk_timer);
22262267

@@ -2329,18 +2370,14 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
23292370
bool dispose_it, need_push = false;
23302371

23312372
/* If the first subflow moved to a close state before accept, e.g. due
2332-
* to an incoming reset, mptcp either:
2333-
* - if either the subflow or the msk are dead, destroy the context
2334-
* (the subflow socket is deleted by inet_child_forget) and the msk
2335-
* - otherwise do nothing at the moment and take action at accept and/or
2336-
* listener shutdown - user-space must be able to accept() the closed
2337-
* socket.
2373+
* to an incoming reset or listener shutdown, the subflow socket is
2374+
* already deleted by inet_child_forget() and the mptcp socket can't
2375+
* survive too.
23382376
*/
2339-
if (msk->in_accept_queue && msk->first == ssk) {
2340-
if (!sock_flag(sk, SOCK_DEAD) && !sock_flag(ssk, SOCK_DEAD))
2341-
return;
2342-
2377+
if (msk->in_accept_queue && msk->first == ssk &&
2378+
(sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) {
23432379
/* ensure later check in mptcp_worker() will dispose the msk */
2380+
mptcp_set_close_tout(sk, tcp_jiffies32 - (TCP_TIMEWAIT_LEN + 1));
23442381
sock_set_flag(sk, SOCK_DEAD);
23452382
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
23462383
mptcp_subflow_drop_ctx(ssk);
@@ -2392,6 +2429,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
23922429
}
23932430

23942431
out_release:
2432+
__mptcp_subflow_error_report(sk, ssk);
23952433
release_sock(ssk);
23962434

23972435
sock_put(ssk);
@@ -2402,6 +2440,22 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
24022440
out:
24032441
if (need_push)
24042442
__mptcp_push_pending(sk, 0);
2443+
2444+
/* Catch every 'all subflows closed' scenario, including peers silently
2445+
* closing them, e.g. due to timeout.
2446+
* For established sockets, allow an additional timeout before closing,
2447+
* as the protocol can still create more subflows.
2448+
*/
2449+
if (list_is_singular(&msk->conn_list) && msk->first &&
2450+
inet_sk_state_load(msk->first) == TCP_CLOSE) {
2451+
if (sk->sk_state != TCP_ESTABLISHED ||
2452+
msk->in_accept_queue || sock_flag(sk, SOCK_DEAD)) {
2453+
inet_sk_state_store(sk, TCP_CLOSE);
2454+
mptcp_close_wake_up(sk);
2455+
} else {
2456+
mptcp_start_tout_timer(sk);
2457+
}
2458+
}
24052459
}
24062460

24072461
void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
@@ -2445,23 +2499,14 @@ static void __mptcp_close_subflow(struct sock *sk)
24452499

24462500
}
24472501

2448-
static bool mptcp_should_close(const struct sock *sk)
2502+
static bool mptcp_close_tout_expired(const struct sock *sk)
24492503
{
2450-
s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp;
2451-
struct mptcp_subflow_context *subflow;
2452-
2453-
if (delta >= TCP_TIMEWAIT_LEN || mptcp_sk(sk)->in_accept_queue)
2454-
return true;
2504+
if (!inet_csk(sk)->icsk_mtup.probe_timestamp ||
2505+
sk->sk_state == TCP_CLOSE)
2506+
return false;
24552507

2456-
/* if all subflows are in closed status don't bother with additional
2457-
* timeout
2458-
*/
2459-
mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
2460-
if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow)) !=
2461-
TCP_CLOSE)
2462-
return false;
2463-
}
2464-
return true;
2508+
return time_after32(tcp_jiffies32,
2509+
inet_csk(sk)->icsk_mtup.probe_timestamp + TCP_TIMEWAIT_LEN);
24652510
}
24662511

24672512
static void mptcp_check_fastclose(struct mptcp_sock *msk)
@@ -2588,27 +2633,28 @@ static void __mptcp_retrans(struct sock *sk)
25882633
reset_timer:
25892634
mptcp_check_and_set_pending(sk);
25902635

2591-
if (!mptcp_timer_pending(sk))
2592-
mptcp_reset_timer(sk);
2636+
if (!mptcp_rtx_timer_pending(sk))
2637+
mptcp_reset_rtx_timer(sk);
25932638
}
25942639

25952640
/* schedule the timeout timer for the relevant event: either close timeout
25962641
* or mp_fail timeout. The close timeout takes precedence on the mp_fail one
25972642
*/
2598-
void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout)
2643+
void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout)
25992644
{
26002645
struct sock *sk = (struct sock *)msk;
26012646
unsigned long timeout, close_timeout;
26022647

2603-
if (!fail_tout && !sock_flag(sk, SOCK_DEAD))
2648+
if (!fail_tout && !inet_csk(sk)->icsk_mtup.probe_timestamp)
26042649
return;
26052650

2606-
close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + TCP_TIMEWAIT_LEN;
2651+
close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies +
2652+
TCP_TIMEWAIT_LEN;
26072653

26082654
/* the close timeout takes precedence on the fail one, and here at least one of
26092655
* them is active
26102656
*/
2611-
timeout = sock_flag(sk, SOCK_DEAD) ? close_timeout : fail_tout;
2657+
timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout;
26122658

26132659
sk_reset_timer(sk, &sk->sk_timer, timeout);
26142660
}
@@ -2627,8 +2673,6 @@ static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
26272673
mptcp_subflow_reset(ssk);
26282674
WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0);
26292675
unlock_sock_fast(ssk, slow);
2630-
2631-
mptcp_reset_timeout(msk, 0);
26322676
}
26332677

26342678
static void mptcp_do_fastclose(struct sock *sk)
@@ -2665,18 +2709,14 @@ static void mptcp_worker(struct work_struct *work)
26652709
if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
26662710
__mptcp_close_subflow(sk);
26672711

2668-
/* There is no point in keeping around an orphaned sk timedout or
2669-
* closed, but we need the msk around to reply to incoming DATA_FIN,
2670-
* even if it is orphaned and in FIN_WAIT2 state
2671-
*/
2672-
if (sock_flag(sk, SOCK_DEAD)) {
2673-
if (mptcp_should_close(sk))
2674-
mptcp_do_fastclose(sk);
2712+
if (mptcp_close_tout_expired(sk)) {
2713+
mptcp_do_fastclose(sk);
2714+
mptcp_close_wake_up(sk);
2715+
}
26752716

2676-
if (sk->sk_state == TCP_CLOSE) {
2677-
__mptcp_destroy_sock(sk);
2678-
goto unlock;
2679-
}
2717+
if (sock_flag(sk, SOCK_DEAD) && sk->sk_state == TCP_CLOSE) {
2718+
__mptcp_destroy_sock(sk);
2719+
goto unlock;
26802720
}
26812721

26822722
if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
@@ -2717,7 +2757,7 @@ static void __mptcp_init_sock(struct sock *sk)
27172757

27182758
/* re-use the csk retrans timer for MPTCP-level retrans */
27192759
timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
2720-
timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0);
2760+
timer_setup(&sk->sk_timer, mptcp_tout_timer, 0);
27212761
}
27222762

27232763
static void mptcp_ca_reset(struct sock *sk)
@@ -2808,8 +2848,8 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
28082848
} else {
28092849
pr_debug("Sending DATA_FIN on subflow %p", ssk);
28102850
tcp_send_ack(ssk);
2811-
if (!mptcp_timer_pending(sk))
2812-
mptcp_reset_timer(sk);
2851+
if (!mptcp_rtx_timer_pending(sk))
2852+
mptcp_reset_rtx_timer(sk);
28132853
}
28142854
break;
28152855
}
@@ -2892,7 +2932,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
28922932

28932933
might_sleep();
28942934

2895-
mptcp_stop_timer(sk);
2935+
mptcp_stop_rtx_timer(sk);
28962936
sk_stop_timer(sk, &sk->sk_timer);
28972937
msk->pm.status = 0;
28982938
mptcp_release_sched(msk);
@@ -2975,7 +3015,6 @@ bool __mptcp_close(struct sock *sk, long timeout)
29753015

29763016
cleanup:
29773017
/* orphan all the subflows */
2978-
inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
29793018
mptcp_for_each_subflow(msk, subflow) {
29803019
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
29813020
bool slow = lock_sock_fast_nested(ssk);
@@ -3012,7 +3051,7 @@ bool __mptcp_close(struct sock *sk, long timeout)
30123051
__mptcp_destroy_sock(sk);
30133052
do_cancel_work = true;
30143053
} else {
3015-
mptcp_reset_timeout(msk, 0);
3054+
mptcp_start_tout_timer(sk);
30163055
}
30173056

30183057
return do_cancel_work;
@@ -3075,8 +3114,8 @@ static int mptcp_disconnect(struct sock *sk, int flags)
30753114
mptcp_check_listen_stop(sk);
30763115
inet_sk_state_store(sk, TCP_CLOSE);
30773116

3078-
mptcp_stop_timer(sk);
3079-
sk_stop_timer(sk, &sk->sk_timer);
3117+
mptcp_stop_rtx_timer(sk);
3118+
mptcp_stop_tout_timer(sk);
30803119

30813120
if (msk->token)
30823121
mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);

net/mptcp/protocol.h

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -718,7 +718,29 @@ void mptcp_get_options(const struct sk_buff *skb,
718718

719719
void mptcp_finish_connect(struct sock *sk);
720720
void __mptcp_set_connected(struct sock *sk);
721-
void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout);
721+
void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout);
722+
723+
static inline void mptcp_stop_tout_timer(struct sock *sk)
724+
{
725+
if (!inet_csk(sk)->icsk_mtup.probe_timestamp)
726+
return;
727+
728+
sk_stop_timer(sk, &sk->sk_timer);
729+
inet_csk(sk)->icsk_mtup.probe_timestamp = 0;
730+
}
731+
732+
static inline void mptcp_set_close_tout(struct sock *sk, unsigned long tout)
733+
{
734+
/* avoid 0 timestamp, as that means no close timeout */
735+
inet_csk(sk)->icsk_mtup.probe_timestamp = tout ? : 1;
736+
}
737+
738+
static inline void mptcp_start_tout_timer(struct sock *sk)
739+
{
740+
mptcp_set_close_tout(sk, tcp_jiffies32);
741+
mptcp_reset_tout_timer(mptcp_sk(sk), 0);
742+
}
743+
722744
static inline bool mptcp_is_fully_established(struct sock *sk)
723745
{
724746
return inet_sk_state_load(sk) == TCP_ESTABLISHED &&

0 commit comments

Comments
 (0)