Skip to content

Commit 571f3dd

Browse files
committed
Merge tag 'rxrpc-fixes-20230107' of git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs
David Howells says: ==================== rxrpc: Fix race between call connection, data transmit and call disconnect Here are patches to fix an oops[1] caused by a race between call connection, initial packet transmission and call disconnection which results in something like: kernel BUG at net/rxrpc/peer_object.c:413! when the syzbot test is run. The problem is that the connection procedure is effectively split across two threads and can get expanded by taking an interrupt, thereby adding the call to the peer error distribution list *after* it has been disconnected (say by the rxrpc socket shutting down). The easiest solution is to look at the fourth set of I/O thread conversion/SACK table expansion patches that didn't get applied[2] and take from it those patches that move call connection and disconnection into the I/O thread. Moving these things into the I/O thread means that the sequencing is managed by all being done in the same thread - and the race can no longer happen. This is preferable to introducing an extra lock as adding an extra lock would make the I/O thread have to wait for the app thread in yet another place. The changes can be considered as a number of logical parts: (1) Move all of the call state changes into the I/O thread. (2) Make client connection ID space per-local endpoint so that the I/O thread doesn't need locks to access it. (3) Move actual abort generation into the I/O thread and clean it up. If sendmsg or recvmsg want to cause an abort, they have to delegate it. (4) Offload the setting up of the security context on a connection to the thread of one of the apps that's starting a call. We don't want to be doing any sort of crypto in the I/O thread. (5) Connect calls (ie. assign them to channel slots on connections) in the I/O thread. Calls are set up by sendmsg/kafs and passed to the I/O thread to connect. Connections are allocated in the I/O thread after this. (6) Disconnect calls in the I/O thread. I've also added a patch for an unrelated bug that cropped up during testing, whereby a race can occur between an incoming call and socket shutdown. Note that whilst this fixes the original syzbot bug, another bug may get triggered if this one is fixed: INFO: rcu detected stall in corrupted rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { P5792 } 2657 jiffies s: 2825 root: 0x0/T rcu: blocking rcu_node structures (internal RCU debug): It doesn't look this should be anything to do with rxrpc, though, as I've tested an additional patch[3] that removes practically all the RCU usage from rxrpc and it still occurs. It seems likely that it is being caused by something in the tunnelling setup that the syzbot test does, but there's not enough info to go on. It also seems unlikely to be anything to do with the afs driver as the test doesn't use that. ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents b4e9b87 + 42f229c commit 571f3dd

29 files changed

+1592
-1760
lines changed

Documentation/networking/rxrpc.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -880,8 +880,8 @@ The kernel interface functions are as follows:
880880

881881
notify_end_rx can be NULL or it can be used to specify a function to be
882882
called when the call changes state to end the Tx phase. This function is
883-
called with the call-state spinlock held to prevent any reply or final ACK
884-
from being delivered first.
883+
called with a spinlock held to prevent the last DATA packet from being
884+
transmitted until the function returns.
885885

886886
(#) Receive data from a call::
887887

fs/afs/cmservice.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
#include "internal.h"
1414
#include "afs_cm.h"
1515
#include "protocol_yfs.h"
16+
#define RXRPC_TRACE_ONLY_DEFINE_ENUMS
17+
#include <trace/events/rxrpc.h>
1618

1719
static int afs_deliver_cb_init_call_back_state(struct afs_call *);
1820
static int afs_deliver_cb_init_call_back_state3(struct afs_call *);
@@ -191,7 +193,7 @@ static void afs_cm_destructor(struct afs_call *call)
191193
* Abort a service call from within an action function.
192194
*/
193195
static void afs_abort_service_call(struct afs_call *call, u32 abort_code, int error,
194-
const char *why)
196+
enum rxrpc_abort_reason why)
195197
{
196198
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
197199
abort_code, error, why);
@@ -469,7 +471,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work)
469471
if (memcmp(r, &call->net->uuid, sizeof(call->net->uuid)) == 0)
470472
afs_send_empty_reply(call);
471473
else
472-
afs_abort_service_call(call, 1, 1, "K-1");
474+
afs_abort_service_call(call, 1, 1, afs_abort_probeuuid_negative);
473475

474476
afs_put_call(call);
475477
_leave("");

fs/afs/rxrpc.c

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
#include "internal.h"
1414
#include "afs_cm.h"
1515
#include "protocol_yfs.h"
16+
#define RXRPC_TRACE_ONLY_DEFINE_ENUMS
17+
#include <trace/events/rxrpc.h>
1618

1719
struct workqueue_struct *afs_async_calls;
1820

@@ -397,7 +399,8 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
397399
error_do_abort:
398400
if (ret != -ECONNABORTED) {
399401
rxrpc_kernel_abort_call(call->net->socket, rxcall,
400-
RX_USER_ABORT, ret, "KSD");
402+
RX_USER_ABORT, ret,
403+
afs_abort_send_data_error);
401404
} else {
402405
len = 0;
403406
iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0);
@@ -527,7 +530,8 @@ static void afs_deliver_to_call(struct afs_call *call)
527530
case -ENOTSUPP:
528531
abort_code = RXGEN_OPCODE;
529532
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
530-
abort_code, ret, "KIV");
533+
abort_code, ret,
534+
afs_abort_op_not_supported);
531535
goto local_abort;
532536
case -EIO:
533537
pr_err("kAFS: Call %u in bad state %u\n",
@@ -542,12 +546,14 @@ static void afs_deliver_to_call(struct afs_call *call)
542546
if (state != AFS_CALL_CL_AWAIT_REPLY)
543547
abort_code = RXGEN_SS_UNMARSHAL;
544548
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
545-
abort_code, ret, "KUM");
549+
abort_code, ret,
550+
afs_abort_unmarshal_error);
546551
goto local_abort;
547552
default:
548553
abort_code = RX_CALL_DEAD;
549554
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
550-
abort_code, ret, "KER");
555+
abort_code, ret,
556+
afs_abort_general_error);
551557
goto local_abort;
552558
}
553559
}
@@ -619,7 +625,8 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
619625
/* Kill off the call if it's still live. */
620626
_debug("call interrupted");
621627
if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
622-
RX_USER_ABORT, -EINTR, "KWI"))
628+
RX_USER_ABORT, -EINTR,
629+
afs_abort_interrupted))
623630
afs_set_call_complete(call, -EINTR, 0);
624631
}
625632
}
@@ -836,7 +843,8 @@ void afs_send_empty_reply(struct afs_call *call)
836843
case -ENOMEM:
837844
_debug("oom");
838845
rxrpc_kernel_abort_call(net->socket, call->rxcall,
839-
RXGEN_SS_MARSHAL, -ENOMEM, "KOO");
846+
RXGEN_SS_MARSHAL, -ENOMEM,
847+
afs_abort_oom);
840848
fallthrough;
841849
default:
842850
_leave(" [error]");
@@ -878,7 +886,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
878886
if (n == -ENOMEM) {
879887
_debug("oom");
880888
rxrpc_kernel_abort_call(net->socket, call->rxcall,
881-
RXGEN_SS_MARSHAL, -ENOMEM, "KOO");
889+
RXGEN_SS_MARSHAL, -ENOMEM,
890+
afs_abort_oom);
882891
}
883892
_leave(" [error]");
884893
}
@@ -900,6 +909,7 @@ int afs_extract_data(struct afs_call *call, bool want_more)
900909
ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, iter,
901910
&call->iov_len, want_more, &remote_abort,
902911
&call->service_id);
912+
trace_afs_receive_data(call, call->iter, want_more, ret);
903913
if (ret == 0 || ret == -EAGAIN)
904914
return ret;
905915

include/net/af_rxrpc.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ struct key;
1515
struct sock;
1616
struct socket;
1717
struct rxrpc_call;
18+
enum rxrpc_abort_reason;
1819

1920
enum rxrpc_interruptibility {
2021
RXRPC_INTERRUPTIBLE, /* Call is interruptible */
@@ -55,7 +56,7 @@ int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *,
5556
int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *,
5657
struct iov_iter *, size_t *, bool, u32 *, u16 *);
5758
bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *,
58-
u32, int, const char *);
59+
u32, int, enum rxrpc_abort_reason);
5960
void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *);
6061
void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *,
6162
struct sockaddr_rxrpc *);

0 commit comments

Comments
 (0)