Skip to content

Commit 918fb1a

Browse files
committed
Merge branch 'stop-corrupting-socket-s-task_frag'
Benjamin Coddington says: ==================== Stop corrupting socket's task_frag The networking code uses flags in sk_allocation to determine if it can use current->task_frag, however in-kernel users of sockets may stop setting sk_allocation when they convert to the preferred memalloc_nofs_save/restore, as SUNRPC has done in commit a1231fd ("SUNRPC: Set memalloc_nofs_save() on all rpciod/xprtiod jobs"). This will cause corruption in current->task_frag when recursing into the network layer for those subsystems during page fault or reclaim. The corruption is difficult to diagnose because stack traces may not contain the offending subsystem at all. The corruption is unlikely to show up in testing because it requires memory pressure, and so subsystems that convert to memalloc_nofs_save/restore are likely to continue to run into this issue. Previous reports and proposed fixes: https://lore.kernel.org/netdev/96a18bd00cbc6cb554603cc0d6ef1c551965b078.1663762494.git.gnault@redhat.com/ https://lore.kernel.org/netdev/b4d8cb09c913d3e34f853736f3f5628abfd7f4b6.1656699567.git.gnault@redhat.com/ https://lore.kernel.org/linux-nfs/de6d99321d1dcaa2ad456b92b3680aa77c07a747.1665401788.git.gnault@redhat.com/ Guilluame Nault has done all of the hard work tracking this problem down and finding the best fix for this issue. I'm just taking a turn posting another fix. ==================== Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents b389a90 + 08f6589 commit 918fb1a

File tree

14 files changed

+24
-4
lines changed

14 files changed

+24
-4
lines changed

drivers/block/drbd/drbd_receiver.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,6 +1030,9 @@ static int conn_connect(struct drbd_connection *connection)
10301030
sock.socket->sk->sk_allocation = GFP_NOIO;
10311031
msock.socket->sk->sk_allocation = GFP_NOIO;
10321032

1033+
sock.socket->sk->sk_use_task_frag = false;
1034+
msock.socket->sk->sk_use_task_frag = false;
1035+
10331036
sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
10341037
msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
10351038

drivers/block/nbd.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
512512
noreclaim_flag = memalloc_noreclaim_save();
513513
do {
514514
sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
515+
sock->sk->sk_use_task_frag = false;
515516
msg.msg_name = NULL;
516517
msg.msg_namelen = 0;
517518
msg.msg_control = NULL;

drivers/nvme/host/tcp.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1537,6 +1537,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
15371537
queue->sock->sk->sk_rcvtimeo = 10 * HZ;
15381538

15391539
queue->sock->sk->sk_allocation = GFP_ATOMIC;
1540+
queue->sock->sk->sk_use_task_frag = false;
15401541
nvme_tcp_set_queue_io_cpu(queue);
15411542
queue->request = NULL;
15421543
queue->data_remaining = 0;

drivers/scsi/iscsi_tcp.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -738,6 +738,7 @@ iscsi_sw_tcp_conn_bind(struct iscsi_cls_session *cls_session,
738738
sk->sk_reuse = SK_CAN_REUSE;
739739
sk->sk_sndtimeo = 15 * HZ; /* FIXME: make it configurable */
740740
sk->sk_allocation = GFP_ATOMIC;
741+
sk->sk_use_task_frag = false;
741742
sk_set_memalloc(sk);
742743
sock_no_linger(sk);
743744

drivers/usb/usbip/usbip_common.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,7 @@ int usbip_recv(struct socket *sock, void *buf, int size)
315315

316316
do {
317317
sock->sk->sk_allocation = GFP_NOIO;
318+
sock->sk->sk_use_task_frag = false;
318319

319320
result = sock_recvmsg(sock, &msg, MSG_WAITALL);
320321
if (result <= 0)

fs/cifs/connect.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2944,6 +2944,7 @@ generic_ip_connect(struct TCP_Server_Info *server)
29442944
cifs_dbg(FYI, "Socket created\n");
29452945
server->ssocket = socket;
29462946
socket->sk->sk_allocation = GFP_NOFS;
2947+
socket->sk->sk_use_task_frag = false;
29472948
if (sfamily == AF_INET6)
29482949
cifs_reclassify_socket6(socket);
29492950
else

fs/dlm/lowcomms.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -645,6 +645,7 @@ static void add_sock(struct socket *sock, struct connection *con)
645645
if (dlm_config.ci_protocol == DLM_PROTO_SCTP)
646646
sk->sk_state_change = lowcomms_state_change;
647647
sk->sk_allocation = GFP_NOFS;
648+
sk->sk_use_task_frag = false;
648649
sk->sk_error_report = lowcomms_error_report;
649650
release_sock(sk);
650651
}
@@ -1769,6 +1770,7 @@ static int dlm_listen_for_all(void)
17691770
listen_con.sock = sock;
17701771

17711772
sock->sk->sk_allocation = GFP_NOFS;
1773+
sock->sk->sk_use_task_frag = false;
17721774
sock->sk->sk_data_ready = lowcomms_listen_data_ready;
17731775
release_sock(sock->sk);
17741776

fs/ocfs2/cluster/tcp.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1602,6 +1602,7 @@ static void o2net_start_connect(struct work_struct *work)
16021602
sc->sc_sock = sock; /* freed by sc_kref_release */
16031603

16041604
sock->sk->sk_allocation = GFP_ATOMIC;
1605+
sock->sk->sk_use_task_frag = false;
16051606

16061607
myaddr.sin_family = AF_INET;
16071608
myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;

include/net/sock.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,9 @@ struct sk_filter;
318318
* @sk_stamp: time stamp of last packet received
319319
* @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
320320
* @sk_tsflags: SO_TIMESTAMPING flags
321+
* @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
322+
* Sockets that can be used under memory reclaim should
323+
* set this to false.
321324
* @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
322325
* for timestamping
323326
* @sk_tskey: counter to disambiguate concurrent tstamp requests
@@ -512,6 +515,7 @@ struct sock {
512515
u8 sk_txtime_deadline_mode : 1,
513516
sk_txtime_report_errors : 1,
514517
sk_txtime_unused : 6;
518+
bool sk_use_task_frag;
515519

516520
struct socket *sk_socket;
517521
void *sk_user_data;
@@ -2560,16 +2564,14 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
25602564
* Both direct reclaim and page faults can nest inside other
25612565
* socket operations and end up recursing into sk_page_frag()
25622566
* while it's already in use: explicitly avoid task page_frag
2563-
* usage if the caller is potentially doing any of them.
2564-
* This assumes that page fault handlers use the GFP_NOFS flags.
2567+
* when users disable sk_use_task_frag.
25652568
*
25662569
* Return: a per task page_frag if context allows that,
25672570
* otherwise a per socket one.
25682571
*/
25692572
static inline struct page_frag *sk_page_frag(struct sock *sk)
25702573
{
2571-
if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) ==
2572-
(__GFP_DIRECT_RECLAIM | __GFP_FS))
2574+
if (sk->sk_use_task_frag)
25732575
return &current->task_frag;
25742576

25752577
return &sk->sk_frag;

net/9p/trans_fd.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -868,6 +868,7 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket)
868868
}
869869

870870
csocket->sk->sk_allocation = GFP_NOIO;
871+
csocket->sk->sk_use_task_frag = false;
871872
file = sock_alloc_file(csocket, 0, NULL);
872873
if (IS_ERR(file)) {
873874
pr_err("%s (%d): failed to map fd\n",

0 commit comments

Comments
 (0)