|
| 1 | +From 7eed89daad561d34ad3b65b2f6e73c99eed0f090 Mon Sep 17 00:00:00 2001 |
| 2 | +From: Eric Dumazet <edumazet@google.com> |
| 3 | +Date: Thu, 20 Feb 2025 13:18:54 +0000 |
| 4 | +Subject: [PATCH 2/2] net: better track kernel sockets lifetime |
| 5 | + |
| 6 | +While kernel sockets are dismantled during pernet_operations->exit(), |
| 7 | +their freeing can be delayed by any tx packets still held in qdisc |
| 8 | +or device queues, due to skb_set_owner_w() prior calls. |
| 9 | + |
| 10 | +This then trigger the following warning from ref_tracker_dir_exit() [1] |
| 11 | + |
| 12 | +To fix this, make sure that kernel sockets own a reference on net->passive. |
| 13 | + |
| 14 | +Add sk_net_refcnt_upgrade() helper, used whenever a kernel socket |
| 15 | +is converted to a refcounted one. |
| 16 | + |
| 17 | +[1] |
| 18 | + |
| 19 | +[ 136.263918][ T35] ref_tracker: net notrefcnt@ffff8880638f01e0 has 1/2 users at |
| 20 | +[ 136.263918][ T35] sk_alloc+0x2b3/0x370 |
| 21 | +[ 136.263918][ T35] inet6_create+0x6ce/0x10f0 |
| 22 | +[ 136.263918][ T35] __sock_create+0x4c0/0xa30 |
| 23 | +[ 136.263918][ T35] inet_ctl_sock_create+0xc2/0x250 |
| 24 | +[ 136.263918][ T35] igmp6_net_init+0x39/0x390 |
| 25 | +[ 136.263918][ T35] ops_init+0x31e/0x590 |
| 26 | +[ 136.263918][ T35] setup_net+0x287/0x9e0 |
| 27 | +[ 136.263918][ T35] copy_net_ns+0x33f/0x570 |
| 28 | +[ 136.263918][ T35] create_new_namespaces+0x425/0x7b0 |
| 29 | +[ 136.263918][ T35] unshare_nsproxy_namespaces+0x124/0x180 |
| 30 | +[ 136.263918][ T35] ksys_unshare+0x57d/0xa70 |
| 31 | +[ 136.263918][ T35] __x64_sys_unshare+0x38/0x40 |
| 32 | +[ 136.263918][ T35] do_syscall_64+0xf3/0x230 |
| 33 | +[ 136.263918][ T35] entry_SYSCALL_64_after_hwframe+0x77/0x7f |
| 34 | +[ 136.263918][ T35] |
| 35 | +[ 136.343488][ T35] ref_tracker: net notrefcnt@ffff8880638f01e0 has 1/2 users at |
| 36 | +[ 136.343488][ T35] sk_alloc+0x2b3/0x370 |
| 37 | +[ 136.343488][ T35] inet6_create+0x6ce/0x10f0 |
| 38 | +[ 136.343488][ T35] __sock_create+0x4c0/0xa30 |
| 39 | +[ 136.343488][ T35] inet_ctl_sock_create+0xc2/0x250 |
| 40 | +[ 136.343488][ T35] ndisc_net_init+0xa7/0x2b0 |
| 41 | +[ 136.343488][ T35] ops_init+0x31e/0x590 |
| 42 | +[ 136.343488][ T35] setup_net+0x287/0x9e0 |
| 43 | +[ 136.343488][ T35] copy_net_ns+0x33f/0x570 |
| 44 | +[ 136.343488][ T35] create_new_namespaces+0x425/0x7b0 |
| 45 | +[ 136.343488][ T35] unshare_nsproxy_namespaces+0x124/0x180 |
| 46 | +[ 136.343488][ T35] ksys_unshare+0x57d/0xa70 |
| 47 | +[ 136.343488][ T35] __x64_sys_unshare+0x38/0x40 |
| 48 | +[ 136.343488][ T35] do_syscall_64+0xf3/0x230 |
| 49 | +[ 136.343488][ T35] entry_SYSCALL_64_after_hwframe+0x77/0x7f |
| 50 | + |
| 51 | +Fixes: 0cafd77dcd03 ("net: add a refcount tracker for kernel sockets") |
| 52 | +Reported-by: syzbot+30a19e01a97420719891@syzkaller.appspotmail.com |
| 53 | +Closes: https://lore.kernel.org/netdev/67b72aeb.050a0220.14d86d.0283.GAE@google.com/T/#u |
| 54 | +Signed-off-by: Eric Dumazet <edumazet@google.com> |
| 55 | +Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com> |
| 56 | +Link: https://patch.msgid.link/20250220131854.4048077-1-edumazet@google.com |
| 57 | +Signed-off-by: Jakub Kicinski <kuba@kernel.org> |
| 58 | +--- |
| 59 | + include/net/sock.h | 1 + |
| 60 | + net/core/sock.c | 27 ++++++++++++++++++++++----- |
| 61 | + net/mptcp/subflow.c | 5 +---- |
| 62 | + net/netlink/af_netlink.c | 10 ---------- |
| 63 | + net/rds/tcp.c | 8 ++------ |
| 64 | + net/smc/af_smc.c | 5 +---- |
| 65 | + net/sunrpc/svcsock.c | 5 +---- |
| 66 | + net/sunrpc/xprtsock.c | 8 ++------ |
| 67 | + 8 files changed, 30 insertions(+), 39 deletions(-) |
| 68 | + |
| 69 | +diff --git a/include/net/sock.h b/include/net/sock.h |
| 70 | +index fa055cf1785e..0c8dc587a8ff 100644 |
| 71 | +--- a/include/net/sock.h |
| 72 | ++++ b/include/net/sock.h |
| 73 | +@@ -1744,6 +1744,7 @@ static inline bool sock_allow_reclassification(const struct sock *csk) |
| 74 | + struct sock *sk_alloc(struct net *net, int family, gfp_t priority, |
| 75 | + struct proto *prot, int kern); |
| 76 | + void sk_free(struct sock *sk); |
| 77 | ++void sk_net_refcnt_upgrade(struct sock *sk); |
| 78 | + void sk_destruct(struct sock *sk); |
| 79 | + struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); |
| 80 | + void sk_free_unlock_clone(struct sock *sk); |
| 81 | +diff --git a/net/core/sock.c b/net/core/sock.c |
| 82 | +index a83f64a1d96a..822642bcb81a 100644 |
| 83 | +--- a/net/core/sock.c |
| 84 | ++++ b/net/core/sock.c |
| 85 | +@@ -2238,6 +2238,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, |
| 86 | + get_net_track(net, &sk->ns_tracker, priority); |
| 87 | + sock_inuse_add(net, 1); |
| 88 | + } else { |
| 89 | ++ net_passive_inc(net); |
| 90 | + __netns_tracker_alloc(net, &sk->ns_tracker, |
| 91 | + false, priority); |
| 92 | + } |
| 93 | +@@ -2262,6 +2263,7 @@ EXPORT_SYMBOL(sk_alloc); |
| 94 | + static void __sk_destruct(struct rcu_head *head) |
| 95 | + { |
| 96 | + struct sock *sk = container_of(head, struct sock, sk_rcu); |
| 97 | ++ struct net *net = sock_net(sk); |
| 98 | + struct sk_filter *filter; |
| 99 | + |
| 100 | + if (sk->sk_destruct) |
| 101 | +@@ -2293,14 +2295,28 @@ static void __sk_destruct(struct rcu_head *head) |
| 102 | + put_cred(sk->sk_peer_cred); |
| 103 | + put_pid(sk->sk_peer_pid); |
| 104 | + |
| 105 | +- if (likely(sk->sk_net_refcnt)) |
| 106 | +- put_net_track(sock_net(sk), &sk->ns_tracker); |
| 107 | +- else |
| 108 | +- __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); |
| 109 | +- |
| 110 | ++ if (likely(sk->sk_net_refcnt)) { |
| 111 | ++ put_net_track(net, &sk->ns_tracker); |
| 112 | ++ } else { |
| 113 | ++ __netns_tracker_free(net, &sk->ns_tracker, false); |
| 114 | ++ net_passive_dec(net); |
| 115 | ++ } |
| 116 | + sk_prot_free(sk->sk_prot_creator, sk); |
| 117 | + } |
| 118 | + |
| 119 | ++void sk_net_refcnt_upgrade(struct sock *sk) |
| 120 | ++{ |
| 121 | ++ struct net *net = sock_net(sk); |
| 122 | ++ |
| 123 | ++ WARN_ON_ONCE(sk->sk_net_refcnt); |
| 124 | ++ __netns_tracker_free(net, &sk->ns_tracker, false); |
| 125 | ++ net_passive_dec(net); |
| 126 | ++ sk->sk_net_refcnt = 1; |
| 127 | ++ get_net_track(net, &sk->ns_tracker, GFP_KERNEL); |
| 128 | ++ sock_inuse_add(net, 1); |
| 129 | ++} |
| 130 | ++EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade); |
| 131 | ++ |
| 132 | + void sk_destruct(struct sock *sk) |
| 133 | + { |
| 134 | + bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); |
| 135 | +@@ -2397,6 +2413,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) |
| 136 | + * is not properly dismantling its kernel sockets at netns |
| 137 | + * destroy time. |
| 138 | + */ |
| 139 | ++ net_passive_inc(sock_net(newsk)); |
| 140 | + __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, |
| 141 | + false, priority); |
| 142 | + } |
| 143 | +diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c |
| 144 | +index b56bbee7312c..4a49b7749b07 100644 |
| 145 | +--- a/net/mptcp/subflow.c |
| 146 | ++++ b/net/mptcp/subflow.c |
| 147 | +@@ -1755,10 +1755,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, |
| 148 | + * needs it. |
| 149 | + * Update ns_tracker to current stack trace and refcounted tracker. |
| 150 | + */ |
| 151 | +- __netns_tracker_free(net, &sf->sk->ns_tracker, false); |
| 152 | +- sf->sk->sk_net_refcnt = 1; |
| 153 | +- get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL); |
| 154 | +- sock_inuse_add(net, 1); |
| 155 | ++ sk_net_refcnt_upgrade(sf->sk); |
| 156 | + err = tcp_set_ulp(sf->sk, "mptcp"); |
| 157 | + if (err) |
| 158 | + goto err_free; |
| 159 | +diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c |
| 160 | +index 775d707ec708..f26e6be94d84 100644 |
| 161 | +--- a/net/netlink/af_netlink.c |
| 162 | ++++ b/net/netlink/af_netlink.c |
| 163 | +@@ -795,16 +795,6 @@ static int netlink_release(struct socket *sock) |
| 164 | + |
| 165 | + sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); |
| 166 | + |
| 167 | +- /* Because struct net might disappear soon, do not keep a pointer. */ |
| 168 | +- if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) { |
| 169 | +- __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); |
| 170 | +- /* Because of deferred_put_nlk_sk and use of work queue, |
| 171 | +- * it is possible netns will be freed before this socket. |
| 172 | +- */ |
| 173 | +- sock_net_set(sk, &init_net); |
| 174 | +- __netns_tracker_alloc(&init_net, &sk->ns_tracker, |
| 175 | +- false, GFP_KERNEL); |
| 176 | +- } |
| 177 | + call_rcu(&nlk->rcu, deferred_put_nlk_sk); |
| 178 | + return 0; |
| 179 | + } |
| 180 | +diff --git a/net/rds/tcp.c b/net/rds/tcp.c |
| 181 | +index 0581c53e6517..3cc2f303bf78 100644 |
| 182 | +--- a/net/rds/tcp.c |
| 183 | ++++ b/net/rds/tcp.c |
| 184 | +@@ -504,12 +504,8 @@ bool rds_tcp_tune(struct socket *sock) |
| 185 | + release_sock(sk); |
| 186 | + return false; |
| 187 | + } |
| 188 | +- /* Update ns_tracker to current stack trace and refcounted tracker */ |
| 189 | +- __netns_tracker_free(net, &sk->ns_tracker, false); |
| 190 | +- |
| 191 | +- sk->sk_net_refcnt = 1; |
| 192 | +- netns_tracker_alloc(net, &sk->ns_tracker, GFP_KERNEL); |
| 193 | +- sock_inuse_add(net, 1); |
| 194 | ++ sk_net_refcnt_upgrade(sk); |
| 195 | ++ put_net(net); |
| 196 | + } |
| 197 | + rtn = net_generic(net, rds_tcp_netid); |
| 198 | + if (rtn->sndbuf_size > 0) { |
| 199 | +diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c |
| 200 | +index ebc41a7b13db..ba834cefb177 100644 |
| 201 | +--- a/net/smc/af_smc.c |
| 202 | ++++ b/net/smc/af_smc.c |
| 203 | +@@ -3334,10 +3334,7 @@ int smc_create_clcsk(struct net *net, struct sock *sk, int family) |
| 204 | + * which need net ref. |
| 205 | + */ |
| 206 | + sk = smc->clcsock->sk; |
| 207 | +- __netns_tracker_free(net, &sk->ns_tracker, false); |
| 208 | +- sk->sk_net_refcnt = 1; |
| 209 | +- get_net_track(net, &sk->ns_tracker, GFP_KERNEL); |
| 210 | +- sock_inuse_add(net, 1); |
| 211 | ++ sk_net_refcnt_upgrade(sk); |
| 212 | + return 0; |
| 213 | + } |
| 214 | + |
| 215 | +diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c |
| 216 | +index 3bfbb789c4be..780a54548372 100644 |
| 217 | +--- a/net/sunrpc/svcsock.c |
| 218 | ++++ b/net/sunrpc/svcsock.c |
| 219 | +@@ -1541,10 +1541,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, |
| 220 | + newlen = error; |
| 221 | + |
| 222 | + if (protocol == IPPROTO_TCP) { |
| 223 | +- __netns_tracker_free(net, &sock->sk->ns_tracker, false); |
| 224 | +- sock->sk->sk_net_refcnt = 1; |
| 225 | +- get_net_track(net, &sock->sk->ns_tracker, GFP_KERNEL); |
| 226 | +- sock_inuse_add(net, 1); |
| 227 | ++ sk_net_refcnt_upgrade(sock->sk); |
| 228 | + if ((error = kernel_listen(sock, 64)) < 0) |
| 229 | + goto bummer; |
| 230 | + } |
| 231 | +diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c |
| 232 | +index 171ad4e2523f..e077f5718e8b 100644 |
| 233 | +--- a/net/sunrpc/xprtsock.c |
| 234 | ++++ b/net/sunrpc/xprtsock.c |
| 235 | +@@ -1940,12 +1940,8 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt, |
| 236 | + goto out; |
| 237 | + } |
| 238 | + |
| 239 | +- if (protocol == IPPROTO_TCP) { |
| 240 | +- __netns_tracker_free(xprt->xprt_net, &sock->sk->ns_tracker, false); |
| 241 | +- sock->sk->sk_net_refcnt = 1; |
| 242 | +- get_net_track(xprt->xprt_net, &sock->sk->ns_tracker, GFP_KERNEL); |
| 243 | +- sock_inuse_add(xprt->xprt_net, 1); |
| 244 | +- } |
| 245 | ++ if (protocol == IPPROTO_TCP) |
| 246 | ++ sk_net_refcnt_upgrade(sock->sk); |
| 247 | + |
| 248 | + filp = sock_alloc_file(sock, O_NONBLOCK, NULL); |
| 249 | + if (IS_ERR(filp)) |
| 250 | +-- |
| 251 | +2.25.1 |
| 252 | + |
0 commit comments