Skip to content

Commit 543cf9b

Browse files
q2venMartin KaFai Lau
authored andcommitted
net: Introduce net.core.bypass_prot_mem sysctl.
If a socket has sk->sk_bypass_prot_mem flagged, the socket opts out of the global protocol memory accounting. Let's control the flag by a new sysctl knob. The flag is written once during socket(2) and is inherited to child sockets. Tested with a script that creates local socket pairs and send()s a bunch of data without recv()ing. Setup: # mkdir /sys/fs/cgroup/test # echo $$ >> /sys/fs/cgroup/test/cgroup.procs # sysctl -q net.ipv4.tcp_mem="1000 1000 1000" # ulimit -n 524288 Without net.core.bypass_prot_mem, charged to tcp_mem & memcg # python3 pressure.py & # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 22642688 <-------------------------------------- charged to memcg # cat /proc/net/sockstat| grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 5376 <-- charged to tcp_mem # ss -tn | head -n 5 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53188 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:49972 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53868 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53554 # nstat | grep Pressure || echo no pressure TcpExtTCPMemoryPressures 1 0.0 With net.core.bypass_prot_mem=1, charged to memcg only: # sysctl -q net.core.bypass_prot_mem=1 # python3 pressure.py & # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 2757468160 <------------------------------------ charged to memcg # cat /proc/net/sockstat | grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 0 <- NOT charged to tcp_mem # ss -tn | head -n 5 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 111000 0 127.0.0.1:36019 127.0.0.1:49026 ESTAB 110000 0 127.0.0.1:36019 127.0.0.1:45630 ESTAB 110000 0 127.0.0.1:36019 127.0.0.1:44870 ESTAB 111000 0 127.0.0.1:36019 127.0.0.1:45274 # nstat | grep Pressure || echo no pressure no pressure Signed-off-by: Kuniyuki Iwashima <[email protected]> Signed-off-by: Martin KaFai Lau <[email protected]> Reviewed-by: Shakeel Butt <[email protected]> Reviewed-by: Eric Dumazet <[email protected]> Acked-by: Roman Gushchin <[email protected]> Link: https://patch.msgid.link/[email protected]
1 parent d5728fe commit 543cf9b

File tree

4 files changed

+23
-0
lines changed

4 files changed

+23
-0
lines changed

Documentation/admin-guide/sysctl/net.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,14 @@ mem_pcpu_rsv
212212

213213
Per-cpu reserved forward alloc cache size in page units. Default 1MB per CPU.
214214

215+
bypass_prot_mem
216+
---------------
217+
218+
Skip charging socket buffers to the global per-protocol memory
219+
accounting controlled by net.ipv4.tcp_mem, net.ipv4.udp_mem, etc.
220+
221+
Default: 0 (off)
222+
215223
rmem_default
216224
------------
217225

include/net/netns/core.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ struct netns_core {
1616
int sysctl_optmem_max;
1717
u8 sysctl_txrehash;
1818
u8 sysctl_tstamp_allow_data;
19+
u8 sysctl_bypass_prot_mem;
1920

2021
#ifdef CONFIG_PROC_FS
2122
struct prot_inuse __percpu *prot_inuse;

net/core/sock.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2306,8 +2306,13 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
23062306
* why we need sk_prot_creator -acme
23072307
*/
23082308
sk->sk_prot = sk->sk_prot_creator = prot;
2309+
2310+
if (READ_ONCE(net->core.sysctl_bypass_prot_mem))
2311+
sk->sk_bypass_prot_mem = 1;
2312+
23092313
sk->sk_kern_sock = kern;
23102314
sock_lock_init(sk);
2315+
23112316
sk->sk_net_refcnt = kern ? 0 : 1;
23122317
if (likely(sk->sk_net_refcnt)) {
23132318
get_net_track(net, &sk->ns_tracker, priority);

net/core/sysctl_net_core.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,15 @@ static struct ctl_table netns_core_table[] = {
676676
.extra1 = SYSCTL_ZERO,
677677
.extra2 = SYSCTL_ONE
678678
},
679+
{
680+
.procname = "bypass_prot_mem",
681+
.data = &init_net.core.sysctl_bypass_prot_mem,
682+
.maxlen = sizeof(u8),
683+
.mode = 0644,
684+
.proc_handler = proc_dou8vec_minmax,
685+
.extra1 = SYSCTL_ZERO,
686+
.extra2 = SYSCTL_ONE
687+
},
679688
/* sysctl_core_net_init() will set the values after this
680689
* to readonly in network namespaces
681690
*/

0 commit comments

Comments
 (0)