Skip to content

Commit 25b74c0

Browse files
q2venKernel Patches Daemon
authored andcommitted
net-memcg: Introduce net.core.memcg_exclusive sysctl.
If net.core.memcg_exclusive is 1 when sk->sk_memcg is allocated, the socket is flagged with SK_MEMCG_EXCLUSIVE internally and skips the global per-protocol memory accounting. OTOH, for accept()ed child sockets, this flag is inherited from the listening socket in sk_clone_lock() and set in __inet_accept(). This is to preserve the decision by BPF which will be supported later. Given sk->sk_memcg can be accessed in the fast path, it would be preferable to place the flag field in the same cache line as sk->sk_memcg. However, struct sock does not have such a 1-byte hole. Let's store the flag in the lowest bit of sk->sk_memcg and check it in mem_cgroup_sk_exclusive(). Tested with a script that creates local socket pairs and send()s a bunch of data without recv()ing. Setup: # mkdir /sys/fs/cgroup/test # echo $$ >> /sys/fs/cgroup/test/cgroup.procs # sysctl -q net.ipv4.tcp_mem="1000 1000 1000" Without net.core.memcg_exclusive, charged to memcg & tcp_mem: # prlimit -n=524288:524288 bash -c "python3 pressure.py" & # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 22642688 <-------------------------------------- charged to memcg # cat /proc/net/sockstat| grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 5376 <-- charged to tcp_mem # ss -tn | head -n 5 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53188 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:49972 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53868 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53554 # nstat | grep Pressure || echo no pressure TcpExtTCPMemoryPressures 1 0.0 With net.core.memcg_exclusive=1, only charged to memcg: # sysctl -q net.core.memcg_exclusive=1 # prlimit -n=524288:524288 bash -c "python3 pressure.py" & # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 2757468160 <------------------------------------ charged to memcg # cat /proc/net/sockstat | grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 0 <- NOT charged to tcp_mem # ss -tn | head -n 5 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 111000 0 127.0.0.1:36019 127.0.0.1:49026 ESTAB 110000 0 127.0.0.1:36019 127.0.0.1:45630 ESTAB 110000 0 127.0.0.1:36019 127.0.0.1:44870 ESTAB 111000 0 127.0.0.1:36019 127.0.0.1:45274 # nstat | grep Pressure || echo no pressure no pressure Signed-off-by: Kuniyuki Iwashima <[email protected]>
1 parent 14ad8eb commit 25b74c0

File tree

7 files changed

+76
-3
lines changed

7 files changed

+76
-3
lines changed

Documentation/admin-guide/sysctl/net.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,15 @@ mem_pcpu_rsv
212212

213213
Per-cpu reserved forward alloc cache size in page units. Default 1MB per CPU.
214214

215+
memcg_exclusive
216+
---------------
217+
218+
Skip charging socket buffers to the per-protocol global memory accounting
219+
(controlled by net.ipv4.tcp_mem, etc) if they are already charged to the
220+
cgroup memory controller ("sock" in memory.stat file).
221+
222+
Default: 0
223+
215224
rmem_default
216225
------------
217226

include/net/netns/core.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ struct netns_core {
1616
int sysctl_optmem_max;
1717
u8 sysctl_txrehash;
1818
u8 sysctl_tstamp_allow_data;
19+
#ifdef CONFIG_MEMCG
20+
u8 sysctl_memcg_exclusive;
21+
#endif
1922

2023
#ifdef CONFIG_PROC_FS
2124
struct prot_inuse __percpu *prot_inuse;

include/net/sock.h

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2614,10 +2614,36 @@ static inline gfp_t gfp_memcg_charge(void)
26142614
return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
26152615
}
26162616

2617+
enum {
2618+
SK_MEMCG_EXCLUSIVE = (1UL << 0),
2619+
SK_MEMCG_FLAG_MAX = (1UL << 1),
2620+
};
2621+
2622+
#define SK_MEMCG_FLAG_MASK (SK_MEMCG_FLAG_MAX - 1)
2623+
#define SK_MEMCG_PTR_MASK ~SK_MEMCG_FLAG_MASK
2624+
26172625
#ifdef CONFIG_MEMCG
26182626
static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk)
26192627
{
2620-
return sk->sk_memcg;
2628+
unsigned long val = (unsigned long)sk->sk_memcg;
2629+
2630+
val &= SK_MEMCG_PTR_MASK;
2631+
return (struct mem_cgroup *)val;
2632+
}
2633+
2634+
static inline void mem_cgroup_sk_set_flags(struct sock *sk, unsigned short flags)
2635+
{
2636+
unsigned long val = (unsigned long)mem_cgroup_from_sk(sk);
2637+
2638+
val |= flags;
2639+
sk->sk_memcg = (struct mem_cgroup *)val;
2640+
}
2641+
2642+
static inline unsigned short mem_cgroup_sk_get_flags(const struct sock *sk)
2643+
{
2644+
unsigned long val = (unsigned long)sk->sk_memcg;
2645+
2646+
return val & SK_MEMCG_FLAG_MASK;
26212647
}
26222648

26232649
static inline bool mem_cgroup_sk_enabled(const struct sock *sk)
@@ -2627,7 +2653,7 @@ static inline bool mem_cgroup_sk_enabled(const struct sock *sk)
26272653

26282654
static inline bool mem_cgroup_sk_exclusive(const struct sock *sk)
26292655
{
2630-
return false;
2656+
return mem_cgroup_sk_get_flags(sk) & SK_MEMCG_EXCLUSIVE;
26312657
}
26322658

26332659
static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk)
@@ -2652,6 +2678,15 @@ static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk)
26522678
return NULL;
26532679
}
26542680

2681+
static inline void mem_cgroup_sk_set_flags(struct sock *sk, unsigned short flags)
2682+
{
2683+
}
2684+
2685+
static inline unsigned short mem_cgroup_sk_get_flags(const struct sock *sk)
2686+
{
2687+
return 0;
2688+
}
2689+
26552690
static inline bool mem_cgroup_sk_enabled(const struct sock *sk)
26562691
{
26572692
return false;

mm/memcontrol.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4995,6 +4995,16 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
49954995
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
49964996
EXPORT_SYMBOL(memcg_sockets_enabled_key);
49974997

4998+
static void mem_cgroup_sk_set(struct sock *sk, struct mem_cgroup *memcg)
4999+
{
5000+
sk->sk_memcg = memcg;
5001+
5002+
#ifdef CONFIG_NET
5003+
if (READ_ONCE(sock_net(sk)->core.sysctl_memcg_exclusive))
5004+
mem_cgroup_sk_set_flags(sk, SK_MEMCG_EXCLUSIVE);
5005+
#endif
5006+
}
5007+
49985008
void mem_cgroup_sk_alloc(struct sock *sk)
49995009
{
50005010
struct mem_cgroup *memcg;
@@ -5013,7 +5023,7 @@ void mem_cgroup_sk_alloc(struct sock *sk)
50135023
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg1_tcpmem_active(memcg))
50145024
goto out;
50155025
if (css_tryget(&memcg->css))
5016-
sk->sk_memcg = memcg;
5026+
mem_cgroup_sk_set(sk, memcg);
50175027
out:
50185028
rcu_read_unlock();
50195029
}

net/core/sock.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2520,6 +2520,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
25202520
#ifdef CONFIG_MEMCG
25212521
/* sk->sk_memcg will be populated at accept() time */
25222522
newsk->sk_memcg = NULL;
2523+
mem_cgroup_sk_set_flags(newsk, mem_cgroup_sk_get_flags(sk));
25232524
#endif
25242525

25252526
cgroup_sk_clone(&newsk->sk_cgrp_data);

net/core/sysctl_net_core.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,17 @@ static struct ctl_table netns_core_table[] = {
676676
.extra1 = SYSCTL_ZERO,
677677
.extra2 = SYSCTL_ONE
678678
},
679+
#ifdef CONFIG_MEMCG
680+
{
681+
.procname = "memcg_exclusive",
682+
.data = &init_net.core.sysctl_memcg_exclusive,
683+
.maxlen = sizeof(u8),
684+
.mode = 0644,
685+
.proc_handler = proc_dou8vec_minmax,
686+
.extra1 = SYSCTL_ZERO,
687+
.extra2 = SYSCTL_ONE
688+
},
689+
#endif
679690
/* sysctl_core_net_init() will set the values after this
680691
* to readonly in network namespaces
681692
*/

net/ipv4/af_inet.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -760,12 +760,16 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *new
760760
if (mem_cgroup_sockets_enabled &&
761761
(!IS_ENABLED(CONFIG_IP_SCTP) || sk_is_tcp(newsk))) {
762762
gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL;
763+
unsigned short flags;
763764

765+
flags = mem_cgroup_sk_get_flags(newsk);
764766
mem_cgroup_sk_alloc(newsk);
765767

766768
if (mem_cgroup_from_sk(newsk)) {
767769
int amt;
768770

771+
mem_cgroup_sk_set_flags(newsk, flags);
772+
769773
/* The socket has not been accepted yet, no need
770774
* to look at newsk->sk_wmem_queued.
771775
*/

0 commit comments

Comments
 (0)