Skip to content

Commit 8caccfa

Browse files
q2venKernel Patches Daemon
authored andcommitted
net-memcg: Introduce net.core.memcg_exclusive sysctl.
If net.core.memcg_exclusive is 1 when sk->sk_memcg is allocated, the socket is flagged with SK_MEMCG_EXCLUSIVE internally and skips the global per-protocol memory accounting. OTOH, for accept()ed child sockets, this flag is inherited from the listening socket in sk_clone_lock() and set in __inet_accept(). This is to preserve the decision by BPF which will be supported later. Given sk->sk_memcg can be accessed in the fast path, it would be preferable to place the flag field in the same cache line as sk->sk_memcg. However, struct sock does not have such a 1-byte hole. Let's store the flag in the lowest bit of sk->sk_memcg and check it in mem_cgroup_sk_exclusive(). Tested with a script that creates local socket pairs and send()s a bunch of data without recv()ing. Setup: # mkdir /sys/fs/cgroup/test # echo $$ >> /sys/fs/cgroup/test/cgroup.procs # sysctl -q net.ipv4.tcp_mem="1000 1000 1000" Without net.core.memcg_exclusive, charged to memcg & tcp_mem: # prlimit -n=524288:524288 bash -c "python3 pressure.py" & # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 22642688 <-------------------------------------- charged to memcg # cat /proc/net/sockstat| grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 5376 <-- charged to tcp_mem # ss -tn | head -n 5 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53188 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:49972 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53868 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53554 # nstat | grep Pressure || echo no pressure TcpExtTCPMemoryPressures 1 0.0 With net.core.memcg_exclusive=1, only charged to memcg: # sysctl -q net.core.memcg_exclusive=1 # prlimit -n=524288:524288 bash -c "python3 pressure.py" & # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 2757468160 <------------------------------------ charged to memcg # cat /proc/net/sockstat | grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 0 <- NOT charged to tcp_mem # ss -tn | head -n 5 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 111000 0 127.0.0.1:36019 127.0.0.1:49026 ESTAB 110000 0 127.0.0.1:36019 127.0.0.1:45630 ESTAB 110000 0 127.0.0.1:36019 127.0.0.1:44870 ESTAB 111000 0 127.0.0.1:36019 127.0.0.1:45274 # nstat | grep Pressure || echo no pressure no pressure Signed-off-by: Kuniyuki Iwashima <[email protected]>
1 parent 9026d0a commit 8caccfa

File tree

7 files changed

+76
-3
lines changed

7 files changed

+76
-3
lines changed

Documentation/admin-guide/sysctl/net.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,15 @@ mem_pcpu_rsv
212212

213213
Per-cpu reserved forward alloc cache size in page units. Default 1MB per CPU.
214214

215+
memcg_exclusive
216+
---------------
217+
218+
Skip charging socket buffers to the per-protocol global memory accounting
219+
(controlled by net.ipv4.tcp_mem, etc) if they are already charged to the
220+
cgroup memory controller ("sock" in memory.stat file).
221+
222+
Default: 0
223+
215224
rmem_default
216225
------------
217226

include/net/netns/core.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ struct netns_core {
1616
int sysctl_optmem_max;
1717
u8 sysctl_txrehash;
1818
u8 sysctl_tstamp_allow_data;
19+
#ifdef CONFIG_MEMCG
20+
u8 sysctl_memcg_exclusive;
21+
#endif
1922

2023
#ifdef CONFIG_PROC_FS
2124
struct prot_inuse __percpu *prot_inuse;

include/net/sock.h

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2596,10 +2596,36 @@ static inline gfp_t gfp_memcg_charge(void)
25962596
return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
25972597
}
25982598

2599+
enum {
2600+
SK_MEMCG_EXCLUSIVE = (1UL << 0),
2601+
SK_MEMCG_FLAG_MAX = (1UL << 1),
2602+
};
2603+
2604+
#define SK_MEMCG_FLAG_MASK (SK_MEMCG_FLAG_MAX - 1)
2605+
#define SK_MEMCG_PTR_MASK ~SK_MEMCG_FLAG_MASK
2606+
25992607
#ifdef CONFIG_MEMCG
26002608
static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk)
26012609
{
2602-
return sk->sk_memcg;
2610+
unsigned long val = (unsigned long)sk->sk_memcg;
2611+
2612+
val &= SK_MEMCG_PTR_MASK;
2613+
return (struct mem_cgroup *)val;
2614+
}
2615+
2616+
static inline void mem_cgroup_sk_set_flags(struct sock *sk, unsigned short flags)
2617+
{
2618+
unsigned long val = (unsigned long)mem_cgroup_from_sk(sk);
2619+
2620+
val |= flags;
2621+
sk->sk_memcg = (struct mem_cgroup *)val;
2622+
}
2623+
2624+
static inline unsigned short mem_cgroup_sk_get_flags(const struct sock *sk)
2625+
{
2626+
unsigned long val = (unsigned long)sk->sk_memcg;
2627+
2628+
return val & SK_MEMCG_FLAG_MASK;
26032629
}
26042630

26052631
static inline bool mem_cgroup_sk_enabled(const struct sock *sk)
@@ -2609,7 +2635,7 @@ static inline bool mem_cgroup_sk_enabled(const struct sock *sk)
26092635

26102636
static inline bool mem_cgroup_sk_exclusive(const struct sock *sk)
26112637
{
2612-
return false;
2638+
return mem_cgroup_sk_get_flags(sk) & SK_MEMCG_EXCLUSIVE;
26132639
}
26142640

26152641
static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk)
@@ -2634,6 +2660,15 @@ static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk)
26342660
return NULL;
26352661
}
26362662

2663+
static inline void mem_cgroup_sk_set_flags(struct sock *sk, unsigned short flags)
2664+
{
2665+
}
2666+
2667+
static inline unsigned short mem_cgroup_sk_get_flags(const struct sock *sk)
2668+
{
2669+
return 0;
2670+
}
2671+
26372672
static inline bool mem_cgroup_sk_enabled(const struct sock *sk)
26382673
{
26392674
return false;

mm/memcontrol.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4995,6 +4995,16 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
49954995
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
49964996
EXPORT_SYMBOL(memcg_sockets_enabled_key);
49974997

4998+
static void mem_cgroup_sk_set(struct sock *sk, struct mem_cgroup *memcg)
4999+
{
5000+
sk->sk_memcg = memcg;
5001+
5002+
#ifdef CONFIG_NET
5003+
if (READ_ONCE(sock_net(sk)->core.sysctl_memcg_exclusive))
5004+
mem_cgroup_sk_set_flags(sk, SK_MEMCG_EXCLUSIVE);
5005+
#endif
5006+
}
5007+
49985008
void mem_cgroup_sk_alloc(struct sock *sk)
49995009
{
50005010
struct mem_cgroup *memcg;
@@ -5013,7 +5023,7 @@ void mem_cgroup_sk_alloc(struct sock *sk)
50135023
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg1_tcpmem_active(memcg))
50145024
goto out;
50155025
if (css_tryget(&memcg->css))
5016-
sk->sk_memcg = memcg;
5026+
mem_cgroup_sk_set(sk, memcg);
50175027
out:
50185028
rcu_read_unlock();
50195029
}

net/core/sock.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2519,6 +2519,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
25192519
#ifdef CONFIG_MEMCG
25202520
/* sk->sk_memcg will be populated at accept() time */
25212521
newsk->sk_memcg = NULL;
2522+
mem_cgroup_sk_set_flags(newsk, mem_cgroup_sk_get_flags(sk));
25222523
#endif
25232524

25242525
cgroup_sk_clone(&newsk->sk_cgrp_data);

net/core/sysctl_net_core.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,17 @@ static struct ctl_table netns_core_table[] = {
676676
.extra1 = SYSCTL_ZERO,
677677
.extra2 = SYSCTL_ONE
678678
},
679+
#ifdef CONFIG_MEMCG
680+
{
681+
.procname = "memcg_exclusive",
682+
.data = &init_net.core.sysctl_memcg_exclusive,
683+
.maxlen = sizeof(u8),
684+
.mode = 0644,
685+
.proc_handler = proc_dou8vec_minmax,
686+
.extra1 = SYSCTL_ZERO,
687+
.extra2 = SYSCTL_ONE
688+
},
689+
#endif
679690
/* sysctl_core_net_init() will set the values after this
680691
* to readonly in network namespaces
681692
*/

net/ipv4/af_inet.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -758,12 +758,16 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *new
758758
if (mem_cgroup_sockets_enabled &&
759759
(!IS_ENABLED(CONFIG_IP_SCTP) || sk_is_tcp(newsk))) {
760760
gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL;
761+
unsigned short flags;
761762

763+
flags = mem_cgroup_sk_get_flags(newsk);
762764
mem_cgroup_sk_alloc(newsk);
763765

764766
if (mem_cgroup_from_sk(newsk)) {
765767
int amt;
766768

769+
mem_cgroup_sk_set_flags(newsk, flags);
770+
767771
/* The socket has not been accepted yet, no need
768772
* to look at newsk->sk_wmem_queued.
769773
*/

0 commit comments

Comments
 (0)