Skip to content

Commit 3681b34

Browse files
q2venKernel Patches Daemon
authored andcommitted
net-memcg: Allow decoupling memcg from global protocol memory accounting.
Some protocols (e.g., TCP, UDP) implement memory accounting for socket buffers and charge memory to per-protocol global counters pointed to by sk->sk_proto->memory_allocated. When running under a non-root cgroup, this memory is also charged to the memcg as "sock" in memory.stat. We do not need to pay costs for two orthogonal memory accounting mechanisms. Let's decouple sockets in memcg from the global per-protocol memory accounting if sockets have SK_BPF_MEMCG_SOCK_ISOLATED in sk->sk_memcg. Note that this does NOT disable memcg, but rather the per-protocol one. If mem_cgroup_sk_isolated(sk) returns true, the per-protocol memory accounting is skipped. # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 2757468160 <--------------------------------- charged to memcg # cat /proc/net/sockstat | grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 0 <-- not charged to tcp_mem In __inet_accept(), we need to reclaim counts that are already charged for child sockets because we do not allocate sk->sk_memcg until accept(). trace_sock_exceed_buf_limit() will always show 0 as accounted for the isolated sockets, but this can be obtained via memory.stat. Most changes are inline and hard to trace, but a microbenchmark on __sk_mem_raise_allocated() during neper/tcp_stream showed that more samples completed faster with bpf. This will be more visible under tcp_mem pressure. # bpftrace -e 'kprobe:__sk_mem_raise_allocated { @start[tid] = nsecs; } kretprobe:__sk_mem_raise_allocated /@start[tid]/ { @EnD[tid] = nsecs - @start[tid]; @times = hist(@EnD[tid]); delete(@start[tid]); }' # tcp_stream -6 -F 1000 -N -T 256 Without bpf prog: [128, 256) 3846 | | [256, 512) 1505326 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [512, 1K) 1371006 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [1K, 2K) 198207 |@@@@@@ | [2K, 4K) 31199 |@ | With bpf prog in the next patch: (must be attached before tcp_stream) # bpftool prog load sk_memcg.bpf.o /sys/fs/bpf/sk_memcg type cgroup/sock_create # bpftool cgroup attach /sys/fs/cgroup/test cgroup_inet_sock_create pinned /sys/fs/bpf/sk_memcg [128, 256) 6413 | | [256, 512) 1868425 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [512, 1K) 1101697 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [1K, 2K) 117031 |@@@@ | [2K, 4K) 11773 | | Signed-off-by: Kuniyuki Iwashima <[email protected]> Nacked-by: Johannes Weiner <[email protected]>
1 parent 7ae7939 commit 3681b34

File tree

9 files changed

+90
-32
lines changed

9 files changed

+90
-32
lines changed

include/net/proto_memory.h

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,22 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
3131
if (!sk->sk_prot->memory_pressure)
3232
return false;
3333

34-
if (mem_cgroup_sk_enabled(sk) &&
35-
mem_cgroup_sk_under_memory_pressure(sk))
36-
return true;
34+
if (mem_cgroup_sk_enabled(sk)) {
35+
if (mem_cgroup_sk_under_memory_pressure(sk))
36+
return true;
37+
38+
if (mem_cgroup_sk_isolated(sk))
39+
return false;
40+
}
3741

3842
return !!READ_ONCE(*sk->sk_prot->memory_pressure);
3943
}
4044

45+
static inline bool sk_should_enter_memory_pressure(struct sock *sk)
46+
{
47+
return !mem_cgroup_sk_enabled(sk) || !mem_cgroup_sk_isolated(sk);
48+
}
49+
4150
static inline long
4251
proto_memory_allocated(const struct proto *prot)
4352
{

include/net/tcp.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -275,9 +275,13 @@ extern unsigned long tcp_memory_pressure;
275275
/* optimized version of sk_under_memory_pressure() for TCP sockets */
276276
static inline bool tcp_under_memory_pressure(const struct sock *sk)
277277
{
278-
if (mem_cgroup_sk_enabled(sk) &&
279-
mem_cgroup_sk_under_memory_pressure(sk))
280-
return true;
278+
if (mem_cgroup_sk_enabled(sk)) {
279+
if (mem_cgroup_sk_under_memory_pressure(sk))
280+
return true;
281+
282+
if (mem_cgroup_sk_isolated(sk))
283+
return false;
284+
}
281285

282286
return READ_ONCE(tcp_memory_pressure);
283287
}

net/core/sock.c

Lines changed: 44 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1046,17 +1046,21 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
10461046
if (!charged)
10471047
return -ENOMEM;
10481048

1049-
/* pre-charge to forward_alloc */
1050-
sk_memory_allocated_add(sk, pages);
1051-
allocated = sk_memory_allocated(sk);
1052-
/* If the system goes into memory pressure with this
1053-
* precharge, give up and return error.
1054-
*/
1055-
if (allocated > sk_prot_mem_limits(sk, 1)) {
1056-
sk_memory_allocated_sub(sk, pages);
1057-
mem_cgroup_sk_uncharge(sk, pages);
1058-
return -ENOMEM;
1049+
if (!mem_cgroup_sk_isolated(sk)) {
1050+
/* pre-charge to forward_alloc */
1051+
sk_memory_allocated_add(sk, pages);
1052+
allocated = sk_memory_allocated(sk);
1053+
1054+
/* If the system goes into memory pressure with this
1055+
* precharge, give up and return error.
1056+
*/
1057+
if (allocated > sk_prot_mem_limits(sk, 1)) {
1058+
sk_memory_allocated_sub(sk, pages);
1059+
mem_cgroup_sk_uncharge(sk, pages);
1060+
return -ENOMEM;
1061+
}
10591062
}
1063+
10601064
sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
10611065

10621066
WRITE_ONCE(sk->sk_reserved_mem,
@@ -3154,8 +3158,11 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
31543158
if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
31553159
return true;
31563160

3157-
sk_enter_memory_pressure(sk);
3161+
if (sk_should_enter_memory_pressure(sk))
3162+
sk_enter_memory_pressure(sk);
3163+
31583164
sk_stream_moderate_sndbuf(sk);
3165+
31593166
return false;
31603167
}
31613168
EXPORT_SYMBOL(sk_page_frag_refill);
@@ -3268,18 +3275,30 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
32683275
{
32693276
bool memcg_enabled = false, charged = false;
32703277
struct proto *prot = sk->sk_prot;
3271-
long allocated;
3272-
3273-
sk_memory_allocated_add(sk, amt);
3274-
allocated = sk_memory_allocated(sk);
3278+
long allocated = 0;
32753279

32763280
if (mem_cgroup_sk_enabled(sk)) {
3281+
bool isolated = mem_cgroup_sk_isolated(sk);
3282+
32773283
memcg_enabled = true;
32783284
charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge());
3279-
if (!charged)
3285+
3286+
if (isolated && charged)
3287+
return 1;
3288+
3289+
if (!charged) {
3290+
if (!isolated) {
3291+
sk_memory_allocated_add(sk, amt);
3292+
allocated = sk_memory_allocated(sk);
3293+
}
3294+
32803295
goto suppress_allocation;
3296+
}
32813297
}
32823298

3299+
sk_memory_allocated_add(sk, amt);
3300+
allocated = sk_memory_allocated(sk);
3301+
32833302
/* Under limit. */
32843303
if (allocated <= sk_prot_mem_limits(sk, 0)) {
32853304
sk_leave_memory_pressure(sk);
@@ -3358,7 +3377,8 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
33583377

33593378
trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
33603379

3361-
sk_memory_allocated_sub(sk, amt);
3380+
if (allocated)
3381+
sk_memory_allocated_sub(sk, amt);
33623382

33633383
if (charged)
33643384
mem_cgroup_sk_uncharge(sk, amt);
@@ -3397,11 +3417,15 @@ EXPORT_SYMBOL(__sk_mem_schedule);
33973417
*/
33983418
void __sk_mem_reduce_allocated(struct sock *sk, int amount)
33993419
{
3400-
sk_memory_allocated_sub(sk, amount);
3401-
3402-
if (mem_cgroup_sk_enabled(sk))
3420+
if (mem_cgroup_sk_enabled(sk)) {
34033421
mem_cgroup_sk_uncharge(sk, amount);
34043422

3423+
if (mem_cgroup_sk_isolated(sk))
3424+
return;
3425+
}
3426+
3427+
sk_memory_allocated_sub(sk, amount);
3428+
34053429
if (sk_under_global_memory_pressure(sk) &&
34063430
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
34073431
sk_leave_memory_pressure(sk);

net/ipv4/af_inet.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@
9595
#include <net/checksum.h>
9696
#include <net/ip.h>
9797
#include <net/protocol.h>
98+
#include <net/proto_memory.h>
9899
#include <net/arp.h>
99100
#include <net/route.h>
100101
#include <net/ip_fib.h>
@@ -773,8 +774,17 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *new
773774
*/
774775
amt = sk_mem_pages(newsk->sk_forward_alloc +
775776
atomic_read(&newsk->sk_rmem_alloc));
776-
if (amt)
777+
if (amt) {
778+
/* This amt is already charged globally to
779+
* sk_prot->memory_allocated due to lack of
780+
* sk_memcg until accept(), thus we need to
781+
* reclaim it here if newsk is isolated.
782+
*/
783+
if (mem_cgroup_sk_isolated(newsk))
784+
sk_memory_allocated_sub(newsk, amt);
785+
777786
mem_cgroup_sk_charge(newsk, amt, gfp);
787+
}
778788
}
779789

780790
kmem_cache_charge(newsk, gfp);

net/ipv4/inet_connection_sock.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <net/tcp.h>
2323
#include <net/sock_reuseport.h>
2424
#include <net/addrconf.h>
25+
#include <net/proto_memory.h>
2526

2627
#if IS_ENABLED(CONFIG_IPV6)
2728
/* match_sk*_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses

net/ipv4/tcp.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -908,7 +908,8 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
908908
}
909909
__kfree_skb(skb);
910910
} else {
911-
sk->sk_prot->enter_memory_pressure(sk);
911+
if (sk_should_enter_memory_pressure(sk))
912+
tcp_enter_memory_pressure(sk);
912913
sk_stream_moderate_sndbuf(sk);
913914
}
914915
return NULL;

net/ipv4/tcp_output.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3574,12 +3574,18 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
35743574
delta = size - sk->sk_forward_alloc;
35753575
if (delta <= 0)
35763576
return;
3577+
35773578
amt = sk_mem_pages(delta);
35783579
sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3579-
sk_memory_allocated_add(sk, amt);
35803580

3581-
if (mem_cgroup_sk_enabled(sk))
3581+
if (mem_cgroup_sk_enabled(sk)) {
35823582
mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
3583+
3584+
if (mem_cgroup_sk_isolated(sk))
3585+
return;
3586+
}
3587+
3588+
sk_memory_allocated_add(sk, amt);
35833589
}
35843590

35853591
/* Send a FIN. The caller locks the socket for us.

net/mptcp/protocol.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <net/inet_common.h>
1717
#include <net/inet_hashtables.h>
1818
#include <net/protocol.h>
19+
#include <net/proto_memory.h>
1920
#include <net/tcp_states.h>
2021
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
2122
#include <net/transp_v6.h>
@@ -1016,7 +1017,7 @@ static void mptcp_enter_memory_pressure(struct sock *sk)
10161017
mptcp_for_each_subflow(msk, subflow) {
10171018
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
10181019

1019-
if (first)
1020+
if (first && sk_should_enter_memory_pressure(ssk))
10201021
tcp_enter_memory_pressure(ssk);
10211022
sk_stream_moderate_sndbuf(ssk);
10221023

net/tls/tls_device.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include <linux/netdevice.h>
3636
#include <net/dst.h>
3737
#include <net/inet_connection_sock.h>
38+
#include <net/proto_memory.h>
3839
#include <net/tcp.h>
3940
#include <net/tls.h>
4041
#include <linux/skbuff_ref.h>
@@ -371,7 +372,8 @@ static int tls_do_allocation(struct sock *sk,
371372
if (!offload_ctx->open_record) {
372373
if (unlikely(!skb_page_frag_refill(prepend_size, pfrag,
373374
sk->sk_allocation))) {
374-
READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
375+
if (sk_should_enter_memory_pressure(sk))
376+
READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
375377
sk_stream_moderate_sndbuf(sk);
376378
return -ENOMEM;
377379
}

0 commit comments

Comments
 (0)