From 3b5e1d318e112557adeb832f099ac437f960845c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Aug 2025 18:38:07 +0000 Subject: [PATCH 1/5] tcp: Save lock_sock() for memcg in inet_csk_accept(). If memcg is enabled, accept() acquires lock_sock() twice for each new TCP/MPTCP socket in inet_csk_accept() and __inet_accept(). Let's move memcg operations from inet_csk_accept() to __inet_accept(). Note that SCTP somehow allocates a new socket by sk_alloc() in sk->sk_prot->accept() and clones fields manually, instead of using sk_clone_lock(). mem_cgroup_sk_alloc() is called for SCTP before __inet_accept(), so I added the protocol check in __inet_accept(), but this can be removed once SCTP uses sk_clone_lock(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Shakeel Butt --- net/ipv4/af_inet.c | 23 +++++++++++++++++++++++ net/ipv4/inet_connection_sock.c | 25 ------------------------- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 76e38092cd8a3..d42757f74c6e6 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -753,6 +753,29 @@ EXPORT_SYMBOL(inet_stream_connect); void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk) { + /* TODO: use sk_clone_lock() in SCTP and remove protocol checks */ + if (mem_cgroup_sockets_enabled && + (!IS_ENABLED(CONFIG_IP_SCTP) || + sk_is_tcp(newsk) || sk_is_mptcp(newsk))) { + gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; + + mem_cgroup_sk_alloc(newsk); + + if (mem_cgroup_from_sk(newsk)) { + int amt; + + /* The socket has not been accepted yet, no need + * to look at newsk->sk_wmem_queued. + */ + amt = sk_mem_pages(newsk->sk_forward_alloc + + atomic_read(&newsk->sk_rmem_alloc)); + if (amt) + mem_cgroup_sk_charge(newsk, amt, gfp); + } + + kmem_cache_charge(newsk, gfp); + } + sock_rps_record_flow(newsk); WARN_ON(!((1 << newsk->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_RECV | diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0ef1eacd539d1..ed10b959a906b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -708,31 +708,6 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) release_sock(sk); - if (mem_cgroup_sockets_enabled) { - gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; - int amt = 0; - - /* atomically get the memory usage, set and charge the - * newsk->sk_memcg. - */ - lock_sock(newsk); - - mem_cgroup_sk_alloc(newsk); - if (mem_cgroup_from_sk(newsk)) { - /* The socket has not been accepted yet, no need - * to look at newsk->sk_wmem_queued. - */ - amt = sk_mem_pages(newsk->sk_forward_alloc + - atomic_read(&newsk->sk_rmem_alloc)); - } - - if (amt) - mem_cgroup_sk_charge(newsk, amt, gfp); - kmem_cache_charge(newsk, gfp); - - release_sock(newsk); - } - if (req) reqsk_put(req); From 5b697e2b2c9f72ea74e4fa04ce1ce97c5983bdd6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Aug 2025 18:38:08 +0000 Subject: [PATCH 2/5] bpf: Support bpf_setsockopt() for BPF_CGROUP_INET_SOCK_CREATE. We will store a flag in sk->sk_memcg by bpf_setsockopt() during socket() or before sk->sk_memcg is set in accept(). BPF_CGROUP_INET_SOCK_CREATE is invoked by __cgroup_bpf_run_filter_sk() that passes a pointer to struct sock to the bpf prog as void *ctx. But there are no bpf_func_proto for bpf_setsockopt() that receives the ctx as a pointer to struct sock. Let's add a new bpf_setsockopt() variant for BPF_CGROUP_INET_SOCK_CREATE. Note that inet_create() is not under lock_sock(). Signed-off-by: Kuniyuki Iwashima --- net/core/filter.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 63f3baee2dafa..443d12b7d3b27 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5743,6 +5743,23 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_5(bpf_unlocked_sock_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return _bpf_setsockopt(sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_unlocked_sock_setsockopt_proto = { + .func = bpf_unlocked_sock_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, int optname, const u8 **start) { @@ -8051,6 +8068,13 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_cg_sock_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; + case BPF_FUNC_setsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + return &bpf_unlocked_sock_setsockopt_proto; + default: + return NULL; + } default: return bpf_base_func_proto(func_id, prog); } From 9b1985239dae92bd81cc232076f5dcb79c01ece0 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Aug 2025 18:38:09 +0000 Subject: [PATCH 3/5] bpf: Introduce SK_BPF_MEMCG_FLAGS and SK_BPF_MEMCG_SOCK_ISOLATED. We will decouple sockets from the global protocol memory accounting if sockets have SK_BPF_MEMCG_SOCK_ISOLATED. This can be flagged, during socket() or before sk->sk_memcg is set in accept(), via bpf_setsockopt(): flags = SK_BPF_MEMCG_SOCK_ISOLATED; bpf_setsockopt(ctx, SOL_SOCKET, SK_BPF_MEMCG_FLAGS, &flags, sizeof(flags)); Given sk->sk_memcg can be accessed in the fast path, it would be preferable to place the flag field in the same cache line as sk->sk_memcg. However, struct sock does not have such a 1-byte hole. Let's store the flag in the lowest bit of sk->sk_memcg and add a helper to check the bit. In the next patch, if mem_cgroup_sk_isolated() returns true, the socket will not be charged to sk->sk_prot->memory_allocated. The main targets are BPF_CGROUP_INET_SOCK_CREATE and BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB as demonstrated in the selftest. Note that we do not support modifying the flag once sk->sk_memcg is set especially because UDP charges memory under sk->sk_receive_queue.lock instead of lock_sock(). Signed-off-by: Kuniyuki Iwashima --- include/net/sock.h | 48 ++++++++++++++++++++++++++++++++++ include/uapi/linux/bpf.h | 6 +++++ net/core/filter.c | 28 +++++++++++++++++++- net/ipv4/af_inet.c | 4 +++ tools/include/uapi/linux/bpf.h | 6 +++++ 5 files changed, 91 insertions(+), 1 deletion(-) diff --git a/include/net/sock.h b/include/net/sock.h index 63a6a48afb48a..d41a2f8f8b30c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2596,10 +2596,39 @@ static inline gfp_t gfp_memcg_charge(void) return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } +#define SK_BPF_MEMCG_FLAG_MASK (SK_BPF_MEMCG_FLAG_MAX - 1) +#define SK_BPF_MEMCG_PTR_MASK ~SK_BPF_MEMCG_FLAG_MASK + #ifdef CONFIG_MEMCG +static inline void mem_cgroup_sk_set_flags(struct sock *sk, unsigned short flags) +{ + unsigned long val = (unsigned long)sk->sk_memcg; + + val |= flags; + sk->sk_memcg = (struct mem_cgroup *)val; +} + +static inline unsigned short mem_cgroup_sk_get_flags(const struct sock *sk) +{ +#ifdef CONFIG_CGROUP_BPF + unsigned long val = (unsigned long)sk->sk_memcg; + + return val & SK_BPF_MEMCG_FLAG_MASK; +#else + return 0; +#endif +} + static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) { +#ifdef CONFIG_CGROUP_BPF + unsigned long val = (unsigned long)sk->sk_memcg; + + val &= SK_BPF_MEMCG_PTR_MASK; + return (struct mem_cgroup *)val; +#else return sk->sk_memcg; +#endif } static inline bool mem_cgroup_sk_enabled(const struct sock *sk) @@ -2607,6 +2636,11 @@ static inline bool mem_cgroup_sk_enabled(const struct sock *sk) return mem_cgroup_sockets_enabled && mem_cgroup_from_sk(sk); } +static inline bool mem_cgroup_sk_isolated(const struct sock *sk) +{ + return mem_cgroup_sk_get_flags(sk) & SK_BPF_MEMCG_SOCK_ISOLATED; +} + static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) { struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); @@ -2624,6 +2658,15 @@ static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) return false; } #else +static inline void mem_cgroup_sk_set_flags(struct sock *sk, unsigned short flags) +{ +} + +static inline unsigned short mem_cgroup_sk_get_flags(const struct sock *sk) +{ + return 0; +} + static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) { return NULL; @@ -2634,6 +2677,11 @@ static inline bool mem_cgroup_sk_enabled(const struct sock *sk) return false; } +static inline bool mem_cgroup_sk_isolated(const struct sock *sk) +{ + return false; +} + static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) { return false; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 233de8677382e..52b8c2278589b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7182,6 +7182,7 @@ enum { TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */ + SK_BPF_MEMCG_FLAGS = 1010, /* Get or Set flags saved in sk->sk_memcg */ }; enum { @@ -7204,6 +7205,11 @@ enum { */ }; +enum { + SK_BPF_MEMCG_SOCK_ISOLATED = (1UL << 0), + SK_BPF_MEMCG_FLAG_MAX = (1UL << 1), +}; + struct bpf_perf_event_value { __u64 counter; __u64 enabled; diff --git a/net/core/filter.c b/net/core/filter.c index 443d12b7d3b27..943ae6d7d637d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5267,6 +5267,27 @@ static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt) return 0; } +static int sk_bpf_set_get_memcg_flags(struct sock *sk, int *optval, bool getopt) +{ + if (!sk_has_account(sk)) + return -EOPNOTSUPP; + + if (getopt) { + *optval = mem_cgroup_sk_get_flags(sk); + return 0; + } + + if (sock_owned_by_user_nocheck(sk) && mem_cgroup_from_sk(sk)) + return -EBUSY; + + if (*optval <= 0 || *optval >= SK_BPF_MEMCG_FLAG_MAX) + return -EINVAL; + + mem_cgroup_sk_set_flags(sk, *optval); + + return 0; +} + static int sol_socket_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) @@ -5284,6 +5305,7 @@ static int sol_socket_sockopt(struct sock *sk, int optname, case SO_BINDTOIFINDEX: case SO_TXREHASH: case SK_BPF_CB_FLAGS: + case SK_BPF_MEMCG_FLAGS: if (*optlen != sizeof(int)) return -EINVAL; break; @@ -5293,8 +5315,12 @@ static int sol_socket_sockopt(struct sock *sk, int optname, return -EINVAL; } - if (optname == SK_BPF_CB_FLAGS) + switch (optname) { + case SK_BPF_CB_FLAGS: return sk_bpf_set_get_cb_flags(sk, optval, getopt); + case SK_BPF_MEMCG_FLAGS: + return sk_bpf_set_get_memcg_flags(sk, (int *)optval, getopt); + } if (getopt) { if (optname == SO_BINDTODEVICE) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d42757f74c6e6..9b62f1ae13baa 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -758,12 +758,16 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *new (!IS_ENABLED(CONFIG_IP_SCTP) || sk_is_tcp(newsk) || sk_is_mptcp(newsk))) { gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; + unsigned short flags; + flags = mem_cgroup_sk_get_flags(newsk); mem_cgroup_sk_alloc(newsk); if (mem_cgroup_from_sk(newsk)) { int amt; + mem_cgroup_sk_set_flags(newsk, flags); + /* The socket has not been accepted yet, no need * to look at newsk->sk_wmem_queued. */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 233de8677382e..52b8c2278589b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7182,6 +7182,7 @@ enum { TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */ + SK_BPF_MEMCG_FLAGS = 1010, /* Get or Set flags saved in sk->sk_memcg */ }; enum { @@ -7204,6 +7205,11 @@ enum { */ }; +enum { + SK_BPF_MEMCG_SOCK_ISOLATED = (1UL << 0), + SK_BPF_MEMCG_FLAG_MAX = (1UL << 1), +}; + struct bpf_perf_event_value { __u64 counter; __u64 enabled; From d561392a21ea70e80d1c906395d1494e7216aef3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Aug 2025 18:38:10 +0000 Subject: [PATCH 4/5] net-memcg: Allow decoupling memcg from global protocol memory accounting. Some protocols (e.g., TCP, UDP) implement memory accounting for socket buffers and charge memory to per-protocol global counters pointed to by sk->sk_proto->memory_allocated. When running under a non-root cgroup, this memory is also charged to the memcg as "sock" in memory.stat. Even when a memcg controls memory usage, sockets of such protocols are still subject to global limits (e.g., /proc/sys/net/ipv4/tcp_mem). This makes it difficult to accurately estimate and configure appropriate global limits, especially in multi-tenant environments. If all workloads were guaranteed to be controlled under memcg, the issue could be worked around by setting tcp_mem[0~2] to UINT_MAX. In reality, this assumption does not always hold, and processes not controlled by memcg lose the seatbelt and can consume memory up to the global limit, becoming noisy neighbour. Let's decouple sockets in memcg from the global per-protocol memory accounting if sockets have SK_BPF_MEMCG_SOCK_ISOLATED in sk->sk_memcg. This simplifies memcg configuration while keeping the global limits within a reasonable range. If mem_cgroup_sk_isolated(sk) returns true, the per-protocol memory accounting is skipped. In __inet_accept(), we need to reclaim counts that are already charged for child sockets because we do not allocate sk->sk_memcg until accept(). Note that trace_sock_exceed_buf_limit() will always show 0 as accounted for the isolated sockets, but this can be obtained via memory.stat. Tested with a script that creates local socket pairs and send()s a bunch of data without recv()ing. Setup: # mkdir /sys/fs/cgroup/test # echo $$ >> /sys/fs/cgroup/test/cgroup.procs # sysctl -q net.ipv4.tcp_mem="1000 1000 1000" Without bpf prog: # prlimit -n=524288:524288 bash -c "python3 pressure.py" & # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 22642688 # cat /proc/net/sockstat| grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 5376 # ss -tn | head -n 5 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53188 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:49972 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53868 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53554 # nstat | grep Pressure || echo no pressure TcpExtTCPMemoryPressures 1 0.0 With bpf prog in the next patch: # bpftool prog load sk_memcg.bpf.o /sys/fs/bpf/sk_memcg_create type cgroup/sock_create # bpftool prog load sk_memcg.bpf.o /sys/fs/bpf/sk_memcg_estab type sockops # bpftool cgroup attach /sys/fs/cgroup/test cgroup_inet_sock_create pinned /sys/fs/bpf/sk_memcg_create # bpftool cgroup attach /sys/fs/cgroup/test cgroup_sock_ops pinned /sys/fs/bpf/sk_memcg_estab # prlimit -n=524288:524288 bash -c "python3 pressure.py" & # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 2757468160 # cat /proc/net/sockstat | grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 0 # ss -tn | head -n 5 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 111000 0 127.0.0.1:36019 127.0.0.1:49026 ESTAB 110000 0 127.0.0.1:36019 127.0.0.1:45630 ESTAB 110000 0 127.0.0.1:36019 127.0.0.1:44870 ESTAB 111000 0 127.0.0.1:36019 127.0.0.1:45274 # nstat | grep Pressure || echo no pressure no pressure Signed-off-by: Kuniyuki Iwashima --- include/net/proto_memory.h | 15 ++++++-- include/net/tcp.h | 10 ++++-- net/core/sock.c | 64 ++++++++++++++++++++++----------- net/ipv4/af_inet.c | 12 ++++++- net/ipv4/inet_connection_sock.c | 1 + net/ipv4/tcp.c | 3 +- net/ipv4/tcp_output.c | 10 ++++-- net/mptcp/protocol.c | 3 +- net/tls/tls_device.c | 4 ++- 9 files changed, 90 insertions(+), 32 deletions(-) diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h index 8e91a8fa31b52..8e8432b135150 100644 --- a/include/net/proto_memory.h +++ b/include/net/proto_memory.h @@ -31,13 +31,22 @@ static inline bool sk_under_memory_pressure(const struct sock *sk) if (!sk->sk_prot->memory_pressure) return false; - if (mem_cgroup_sk_enabled(sk) && - mem_cgroup_sk_under_memory_pressure(sk)) - return true; + if (mem_cgroup_sk_enabled(sk)) { + if (mem_cgroup_sk_under_memory_pressure(sk)) + return true; + + if (mem_cgroup_sk_isolated(sk)) + return false; + } return !!READ_ONCE(*sk->sk_prot->memory_pressure); } +static inline bool sk_should_enter_memory_pressure(struct sock *sk) +{ + return !mem_cgroup_sk_enabled(sk) || !mem_cgroup_sk_isolated(sk); +} + static inline long proto_memory_allocated(const struct proto *prot) { diff --git a/include/net/tcp.h b/include/net/tcp.h index 2936b8175950f..0191a4585bba0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -275,9 +275,13 @@ extern unsigned long tcp_memory_pressure; /* optimized version of sk_under_memory_pressure() for TCP sockets */ static inline bool tcp_under_memory_pressure(const struct sock *sk) { - if (mem_cgroup_sk_enabled(sk) && - mem_cgroup_sk_under_memory_pressure(sk)) - return true; + if (mem_cgroup_sk_enabled(sk)) { + if (mem_cgroup_sk_under_memory_pressure(sk)) + return true; + + if (mem_cgroup_sk_isolated(sk)) + return false; + } return READ_ONCE(tcp_memory_pressure); } diff --git a/net/core/sock.c b/net/core/sock.c index 8002ac6293dca..be5574f9a0255 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1046,17 +1046,21 @@ static int sock_reserve_memory(struct sock *sk, int bytes) if (!charged) return -ENOMEM; - /* pre-charge to forward_alloc */ - sk_memory_allocated_add(sk, pages); - allocated = sk_memory_allocated(sk); - /* If the system goes into memory pressure with this - * precharge, give up and return error. - */ - if (allocated > sk_prot_mem_limits(sk, 1)) { - sk_memory_allocated_sub(sk, pages); - mem_cgroup_sk_uncharge(sk, pages); - return -ENOMEM; + if (!mem_cgroup_sk_isolated(sk)) { + /* pre-charge to forward_alloc */ + sk_memory_allocated_add(sk, pages); + allocated = sk_memory_allocated(sk); + + /* If the system goes into memory pressure with this + * precharge, give up and return error. + */ + if (allocated > sk_prot_mem_limits(sk, 1)) { + sk_memory_allocated_sub(sk, pages); + mem_cgroup_sk_uncharge(sk, pages); + return -ENOMEM; + } } + sk_forward_alloc_add(sk, pages << PAGE_SHIFT); WRITE_ONCE(sk->sk_reserved_mem, @@ -3153,8 +3157,11 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) return true; - sk_enter_memory_pressure(sk); + if (sk_should_enter_memory_pressure(sk)) + sk_enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + return false; } EXPORT_SYMBOL(sk_page_frag_refill); @@ -3267,18 +3274,30 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { bool memcg_enabled = false, charged = false; struct proto *prot = sk->sk_prot; - long allocated; - - sk_memory_allocated_add(sk, amt); - allocated = sk_memory_allocated(sk); + long allocated = 0; if (mem_cgroup_sk_enabled(sk)) { + bool isolated = mem_cgroup_sk_isolated(sk); + memcg_enabled = true; charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge()); - if (!charged) + + if (isolated && charged) + return 1; + + if (!charged) { + if (!isolated) { + sk_memory_allocated_add(sk, amt); + allocated = sk_memory_allocated(sk); + } + goto suppress_allocation; + } } + sk_memory_allocated_add(sk, amt); + allocated = sk_memory_allocated(sk); + /* Under limit. */ if (allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); @@ -3357,7 +3376,8 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) trace_sock_exceed_buf_limit(sk, prot, allocated, kind); - sk_memory_allocated_sub(sk, amt); + if (allocated) + sk_memory_allocated_sub(sk, amt); if (charged) mem_cgroup_sk_uncharge(sk, amt); @@ -3396,11 +3416,15 @@ EXPORT_SYMBOL(__sk_mem_schedule); */ void __sk_mem_reduce_allocated(struct sock *sk, int amount) { - sk_memory_allocated_sub(sk, amount); - - if (mem_cgroup_sk_enabled(sk)) + if (mem_cgroup_sk_enabled(sk)) { mem_cgroup_sk_uncharge(sk, amount); + if (mem_cgroup_sk_isolated(sk)) + return; + } + + sk_memory_allocated_sub(sk, amount); + if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9b62f1ae13baa..adbc8bcb760b1 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -95,6 +95,7 @@ #include #include #include +#include #include #include #include @@ -773,8 +774,17 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *new */ amt = sk_mem_pages(newsk->sk_forward_alloc + atomic_read(&newsk->sk_rmem_alloc)); - if (amt) + if (amt) { + /* This amt is already charged globally to + * sk_prot->memory_allocated due to lack of + * sk_memcg until accept(), thus we need to + * reclaim it here if newsk is isolated. + */ + if (mem_cgroup_sk_isolated(newsk)) + sk_memory_allocated_sub(newsk, amt); + mem_cgroup_sk_charge(newsk, amt, gfp); + } } kmem_cache_charge(newsk, gfp); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index ed10b959a906b..f8dd53d40dcf0 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -22,6 +22,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) /* match_sk*_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71a956fbfc553..dcbd49e2f8af3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -908,7 +908,8 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, } __kfree_skb(skb); } else { - sk->sk_prot->enter_memory_pressure(sk); + if (sk_should_enter_memory_pressure(sk)) + tcp_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return NULL; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index dfbac0876d96e..f7aa86661219e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3574,12 +3574,18 @@ void sk_forced_mem_schedule(struct sock *sk, int size) delta = size - sk->sk_forward_alloc; if (delta <= 0) return; + amt = sk_mem_pages(delta); sk_forward_alloc_add(sk, amt << PAGE_SHIFT); - sk_memory_allocated_add(sk, amt); - if (mem_cgroup_sk_enabled(sk)) + if (mem_cgroup_sk_enabled(sk)) { mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL); + + if (mem_cgroup_sk_isolated(sk)) + return; + } + + sk_memory_allocated_add(sk, amt); } /* Send a FIN. The caller locks the socket for us. diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9a287b75c1b31..f7487e22a3f8a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include @@ -1016,7 +1017,7 @@ static void mptcp_enter_memory_pressure(struct sock *sk) mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - if (first) + if (first && sk_should_enter_memory_pressure(ssk)) tcp_enter_memory_pressure(ssk); sk_stream_moderate_sndbuf(ssk); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index f672a62a9a52f..6696ef8371163 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -371,7 +372,8 @@ static int tls_do_allocation(struct sock *sk, if (!offload_ctx->open_record) { if (unlikely(!skb_page_frag_refill(prepend_size, pfrag, sk->sk_allocation))) { - READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); + if (sk_should_enter_memory_pressure(sk)) + READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return -ENOMEM; } From a5200d313d9326b33e25565cebc163e038192b04 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Aug 2025 18:38:11 +0000 Subject: [PATCH 5/5] selftest: bpf: Add test for SK_BPF_MEMCG_SOCK_ISOLATED. The test does the following for IPv4/IPv6 x TCP/UDP sockets with/without BPF prog. 1. Create socket pairs 2. Send a bunch of data that require more than 1000 pages 3. Read memory_allocated from the 3rd column in /proc/net/protocols 4. Check if unread data is charged to memory_allocated If BPF prog is attached, memory_allocated should not be changed, but we allow a small error (up to 10 pages) in case the test is ran concurrently with other tests using TCP/UDP sockets. Signed-off-by: Kuniyuki Iwashima --- .../selftests/bpf/prog_tests/sk_memcg.c | 218 ++++++++++++++++++ tools/testing/selftests/bpf/progs/sk_memcg.c | 30 +++ 2 files changed, 248 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_memcg.c create mode 100644 tools/testing/selftests/bpf/progs/sk_memcg.c diff --git a/tools/testing/selftests/bpf/prog_tests/sk_memcg.c b/tools/testing/selftests/bpf/prog_tests/sk_memcg.c new file mode 100644 index 0000000000000..a45dc30c5ab44 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sk_memcg.c @@ -0,0 +1,218 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2025 Google LLC */ + +#include +#include "sk_memcg.skel.h" +#include "network_helpers.h" + +#define NR_SOCKETS 128 +#define NR_SEND 128 +#define BUF_SINGLE 1024 +#define BUF_TOTAL (BUF_SINGLE * NR_SEND) + +struct test_case { + char name[10]; /* protocols (%-9s) in /proc/net/protocols, see proto_seq_printf(). */ + int family; + int type; + int (*create_sockets)(struct test_case *test_case, int sk[], int len); +}; + +static int tcp_create_sockets(struct test_case *test_case, int sk[], int len) +{ + int server, i; + + server = start_server(test_case->family, test_case->type, NULL, 0, 0); + ASSERT_GE(server, 0, "start_server_str"); + + for (i = 0; i < len / 2; i++) { + sk[i * 2] = connect_to_fd(server, 0); + if (!ASSERT_GE(sk[i * 2], 0, "connect_to_fd")) + return sk[i * 2]; + + sk[i * 2 + 1] = accept(server, NULL, NULL); + if (!ASSERT_GE(sk[i * 2 + 1], 0, "accept")) + return sk[i * 2 + 1]; + } + + close(server); + + return 0; +} + +static int udp_create_sockets(struct test_case *test_case, int sk[], int len) +{ + int i, err, rcvbuf = BUF_TOTAL; + + for (i = 0; i < len / 2; i++) { + sk[i * 2] = start_server(test_case->family, test_case->type, NULL, 0, 0); + if (!ASSERT_GE(sk[i * 2], 0, "start_server")) + return sk[i * 2]; + + sk[i * 2 + 1] = connect_to_fd(sk[i * 2], 0); + if (!ASSERT_GE(sk[i * 2 + 1], 0, "connect_to_fd")) + return sk[i * 2 + 1]; + + err = connect_fd_to_fd(sk[i * 2], sk[i * 2 + 1], 0); + if (!ASSERT_EQ(err, 0, "connect_fd_to_fd")) + return err; + + err = setsockopt(sk[i * 2], SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(int)); + if (!ASSERT_EQ(err, 0, "setsockopt(SO_RCVBUF)")) + return err; + + err = setsockopt(sk[i * 2 + 1], SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(int)); + if (!ASSERT_EQ(err, 0, "setsockopt(SO_RCVBUF)")) + return err; + } + + return 0; +} + +static int get_memory_allocated(struct test_case *test_case) +{ + long memory_allocated = -1; + char *line = NULL; + size_t unused; + FILE *f; + + f = fopen("/proc/net/protocols", "r"); + if (!ASSERT_OK_PTR(f, "fopen")) + goto out; + + while (getline(&line, &unused, f) != -1) { + unsigned int unused_0; + int unused_1; + int ret; + + if (strncmp(line, test_case->name, sizeof(test_case->name))) + continue; + + ret = sscanf(line + sizeof(test_case->name), "%4u %6d %6ld", + &unused_0, &unused_1, &memory_allocated); + ASSERT_EQ(ret, 3, "sscanf"); + break; + } + + ASSERT_NEQ(memory_allocated, -1, "get_memory_allocated"); + + free(line); + fclose(f); +out: + return memory_allocated; +} + +static int check_isolated(struct test_case *test_case, bool isolated) +{ + char buf[BUF_SINGLE] = {}; + long memory_allocated[2]; + int sk[NR_SOCKETS] = {}; + int err = -1, i, j; + + memory_allocated[0] = get_memory_allocated(test_case); + if (!ASSERT_GE(memory_allocated[0], 0, "memory_allocated[0]")) + goto out; + + err = test_case->create_sockets(test_case, sk, ARRAY_SIZE(sk)); + if (err) + goto close; + + /* Must allocate pages >= net.core.mem_pcpu_rsv */ + for (i = 0; i < ARRAY_SIZE(sk); i++) { + for (j = 0; j < NR_SEND; j++) { + int bytes = send(sk[i], buf, sizeof(buf), 0); + + /* Avoid too noisy logs when something failed. */ + if (bytes != sizeof(buf)) + ASSERT_EQ(bytes, sizeof(buf), "send"); + } + } + + memory_allocated[1] = get_memory_allocated(test_case); + if (!ASSERT_GE(memory_allocated[1], 0, "memory_allocated[1]")) + goto close; + + if (isolated) + ASSERT_LE(memory_allocated[1], memory_allocated[0] + 10, "isolated"); + else + ASSERT_GT(memory_allocated[1], memory_allocated[0] + 1000, "not isolated"); + +close: + for (i = 0; i < ARRAY_SIZE(sk); i++) + close(sk[i]); + + /* Let RCU destruct sockets */ + sleep(1); +out: + return err; +} + +void run_test(struct test_case *test_case) +{ + struct sk_memcg *skel; + int cgroup, err; + + skel = sk_memcg__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + cgroup = test__join_cgroup("/sk_memcg"); + if (!ASSERT_GE(cgroup, 0, "join_cgroup")) + goto destroy_skel; + + err = check_isolated(test_case, false); + if (!ASSERT_EQ(err, 0, "test_isolated(false)")) + goto close_cgroup; + + skel->links.sock_create = bpf_program__attach_cgroup(skel->progs.sock_create, cgroup); + if (!ASSERT_OK_PTR(skel->links.sock_create, "attach_cgroup(sock_create)")) + goto close_cgroup; + + skel->links.skops_setsockopt = bpf_program__attach_cgroup(skel->progs.skops_setsockopt, cgroup); + if (!ASSERT_OK_PTR(skel->links.skops_setsockopt, "attach_cgroup(skops_setsockopt)")) + goto close_cgroup; + + err = check_isolated(test_case, true); + ASSERT_EQ(err, 0, "test_isolated(false)"); + +close_cgroup: + close(cgroup); +destroy_skel: + sk_memcg__destroy(skel); +} + +struct test_case test_cases[] = { + { + .name = "TCP ", + .family = AF_INET, + .type = SOCK_STREAM, + .create_sockets = tcp_create_sockets, + }, + { + .name = "UDP ", + .family = AF_INET, + .type = SOCK_DGRAM, + .create_sockets = udp_create_sockets, + }, + { + .name = "TCPv6 ", + .family = AF_INET6, + .type = SOCK_STREAM, + .create_sockets = tcp_create_sockets, + }, + { + .name = "UDPv6 ", + .family = AF_INET6, + .type = SOCK_DGRAM, + .create_sockets = udp_create_sockets, + }, +}; + +void test_sk_memcg(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(test_cases); i++) { + test__start_subtest(test_cases[i].name); + run_test(&test_cases[i]); + } +} diff --git a/tools/testing/selftests/bpf/progs/sk_memcg.c b/tools/testing/selftests/bpf/progs/sk_memcg.c new file mode 100644 index 0000000000000..1b0ea991099d3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/sk_memcg.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2025 Google LLC */ + +#include "bpf_tracing_net.h" +#include + +void isolate_memcg(void *ctx) +{ + int flags = SK_BPF_MEMCG_SOCK_ISOLATED; + + bpf_setsockopt(ctx, SOL_SOCKET, SK_BPF_MEMCG_FLAGS, + &flags, sizeof(flags)); +} + +SEC("cgroup/sock_create") +int sock_create(struct bpf_sock *ctx) +{ + isolate_memcg(ctx); + return 1; +} + +SEC("sockops") +int skops_setsockopt(struct bpf_sock_ops *skops) +{ + if (skops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB) + isolate_memcg(skops); + return 1; +} + +char LICENSE[] SEC("license") = "GPL";