Skip to content

Commit 03de843

Browse files
author
Martin KaFai Lau
committed
Merge branch 'bpf-allow-opt-out-from-sk-sk_prot-memory_allocated'
Kuniyuki Iwashima says: ==================== bpf: Allow opt-out from sk->sk_prot->memory_allocated. This series allows opting out of the global per-protocol memory accounting if socket is configured as such by sysctl or BPF prog. This series is the successor of the series below [0], but the changes now fall in net and bpf subsystems only. I discussed with Roman Gushchin offlist, and he suggested not mixing two independent subsystems and it would be cleaner not to depend on memcg. So, sk->sk_memcg and memcg code are no longer touched, and instead we use another hole near sk->sk_prot to store a flag for the pure net opt-out feature. Overview of the series: patch 1 is misc cleanup patch 2 allows opt-out from sk->sk_prot->memory_allocated patch 3 introduces net.core.bypass_prot_mem patch 4 & 5 supports flagging sk->sk_bypass_prot_mem via bpf_setsockopt() patch 6 is selftest Thank you very much for all your help, Shakeel, Roman, Martin, and Eric! [0]: https://lore.kernel.org/bpf/[email protected]/ Changes: v2: * Patch 2: * Fill kdoc for skc_bypass_prot_mem * Patch 6 * Fix server fd leak in tcp_create_sockets() * Avoid close(0) in check_bypass() v1: https://lore.kernel.org/bpf/[email protected]/ ==================== Link: https://patch.msgid.link/[email protected] Signed-off-by: Martin KaFai Lau <[email protected]>
2 parents 55db64d + 5f941dd commit 03de843

File tree

18 files changed

+577
-38
lines changed

18 files changed

+577
-38
lines changed

Documentation/admin-guide/sysctl/net.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,14 @@ mem_pcpu_rsv
212212

213213
Per-cpu reserved forward alloc cache size in page units. Default 1MB per CPU.
214214

215+
bypass_prot_mem
216+
---------------
217+
218+
Skip charging socket buffers to the global per-protocol memory
219+
accounting controlled by net.ipv4.tcp_mem, net.ipv4.udp_mem, etc.
220+
221+
Default: 0 (off)
222+
215223
rmem_default
216224
------------
217225

include/net/netns/core.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ struct netns_core {
1717
int sysctl_optmem_max;
1818
u8 sysctl_txrehash;
1919
u8 sysctl_tstamp_allow_data;
20+
u8 sysctl_bypass_prot_mem;
2021

2122
#ifdef CONFIG_PROC_FS
2223
struct prot_inuse __percpu *prot_inuse;

include/net/proto_memory.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
3535
mem_cgroup_sk_under_memory_pressure(sk))
3636
return true;
3737

38+
if (sk->sk_bypass_prot_mem)
39+
return false;
40+
3841
return !!READ_ONCE(*sk->sk_prot->memory_pressure);
3942
}
4043

include/net/sock.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ typedef __u64 __bitwise __addrpair;
118118
* @skc_reuseport: %SO_REUSEPORT setting
119119
* @skc_ipv6only: socket is IPV6 only
120120
* @skc_net_refcnt: socket is using net ref counting
121+
* @skc_bypass_prot_mem: bypass the per-protocol memory accounting for skb
121122
* @skc_bound_dev_if: bound device index if != 0
122123
* @skc_bind_node: bind hash linkage for various protocol lookup tables
123124
* @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
@@ -174,6 +175,7 @@ struct sock_common {
174175
unsigned char skc_reuseport:1;
175176
unsigned char skc_ipv6only:1;
176177
unsigned char skc_net_refcnt:1;
178+
unsigned char skc_bypass_prot_mem:1;
177179
int skc_bound_dev_if;
178180
union {
179181
struct hlist_node skc_bind_node;
@@ -381,6 +383,7 @@ struct sock {
381383
#define sk_reuseport __sk_common.skc_reuseport
382384
#define sk_ipv6only __sk_common.skc_ipv6only
383385
#define sk_net_refcnt __sk_common.skc_net_refcnt
386+
#define sk_bypass_prot_mem __sk_common.skc_bypass_prot_mem
384387
#define sk_bound_dev_if __sk_common.skc_bound_dev_if
385388
#define sk_bind_node __sk_common.skc_bind_node
386389
#define sk_prot __sk_common.skc_prot

include/net/tcp.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,9 @@ static inline bool tcp_under_memory_pressure(const struct sock *sk)
303303
mem_cgroup_sk_under_memory_pressure(sk))
304304
return true;
305305

306+
if (sk->sk_bypass_prot_mem)
307+
return false;
308+
306309
return READ_ONCE(tcp_memory_pressure);
307310
}
308311
/*

include/uapi/linux/bpf.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7200,6 +7200,8 @@ enum {
72007200
TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */
72017201
TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
72027202
SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */
7203+
SK_BPF_BYPASS_PROT_MEM = 1010, /* Get or Set sk->sk_bypass_prot_mem */
7204+
72037205
};
72047206

72057207
enum {

net/core/filter.c

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5733,6 +5733,77 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
57335733
.arg5_type = ARG_CONST_SIZE,
57345734
};
57355735

5736+
static int sk_bpf_set_get_bypass_prot_mem(struct sock *sk,
5737+
char *optval, int optlen,
5738+
bool getopt)
5739+
{
5740+
int val;
5741+
5742+
if (optlen != sizeof(int))
5743+
return -EINVAL;
5744+
5745+
if (!sk_has_account(sk))
5746+
return -EOPNOTSUPP;
5747+
5748+
if (getopt) {
5749+
*(int *)optval = sk->sk_bypass_prot_mem;
5750+
return 0;
5751+
}
5752+
5753+
val = *(int *)optval;
5754+
if (val < 0 || val > 1)
5755+
return -EINVAL;
5756+
5757+
sk->sk_bypass_prot_mem = val;
5758+
return 0;
5759+
}
5760+
5761+
BPF_CALL_5(bpf_sock_create_setsockopt, struct sock *, sk, int, level,
5762+
int, optname, char *, optval, int, optlen)
5763+
{
5764+
if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM)
5765+
return sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, false);
5766+
5767+
return __bpf_setsockopt(sk, level, optname, optval, optlen);
5768+
}
5769+
5770+
static const struct bpf_func_proto bpf_sock_create_setsockopt_proto = {
5771+
.func = bpf_sock_create_setsockopt,
5772+
.gpl_only = false,
5773+
.ret_type = RET_INTEGER,
5774+
.arg1_type = ARG_PTR_TO_CTX,
5775+
.arg2_type = ARG_ANYTHING,
5776+
.arg3_type = ARG_ANYTHING,
5777+
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
5778+
.arg5_type = ARG_CONST_SIZE,
5779+
};
5780+
5781+
BPF_CALL_5(bpf_sock_create_getsockopt, struct sock *, sk, int, level,
5782+
int, optname, char *, optval, int, optlen)
5783+
{
5784+
if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) {
5785+
int err = sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, true);
5786+
5787+
if (err)
5788+
memset(optval, 0, optlen);
5789+
5790+
return err;
5791+
}
5792+
5793+
return __bpf_getsockopt(sk, level, optname, optval, optlen);
5794+
}
5795+
5796+
static const struct bpf_func_proto bpf_sock_create_getsockopt_proto = {
5797+
.func = bpf_sock_create_getsockopt,
5798+
.gpl_only = false,
5799+
.ret_type = RET_INTEGER,
5800+
.arg1_type = ARG_PTR_TO_CTX,
5801+
.arg2_type = ARG_ANYTHING,
5802+
.arg3_type = ARG_ANYTHING,
5803+
.arg4_type = ARG_PTR_TO_UNINIT_MEM,
5804+
.arg5_type = ARG_CONST_SIZE,
5805+
};
5806+
57365807
BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
57375808
int, level, int, optname, char *, optval, int, optlen)
57385809
{
@@ -8062,6 +8133,20 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
80628133
return &bpf_sk_storage_get_cg_sock_proto;
80638134
case BPF_FUNC_ktime_get_coarse_ns:
80648135
return &bpf_ktime_get_coarse_ns_proto;
8136+
case BPF_FUNC_setsockopt:
8137+
switch (prog->expected_attach_type) {
8138+
case BPF_CGROUP_INET_SOCK_CREATE:
8139+
return &bpf_sock_create_setsockopt_proto;
8140+
default:
8141+
return NULL;
8142+
}
8143+
case BPF_FUNC_getsockopt:
8144+
switch (prog->expected_attach_type) {
8145+
case BPF_CGROUP_INET_SOCK_CREATE:
8146+
return &bpf_sock_create_getsockopt_proto;
8147+
default:
8148+
return NULL;
8149+
}
80658150
default:
80668151
return bpf_base_func_proto(func_id, prog);
80678152
}

net/core/sock.c

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1046,9 +1046,13 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
10461046
if (!charged)
10471047
return -ENOMEM;
10481048

1049+
if (sk->sk_bypass_prot_mem)
1050+
goto success;
1051+
10491052
/* pre-charge to forward_alloc */
10501053
sk_memory_allocated_add(sk, pages);
10511054
allocated = sk_memory_allocated(sk);
1055+
10521056
/* If the system goes into memory pressure with this
10531057
* precharge, give up and return error.
10541058
*/
@@ -1057,6 +1061,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
10571061
mem_cgroup_sk_uncharge(sk, pages);
10581062
return -ENOMEM;
10591063
}
1064+
1065+
success:
10601066
sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
10611067

10621068
WRITE_ONCE(sk->sk_reserved_mem,
@@ -2300,8 +2306,13 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
23002306
* why we need sk_prot_creator -acme
23012307
*/
23022308
sk->sk_prot = sk->sk_prot_creator = prot;
2309+
2310+
if (READ_ONCE(net->core.sysctl_bypass_prot_mem))
2311+
sk->sk_bypass_prot_mem = 1;
2312+
23032313
sk->sk_kern_sock = kern;
23042314
sock_lock_init(sk);
2315+
23052316
sk->sk_net_refcnt = kern ? 0 : 1;
23062317
if (likely(sk->sk_net_refcnt)) {
23072318
get_net_track(net, &sk->ns_tracker, priority);
@@ -3145,8 +3156,11 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
31453156
if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
31463157
return true;
31473158

3148-
sk_enter_memory_pressure(sk);
3159+
if (!sk->sk_bypass_prot_mem)
3160+
sk_enter_memory_pressure(sk);
3161+
31493162
sk_stream_moderate_sndbuf(sk);
3163+
31503164
return false;
31513165
}
31523166
EXPORT_SYMBOL(sk_page_frag_refill);
@@ -3263,10 +3277,12 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
32633277
{
32643278
bool memcg_enabled = false, charged = false;
32653279
struct proto *prot = sk->sk_prot;
3266-
long allocated;
3280+
long allocated = 0;
32673281

3268-
sk_memory_allocated_add(sk, amt);
3269-
allocated = sk_memory_allocated(sk);
3282+
if (!sk->sk_bypass_prot_mem) {
3283+
sk_memory_allocated_add(sk, amt);
3284+
allocated = sk_memory_allocated(sk);
3285+
}
32703286

32713287
if (mem_cgroup_sk_enabled(sk)) {
32723288
memcg_enabled = true;
@@ -3275,6 +3291,9 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
32753291
goto suppress_allocation;
32763292
}
32773293

3294+
if (!allocated)
3295+
return 1;
3296+
32783297
/* Under limit. */
32793298
if (allocated <= sk_prot_mem_limits(sk, 0)) {
32803299
sk_leave_memory_pressure(sk);
@@ -3353,7 +3372,8 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
33533372

33543373
trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
33553374

3356-
sk_memory_allocated_sub(sk, amt);
3375+
if (allocated)
3376+
sk_memory_allocated_sub(sk, amt);
33573377

33583378
if (charged)
33593379
mem_cgroup_sk_uncharge(sk, amt);
@@ -3392,11 +3412,14 @@ EXPORT_SYMBOL(__sk_mem_schedule);
33923412
*/
33933413
void __sk_mem_reduce_allocated(struct sock *sk, int amount)
33943414
{
3395-
sk_memory_allocated_sub(sk, amount);
3396-
33973415
if (mem_cgroup_sk_enabled(sk))
33983416
mem_cgroup_sk_uncharge(sk, amount);
33993417

3418+
if (sk->sk_bypass_prot_mem)
3419+
return;
3420+
3421+
sk_memory_allocated_sub(sk, amount);
3422+
34003423
if (sk_under_global_memory_pressure(sk) &&
34013424
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
34023425
sk_leave_memory_pressure(sk);

net/core/sysctl_net_core.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,15 @@ static struct ctl_table netns_core_table[] = {
683683
.extra1 = SYSCTL_ZERO,
684684
.extra2 = SYSCTL_ONE
685685
},
686+
{
687+
.procname = "bypass_prot_mem",
688+
.data = &init_net.core.sysctl_bypass_prot_mem,
689+
.maxlen = sizeof(u8),
690+
.mode = 0644,
691+
.proc_handler = proc_dou8vec_minmax,
692+
.extra1 = SYSCTL_ZERO,
693+
.extra2 = SYSCTL_ONE
694+
},
686695
/* sysctl_core_net_init() will set the values after this
687696
* to readonly in network namespaces
688697
*/

net/ipv4/af_inet.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -755,6 +755,28 @@ EXPORT_SYMBOL(inet_stream_connect);
755755

756756
void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk)
757757
{
758+
/* TODO: use sk_clone_lock() in SCTP and remove protocol checks */
759+
if (mem_cgroup_sockets_enabled &&
760+
(!IS_ENABLED(CONFIG_IP_SCTP) || sk_is_tcp(newsk))) {
761+
gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL;
762+
763+
mem_cgroup_sk_alloc(newsk);
764+
765+
if (mem_cgroup_from_sk(newsk)) {
766+
int amt;
767+
768+
/* The socket has not been accepted yet, no need
769+
* to look at newsk->sk_wmem_queued.
770+
*/
771+
amt = sk_mem_pages(newsk->sk_forward_alloc +
772+
atomic_read(&newsk->sk_rmem_alloc));
773+
if (amt)
774+
mem_cgroup_sk_charge(newsk, amt, gfp);
775+
}
776+
777+
kmem_cache_charge(newsk, gfp);
778+
}
779+
758780
sock_rps_record_flow(newsk);
759781
WARN_ON(!((1 << newsk->sk_state) &
760782
(TCPF_ESTABLISHED | TCPF_SYN_RECV |

0 commit comments

Comments
 (0)