Skip to content

Commit d5728fe

Browse files
q2venMartin KaFai Lau
authored andcommitted
net: Allow opt-out from global protocol memory accounting.
Some protocols (e.g., TCP, UDP) implement memory accounting for socket buffers and charge memory to per-protocol global counters pointed to by sk->sk_proto->memory_allocated. Sometimes, system processes do not want that limitation. For a similar purpose, there is SO_RESERVE_MEM for sockets under memcg. Also, by opting out of the per-protocol accounting, sockets under memcg can avoid paying costs for two orthogonal memory accounting mechanisms. A microbenchmark result is in the subsequent bpf patch. Let's allow opt-out from the per-protocol memory accounting if sk->sk_bypass_prot_mem is true. sk->sk_bypass_prot_mem and sk->sk_prot are placed in the same cache line, and sk_has_account() always fetches sk->sk_prot before accessing sk->sk_bypass_prot_mem, so there is no extra cache miss for this patch. The following patches will set sk->sk_bypass_prot_mem to true, and then, the per-protocol memory accounting will be skipped. Note that this does NOT disable memcg, but rather the per-protocol one. Another option not to use the hole in struct sock_common is create sk_prot variants like tcp_prot_bypass, but this would complicate SOCKMAP logic, tcp_bpf_prots etc. Signed-off-by: Kuniyuki Iwashima <[email protected]> Signed-off-by: Martin KaFai Lau <[email protected]> Reviewed-by: Shakeel Butt <[email protected]> Reviewed-by: Eric Dumazet <[email protected]> Acked-by: Roman Gushchin <[email protected]> Link: https://patch.msgid.link/[email protected]
1 parent 8c52ab2 commit d5728fe

File tree

8 files changed

+48
-13
lines changed

8 files changed

+48
-13
lines changed

include/net/proto_memory.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
3535
mem_cgroup_sk_under_memory_pressure(sk))
3636
return true;
3737

38+
if (sk->sk_bypass_prot_mem)
39+
return false;
40+
3841
return !!READ_ONCE(*sk->sk_prot->memory_pressure);
3942
}
4043

include/net/sock.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ typedef __u64 __bitwise __addrpair;
118118
* @skc_reuseport: %SO_REUSEPORT setting
119119
* @skc_ipv6only: socket is IPV6 only
120120
* @skc_net_refcnt: socket is using net ref counting
121+
* @skc_bypass_prot_mem: bypass the per-protocol memory accounting for skb
121122
* @skc_bound_dev_if: bound device index if != 0
122123
* @skc_bind_node: bind hash linkage for various protocol lookup tables
123124
* @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
@@ -174,6 +175,7 @@ struct sock_common {
174175
unsigned char skc_reuseport:1;
175176
unsigned char skc_ipv6only:1;
176177
unsigned char skc_net_refcnt:1;
178+
unsigned char skc_bypass_prot_mem:1;
177179
int skc_bound_dev_if;
178180
union {
179181
struct hlist_node skc_bind_node;
@@ -380,6 +382,7 @@ struct sock {
380382
#define sk_reuseport __sk_common.skc_reuseport
381383
#define sk_ipv6only __sk_common.skc_ipv6only
382384
#define sk_net_refcnt __sk_common.skc_net_refcnt
385+
#define sk_bypass_prot_mem __sk_common.skc_bypass_prot_mem
383386
#define sk_bound_dev_if __sk_common.skc_bound_dev_if
384387
#define sk_bind_node __sk_common.skc_bind_node
385388
#define sk_prot __sk_common.skc_prot

include/net/tcp.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,9 @@ static inline bool tcp_under_memory_pressure(const struct sock *sk)
303303
mem_cgroup_sk_under_memory_pressure(sk))
304304
return true;
305305

306+
if (sk->sk_bypass_prot_mem)
307+
return false;
308+
306309
return READ_ONCE(tcp_memory_pressure);
307310
}
308311
/*

net/core/sock.c

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1046,9 +1046,13 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
10461046
if (!charged)
10471047
return -ENOMEM;
10481048

1049+
if (sk->sk_bypass_prot_mem)
1050+
goto success;
1051+
10491052
/* pre-charge to forward_alloc */
10501053
sk_memory_allocated_add(sk, pages);
10511054
allocated = sk_memory_allocated(sk);
1055+
10521056
/* If the system goes into memory pressure with this
10531057
* precharge, give up and return error.
10541058
*/
@@ -1057,6 +1061,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
10571061
mem_cgroup_sk_uncharge(sk, pages);
10581062
return -ENOMEM;
10591063
}
1064+
1065+
success:
10601066
sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
10611067

10621068
WRITE_ONCE(sk->sk_reserved_mem,
@@ -3136,8 +3142,11 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
31363142
if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
31373143
return true;
31383144

3139-
sk_enter_memory_pressure(sk);
3145+
if (!sk->sk_bypass_prot_mem)
3146+
sk_enter_memory_pressure(sk);
3147+
31403148
sk_stream_moderate_sndbuf(sk);
3149+
31413150
return false;
31423151
}
31433152
EXPORT_SYMBOL(sk_page_frag_refill);
@@ -3254,10 +3263,12 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
32543263
{
32553264
bool memcg_enabled = false, charged = false;
32563265
struct proto *prot = sk->sk_prot;
3257-
long allocated;
3266+
long allocated = 0;
32583267

3259-
sk_memory_allocated_add(sk, amt);
3260-
allocated = sk_memory_allocated(sk);
3268+
if (!sk->sk_bypass_prot_mem) {
3269+
sk_memory_allocated_add(sk, amt);
3270+
allocated = sk_memory_allocated(sk);
3271+
}
32613272

32623273
if (mem_cgroup_sk_enabled(sk)) {
32633274
memcg_enabled = true;
@@ -3266,6 +3277,9 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
32663277
goto suppress_allocation;
32673278
}
32683279

3280+
if (!allocated)
3281+
return 1;
3282+
32693283
/* Under limit. */
32703284
if (allocated <= sk_prot_mem_limits(sk, 0)) {
32713285
sk_leave_memory_pressure(sk);
@@ -3344,7 +3358,8 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
33443358

33453359
trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
33463360

3347-
sk_memory_allocated_sub(sk, amt);
3361+
if (allocated)
3362+
sk_memory_allocated_sub(sk, amt);
33483363

33493364
if (charged)
33503365
mem_cgroup_sk_uncharge(sk, amt);
@@ -3383,11 +3398,14 @@ EXPORT_SYMBOL(__sk_mem_schedule);
33833398
*/
33843399
void __sk_mem_reduce_allocated(struct sock *sk, int amount)
33853400
{
3386-
sk_memory_allocated_sub(sk, amount);
3387-
33883401
if (mem_cgroup_sk_enabled(sk))
33893402
mem_cgroup_sk_uncharge(sk, amount);
33903403

3404+
if (sk->sk_bypass_prot_mem)
3405+
return;
3406+
3407+
sk_memory_allocated_sub(sk, amount);
3408+
33913409
if (sk_under_global_memory_pressure(sk) &&
33923410
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
33933411
sk_leave_memory_pressure(sk);

net/ipv4/tcp.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -928,7 +928,8 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
928928
}
929929
__kfree_skb(skb);
930930
} else {
931-
sk->sk_prot->enter_memory_pressure(sk);
931+
if (!sk->sk_bypass_prot_mem)
932+
tcp_enter_memory_pressure(sk);
932933
sk_stream_moderate_sndbuf(sk);
933934
}
934935
return NULL;

net/ipv4/tcp_output.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3732,12 +3732,17 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
37323732
delta = size - sk->sk_forward_alloc;
37333733
if (delta <= 0)
37343734
return;
3735+
37353736
amt = sk_mem_pages(delta);
37363737
sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3737-
sk_memory_allocated_add(sk, amt);
37383738

37393739
if (mem_cgroup_sk_enabled(sk))
37403740
mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
3741+
3742+
if (sk->sk_bypass_prot_mem)
3743+
return;
3744+
3745+
sk_memory_allocated_add(sk, amt);
37413746
}
37423747

37433748
/* Send a FIN. The caller locks the socket for us.

net/mptcp/protocol.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,11 +1065,12 @@ static void mptcp_enter_memory_pressure(struct sock *sk)
10651065
mptcp_for_each_subflow(msk, subflow) {
10661066
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
10671067

1068-
if (first)
1068+
if (first && !ssk->sk_bypass_prot_mem) {
10691069
tcp_enter_memory_pressure(ssk);
1070-
sk_stream_moderate_sndbuf(ssk);
1070+
first = false;
1071+
}
10711072

1072-
first = false;
1073+
sk_stream_moderate_sndbuf(ssk);
10731074
}
10741075
__mptcp_sync_sndbuf(sk);
10751076
}

net/tls/tls_device.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,8 @@ static int tls_do_allocation(struct sock *sk,
373373
if (!offload_ctx->open_record) {
374374
if (unlikely(!skb_page_frag_refill(prepend_size, pfrag,
375375
sk->sk_allocation))) {
376-
READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
376+
if (!sk->sk_bypass_prot_mem)
377+
READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
377378
sk_stream_moderate_sndbuf(sk);
378379
return -ENOMEM;
379380
}

0 commit comments

Comments
 (0)