Skip to content

Commit 0ced0b6

Browse files
teawaterKernel Patches Daemon
authored andcommitted
memcg: add eBPF struct ops support for memory charging
Add eBPF struct ops support to the memory controller, enabling dynamic memory management policies via eBPF programs. This allows users to implement custom memory charging and reclamation strategies without kernel recompilation. The implementation introduces: - A new BPF struct ops type `memcg_ops` with a `try_charge_memcg` hook for intercepting memory charge operations - Integration into the `try_charge_memcg()` function to call registered eBPF handlers - Safe registration/unregistration via BPF struct ops infrastructure - Reference counting using percpu_ref to track handler lifecycle - Static branch keys to minimize overhead when disabled - New Kconfig option CONFIG_MEMCG_BPF to control the feature The eBPF handler receives a `try_charge_memcg` struct containing: - Memory cgroup and affected memory cgroup - GFP flags and page count - Reclamation options - Current charge status Handlers can inspect this context and modify certain fields (e.g., nr_pages) to adjust reclamation behavior. The design enforces single active handler to avoid conflicts. Use cases include: - Custom memory policies for specialized workloads - Memory pressure telemetry and monitoring - Integration with container management systems - Runtime memory management experimentation Design decisions: - Uses RCU synchronization for safe handler replacement - Zero overhead when feature is disabled (via static keys) - Single handler model prevents complexity and race conditions - eBPF verifier restrictions ensure memory safety - Minimal context exposure to reduce attack surface Signed-off-by: Geliang Tang <[email protected]> Signed-off-by: Hui Zhu <[email protected]>
1 parent 73c6b0b commit 0ced0b6

File tree

6 files changed

+363
-7
lines changed

6 files changed

+363
-7
lines changed

MAINTAINERS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6352,6 +6352,8 @@ F: include/linux/page_counter.h
63526352
F: mm/memcontrol.c
63536353
F: mm/memcontrol-v1.c
63546354
F: mm/memcontrol-v1.h
6355+
F: mm/memcontrol_bpf.c
6356+
F: mm/memcontrol_bpf.h
63556357
F: mm/page_counter.c
63566358
F: mm/swap_cgroup.c
63576359
F: samples/cgroup/*

init/Kconfig

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1063,6 +1063,44 @@ config MEMCG_V1
10631063

10641064
Say N if unsure.
10651065

1066+
config MEMCG_BPF
1067+
bool "Memory controller eBPF support"
1068+
depends on MEMCG
1069+
depends on BPF_SYSCALL
1070+
default n
1071+
help
1072+
This option enables eBPF support for the memory controller,
1073+
allowing eBPF programs to hook into memory charging
1074+
operations and implement custom memory management policies
1075+
at runtime.
1076+
1077+
With this feature enabled, administrators can load eBPF
1078+
programs to monitor and adjust memory charging behavior
1079+
without recompiling the kernel. This enables:
1080+
1081+
- Custom memory reclamation strategies for specialized
1082+
workloads
1083+
- Dynamic memory pressure telemetry and monitoring
1084+
- Memory accounting adjustments based on runtime conditions
1085+
- Integration with container orchestration systems
1086+
- Experimentation with novel memory management algorithms
1087+
1088+
The eBPF handler is invoked during memory charge attempts
1089+
and can inspect memory cgroup context and optionally modify
1090+
parameters like reclamation size.
1091+
1092+
When this feature is disabled or no eBPF program is loaded,
1093+
there is zero performance overhead. When enabled with an
1094+
active program, overhead is minimal (one indirect function
1095+
call per charge attempt). The eBPF verifier ensures memory
1096+
safety of loaded programs.
1097+
1098+
Only one eBPF program can be active at a time. Loading a
1099+
new program requires appropriate BPF permissions
1100+
(CAP_PERFMON or CAP_SYS_ADMIN).
1101+
1102+
Say N if unsure.
1103+
10661104
config BLK_CGROUP
10671105
bool "IO controller"
10681106
depends on BLOCK

mm/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
102102
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
103103
obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o
104104
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
105+
obj-$(CONFIG_MEMCG_BPF) += memcontrol_bpf.o
105106
ifdef CONFIG_SWAP
106107
obj-$(CONFIG_MEMCG) += swap_cgroup.o
107108
endif

mm/memcontrol.c

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
#include <net/ip.h>
6969
#include "slab.h"
7070
#include "memcontrol-v1.h"
71+
#include "memcontrol_bpf.h"
7172

7273
#include <linux/uaccess.h>
7374

@@ -2301,13 +2302,14 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
23012302
int nr_retries = MAX_RECLAIM_RETRIES;
23022303
struct mem_cgroup *mem_over_limit;
23032304
struct page_counter *counter;
2304-
unsigned long nr_reclaimed;
2305+
unsigned long nr_reclaime, nr_reclaimed;
23052306
bool passed_oom = false;
23062307
unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
23072308
bool drained = false;
23082309
bool raised_max_event = false;
23092310
unsigned long pflags;
23102311
bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
2312+
bool charge_done = false;
23112313

23122314
retry:
23132315
if (consume_stock(memcg, nr_pages))
@@ -2320,20 +2322,30 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
23202322
if (!do_memsw_account() ||
23212323
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
23222324
if (page_counter_try_charge(&memcg->memory, batch, &counter))
2323-
goto done_restock;
2324-
if (do_memsw_account())
2325-
page_counter_uncharge(&memcg->memsw, batch);
2326-
mem_over_limit = mem_cgroup_from_counter(counter, memory);
2325+
charge_done = true;
2326+
else {
2327+
if (do_memsw_account())
2328+
page_counter_uncharge(&memcg->memsw, batch);
2329+
mem_over_limit = mem_cgroup_from_counter(counter, memory);
2330+
}
23272331
} else {
23282332
mem_over_limit = mem_cgroup_from_counter(counter, memsw);
23292333
reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
23302334
}
23312335

2332-
if (batch > nr_pages) {
2336+
if (!charge_done && batch > nr_pages) {
23332337
batch = nr_pages;
23342338
goto retry;
23352339
}
23362340

2341+
nr_reclaime = bpf_try_charge_memcg(memcg, gfp_mask, nr_pages,
2342+
mem_over_limit,
2343+
reclaim_options,
2344+
charge_done);
2345+
2346+
if (charge_done)
2347+
goto done_restock;
2348+
23372349
/*
23382350
* Prevent unbounded recursion when reclaim operations need to
23392351
* allocate memory. This might exceed the limits temporarily,
@@ -2353,7 +2365,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
23532365
raised_max_event = true;
23542366

23552367
psi_memstall_enter(&pflags);
2356-
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2368+
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_reclaime,
23572369
gfp_mask, reclaim_options, NULL);
23582370
psi_memstall_leave(&pflags);
23592371

mm/memcontrol_bpf.c

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/*
3+
* Memory Controller eBPF support
4+
*
5+
* Author: Hui Zhu <[email protected]>
6+
* Copyright (C) 2025 KylinSoft Corporation.
7+
*/
8+
9+
#include <linux/cgroup-defs.h>
10+
#include <linux/page_counter.h>
11+
#include <linux/memcontrol.h>
12+
#include <linux/cgroup.h>
13+
#include <linux/rcupdate.h>
14+
#include <linux/bpf_verifier.h>
15+
#include <linux/bpf.h>
16+
#include <linux/btf.h>
17+
#include <linux/btf_ids.h>
18+
#include <linux/module.h>
19+
#include "memcontrol_bpf.h"
20+
21+
struct memcg_ops __rcu *memcg_ops;
22+
DEFINE_STATIC_KEY_FALSE(memcg_bpf_enable);
23+
24+
static void memcg_ops_release(struct percpu_ref *ref)
25+
{
26+
struct memcg_ops *ops = container_of(ref,
27+
struct memcg_ops, refcount);
28+
29+
/* Signal destruction completion */
30+
complete(&ops->destroy_done);
31+
}
32+
33+
static int memcg_ops_btf_struct_access(struct bpf_verifier_log *log,
34+
const struct bpf_reg_state *reg,
35+
int off, int size)
36+
{
37+
size_t end;
38+
39+
switch (off) {
40+
case offsetof(struct try_charge_memcg, nr_pages):
41+
end = offsetofend(struct try_charge_memcg, nr_pages);
42+
break;
43+
default:
44+
return -EACCES;
45+
}
46+
47+
if (off + size > end)
48+
return -EACCES;
49+
50+
return 0;
51+
}
52+
53+
static bool memcg_ops_is_valid_access(int off, int size, enum bpf_access_type type,
54+
const struct bpf_prog *prog,
55+
struct bpf_insn_access_aux *info)
56+
{
57+
return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
58+
}
59+
60+
const struct bpf_verifier_ops bpf_memcg_verifier_ops = {
61+
.get_func_proto = bpf_base_func_proto,
62+
.btf_struct_access = memcg_ops_btf_struct_access,
63+
.is_valid_access = memcg_ops_is_valid_access,
64+
};
65+
66+
static int cfi_try_charge_memcg(struct try_charge_memcg *tcm)
67+
{
68+
return -EINVAL;
69+
}
70+
71+
static struct memcg_ops cfi_bpf_memcg_ops = {
72+
.try_charge_memcg = cfi_try_charge_memcg,
73+
};
74+
75+
static int bpf_memcg_ops_init(struct btf *btf)
76+
{
77+
return 0;
78+
}
79+
80+
static int bpf_memcg_ops_check_member(const struct btf_type *t,
81+
const struct btf_member *member,
82+
const struct bpf_prog *prog)
83+
{
84+
u32 moff = __btf_member_bit_offset(t, member) / 8;
85+
86+
switch (moff) {
87+
case offsetof(struct memcg_ops, try_charge_memcg):
88+
case offsetof(struct memcg_ops, refcount):
89+
case offsetof(struct memcg_ops, destroy_done):
90+
break;
91+
default:
92+
if (prog->sleepable)
93+
return -EINVAL;
94+
}
95+
96+
return 0;
97+
}
98+
99+
static int default_try_charge_memcg(struct try_charge_memcg *tcm)
100+
{
101+
return 0;
102+
}
103+
104+
static int bpf_memcg_ops_init_member(const struct btf_type *t,
105+
const struct btf_member *member,
106+
void *kdata, const void *udata)
107+
{
108+
struct memcg_ops *ops = (struct memcg_ops *)kdata;
109+
u32 moff = __btf_member_bit_offset(t, member) / 8;
110+
int ret;
111+
112+
if (moff == offsetof(struct memcg_ops, refcount)) {
113+
ret = percpu_ref_init(&ops->refcount, memcg_ops_release, 0, GFP_KERNEL);
114+
if (ret) {
115+
pr_err("Failed to percpu_ref_init: %d\n", ret);
116+
return ret;
117+
}
118+
119+
init_completion(&ops->destroy_done);
120+
121+
if (!ops->try_charge_memcg)
122+
ops->try_charge_memcg = default_try_charge_memcg;
123+
}
124+
125+
return 0;
126+
}
127+
128+
static int bpf_memcg_ops_reg(void *kdata, struct bpf_link *link)
129+
{
130+
struct memcg_ops *new_ops, *old_ops;
131+
132+
/*
133+
* Check if ops already exists.
134+
* just get old_ops but not keep lock because
135+
* caller has locked st_map->lock.
136+
*/
137+
rcu_read_lock();
138+
old_ops = rcu_dereference(memcg_ops);
139+
rcu_read_unlock();
140+
if (old_ops)
141+
return -EEXIST;
142+
143+
new_ops = kdata;
144+
145+
/* Atomically set ops pointer (should be NULL at this point) */
146+
old_ops = rcu_replace_pointer(memcg_ops, new_ops, true);
147+
WARN_ON(old_ops);
148+
149+
static_branch_enable(&memcg_bpf_enable);
150+
151+
return 0;
152+
}
153+
154+
/* Unregister the struct ops instance */
155+
static void bpf_memcg_ops_unreg(void *kdata, struct bpf_link *link)
156+
{
157+
struct memcg_ops *ops;
158+
159+
static_branch_disable(&memcg_bpf_enable);
160+
161+
/* Not lock same with bpf_memcg_ops_reg. */
162+
ops = rcu_replace_pointer(memcg_ops, NULL, true);
163+
if (ops) {
164+
synchronize_rcu();
165+
166+
percpu_ref_kill(&ops->refcount);
167+
wait_for_completion(&ops->destroy_done);
168+
169+
percpu_ref_exit(&ops->refcount);
170+
}
171+
}
172+
173+
static struct bpf_struct_ops bpf_memcg_ops = {
174+
.verifier_ops = &bpf_memcg_verifier_ops,
175+
.init = bpf_memcg_ops_init,
176+
.check_member = bpf_memcg_ops_check_member,
177+
.init_member = bpf_memcg_ops_init_member,
178+
.reg = bpf_memcg_ops_reg,
179+
.unreg = bpf_memcg_ops_unreg,
180+
.name = "memcg_ops",
181+
.owner = THIS_MODULE,
182+
.cfi_stubs = &cfi_bpf_memcg_ops,
183+
};
184+
185+
static int __init memcontrol_bpf_init(void)
186+
{
187+
int err;
188+
189+
RCU_INIT_POINTER(memcg_ops, NULL);
190+
191+
err = register_bpf_struct_ops(&bpf_memcg_ops, memcg_ops);
192+
if (err) {
193+
pr_warn("error while registering bpf memcg_ops: %d\n", err);
194+
return err;
195+
}
196+
197+
pr_info("bpf memcg_ops registered successfully\n");
198+
return 0;
199+
}
200+
late_initcall(memcontrol_bpf_init);

0 commit comments

Comments
 (0)