Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions MAINTAINERS
Original file line number Diff line number Diff line change
Expand Up @@ -6352,9 +6352,14 @@ F: include/linux/page_counter.h
F: mm/memcontrol.c
F: mm/memcontrol-v1.c
F: mm/memcontrol-v1.h
F: mm/memcontrol_bpf.c
F: mm/memcontrol_bpf.h
F: mm/page_counter.c
F: mm/swap_cgroup.c
F: samples/cgroup/*
F: samples/memcg_printk.bpf.c
F: samples/memcg_printk.c
F: tools/testing/selftests/bpf/*/memcg_ops.c
F: tools/testing/selftests/cgroup/memcg_protection.m
F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c
F: tools/testing/selftests/cgroup/test_kmem.c
Expand Down
38 changes: 38 additions & 0 deletions init/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -1063,6 +1063,44 @@ config MEMCG_V1

Say N if unsure.

config MEMCG_BPF
bool "Memory controller eBPF support"
depends on MEMCG
depends on BPF_SYSCALL
default n
help
This option enables eBPF support for the memory controller,
allowing eBPF programs to hook into memory charging
operations and implement custom memory management policies
at runtime.

With this feature enabled, administrators can load eBPF
programs to monitor and adjust memory charging behavior
without recompiling the kernel. This enables:

- Custom memory reclamation strategies for specialized
workloads
- Dynamic memory pressure telemetry and monitoring
- Memory accounting adjustments based on runtime conditions
- Integration with container orchestration systems
- Experimentation with novel memory management algorithms

The eBPF handler is invoked during memory charge attempts
and can inspect memory cgroup context and optionally modify
parameters like reclamation size.

When this feature is disabled or no eBPF program is loaded,
there is zero performance overhead. When enabled with an
active program, overhead is minimal (one indirect function
call per charge attempt). The eBPF verifier ensures memory
safety of loaded programs.

Only one eBPF program can be active at a time. Loading a
new program requires appropriate BPF permissions
(CAP_PERFMON or CAP_SYS_ADMIN).

Say N if unsure.

config BLK_CGROUP
bool "IO controller"
depends on BLOCK
Expand Down
1 change: 1 addition & 0 deletions mm/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
obj-$(CONFIG_MEMCG_BPF) += memcontrol_bpf.o
ifdef CONFIG_SWAP
obj-$(CONFIG_MEMCG) += swap_cgroup.o
endif
Expand Down
26 changes: 19 additions & 7 deletions mm/memcontrol.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
#include <net/ip.h>
#include "slab.h"
#include "memcontrol-v1.h"
#include "memcontrol_bpf.h"

#include <linux/uaccess.h>

Expand Down Expand Up @@ -2301,13 +2302,14 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct page_counter *counter;
unsigned long nr_reclaimed;
unsigned long nr_reclaime, nr_reclaimed;
bool passed_oom = false;
unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
bool drained = false;
bool raised_max_event = false;
unsigned long pflags;
bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
bool charge_done = false;

retry:
if (consume_stock(memcg, nr_pages))
Expand All @@ -2320,20 +2322,30 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (!do_memsw_account() ||
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
if (page_counter_try_charge(&memcg->memory, batch, &counter))
goto done_restock;
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, batch);
mem_over_limit = mem_cgroup_from_counter(counter, memory);
charge_done = true;
else {
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, batch);
mem_over_limit = mem_cgroup_from_counter(counter, memory);
}
} else {
mem_over_limit = mem_cgroup_from_counter(counter, memsw);
reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
}

if (batch > nr_pages) {
if (!charge_done && batch > nr_pages) {
batch = nr_pages;
goto retry;
}

nr_reclaime = bpf_try_charge_memcg(memcg, gfp_mask, nr_pages,
mem_over_limit,
reclaim_options,
charge_done);

if (charge_done)
goto done_restock;

/*
* Prevent unbounded recursion when reclaim operations need to
* allocate memory. This might exceed the limits temporarily,
Expand All @@ -2353,7 +2365,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
raised_max_event = true;

psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_reclaime,
gfp_mask, reclaim_options, NULL);
psi_memstall_leave(&pflags);

Expand Down
200 changes: 200 additions & 0 deletions mm/memcontrol_bpf.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Memory Controller eBPF support
*
* Author: Hui Zhu <[email protected]>
* Copyright (C) 2025 KylinSoft Corporation.
*/

#include <linux/cgroup-defs.h>
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
#include <linux/rcupdate.h>
#include <linux/bpf_verifier.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/module.h>
#include "memcontrol_bpf.h"

struct memcg_ops __rcu *memcg_ops;
DEFINE_STATIC_KEY_FALSE(memcg_bpf_enable);

static void memcg_ops_release(struct percpu_ref *ref)
{
struct memcg_ops *ops = container_of(ref,
struct memcg_ops, refcount);

/* Signal destruction completion */
complete(&ops->destroy_done);
}

static int memcg_ops_btf_struct_access(struct bpf_verifier_log *log,
const struct bpf_reg_state *reg,
int off, int size)
{
size_t end;

switch (off) {
case offsetof(struct try_charge_memcg, nr_pages):
end = offsetofend(struct try_charge_memcg, nr_pages);
break;
default:
return -EACCES;
}

if (off + size > end)
return -EACCES;

return 0;
}

static bool memcg_ops_is_valid_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
}

const struct bpf_verifier_ops bpf_memcg_verifier_ops = {
.get_func_proto = bpf_base_func_proto,
.btf_struct_access = memcg_ops_btf_struct_access,
.is_valid_access = memcg_ops_is_valid_access,
};

static int cfi_try_charge_memcg(struct try_charge_memcg *tcm)
{
return -EINVAL;
}

static struct memcg_ops cfi_bpf_memcg_ops = {
.try_charge_memcg = cfi_try_charge_memcg,
};

static int bpf_memcg_ops_init(struct btf *btf)
{
return 0;
}

static int bpf_memcg_ops_check_member(const struct btf_type *t,
const struct btf_member *member,
const struct bpf_prog *prog)
{
u32 moff = __btf_member_bit_offset(t, member) / 8;

switch (moff) {
case offsetof(struct memcg_ops, try_charge_memcg):
case offsetof(struct memcg_ops, refcount):
case offsetof(struct memcg_ops, destroy_done):
break;
default:
if (prog->sleepable)
return -EINVAL;
}

return 0;
}

static int default_try_charge_memcg(struct try_charge_memcg *tcm)
{
return 0;
}

static int bpf_memcg_ops_init_member(const struct btf_type *t,
const struct btf_member *member,
void *kdata, const void *udata)
{
struct memcg_ops *ops = (struct memcg_ops *)kdata;
u32 moff = __btf_member_bit_offset(t, member) / 8;
int ret;

if (moff == offsetof(struct memcg_ops, refcount)) {
ret = percpu_ref_init(&ops->refcount, memcg_ops_release, 0, GFP_KERNEL);
if (ret) {
pr_err("Failed to percpu_ref_init: %d\n", ret);
return ret;
}

init_completion(&ops->destroy_done);

if (!ops->try_charge_memcg)
ops->try_charge_memcg = default_try_charge_memcg;
}

return 0;
}

static int bpf_memcg_ops_reg(void *kdata, struct bpf_link *link)
{
struct memcg_ops *new_ops, *old_ops;

/*
* Check if ops already exists.
* just get old_ops but not keep lock because
* caller has locked st_map->lock.
*/
rcu_read_lock();
old_ops = rcu_dereference(memcg_ops);
rcu_read_unlock();
if (old_ops)
return -EEXIST;

new_ops = kdata;

/* Atomically set ops pointer (should be NULL at this point) */
old_ops = rcu_replace_pointer(memcg_ops, new_ops, true);
WARN_ON(old_ops);

static_branch_enable(&memcg_bpf_enable);

return 0;
}

/* Unregister the struct ops instance */
static void bpf_memcg_ops_unreg(void *kdata, struct bpf_link *link)
{
struct memcg_ops *ops;

static_branch_disable(&memcg_bpf_enable);

/* Not lock same with bpf_memcg_ops_reg. */
ops = rcu_replace_pointer(memcg_ops, NULL, true);
if (ops) {
synchronize_rcu();

percpu_ref_kill(&ops->refcount);
wait_for_completion(&ops->destroy_done);

percpu_ref_exit(&ops->refcount);
}
}

static struct bpf_struct_ops bpf_memcg_ops = {
.verifier_ops = &bpf_memcg_verifier_ops,
.init = bpf_memcg_ops_init,
.check_member = bpf_memcg_ops_check_member,
.init_member = bpf_memcg_ops_init_member,
.reg = bpf_memcg_ops_reg,
.unreg = bpf_memcg_ops_unreg,
.name = "memcg_ops",
.owner = THIS_MODULE,
.cfi_stubs = &cfi_bpf_memcg_ops,
};

static int __init memcontrol_bpf_init(void)
{
int err;

RCU_INIT_POINTER(memcg_ops, NULL);

err = register_bpf_struct_ops(&bpf_memcg_ops, memcg_ops);
if (err) {
pr_warn("error while registering bpf memcg_ops: %d\n", err);
return err;
}

pr_info("bpf memcg_ops registered successfully\n");
return 0;
}
late_initcall(memcontrol_bpf_init);
Loading
Loading