diff --git a/MAINTAINERS b/MAINTAINERS index 48aabeeed0297..3a70521cfdd42 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6352,9 +6352,14 @@ F: include/linux/page_counter.h F: mm/memcontrol.c F: mm/memcontrol-v1.c F: mm/memcontrol-v1.h +F: mm/memcontrol_bpf.c +F: mm/memcontrol_bpf.h F: mm/page_counter.c F: mm/swap_cgroup.c F: samples/cgroup/* +F: samples/memcg_printk.bpf.c +F: samples/memcg_printk.c +F: tools/testing/selftests/bpf/*/memcg_ops.c F: tools/testing/selftests/cgroup/memcg_protection.m F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c F: tools/testing/selftests/cgroup/test_kmem.c diff --git a/init/Kconfig b/init/Kconfig index cab3ad28ca49e..cde8f5cb5ffa0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1063,6 +1063,44 @@ config MEMCG_V1 Say N if unsure. +config MEMCG_BPF + bool "Memory controller eBPF support" + depends on MEMCG + depends on BPF_SYSCALL + default n + help + This option enables eBPF support for the memory controller, + allowing eBPF programs to hook into memory charging + operations and implement custom memory management policies + at runtime. + + With this feature enabled, administrators can load eBPF + programs to monitor and adjust memory charging behavior + without recompiling the kernel. This enables: + + - Custom memory reclamation strategies for specialized + workloads + - Dynamic memory pressure telemetry and monitoring + - Memory accounting adjustments based on runtime conditions + - Integration with container orchestration systems + - Experimentation with novel memory management algorithms + + The eBPF handler is invoked during memory charge attempts + and can inspect memory cgroup context and optionally modify + parameters like reclamation size. + + When this feature is disabled or no eBPF program is loaded, + there is zero performance overhead. When enabled with an + active program, overhead is minimal (one indirect function + call per charge attempt). The eBPF verifier ensures memory + safety of loaded programs. + + Only one eBPF program can be active at a time. Loading a + new program requires appropriate BPF permissions + (CAP_PERFMON or CAP_SYS_ADMIN). + + Say N if unsure. + config BLK_CGROUP bool "IO controller" depends on BLOCK diff --git a/mm/Makefile b/mm/Makefile index 21abb33535501..5ac2fa7a8a74b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -102,6 +102,7 @@ obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o +obj-$(CONFIG_MEMCG_BPF) += memcontrol_bpf.o ifdef CONFIG_SWAP obj-$(CONFIG_MEMCG) += swap_cgroup.o endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4deda33625f41..104c9e9309f20 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -68,6 +68,7 @@ #include #include "slab.h" #include "memcontrol-v1.h" +#include "memcontrol_bpf.h" #include @@ -2301,13 +2302,14 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, int nr_retries = MAX_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct page_counter *counter; - unsigned long nr_reclaimed; + unsigned long nr_reclaime, nr_reclaimed; bool passed_oom = false; unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP; bool drained = false; bool raised_max_event = false; unsigned long pflags; bool allow_spinning = gfpflags_allow_spinning(gfp_mask); + bool charge_done = false; retry: if (consume_stock(memcg, nr_pages)) @@ -2320,20 +2322,30 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, if (!do_memsw_account() || page_counter_try_charge(&memcg->memsw, batch, &counter)) { if (page_counter_try_charge(&memcg->memory, batch, &counter)) - goto done_restock; - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, batch); - mem_over_limit = mem_cgroup_from_counter(counter, memory); + charge_done = true; + else { + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, batch); + mem_over_limit = mem_cgroup_from_counter(counter, memory); + } } else { mem_over_limit = mem_cgroup_from_counter(counter, memsw); reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP; } - if (batch > nr_pages) { + if (!charge_done && batch > nr_pages) { batch = nr_pages; goto retry; } + nr_reclaime = bpf_try_charge_memcg(memcg, gfp_mask, nr_pages, + mem_over_limit, + reclaim_options, + charge_done); + + if (charge_done) + goto done_restock; + /* * Prevent unbounded recursion when reclaim operations need to * allocate memory. This might exceed the limits temporarily, @@ -2353,7 +2365,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, raised_max_event = true; psi_memstall_enter(&pflags); - nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, + nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_reclaime, gfp_mask, reclaim_options, NULL); psi_memstall_leave(&pflags); diff --git a/mm/memcontrol_bpf.c b/mm/memcontrol_bpf.c new file mode 100644 index 0000000000000..0bdb2a147a509 --- /dev/null +++ b/mm/memcontrol_bpf.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory Controller eBPF support + * + * Author: Hui Zhu + * Copyright (C) 2025 KylinSoft Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "memcontrol_bpf.h" + +struct memcg_ops __rcu *memcg_ops; +DEFINE_STATIC_KEY_FALSE(memcg_bpf_enable); + +static void memcg_ops_release(struct percpu_ref *ref) +{ + struct memcg_ops *ops = container_of(ref, + struct memcg_ops, refcount); + + /* Signal destruction completion */ + complete(&ops->destroy_done); +} + +static int memcg_ops_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + size_t end; + + switch (off) { + case offsetof(struct try_charge_memcg, nr_pages): + end = offsetofend(struct try_charge_memcg, nr_pages); + break; + default: + return -EACCES; + } + + if (off + size > end) + return -EACCES; + + return 0; +} + +static bool memcg_ops_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +const struct bpf_verifier_ops bpf_memcg_verifier_ops = { + .get_func_proto = bpf_base_func_proto, + .btf_struct_access = memcg_ops_btf_struct_access, + .is_valid_access = memcg_ops_is_valid_access, +}; + +static int cfi_try_charge_memcg(struct try_charge_memcg *tcm) +{ + return -EINVAL; +} + +static struct memcg_ops cfi_bpf_memcg_ops = { + .try_charge_memcg = cfi_try_charge_memcg, +}; + +static int bpf_memcg_ops_init(struct btf *btf) +{ + return 0; +} + +static int bpf_memcg_ops_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct memcg_ops, try_charge_memcg): + case offsetof(struct memcg_ops, refcount): + case offsetof(struct memcg_ops, destroy_done): + break; + default: + if (prog->sleepable) + return -EINVAL; + } + + return 0; +} + +static int default_try_charge_memcg(struct try_charge_memcg *tcm) +{ + return 0; +} + +static int bpf_memcg_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + struct memcg_ops *ops = (struct memcg_ops *)kdata; + u32 moff = __btf_member_bit_offset(t, member) / 8; + int ret; + + if (moff == offsetof(struct memcg_ops, refcount)) { + ret = percpu_ref_init(&ops->refcount, memcg_ops_release, 0, GFP_KERNEL); + if (ret) { + pr_err("Failed to percpu_ref_init: %d\n", ret); + return ret; + } + + init_completion(&ops->destroy_done); + + if (!ops->try_charge_memcg) + ops->try_charge_memcg = default_try_charge_memcg; + } + + return 0; +} + +static int bpf_memcg_ops_reg(void *kdata, struct bpf_link *link) +{ + struct memcg_ops *new_ops, *old_ops; + + /* + * Check if ops already exists. + * just get old_ops but not keep lock because + * caller has locked st_map->lock. + */ + rcu_read_lock(); + old_ops = rcu_dereference(memcg_ops); + rcu_read_unlock(); + if (old_ops) + return -EEXIST; + + new_ops = kdata; + + /* Atomically set ops pointer (should be NULL at this point) */ + old_ops = rcu_replace_pointer(memcg_ops, new_ops, true); + WARN_ON(old_ops); + + static_branch_enable(&memcg_bpf_enable); + + return 0; +} + +/* Unregister the struct ops instance */ +static void bpf_memcg_ops_unreg(void *kdata, struct bpf_link *link) +{ + struct memcg_ops *ops; + + static_branch_disable(&memcg_bpf_enable); + + /* Not lock same with bpf_memcg_ops_reg. */ + ops = rcu_replace_pointer(memcg_ops, NULL, true); + if (ops) { + synchronize_rcu(); + + percpu_ref_kill(&ops->refcount); + wait_for_completion(&ops->destroy_done); + + percpu_ref_exit(&ops->refcount); + } +} + +static struct bpf_struct_ops bpf_memcg_ops = { + .verifier_ops = &bpf_memcg_verifier_ops, + .init = bpf_memcg_ops_init, + .check_member = bpf_memcg_ops_check_member, + .init_member = bpf_memcg_ops_init_member, + .reg = bpf_memcg_ops_reg, + .unreg = bpf_memcg_ops_unreg, + .name = "memcg_ops", + .owner = THIS_MODULE, + .cfi_stubs = &cfi_bpf_memcg_ops, +}; + +static int __init memcontrol_bpf_init(void) +{ + int err; + + RCU_INIT_POINTER(memcg_ops, NULL); + + err = register_bpf_struct_ops(&bpf_memcg_ops, memcg_ops); + if (err) { + pr_warn("error while registering bpf memcg_ops: %d\n", err); + return err; + } + + pr_info("bpf memcg_ops registered successfully\n"); + return 0; +} +late_initcall(memcontrol_bpf_init); diff --git a/mm/memcontrol_bpf.h b/mm/memcontrol_bpf.h new file mode 100644 index 0000000000000..ee2815fc3d057 --- /dev/null +++ b/mm/memcontrol_bpf.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* memcontrol_bpf.h - Memory Controller eBPF support + * + * Author: Hui Zhu + * Copyright (C) 2025 KylinSoft Corporation. + */ + +#ifndef _LINUX_MEMCONTROL_BPF_H +#define _LINUX_MEMCONTROL_BPF_H + +#ifdef CONFIG_MEMCG_BPF + +struct try_charge_memcg { + struct mem_cgroup *memcg; + gfp_t gfp_mask; + unsigned long nr_pages; + struct mem_cgroup *mem_over_limit; + unsigned int reclaim_options; + bool charge_done; +}; + +struct memcg_ops { + int (*try_charge_memcg)(struct try_charge_memcg *tcm); + struct percpu_ref refcount; + struct completion destroy_done; +}; + +extern struct memcg_ops __rcu *memcg_ops; +DECLARE_STATIC_KEY_FALSE(memcg_bpf_enable); + +static inline struct memcg_ops *memcg_ops_get(void) +{ + struct memcg_ops *ops; + + rcu_read_lock(); + ops = rcu_dereference(memcg_ops); + if (likely(ops)) { + if (unlikely(!percpu_ref_tryget_live(&ops->refcount))) + ops = NULL; + } + rcu_read_unlock(); + + return ops; +} + +static inline void memcg_ops_put(struct memcg_ops *ops) +{ + percpu_ref_put(&ops->refcount); +} + +static inline unsigned long +bpf_try_charge_memcg(struct mem_cgroup *memcg, + gfp_t gfp_mask, + unsigned int nr_pages, + struct mem_cgroup *mem_over_limit, + unsigned int reclaim_options, + bool charge_done) +{ + struct memcg_ops *ops; + struct try_charge_memcg tcm; + int update_nr_pages; + + if (likely(!static_branch_unlikely(&memcg_bpf_enable))) + goto out; + + ops = memcg_ops_get(); + if (unlikely(!ops)) + goto out; + + tcm.memcg = memcg; + tcm.gfp_mask = gfp_mask; + tcm.nr_pages = nr_pages; + tcm.mem_over_limit = mem_over_limit; + tcm.reclaim_options = reclaim_options; + tcm.charge_done = charge_done; + + update_nr_pages = ops->try_charge_memcg(&tcm); + + memcg_ops_put(ops); + + if (update_nr_pages && !charge_done && tcm.nr_pages && + tcm.nr_pages <= HPAGE_PMD_NR) + nr_pages = tcm.nr_pages; + +out: + return nr_pages; +} + +#else + +#define bpf_try_charge_memcg(memcg, gfp_mask, nr_pages, \ + mem_over_limit, reclaim_options, \ + charge_done) \ + ((void)memcg, \ + (void)gfp_mask, \ + nr_pages, \ + (void)mem_over_limit, \ + (void)reclaim_options, \ + (void)charge_done) + +#endif + +#endif diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 95a4fa1f1e447..d50e958fc8d50 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -37,6 +37,7 @@ tprogs-y += xdp_fwd tprogs-y += task_fd_query tprogs-y += ibumad tprogs-y += hbm +tprogs-y += memcg_printk # Libbpf dependencies LIBBPF_SRC = $(TOOLS_PATH)/lib/bpf @@ -122,6 +123,7 @@ always-y += task_fd_query_kern.o always-y += ibumad_kern.o always-y += hbm_out_kern.o always-y += hbm_edt_kern.o +always-y += memcg_printk.bpf.o COMMON_CFLAGS = $(TPROGS_USER_CFLAGS) TPROGS_LDFLAGS = $(TPROGS_USER_LDFLAGS) diff --git a/samples/bpf/memcg_printk.bpf.c b/samples/bpf/memcg_printk.bpf.c new file mode 100644 index 0000000000000..66c87bf4cbcbe --- /dev/null +++ b/samples/bpf/memcg_printk.bpf.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" + +#include +#include + +SEC("struct_ops/try_charge_memcg") +int BPF_PROG(handle_try_charge_memcg, struct try_charge_memcg *tcm) +{ + bpf_printk( + "memcg %s gfp_mask 0x%x nr_pages %lu reclaim_options 0x%lx\n", + tcm->memcg->css.cgroup->kn->name, + tcm->gfp_mask, + tcm->nr_pages, + tcm->reclaim_options); + if (!tcm->charge_done) + bpf_printk("memcg %s mem_over_limit %s\n", + tcm->memcg->css.cgroup->kn->name, + tcm->mem_over_limit->css.cgroup->kn->name); + + return 0; +} + +SEC(".struct_ops") +struct memcg_ops mcg_ops = { + .try_charge_memcg = (void *)handle_try_charge_memcg, +}; + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/memcg_printk.c b/samples/bpf/memcg_printk.c new file mode 100644 index 0000000000000..a2c5be2415eac --- /dev/null +++ b/samples/bpf/memcg_printk.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0-only +#define _GNU_SOURCE + +#include +#include +#include + +static bool exiting; + +static void sig_handler(int sig) +{ + exiting = true; +} + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + return vfprintf(stderr, format, args); +} + +int main(int argc, char **argv) +{ + struct bpf_object *obj = NULL; + struct bpf_link *link = NULL; + struct bpf_map *map; + char filename[256]; + int err; + + exiting = false; + + signal(SIGINT, sig_handler); + signal(SIGTERM, sig_handler); + + libbpf_set_print(libbpf_print_fn); + + snprintf(filename, sizeof(filename), "%s.bpf.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + err = libbpf_get_error(obj); + if (err) { + fprintf(stderr, "Failed to open BPF object file: %d\n", + err); + obj = NULL; + goto cleanup; + } + + err = bpf_object__load(obj); + if (err) { + fprintf(stderr, "Failed to loading BPF object file: %d\n", + err); + goto cleanup; + } + + map = bpf_object__find_map_by_name(obj, "mcg_ops"); + if (!map) { + fprintf(stderr, "Failed to find struct_ops map 'mcg_ops'\n"); + err = -ENOENT; + goto cleanup; + } + + link = bpf_map__attach_struct_ops(map); + err = libbpf_get_error(link); + if (err) { + fprintf(stderr, "Failed to attach struct ops: %d\n", + err); + link = NULL; + goto cleanup; + } + + printf("Press Ctrl+C to exit...\n"); + + while (!exiting) + sleep(1); + + printf("Bye!\n"); + +cleanup: + if (link) + bpf_link__destroy(link); + if (obj) + bpf_object__close(obj); + + return err; +} diff --git a/tools/testing/selftests/bpf/prog_tests/memcg_ops.c b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c new file mode 100644 index 0000000000000..3f989bcfb8c44 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/memcg_ops.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory controller eBPF struct ops test + */ + +#include +#include + +void test_memcg_ops_load(void) +{ + struct bpf_object *obj; + int err; + + obj = bpf_object__open_file("memcg_ops.bpf.o", NULL); + err = libbpf_get_error(obj); + if (CHECK_FAIL(err)) { + obj = NULL; + goto out; + } + + err = bpf_object__load(obj); + if (CHECK_FAIL(err)) + goto out; + +out: + if (obj) + bpf_object__close(obj); +} + +void test_memcg_ops_attach(void) +{ + struct bpf_object *obj; + struct bpf_map *map; + struct bpf_link *link = NULL; + int err; + + obj = bpf_object__open_file("memcg_ops.bpf.o", NULL); + err = libbpf_get_error(obj); + if (CHECK_FAIL(err)) { + obj = NULL; + goto out; + } + + err = bpf_object__load(obj); + if (CHECK_FAIL(err)) + goto out; + + map = bpf_object__find_map_by_name(obj, "mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name")) + goto out; + + link = bpf_map__attach_struct_ops(map); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) + goto out; + +out: + if (link) + bpf_link__destroy(link); + if (obj) + bpf_object__close(obj); +} + +void test_memcg_ops_double_attach(void) +{ + struct bpf_object *obj, *obj2; + struct bpf_map *map, *map2; + struct bpf_link *link = NULL, *link2 = NULL; + int err; + + obj = bpf_object__open_file("memcg_ops.bpf.o", NULL); + err = libbpf_get_error(obj); + if (CHECK_FAIL(err)) { + obj = NULL; + goto out; + } + + err = bpf_object__load(obj); + if (CHECK_FAIL(err)) + goto out; + + map = bpf_object__find_map_by_name(obj, "mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name")) + goto out; + + link = bpf_map__attach_struct_ops(map); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) + goto out; + + obj2 = bpf_object__open_file("memcg_ops.bpf.o", NULL); + err = libbpf_get_error(obj2); + if (CHECK_FAIL(err)) { + obj2 = NULL; + goto out; + } + + err = bpf_object__load(obj2); + if (CHECK_FAIL(err)) + goto out; + + map2 = bpf_object__find_map_by_name(obj2, "mcg_ops"); + if (!ASSERT_OK_PTR(map, "bpf_object__find_map_by_name")) + goto out; + + link2 = bpf_map__attach_struct_ops(map2); + if (!ASSERT_ERR_PTR(link2, "bpf_map__attach_struct_ops")) { + bpf_link__destroy(link2); + goto out; + } + +out: + if (link) + bpf_link__destroy(link); + if (obj) + bpf_object__close(obj); + if (obj2) + bpf_object__close(obj2); +} diff --git a/tools/testing/selftests/bpf/progs/memcg_ops.c b/tools/testing/selftests/bpf/progs/memcg_ops.c new file mode 100644 index 0000000000000..a21fbe859fd3b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/memcg_ops.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" + +#include +#include + +char _license[] SEC("license") = "GPL"; + +SEC("struct_ops/try_charge_memcg") +int BPF_PROG(test_try_charge_memcg, + struct try_charge_memcg *tcm) +{ + return 0; +} + +SEC(".struct_ops") +struct memcg_ops mcg_ops = { + .try_charge_memcg = (void *)test_try_charge_memcg, +};