kernel-patches · kernel-patches-daemon-bpf-rc · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/MAINTAINERS b/MAINTAINERS
@@ -6352,9 +6352,14 @@ F:	include/linux/page_counter.h
 F:	mm/memcontrol.c
 F:	mm/memcontrol-v1.c
 F:	mm/memcontrol-v1.h
+F:	mm/memcontrol_bpf.c
+F:	mm/memcontrol_bpf.h
 F:	mm/page_counter.c
 F:	mm/swap_cgroup.c
 F:	samples/cgroup/*
+F:	samples/memcg_printk.bpf.c
+F:	samples/memcg_printk.c
+F:	tools/testing/selftests/bpf/*/memcg_ops.c
 F:	tools/testing/selftests/cgroup/memcg_protection.m
 F:	tools/testing/selftests/cgroup/test_hugetlb_memcg.c
 F:	tools/testing/selftests/cgroup/test_kmem.c

diff --git a/init/Kconfig b/init/Kconfig
@@ -1063,6 +1063,44 @@ config MEMCG_V1
 
 	  Say N if unsure.
 
+config MEMCG_BPF
+	bool "Memory controller eBPF support"
+	depends on MEMCG
+	depends on BPF_SYSCALL
+	default n
+	help
+	  This option enables eBPF support for the memory controller,
+	  allowing eBPF programs to hook into memory charging
+	  operations and implement custom memory management policies
+	  at runtime.
+
+	  With this feature enabled, administrators can load eBPF
+	  programs to monitor and adjust memory charging behavior
+	  without recompiling the kernel. This enables:
+
+	  - Custom memory reclamation strategies for specialized
+	    workloads
+	  - Dynamic memory pressure telemetry and monitoring
+	  - Memory accounting adjustments based on runtime conditions
+	  - Integration with container orchestration systems
+	  - Experimentation with novel memory management algorithms
+
+	  The eBPF handler is invoked during memory charge attempts
+	  and can inspect memory cgroup context and optionally modify
+	  parameters like reclamation size.
+
+	  When this feature is disabled or no eBPF program is loaded,
+	  there is zero performance overhead. When enabled with an
+	  active program, overhead is minimal (one indirect function
+	  call per charge attempt). The eBPF verifier ensures memory
+	  safety of loaded programs.
+
+	  Only one eBPF program can be active at a time. Loading a
+	  new program requires appropriate BPF permissions
+	  (CAP_PERFMON or CAP_SYS_ADMIN).
+
+	  Say N if unsure.
+
 config BLK_CGROUP
 	bool "IO controller"
 	depends on BLOCK

diff --git a/mm/Makefile b/mm/Makefile
@@ -102,6 +102,7 @@ obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
 obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
 obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o
 obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
+obj-$(CONFIG_MEMCG_BPF) += memcontrol_bpf.o
 ifdef CONFIG_SWAP
 obj-$(CONFIG_MEMCG) += swap_cgroup.o
 endif

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
@@ -68,6 +68,7 @@
 #include <net/ip.h>
 #include "slab.h"
 #include "memcontrol-v1.h"
+#include "memcontrol_bpf.h"
 
 #include <linux/uaccess.h>
 
@@ -2301,13 +2302,14 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	int nr_retries = MAX_RECLAIM_RETRIES;
 	struct mem_cgroup *mem_over_limit;
 	struct page_counter *counter;
-	unsigned long nr_reclaimed;
+	unsigned long nr_reclaime, nr_reclaimed;
 	bool passed_oom = false;
 	unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
 	bool drained = false;
 	bool raised_max_event = false;
 	unsigned long pflags;
 	bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
+	bool charge_done = false;
 
 retry:
 	if (consume_stock(memcg, nr_pages))
@@ -2320,20 +2322,30 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	if (!do_memsw_account() ||
 	    page_counter_try_charge(&memcg->memsw, batch, &counter)) {
 		if (page_counter_try_charge(&memcg->memory, batch, &counter))
-			goto done_restock;
-		if (do_memsw_account())
-			page_counter_uncharge(&memcg->memsw, batch);
-		mem_over_limit = mem_cgroup_from_counter(counter, memory);
+			charge_done = true;
+		else {
+			if (do_memsw_account())
+				page_counter_uncharge(&memcg->memsw, batch);
+			mem_over_limit = mem_cgroup_from_counter(counter, memory);
+		}
 	} else {
 		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
 		reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
 	}
 
-	if (batch > nr_pages) {
+	if (!charge_done && batch > nr_pages) {
 		batch = nr_pages;
 		goto retry;
 	}
 
+	nr_reclaime = bpf_try_charge_memcg(memcg, gfp_mask, nr_pages,
+					   mem_over_limit,
+					   reclaim_options,
+					   charge_done);
+
+	if (charge_done)
+		goto done_restock;
+
 	/*
 	 * Prevent unbounded recursion when reclaim operations need to
 	 * allocate memory. This might exceed the limits temporarily,
@@ -2353,7 +2365,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	raised_max_event = true;
 
 	psi_memstall_enter(&pflags);
-	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
+	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_reclaime,
 						    gfp_mask, reclaim_options, NULL);
 	psi_memstall_leave(&pflags);
 

diff --git a/mm/memcontrol_bpf.c b/mm/memcontrol_bpf.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory Controller eBPF support
+ *
+ * Author: Hui Zhu <[email protected]>
+ * Copyright (C) 2025 KylinSoft Corporation.
+ */
+
+#include <linux/cgroup-defs.h>
+#include <linux/page_counter.h>
+#include <linux/memcontrol.h>
+#include <linux/cgroup.h>
+#include <linux/rcupdate.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/module.h>
+#include "memcontrol_bpf.h"
+
+struct memcg_ops __rcu *memcg_ops;
+DEFINE_STATIC_KEY_FALSE(memcg_bpf_enable);
+
+static void memcg_ops_release(struct percpu_ref *ref)
+{
+	struct memcg_ops *ops = container_of(ref,
+		struct memcg_ops, refcount);
+
+	/* Signal destruction completion */
+	complete(&ops->destroy_done);
+}
+
+static int memcg_ops_btf_struct_access(struct bpf_verifier_log *log,
+					const struct bpf_reg_state *reg,
+					int off, int size)
+{
+	size_t end;
+
+	switch (off) {
+	case offsetof(struct try_charge_memcg, nr_pages):
+		end = offsetofend(struct try_charge_memcg, nr_pages);
+		break;
+	default:
+		return -EACCES;
+	}
+
+	if (off + size > end)
+		return -EACCES;
+
+	return 0;
+}
+
+static bool memcg_ops_is_valid_access(int off, int size, enum bpf_access_type type,
+	const struct bpf_prog *prog,
+	struct bpf_insn_access_aux *info)
+{
+	return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+const struct bpf_verifier_ops bpf_memcg_verifier_ops = {
+	.get_func_proto = bpf_base_func_proto,
+	.btf_struct_access = memcg_ops_btf_struct_access,
+	.is_valid_access = memcg_ops_is_valid_access,
+};
+
+static int cfi_try_charge_memcg(struct try_charge_memcg *tcm)
+{
+	return -EINVAL;
+}
+
+static struct memcg_ops cfi_bpf_memcg_ops = {
+	.try_charge_memcg = cfi_try_charge_memcg,
+};
+
+static int bpf_memcg_ops_init(struct btf *btf)
+{
+	return 0;
+}
+
+static int bpf_memcg_ops_check_member(const struct btf_type *t,
+				const struct btf_member *member,
+				const struct bpf_prog *prog)
+{
+	u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+	switch (moff) {
+	case offsetof(struct memcg_ops, try_charge_memcg):
+	case offsetof(struct memcg_ops, refcount):
+	case offsetof(struct memcg_ops, destroy_done):
+		break;
+	default:
+		if (prog->sleepable)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int default_try_charge_memcg(struct try_charge_memcg *tcm)
+{
+	return 0;
+}
+
+static int bpf_memcg_ops_init_member(const struct btf_type *t,
+				const struct btf_member *member,
+				void *kdata, const void *udata)
+{
+	struct memcg_ops *ops = (struct memcg_ops *)kdata;
+	u32 moff = __btf_member_bit_offset(t, member) / 8;
+	int ret;
+
+	if (moff == offsetof(struct memcg_ops, refcount)) {
+		ret = percpu_ref_init(&ops->refcount, memcg_ops_release, 0, GFP_KERNEL);
+		if (ret) {
+			pr_err("Failed to percpu_ref_init: %d\n", ret);
+			return ret;
+		}
+
+		init_completion(&ops->destroy_done);
+
+		if (!ops->try_charge_memcg)
+			ops->try_charge_memcg = default_try_charge_memcg;
+	}
+
+	return 0;
+}
+
+static int bpf_memcg_ops_reg(void *kdata, struct bpf_link *link)
+{
+	struct memcg_ops *new_ops, *old_ops;
+
+	/*
+	 * Check if ops already exists.
+	 * just get old_ops but not keep lock because
+	 * caller has locked st_map->lock.
+	 */
+	rcu_read_lock();
+	old_ops = rcu_dereference(memcg_ops);
+	rcu_read_unlock();
+	if (old_ops)
+		return -EEXIST;
+
+	new_ops = kdata;
+
+	/* Atomically set ops pointer (should be NULL at this point) */
+	old_ops = rcu_replace_pointer(memcg_ops, new_ops, true);
+	WARN_ON(old_ops);
+
+	static_branch_enable(&memcg_bpf_enable);
+
+	return 0;
+}
+
+/* Unregister the struct ops instance */
+static void bpf_memcg_ops_unreg(void *kdata, struct bpf_link *link)
+{
+	struct memcg_ops *ops;
+
+	static_branch_disable(&memcg_bpf_enable);
+
+	/* Not lock same with bpf_memcg_ops_reg. */
+	ops = rcu_replace_pointer(memcg_ops, NULL, true);
+	if (ops) {
+		synchronize_rcu();
+
+		percpu_ref_kill(&ops->refcount);
+		wait_for_completion(&ops->destroy_done);
+
+		percpu_ref_exit(&ops->refcount);
+	}
+}
+
+static struct bpf_struct_ops bpf_memcg_ops = {
+	.verifier_ops = &bpf_memcg_verifier_ops,
+	.init = bpf_memcg_ops_init,
+	.check_member = bpf_memcg_ops_check_member,
+	.init_member = bpf_memcg_ops_init_member,
+	.reg = bpf_memcg_ops_reg,
+	.unreg = bpf_memcg_ops_unreg,
+	.name = "memcg_ops",
+	.owner = THIS_MODULE,
+	.cfi_stubs = &cfi_bpf_memcg_ops,
+};
+
+static int __init memcontrol_bpf_init(void)
+{
+	int err;
+
+	RCU_INIT_POINTER(memcg_ops, NULL);
+
+	err = register_bpf_struct_ops(&bpf_memcg_ops, memcg_ops);
+	if (err) {
+		pr_warn("error while registering bpf memcg_ops: %d\n", err);
+		return err;
+	}
+
+	pr_info("bpf memcg_ops registered successfully\n");
+	return 0;
+}
+late_initcall(memcontrol_bpf_init);