diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 2f190c90192d0..5a1527b3b6f0e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -6,6 +6,8 @@ #include /* only for vma_is_dax() */ #include +#include +#include vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -54,6 +56,7 @@ enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, + TRANSPARENT_HUGEPAGE_BPF_ATTACHED, /* BPF prog is attached */ }; struct kobject; @@ -190,6 +193,16 @@ static inline bool hugepage_global_always(void) (1< + extern unsigned int khugepaged_max_ptes_none __read_mostly; #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern struct attribute_group khugepaged_attr_group; @@ -20,7 +22,15 @@ extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { - if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags)) + /* + * THP allocation policy can be dynamically modified via BPF. If a + * long-lived task was previously allowed to allocate THP but is no + * longer permitted under the new policy, we must ensure its forked + * child processes also inherit this restriction. + * The MMF_VM_HUGEPAGE flag will be cleared by khugepaged. + */ + if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags) && + get_suggested_order(mm, 0, PMD_ORDER) == PMD_ORDER) __khugepaged_enter(mm); } diff --git a/mm/Kconfig b/mm/Kconfig index 781be3240e216..5d05a537ecdeb 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -908,6 +908,18 @@ config NO_PAGE_MAPCOUNT EXPERIMENTAL because the impact of some changes is still unclear. +config EXPERIMENTAL_BPF_ORDER_SELECTION + bool "BPF-based THP order selection (EXPERIMENTAL)" + depends on TRANSPARENT_HUGEPAGE && BPF_SYSCALL + + help + Enable dynamic THP order selection using BPF programs. This + experimental feature allows custom BPF logic to determine optimal + transparent hugepage allocation sizes at runtime. + + Warning: This feature is unstable and may change in future kernel + versions. + endif # TRANSPARENT_HUGEPAGE # simple helper to make the code a bit easier to read diff --git a/mm/Makefile b/mm/Makefile index 1a7a11d4933d1..562525e6a28a8 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -99,6 +99,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_NUMA) += memory-tiers.o obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o +obj-$(CONFIG_EXPERIMENTAL_BPF_ORDER_SELECTION) += bpf_thp.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o diff --git a/mm/bpf_thp.c b/mm/bpf_thp.c new file mode 100644 index 0000000000000..3b10a97acc31e --- /dev/null +++ b/mm/bpf_thp.c @@ -0,0 +1,255 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +struct bpf_thp_ops { + /** + * @get_suggested_order: Get the suggested highest THP order for allocation + * @mm: mm_struct associated with the THP allocation + * @tva_flags: TVA flags for current context + * %TVA_IN_PF: Set when in page fault context + * Other flags: Reserved for future use + * @order: The highest order being considered for this THP allocation. + * %PUD_ORDER for PUD-mapped allocations + * %PMD_ORDER for PMD-mapped allocations + * %PMD_ORDER - 1 for mTHP allocations + * + * Rerurn: Suggested highest THP order to use for allocation. The returned + * order will never exceed the input @order value. + */ + int (*get_suggested_order)(struct mm_struct *mm, unsigned long tva_flags, int order) __rcu; +}; + +static struct bpf_thp_ops bpf_thp; +static DEFINE_SPINLOCK(thp_ops_lock); + +int get_suggested_order(struct mm_struct *mm, unsigned long tva_flags, int order) +{ + int (*bpf_suggested_order)(struct mm_struct *mm, unsigned long tva_flags, int order); + int suggested_order = order; + + /* No BPF program is attached */ + if (!test_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, + &transparent_hugepage_flags)) + return suggested_order; + + rcu_read_lock(); + bpf_suggested_order = rcu_dereference(bpf_thp.get_suggested_order); + if (!bpf_suggested_order) + goto out; + + suggested_order = bpf_suggested_order(mm, tva_flags, order); + if (suggested_order > order) + suggested_order = order; + +out: + rcu_read_unlock(); + return suggested_order; +} + +static bool bpf_thp_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +static const struct bpf_func_proto * +bpf_thp_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return bpf_base_func_proto(func_id, prog); +} + +static const struct bpf_verifier_ops thp_bpf_verifier_ops = { + .get_func_proto = bpf_thp_get_func_proto, + .is_valid_access = bpf_thp_ops_is_valid_access, +}; + +static int bpf_thp_init(struct btf *btf) +{ + return 0; +} + +static int bpf_thp_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + return 0; +} + +static int bpf_thp_reg(void *kdata, struct bpf_link *link) +{ + struct bpf_thp_ops *ops = kdata; + + spin_lock(&thp_ops_lock); + if (test_and_set_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, + &transparent_hugepage_flags)) { + spin_unlock(&thp_ops_lock); + return -EBUSY; + } + WARN_ON_ONCE(bpf_thp.get_suggested_order); + WRITE_ONCE(bpf_thp.get_suggested_order, ops->get_suggested_order); + spin_unlock(&thp_ops_lock); + return 0; +} + +static void bpf_thp_unreg(void *kdata, struct bpf_link *link) +{ + spin_lock(&thp_ops_lock); + clear_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, &transparent_hugepage_flags); + WARN_ON_ONCE(!bpf_thp.get_suggested_order); + rcu_replace_pointer(bpf_thp.get_suggested_order, NULL, lockdep_is_held(&thp_ops_lock)); + spin_unlock(&thp_ops_lock); + + synchronize_rcu(); +} + +static int bpf_thp_update(void *kdata, void *old_kdata, struct bpf_link *link) +{ + struct bpf_thp_ops *ops = kdata; + struct bpf_thp_ops *old = old_kdata; + + if (!ops || !old) + return -EINVAL; + + spin_lock(&thp_ops_lock); + if (!test_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, &transparent_hugepage_flags)) + goto out; + rcu_replace_pointer(bpf_thp.get_suggested_order, ops->get_suggested_order, + lockdep_is_held(&thp_ops_lock)); + +out: + spin_unlock(&thp_ops_lock); + synchronize_rcu(); + return 0; +} + +static int bpf_thp_validate(void *kdata) +{ + struct bpf_thp_ops *ops = kdata; + + if (!ops->get_suggested_order) { + pr_err("bpf_thp: required ops isn't implemented\n"); + return -EINVAL; + } + return 0; +} + +static int suggested_order(struct mm_struct *mm, unsigned long vm_flags, int order) +{ + return order; +} + +static struct bpf_thp_ops __bpf_thp_ops = { + .get_suggested_order = suggested_order, +}; + +static struct bpf_struct_ops bpf_bpf_thp_ops = { + .verifier_ops = &thp_bpf_verifier_ops, + .init = bpf_thp_init, + .init_member = bpf_thp_init_member, + .reg = bpf_thp_reg, + .unreg = bpf_thp_unreg, + .update = bpf_thp_update, + .validate = bpf_thp_validate, + .cfi_stubs = &__bpf_thp_ops, + .owner = THIS_MODULE, + .name = "bpf_thp_ops", +}; + +__bpf_kfunc_start_defs(); + +/** + * bpf_mm_get_mem_cgroup - Get the memory cgroup associated with a mm_struct. + * @mm: The mm_struct to query + * + * The obtained mem_cgroup must be released by calling bpf_put_mem_cgroup(). + * + * Return: The associated mem_cgroup on success, or NULL on failure. Note that + * this function depends on CONFIG_MEMCG being enabled - it will always return + * NULL if CONFIG_MEMCG is not configured. + */ +__bpf_kfunc struct mem_cgroup *bpf_mm_get_mem_cgroup(struct mm_struct *mm) +{ + return get_mem_cgroup_from_mm(mm); +} + +/** + * bpf_put_mem_cgroup - Release a memory cgroup obtained from bpf_mm_get_mem_cgroup() + * @memcg: The memory cgroup to release + */ +__bpf_kfunc void bpf_put_mem_cgroup(struct mem_cgroup *memcg) +{ +#ifdef CONFIG_MEMCG + if (!memcg) + return; + css_put(&memcg->css); +#endif +} + +/** + * bpf_mm_get_task - Get the task struct associated with a mm_struct. + * @mm: The mm_struct to query + * + * The obtained task_struct must be released by calling bpf_task_release(). + * + * Return: The associated task_struct on success, or NULL on failure. Note that + * this function depends on CONFIG_MEMCG being enabled - it will always return + * NULL if CONFIG_MEMCG is not configured. + */ +__bpf_kfunc struct task_struct *bpf_mm_get_task(struct mm_struct *mm) +{ +#ifdef CONFIG_MEMCG + struct task_struct *task; + + if (!mm) + return NULL; + rcu_read_lock(); + task = rcu_dereference(mm->owner); + if (!task) + goto out; + if (!refcount_inc_not_zero(&task->rcu_users)) + goto out; + + rcu_read_unlock(); + return task; + +out: + rcu_read_unlock(); +#endif + return NULL; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(bpf_thp_ids) +BTF_ID_FLAGS(func, bpf_mm_get_mem_cgroup, KF_TRUSTED_ARGS | KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_mm_get_task, KF_TRUSTED_ARGS | KF_ACQUIRE | KF_RET_NULL) +BTF_KFUNCS_END(bpf_thp_ids) + +static const struct btf_kfunc_id_set bpf_thp_set = { + .owner = THIS_MODULE, + .set = &bpf_thp_ids, +}; + +static int __init bpf_thp_ops_init(void) +{ + int err; + + err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_thp_set); + if (err) { + pr_err("bpf_thp: Failed to register kfunc sets (%d)\n", err); + return err; + } + + err = register_bpf_struct_ops(&bpf_bpf_thp_ops, bpf_thp_ops); + if (err) + pr_err("bpf_thp: Failed to register struct_ops (%d)\n", err); + return err; +} +late_initcall(bpf_thp_ops_init); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d3e66136e41a3..e504b601205f9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1328,6 +1328,15 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return ret; khugepaged_enter_vma(vma, vma->vm_flags); + /* + * This check must occur after khugepaged_enter_vma() because: + * 1. We may permit THP allocation via khugepaged + * 2. While simultaneously disallowing THP allocation + * during page fault handling + */ + if (get_suggested_order(vma->vm_mm, TVA_IN_PF, PMD_ORDER) != PMD_ORDER) + return VM_FAULT_FALLBACK; + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 15203ea7d0073..d0b6c1b203428 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -475,7 +475,8 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_pmd_enabled()) { if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, - PMD_ORDER)) + PMD_ORDER) && + get_suggested_order(vma->vm_mm, 0, PMD_ORDER) == PMD_ORDER) __khugepaged_enter(vma->vm_mm); } } @@ -1448,6 +1449,11 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) /* khugepaged_mm_lock actually not necessary for the below */ mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); + } else if (get_suggested_order(mm, 0, PMD_ORDER) != PMD_ORDER) { + hash_del(&slot->hash); + list_del(&slot->mm_node); + clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + mm_slot_free(mm_slot_cache, mm_slot); } } @@ -2390,6 +2396,10 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, * the next mm on the list. */ vma = NULL; + + /* If this mm is not suitable for the scan list, we should remove it. */ + if (get_suggested_order(mm, 0, PMD_ORDER) != PMD_ORDER) + goto breakouterloop_mmap_lock; if (unlikely(!mmap_read_trylock(mm))) goto breakouterloop_mmap_lock; @@ -2407,7 +2417,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, break; } if (!thp_vma_allowable_order(vma, vma->vm_flags, - TVA_ENFORCE_SYSFS, PMD_ORDER)) { + TVA_ENFORCE_SYSFS, PMD_ORDER) || + get_suggested_order(vma->vm_mm, 0, PMD_ORDER) != PMD_ORDER) { skip: progress++; continue; @@ -2746,6 +2757,9 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return -EINVAL; + if (get_suggested_order(vma->vm_mm, 0, PMD_ORDER) != PMD_ORDER) + return -EINVAL; + cc = kmalloc(sizeof(*cc), GFP_KERNEL); if (!cc) return -ENOMEM; diff --git a/mm/memory.c b/mm/memory.c index b0cda5aab3985..ff3e4c92a2a24 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4375,6 +4375,7 @@ static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset, static struct folio *alloc_swap_folio(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + int order, suggested_order; unsigned long orders; struct folio *folio; unsigned long addr; @@ -4382,7 +4383,6 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) spinlock_t *ptl; pte_t *pte; gfp_t gfp; - int order; /* * If uffd is active for the vma we need per-page fault fidelity to @@ -4399,13 +4399,16 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) if (!zswap_never_enabled()) goto fallback; + suggested_order = get_suggested_order(vma->vm_mm, TVA_IN_PF, PMD_ORDER - 1); + if (!suggested_order) + goto fallback; entry = pte_to_swp_entry(vmf->orig_pte); /* * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. */ orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(suggested_order + 1) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders); @@ -4933,12 +4936,12 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; #ifdef CONFIG_TRANSPARENT_HUGEPAGE + int order, suggested_order; unsigned long orders; struct folio *folio; unsigned long addr; pte_t *pte; gfp_t gfp; - int order; /* * If uffd is active for the vma we need per-page fault fidelity to @@ -4947,13 +4950,16 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) if (unlikely(userfaultfd_armed(vma))) goto fallback; + suggested_order = get_suggested_order(vma->vm_mm, TVA_IN_PF, PMD_ORDER - 1); + if (!suggested_order) + goto fallback; /* * Get a list of all the (large) orders below PMD_ORDER that are enabled * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(suggested_order + 1) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 7247833fe623b..1b1b44a4ed8ab 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -119,3 +119,5 @@ CONFIG_XDP_SOCKETS=y CONFIG_XFRM_INTERFACE=y CONFIG_TCP_CONG_DCTCP=y CONFIG_TCP_CONG_BBR=y +CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_MEMCG=y diff --git a/tools/testing/selftests/bpf/prog_tests/thp_adjust.c b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c new file mode 100644 index 0000000000000..31d03383cbb8b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "cgroup_helpers.h" +#include "test_thp_adjust.skel.h" +#include "test_thp_adjust_failure.skel.h" + +#define LEN (16 * 1024 * 1024) /* 16MB */ +#define THP_ENABLED_PATH "/sys/kernel/mm/transparent_hugepage/enabled" + +static char *thp_addr; +static char old_mode[32]; + +int thp_mode_save(void) +{ + const char *start, *end; + char buf[128]; + int fd, err; + size_t len; + + fd = open(THP_ENABLED_PATH, O_RDONLY); + if (fd == -1) + return -1; + + err = read(fd, buf, sizeof(buf) - 1); + if (err == -1) + goto close; + + start = strchr(buf, '['); + end = start ? strchr(start, ']') : NULL; + if (!start || !end || end <= start) { + err = -1; + goto close; + } + + len = end - start - 1; + if (len >= sizeof(old_mode)) + len = sizeof(old_mode) - 1; + strncpy(old_mode, start + 1, len); + old_mode[len] = '\0'; + +close: + close(fd); + return err; +} + +int thp_set(const char *desired_mode) +{ + int fd, err; + + fd = open(THP_ENABLED_PATH, O_RDWR); + if (fd == -1) + return -1; + + err = write(fd, desired_mode, strlen(desired_mode)); + close(fd); + return err; +} + +int thp_reset(void) +{ + int fd, err; + + fd = open(THP_ENABLED_PATH, O_WRONLY); + if (fd == -1) + return -1; + + err = write(fd, old_mode, strlen(old_mode)); + close(fd); + return err; +} + +int thp_alloc(void) +{ + int err, i; + + thp_addr = mmap(NULL, LEN, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (thp_addr == MAP_FAILED) + return -1; + + err = madvise(thp_addr, LEN, MADV_HUGEPAGE); + if (err == -1) + goto unmap; + + for (i = 0; i < LEN; i += 4096) + thp_addr[i] = 1; + return 0; + +unmap: + munmap(thp_addr, LEN); + return -1; +} + +void thp_free(void) +{ + if (!thp_addr) + return; + munmap(thp_addr, LEN); +} + +void subtest_thp_adjust(void) +{ + struct bpf_link *fentry_link, *ops_link; + struct test_thp_adjust *skel; + int err, cgrp_fd, cgrp_id; + + err = setup_cgroup_environment(); + if (!ASSERT_OK(err, "cgrp_env_setup")) + return; + + cgrp_fd = create_and_get_cgroup("thp_adjust"); + if (!ASSERT_GE(cgrp_fd, 0, "create_and_get_cgroup")) + goto cleanup; + + err = join_cgroup("thp_adjust"); + if (!ASSERT_OK(err, "join_cgroup")) + goto close_fd; + + cgrp_id = get_cgroup_id("thp_adjust"); + if (!ASSERT_GE(cgrp_id, 0, "create_and_get_cgroup")) + goto join_root; + + if (!ASSERT_NEQ(thp_mode_save(), -1, "THP mode save")) + goto join_root; + if (!ASSERT_GE(thp_set("madvise"), 0, "THP mode set")) + goto join_root; + + skel = test_thp_adjust__open(); + if (!ASSERT_OK_PTR(skel, "open")) + goto thp_reset; + + skel->bss->cgrp_id = cgrp_id; + skel->bss->target_pid = getpid(); + + err = test_thp_adjust__load(skel); + if (!ASSERT_OK(err, "load")) + goto destroy; + + fentry_link = bpf_program__attach_trace(skel->progs.thp_run); + if (!ASSERT_OK_PTR(fentry_link, "attach fentry")) + goto destroy; + + ops_link = bpf_map__attach_struct_ops(skel->maps.thp); + if (!ASSERT_OK_PTR(ops_link, "attach struct_ops")) + goto destroy; + + if (!ASSERT_NEQ(thp_alloc(), -1, "THP alloc")) + goto destroy; + + /* After attaching struct_ops, THP will be allocated only in khugepaged . */ + if (!ASSERT_EQ(skel->bss->pf_alloc, 0, "alloc_in_pf")) + goto thp_free; + if (!ASSERT_GT(skel->bss->pf_disallow, 0, "alloc_in_pf")) + goto thp_free; + + if (!ASSERT_GT(skel->bss->khugepaged_alloc, 0, "alloc_in_khugepaged")) + goto thp_free; + ASSERT_EQ(skel->bss->khugepaged_disallow, 0, "alloc_in_pf"); + +thp_free: + thp_free(); +destroy: + test_thp_adjust__destroy(skel); +thp_reset: + ASSERT_GE(thp_reset(), 0, "THP mode reset"); +join_root: + /* We must join the root cgroup before removing the created cgroup. */ + err = join_root_cgroup(); + ASSERT_OK(err, "join_cgroup to root"); +close_fd: + close(cgrp_fd); + remove_cgroup("thp_adjust"); +cleanup: + cleanup_cgroup_environment(); +} + +void test_thp_adjust(void) +{ + if (test__start_subtest("thp_adjust")) + subtest_thp_adjust(); + RUN_TESTS(test_thp_adjust_failure); +} diff --git a/tools/testing/selftests/bpf/progs/test_thp_adjust.c b/tools/testing/selftests/bpf/progs/test_thp_adjust.c new file mode 100644 index 0000000000000..bb4aad50c7a80 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_thp_adjust.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +#define TVA_IN_PF (1 << 1) + +int pf_alloc, pf_disallow, khugepaged_alloc, khugepaged_disallow; +int cgrp_id, target_pid; + +/* Detecting whether a task can successfully allocate THP is unreliable because + * it may be influenced by system memory pressure. Instead of making the result + * dependent on unpredictable factors, we should simply check + * get_suggested_order()'s return value, which is deterministic. + */ +SEC("fexit/get_suggested_order") +int BPF_PROG(thp_run, struct mm_struct *mm, unsigned long tva_flags, int order, int retval) +{ + struct task_struct *current = bpf_get_current_task_btf(); + + if (current->pid != target_pid || order != 9) + return 0; + + if (tva_flags & TVA_IN_PF) { + if (retval == 9) + pf_alloc++; + else if (!retval) + pf_disallow++; + } else { + if (retval == 9) + khugepaged_alloc++; + else if (!retval) + khugepaged_disallow++; + } + return 0; +} + +SEC("struct_ops/get_suggested_order") +int BPF_PROG(bpf_suggested_order, struct mm_struct *mm, unsigned long tva_flags, int order) +{ + struct mem_cgroup *memcg = bpf_mm_get_mem_cgroup(mm); + int suggested_order = order; + + /* Only works when CONFIG_MEMCG is enabled. */ + if (!memcg) + return suggested_order; + + if (memcg->css.cgroup->kn->id == cgrp_id) { + /* BPF THP allocation policy: + * - Disallow PMD allocation in page fault context + */ + if (tva_flags & TVA_IN_PF && order == 9) { + suggested_order = 0; + goto out; + } + } + +out: + bpf_put_mem_cgroup(memcg); + return suggested_order; +} + +SEC(".struct_ops.link") +struct bpf_thp_ops thp = { + .get_suggested_order = (void *)bpf_suggested_order, +}; diff --git a/tools/testing/selftests/bpf/progs/test_thp_adjust_failure.c b/tools/testing/selftests/bpf/progs/test_thp_adjust_failure.c new file mode 100644 index 0000000000000..b080aead9b878 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_thp_adjust_failure.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +SEC("struct_ops/get_suggested_order") +__failure __msg("Unreleased reference") +int BPF_PROG(unreleased_task, struct mm_struct *mm, bool vma_madvised) +{ + struct task_struct *p = bpf_mm_get_task(mm); + + /* The task should be released with bpf_task_release() */ + return p ? 9 : 0; +} + +SEC(".struct_ops.link") +struct bpf_thp_ops thp = { + .get_suggested_order = (void *)unreleased_task, +};