diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 1654211cc6cf2..f6991c674329f 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -738,3 +738,42 @@ support enabled just fine as always. No difference can be noted in hugetlbfs other than there will be less overall fragmentation. All usual features belonging to hugetlbfs are preserved and unaffected. libhugetlbfs will also work fine as usual. + +BPF THP +======= + +Overview +-------- + +When the system is configured with "always" or "madvise" THP mode, a BPF program +can be used to adjust THP allocation policies dynamically. This enables +fine-grained control over THP decisions based on various factors including +workload identity, allocation context, and system memory pressure. + +Program Interface +----------------- + +This feature implements a struct_ops BPF program with the following interface:: + + int thp_get_order(struct vm_area_struct *vma, + enum tva_type type, + unsigned long orders); + +Parameters:: + + @vma: vm_area_struct associated with the THP allocation + @type: TVA type for current @vma + @orders: Bitmask of available THP orders for this allocation + +Return value:: + + The suggested THP order for allocation from the BPF program. Must be + a valid, available order. + +Implementation Notes +-------------------- + +This is currently an experimental feature. CONFIG_BPF_THP (EXPERIMENTAL) must be +enabled to use it. Only one BPF program can be attached at a time, but the +program can be updated dynamically to adjust policies without requiring affected +tasks to be restarted. diff --git a/MAINTAINERS b/MAINTAINERS index 46126ce2f968e..7febdd8b17b3f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16520,8 +16520,11 @@ F: include/linux/huge_mm.h F: include/linux/khugepaged.h F: include/trace/events/huge_memory.h F: mm/huge_memory.c +F: mm/huge_memory_bpf.c F: mm/khugepaged.c F: mm/mm_slot.h +F: tools/testing/selftests/bpf/prog_tests/thp_adjust.c +F: tools/testing/selftests/bpf/progs/test_thp_adjust* F: tools/testing/selftests/mm/khugepaged.c F: tools/testing/selftests/mm/split_huge_page_test.c F: tools/testing/selftests/mm/transhuge-stress.c diff --git a/fs/exec.c b/fs/exec.c index 6b70c6726d316..41d7703368e96 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -890,6 +890,7 @@ static int exec_mmap(struct mm_struct *mm) activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); + bpf_thp_retain_mm(mm, old_mm); lru_gen_add_mm(mm); task_unlock(tsk); lru_gen_use_mm(mm); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fc35a0543f019..e713d1905750d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1369,8 +1369,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %8u\n", - !!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS, - THP_ORDERS_ALL)); + !!thp_vma_allowable_orders(vma, TVA_SMAPS, THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f327d62fc9852..9e4088ae0a322 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -96,13 +96,14 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; enum tva_type { TVA_SMAPS, /* Exposing "THPeligible:" in smaps. */ - TVA_PAGEFAULT, /* Serving a page fault. */ + TVA_PAGEFAULT, /* Serving a non-swap page fault. */ TVA_KHUGEPAGED, /* Khugepaged collapse. */ TVA_FORCED_COLLAPSE, /* Forced collapse (e.g. MADV_COLLAPSE). */ + TVA_SWAP_PAGEFAULT, /* serving a swap page fault. */ }; -#define thp_vma_allowable_order(vma, vm_flags, type, order) \ - (!!thp_vma_allowable_orders(vma, vm_flags, type, BIT(order))) +#define thp_vma_allowable_order(vma, type, order) \ + (!!thp_vma_allowable_orders(vma, type, BIT(order))) #define split_folio(f) split_folio_to_list(f, NULL) @@ -266,14 +267,47 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, } unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, - vm_flags_t vm_flags, enum tva_type type, unsigned long orders); +#ifdef CONFIG_BPF_THP + +unsigned long +bpf_hook_thp_get_orders(struct vm_area_struct *vma, enum tva_type type, + unsigned long orders); + +void bpf_thp_exit_mm(struct mm_struct *mm); +void bpf_thp_retain_mm(struct mm_struct *mm, struct mm_struct *old_mm); +void bpf_thp_fork(struct mm_struct *mm, struct mm_struct *old_mm); + +#else + +static inline unsigned long +bpf_hook_thp_get_orders(struct vm_area_struct *vma, enum tva_type type, + unsigned long orders) +{ + return orders; +} + +static inline void bpf_thp_ops_exit(struct mm_struct *mm) +{ +} + +static inline void +bpf_thp_retain_mm(struct mm_struct *mm, struct mm_struct *old_mm) +{ +} + +static inline void +bpf_thp_fork(struct mm_struct *mm, struct mm_struct *old_mm) +{ +} + +#endif + /** * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma * @vma: the vm area to check - * @vm_flags: use these vm_flags instead of vma->vm_flags * @type: TVA type * @orders: bitfield of all orders to consider * @@ -287,10 +321,16 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, */ static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, - vm_flags_t vm_flags, enum tva_type type, unsigned long orders) { + vm_flags_t vm_flags = vma->vm_flags; + + /* The BPF-specified order overrides which order is selected. */ + orders &= bpf_hook_thp_get_orders(vma, type, orders); + if (!orders) + return 0; + /* * Optimization to check if required orders are enabled early. Only * forced collapse ignores sysfs configs. @@ -309,7 +349,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, return 0; } - return __thp_vma_allowable_orders(vma, vm_flags, type, orders); + return __thp_vma_allowable_orders(vma, type, orders); } struct thpsize { @@ -329,8 +369,10 @@ struct thpsize { * through madvise or prctl. */ static inline bool vma_thp_disabled(struct vm_area_struct *vma, - vm_flags_t vm_flags, bool forced_collapse) + bool forced_collapse) { + vm_flags_t vm_flags = vma->vm_flags; + /* Are THPs disabled for this VMA? */ if (vm_flags & VM_NOHUGEPAGE) return true; @@ -560,7 +602,6 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, } static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, - vm_flags_t vm_flags, enum tva_type type, unsigned long orders) { diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index eb1946a70cff7..b30814d3d6658 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -13,8 +13,8 @@ extern void khugepaged_destroy(void); extern int start_stop_khugepaged(void); extern void __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); -extern void khugepaged_enter_vma(struct vm_area_struct *vma, - vm_flags_t vm_flags); +extern void khugepaged_enter_vma(struct vm_area_struct *vma); +extern void khugepaged_enter_mm(struct mm_struct *mm); extern void khugepaged_min_free_kbytes_update(void); extern bool current_is_khugepaged(void); extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, @@ -38,8 +38,10 @@ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm static inline void khugepaged_exit(struct mm_struct *mm) { } -static inline void khugepaged_enter_vma(struct vm_area_struct *vma, - vm_flags_t vm_flags) +static inline void khugepaged_enter_vma(struct vm_area_struct *vma) +{ +} +static inline void khugepaged_enter_mm(struct mm_struct *mm) { } static inline int collapse_pte_mapped_thp(struct mm_struct *mm, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 90e5790c318f0..9d4dc61690b35 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -33,6 +33,7 @@ struct address_space; struct futex_private_hash; struct mem_cgroup; +struct bpf_mm_ops; typedef struct { unsigned long f; @@ -930,6 +931,19 @@ struct mm_cid { }; #endif +#ifdef CONFIG_BPF_THP +struct bpf_thp_ops; +#endif + +#ifdef CONFIG_BPF_MM +struct bpf_mm_ops { +#ifdef CONFIG_BPF_THP + struct bpf_thp_ops __rcu *bpf_thp; + struct list_head bpf_thp_list; +#endif +}; +#endif + /* * Opaque type representing current mm_struct flag state. Must be accessed via * mm_flags_xxx() helper functions. @@ -1227,6 +1241,10 @@ struct mm_struct { #ifdef CONFIG_MM_ID mm_id_t mm_id; #endif /* CONFIG_MM_ID */ + +#ifdef CONFIG_BPF_MM + struct bpf_mm_ops bpf_mm; +#endif } __randomize_layout; /* diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c908015b2d34b..503c325aef1f5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7054,6 +7054,9 @@ BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state) { /* RCU trusted: these fields are trusted in RCU CS and can be NULL */ BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) { struct file __rcu *exe_file; +#ifdef CONFIG_MEMCG + struct task_struct __rcu *owner; +#endif }; /* skb->sk, req->sk are not RCU protected, but we mark them as such @@ -7093,6 +7096,10 @@ BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) { struct sock *sk; }; +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct) { + struct mm_struct *vm_mm; +}; + static bool type_is_rcu(struct bpf_verifier_env *env, struct bpf_reg_state *reg, const char *field_name, u32 btf_id) @@ -7134,6 +7141,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, { BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted_or_null"); diff --git a/kernel/fork.c b/kernel/fork.c index 3da0f08615a95..dc24f3d012df9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1130,6 +1130,7 @@ static inline void __mmput(struct mm_struct *mm) exit_aio(mm); ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ + bpf_thp_exit_mm(mm); exit_mmap(mm); mm_put_huge_zero_folio(mm); set_mm_exe_file(mm, NULL); diff --git a/mm/Kconfig b/mm/Kconfig index 0e26f4fc8717b..21cf0c814611c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1372,6 +1372,28 @@ config PT_RECLAIM config FIND_NORMAL_PAGE def_bool n +menuconfig BPF_MM + bool "BPF-based Memory Management (EXPERIMENTAL)" + depends on BPF_SYSCALL + + help + Enable BPF-based Memory Management Policy. This feature is currently + experimental. + + WARNING: This feature is unstable and may change in future kernel + +if BPF_MM +config BPF_THP + bool "BPF-based THP Policy (EXPERIMENTAL)" + depends on TRANSPARENT_HUGEPAGE && BPF_MM + + help + Enable dynamic THP policy adjustment using BPF programs. This feature + is currently experimental. + + WARNING: This feature is unstable and may change in future kernel +endif # BPF_MM + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index 21abb33535501..4efca1c8a919b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -99,6 +99,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_NUMA) += memory-tiers.o obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o +obj-$(CONFIG_BPF_THP) += huge_memory_bpf.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1b81680b4225f..2b155a734c783 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -98,14 +98,14 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) } unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, - vm_flags_t vm_flags, enum tva_type type, unsigned long orders) { const bool smaps = type == TVA_SMAPS; - const bool in_pf = type == TVA_PAGEFAULT; + const bool in_pf = (type == TVA_PAGEFAULT || type == TVA_SWAP_PAGEFAULT); const bool forced_collapse = type == TVA_FORCED_COLLAPSE; unsigned long supported_orders; + vm_flags_t vm_flags = vma->vm_flags; /* Check the intersection of requested and supported orders. */ if (vma_is_anonymous(vma)) @@ -122,7 +122,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, if (!vma->vm_mm) /* vdso */ return 0; - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, forced_collapse)) return 0; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ @@ -1346,7 +1346,6 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) ret = vmf_anon_prepare(vmf); if (ret) return ret; - khugepaged_enter_vma(vma, vma->vm_flags); if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && diff --git a/mm/huge_memory_bpf.c b/mm/huge_memory_bpf.c new file mode 100644 index 0000000000000..24ab432cbbaa5 --- /dev/null +++ b/mm/huge_memory_bpf.c @@ -0,0 +1,306 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BPF-based THP policy management + * + * Author: Yafang Shao + */ + +#include +#include +#include +#include + +/** + * @thp_order_fn_t: Get the suggested THP order from a BPF program for allocation + * @vma: vm_area_struct associated with the THP allocation + * @type: TVA type for current @vma + * @orders: Bitmask of available THP orders for this allocation + * + * Return: The suggested THP order for allocation from the BPF program. Must be + * a valid, available order. + */ +typedef int thp_order_fn_t(struct vm_area_struct *vma, + enum tva_type type, + unsigned long orders); + +struct bpf_thp_mm_list { + struct list_head list; +}; + +struct bpf_thp_ops { + pid_t pid; /* The pid to attach */ + thp_order_fn_t *thp_get_order; + + /* private*/ + /* The list of mm_struct this ops is operated on */ + struct bpf_thp_mm_list mm_list; +}; + +static DEFINE_SPINLOCK(thp_ops_lock); + +void bpf_thp_exit_mm(struct mm_struct *mm) +{ + if (!rcu_access_pointer(mm->bpf_mm.bpf_thp)) + return; + + spin_lock(&thp_ops_lock); + if (!rcu_access_pointer(mm->bpf_mm.bpf_thp)) { + spin_unlock(&thp_ops_lock); + return; + } + list_del(&mm->bpf_mm.bpf_thp_list); + RCU_INIT_POINTER(mm->bpf_mm.bpf_thp, NULL); + spin_unlock(&thp_ops_lock); + +} + +void bpf_thp_retain_mm(struct mm_struct *mm, struct mm_struct *old_mm) +{ + struct bpf_thp_ops *bpf_thp; + + if (!old_mm || !rcu_access_pointer(old_mm->bpf_mm.bpf_thp)) + return; + + spin_lock(&thp_ops_lock); + bpf_thp = rcu_dereference_protected(old_mm->bpf_mm.bpf_thp, + lockdep_is_held(&thp_ops_lock)); + if (!bpf_thp) { + spin_unlock(&thp_ops_lock); + return; + } + + /* The new mm is still under initilization */ + RCU_INIT_POINTER(mm->bpf_mm.bpf_thp, bpf_thp); + + /* The old mm is destroying */ + RCU_INIT_POINTER(old_mm->bpf_mm.bpf_thp, NULL); + list_replace(&old_mm->bpf_mm.bpf_thp_list, &mm->bpf_mm.bpf_thp_list); + spin_unlock(&thp_ops_lock); +} + +void bpf_thp_fork(struct mm_struct *mm, struct mm_struct *old_mm) +{ + struct bpf_thp_mm_list *mm_list; + struct bpf_thp_ops *bpf_thp; + + if (!rcu_access_pointer(old_mm->bpf_mm.bpf_thp)) + return; + + spin_lock(&thp_ops_lock); + bpf_thp = rcu_dereference_protected(old_mm->bpf_mm.bpf_thp, + lockdep_is_held(&thp_ops_lock)); + if (!bpf_thp) { + spin_unlock(&thp_ops_lock); + return; + } + + /* The new mm is still under initilization */ + RCU_INIT_POINTER(mm->bpf_mm.bpf_thp, bpf_thp); + + mm_list = &bpf_thp->mm_list; + list_add_tail(&mm->bpf_mm.bpf_thp_list, &mm_list->list); + spin_unlock(&thp_ops_lock); +} + +unsigned long bpf_hook_thp_get_orders(struct vm_area_struct *vma, + enum tva_type type, + unsigned long orders) +{ + struct mm_struct *mm = vma->vm_mm; + struct bpf_thp_ops *bpf_thp; + int bpf_order; + + if (!mm) + return orders; + + rcu_read_lock(); + bpf_thp = rcu_dereference(mm->bpf_mm.bpf_thp); + if (!bpf_thp || !bpf_thp->thp_get_order) + goto out; + + bpf_order = bpf_thp->thp_get_order(vma, type, orders); + orders &= BIT(bpf_order); + +out: + rcu_read_unlock(); + return orders; +} + +static bool bpf_thp_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +static const struct bpf_func_proto * +bpf_thp_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return bpf_base_func_proto(func_id, prog); +} + +static const struct bpf_verifier_ops thp_bpf_verifier_ops = { + .get_func_proto = bpf_thp_get_func_proto, + .is_valid_access = bpf_thp_ops_is_valid_access, +}; + +static int bpf_thp_init(struct btf *btf) +{ + return 0; +} + +static int bpf_thp_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + /* The call site operates under RCU protection. */ + if (prog->sleepable) + return -EINVAL; + return 0; +} + +static int bpf_thp_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct bpf_thp_ops *ubpf_thp; + struct bpf_thp_ops *kbpf_thp; + u32 moff; + + ubpf_thp = (const struct bpf_thp_ops *)udata; + kbpf_thp = (struct bpf_thp_ops *)kdata; + + moff = __btf_member_bit_offset(t, member) / 8; + switch (moff) { + case offsetof(struct bpf_thp_ops, pid): + kbpf_thp->pid = ubpf_thp->pid; + return 1; + } + return 0; +} + +static int bpf_thp_reg(void *kdata, struct bpf_link *link) +{ + struct bpf_thp_ops *bpf_thp = kdata; + struct bpf_thp_mm_list *mm_list; + struct task_struct *p; + struct mm_struct *mm; + int err = -EINVAL; + pid_t pid; + + pid = bpf_thp->pid; + p = find_get_task_by_vpid(pid); + if (!p || p->flags & PF_EXITING) + return -EINVAL; + + mm = get_task_mm(p); + put_task_struct(p); + if (!mm) + goto out; + + err = -EBUSY; + spin_lock(&thp_ops_lock); + if (rcu_access_pointer(mm->bpf_mm.bpf_thp)) + goto out_lock; + err = 0; + rcu_assign_pointer(mm->bpf_mm.bpf_thp, bpf_thp); + + mm_list = &bpf_thp->mm_list; + INIT_LIST_HEAD(&mm_list->list); + list_add_tail(&mm->bpf_mm.bpf_thp_list, &mm_list->list); +out_lock: + spin_unlock(&thp_ops_lock); +out: + mmput(mm); + return err; +} + + +static void bpf_thp_unreg(void *kdata, struct bpf_link *link) +{ + struct bpf_thp_ops *bpf_thp = kdata; + struct bpf_mm_ops *bpf_mm; + struct list_head *pos, *n; + + spin_lock(&thp_ops_lock); + list_for_each_safe(pos, n, &bpf_thp->mm_list.list) { + bpf_mm = list_entry(pos, struct bpf_mm_ops, bpf_thp_list); + WARN_ON_ONCE(!bpf_mm); + rcu_replace_pointer(bpf_mm->bpf_thp, NULL, lockdep_is_held(&thp_ops_lock)); + list_del(pos); + } + spin_unlock(&thp_ops_lock); + + synchronize_rcu(); +} + +static int bpf_thp_update(void *kdata, void *old_kdata, struct bpf_link *link) +{ + struct bpf_thp_ops *old_bpf_thp = old_kdata; + struct bpf_thp_ops *bpf_thp = kdata; + struct bpf_mm_ops *bpf_mm; + struct list_head *pos, *n; + + INIT_LIST_HEAD(&bpf_thp->mm_list.list); + + spin_lock(&thp_ops_lock); + list_for_each_safe(pos, n, &old_bpf_thp->mm_list.list) { + bpf_mm = list_entry(pos, struct bpf_mm_ops, bpf_thp_list); + WARN_ON_ONCE(!bpf_mm); + rcu_replace_pointer(bpf_mm->bpf_thp, bpf_thp, lockdep_is_held(&thp_ops_lock)); + list_del(pos); + list_add_tail(&bpf_mm->bpf_thp_list, &bpf_thp->mm_list.list); + } + spin_unlock(&thp_ops_lock); + + synchronize_rcu(); + return 0; +} + +static int bpf_thp_validate(void *kdata) +{ + struct bpf_thp_ops *ops = kdata; + + if (!ops->thp_get_order) { + pr_err("bpf_thp: required ops isn't implemented\n"); + return -EINVAL; + } + return 0; +} + +static int bpf_thp_get_order(struct vm_area_struct *vma, + enum tva_type type, + unsigned long orders) +{ + return -1; +} + +static struct bpf_thp_ops __bpf_thp_ops = { + .thp_get_order = (thp_order_fn_t __rcu *)bpf_thp_get_order, +}; + +static struct bpf_struct_ops bpf_bpf_thp_ops = { + .verifier_ops = &thp_bpf_verifier_ops, + .init = bpf_thp_init, + .check_member = bpf_thp_check_member, + .init_member = bpf_thp_init_member, + .reg = bpf_thp_reg, + .unreg = bpf_thp_unreg, + .update = bpf_thp_update, + .validate = bpf_thp_validate, + .cfi_stubs = &__bpf_thp_ops, + .owner = THIS_MODULE, + .name = "bpf_thp_ops", +}; + +static int __init bpf_thp_ops_init(void) +{ + int err; + + err = register_bpf_struct_ops(&bpf_bpf_thp_ops, bpf_thp_ops); + if (err) + pr_err("bpf_thp: Failed to register struct_ops (%d)\n", err); + return err; +} +late_initcall(bpf_thp_ops_init); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index abe54f0043c73..4326ccd76c276 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -353,12 +353,6 @@ int hugepage_madvise(struct vm_area_struct *vma, #endif *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; - /* - * If the vma become good for khugepaged to scan, - * register it here without waiting a page fault that - * may not happen any time soon. - */ - khugepaged_enter_vma(vma, *vm_flags); break; case MADV_NOHUGEPAGE: *vm_flags &= ~VM_HUGEPAGE; @@ -457,14 +451,21 @@ void __khugepaged_enter(struct mm_struct *mm) wake_up_interruptible(&khugepaged_wait); } -void khugepaged_enter_vma(struct vm_area_struct *vma, - vm_flags_t vm_flags) +void khugepaged_enter_mm(struct mm_struct *mm) { - if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && - hugepage_pmd_enabled()) { - if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) - __khugepaged_enter(vma->vm_mm); - } + if (mm_flags_test(MMF_VM_HUGEPAGE, mm)) + return; + if (!hugepage_pmd_enabled()) + return; + + __khugepaged_enter(mm); +} + +void khugepaged_enter_vma(struct vm_area_struct *vma) +{ + if (!thp_vma_allowable_order(vma, TVA_KHUGEPAGED, PMD_ORDER)) + return; + khugepaged_enter_mm(vma->vm_mm); } void __khugepaged_exit(struct mm_struct *mm) @@ -913,7 +914,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, type, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1527,7 +1528,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * and map it by a PMD, regardless of sysfs THP settings. As such, let's * analogously elide sysfs THP settings here and force collapse. */ - if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, TVA_FORCED_COLLAPSE, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2422,7 +2423,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { + if (!thp_vma_allowable_order(vma, TVA_KHUGEPAGED, PMD_ORDER)) { skip: progress++; continue; @@ -2753,7 +2754,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); - if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, TVA_FORCED_COLLAPSE, PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); diff --git a/mm/madvise.c b/mm/madvise.c index fb1c86e630b66..8de7c39305dda 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1425,6 +1425,13 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior) VM_WARN_ON_ONCE(madv_behavior->lock_mode != MADVISE_MMAP_WRITE_LOCK); error = madvise_update_vma(new_flags, madv_behavior); + /* + * If the vma become good for khugepaged to scan, + * register it here without waiting a page fault that + * may not happen any time soon. + */ + if (!error && new_flags & VM_HUGEPAGE) + khugepaged_enter_mm(vma->vm_mm); out: /* * madvise() returns EAGAIN if kernel resources, such as diff --git a/mm/memory.c b/mm/memory.c index 74b45e258323e..1e5f678293c53 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4514,7 +4514,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + orders = thp_vma_allowable_orders(vma, TVA_SWAP_PAGEFAULT, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); orders = thp_swap_suitable_orders(swp_offset(entry), @@ -5063,7 +5063,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + orders = thp_vma_allowable_orders(vma, TVA_PAGEFAULT, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); @@ -5335,7 +5335,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa * PMD mappings if THPs are disabled. As we already have a THP, * behave as if we are forcing a collapse. */ - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags, + if (thp_disabled_by_hw() || vma_thp_disabled(vma, /* forced_collapse=*/ true)) return ret; @@ -6236,7 +6236,6 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; - vm_flags_t vm_flags = vma->vm_flags; pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; @@ -6251,7 +6250,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) { + thp_vma_allowable_order(vma, TVA_PAGEFAULT, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -6284,11 +6283,14 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, if (pud_trans_unstable(vmf.pud)) goto retry_pud; - if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) { - ret = create_huge_pmd(&vmf); - if (!(ret & VM_FAULT_FALLBACK)) - return ret; + if (pmd_none(*vmf.pmd)) { + if (vma_is_anonymous(vma)) + khugepaged_enter_vma(vma); + if (thp_vma_allowable_order(vma, TVA_PAGEFAULT, PMD_ORDER)) { + ret = create_huge_pmd(&vmf); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } } else { vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); diff --git a/mm/mmap.c b/mm/mmap.c index 5fd3b80fda1d5..8ac7d3046a332 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1844,6 +1844,7 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) vma_iter_free(&vmi); if (!retval) { mt_set_in_rcu(vmi.mas.tree); + bpf_thp_fork(mm, oldmm); ksm_fork(mm, oldmm); khugepaged_fork(mm, oldmm); } else { diff --git a/mm/shmem.c b/mm/shmem.c index b9081b817d28f..546d39602a11a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1780,7 +1780,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, vm_flags_t vm_flags = vma ? vma->vm_flags : 0; unsigned int global_orders; - if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force))) + if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, shmem_huge_force))) return 0; global_orders = shmem_huge_global_enabled(inode, index, write_end, diff --git a/mm/vma.c b/mm/vma.c index abe0da33c8446..872eb3c922581 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -973,7 +973,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( if (err || commit_merge(vmg)) goto abort; - khugepaged_enter_vma(vmg->target, vmg->vm_flags); + khugepaged_enter_vma(vmg->target); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; @@ -1093,7 +1093,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) * following VMA if we have VMAs on both sides. */ if (vmg->target && !vma_expand(vmg)) { - khugepaged_enter_vma(vmg->target, vmg->vm_flags); + khugepaged_enter_vma(vmg->target); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; } @@ -2503,7 +2503,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) * call covers the non-merge case. */ if (!vma_is_anonymous(vma)) - khugepaged_enter_vma(vma, map->vm_flags); + khugepaged_enter_vma(vma); *vmap = vma; return 0; diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 70b28c1e653ea..c2d80c4bc6163 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -9,6 +9,7 @@ CONFIG_BPF_LIRC_MODE2=y CONFIG_BPF_LSM=y CONFIG_BPF_STREAM_PARSER=y CONFIG_BPF_SYSCALL=y +CONFIG_BPF_THP=y # CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set CONFIG_CGROUP_BPF=y CONFIG_CRYPTO_HMAC=y @@ -51,6 +52,7 @@ CONFIG_IPV6_TUNNEL=y CONFIG_KEYS=y CONFIG_LIRC=y CONFIG_LWTUNNEL=y +CONFIG_MEMCG=y CONFIG_MODULE_SIG=y CONFIG_MODULE_SRCVERSION_ALL=y CONFIG_MODULE_UNLOAD=y @@ -115,6 +117,7 @@ CONFIG_SECURITY=y CONFIG_SECURITYFS=y CONFIG_SYN_COOKIES=y CONFIG_TEST_BPF=m +CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_UDMABUF=y CONFIG_USERFAULTFD=y CONFIG_VSOCKETS=y diff --git a/tools/testing/selftests/bpf/prog_tests/thp_adjust.c b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c new file mode 100644 index 0000000000000..b69f519486663 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "test_thp_adjust.skel.h" + +#define LEN (16 * 1024 * 1024) /* 16MB */ +#define THP_ENABLED_FILE "/sys/kernel/mm/transparent_hugepage/enabled" +#define PMD_SIZE_FILE "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" + +static struct test_thp_adjust *skel; +static char old_mode[32]; +static long pagesize; + +static int thp_mode_save(void) +{ + const char *start, *end; + char buf[128]; + int fd, err; + size_t len; + + fd = open(THP_ENABLED_FILE, O_RDONLY); + if (fd == -1) + return -1; + + err = read(fd, buf, sizeof(buf) - 1); + if (err == -1) + goto close; + + start = strchr(buf, '['); + end = start ? strchr(start, ']') : NULL; + if (!start || !end || end <= start) { + err = -1; + goto close; + } + + len = end - start - 1; + if (len >= sizeof(old_mode)) + len = sizeof(old_mode) - 1; + strncpy(old_mode, start + 1, len); + old_mode[len] = '\0'; + +close: + close(fd); + return err; +} + +static int thp_mode_set(const char *desired_mode) +{ + int fd, err; + + fd = open(THP_ENABLED_FILE, O_RDWR); + if (fd == -1) + return -1; + + err = write(fd, desired_mode, strlen(desired_mode)); + close(fd); + return err; +} + +static int thp_mode_reset(void) +{ + int fd, err; + + fd = open(THP_ENABLED_FILE, O_WRONLY); + if (fd == -1) + return -1; + + err = write(fd, old_mode, strlen(old_mode)); + close(fd); + return err; +} + +static char *thp_alloc(void) +{ + char *addr; + int err, i; + + addr = mmap(NULL, LEN, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (addr == MAP_FAILED) + return NULL; + + err = madvise(addr, LEN, MADV_HUGEPAGE); + if (err == -1) + goto unmap; + + /* Accessing a single byte within a page is sufficient to trigger a page fault. */ + for (i = 0; i < LEN; i += pagesize) + addr[i] = 1; + return addr; + +unmap: + munmap(addr, LEN); + return NULL; +} + +static void thp_free(char *ptr) +{ + munmap(ptr, LEN); +} + +static int get_pmd_order(void) +{ + ssize_t bytes_read, size; + int fd, order, ret = -1; + char buf[64], *endptr; + + fd = open(PMD_SIZE_FILE, O_RDONLY); + if (fd < 0) + return -1; + + bytes_read = read(fd, buf, sizeof(buf) - 1); + if (bytes_read <= 0) + goto close_fd; + + /* Remove potential newline character */ + if (buf[bytes_read - 1] == '\n') + buf[bytes_read - 1] = '\0'; + + size = strtoul(buf, &endptr, 10); + if (endptr == buf || *endptr != '\0') + goto close_fd; + if (size % pagesize != 0) + goto close_fd; + ret = size / pagesize; + if ((ret & (ret - 1)) == 0) { + order = 0; + while (ret > 1) { + ret >>= 1; + order++; + } + ret = order; + } + +close_fd: + close(fd); + return ret; +} + +static int get_thp_eligible(pid_t pid, unsigned long addr) +{ + int this_vma = 0, eligible = -1; + unsigned long start, end; + char smaps_path[64]; + FILE *smaps_file; + char line[4096]; + + snprintf(smaps_path, sizeof(smaps_path), "/proc/%d/smaps", pid); + smaps_file = fopen(smaps_path, "r"); + if (!smaps_file) + return -1; + + while (fgets(line, sizeof(line), smaps_file)) { + if (sscanf(line, "%lx-%lx", &start, &end) == 2) { + /* addr is monotonic */ + if (addr < start) + break; + this_vma = (addr >= start && addr < end) ? 1 : 0; + continue; + } + + if (!this_vma) + continue; + + if (strstr(line, "THPeligible:")) { + sscanf(line, "THPeligible: %d", &eligible); + break; + } + } + + fclose(smaps_file); + return eligible; +} + +static void subtest_thp_eligible(void) +{ + struct bpf_link *ops_link; + int elighble; + char *ptr; + + ops_link = bpf_map__attach_struct_ops(skel->maps.thp_eligible_ops); + if (!ASSERT_OK_PTR(ops_link, "attach struct_ops")) + return; + + ptr = thp_alloc(); + if (!ASSERT_OK_PTR(ptr, "THP alloc")) + goto detach; + + elighble = get_thp_eligible(getpid(), (unsigned long)ptr); + ASSERT_EQ(elighble, 1, "THPeligible"); + + thp_free(ptr); +detach: + bpf_link__destroy(ops_link); +} + +static int thp_adjust_setup(void) +{ + int err = -1, pmd_order; + + pagesize = sysconf(_SC_PAGESIZE); + pmd_order = get_pmd_order(); + if (!ASSERT_NEQ(pmd_order, -1, "get_pmd_order")) + return -1; + + if (!ASSERT_NEQ(thp_mode_save(), -1, "THP mode save")) + return -1; + if (!ASSERT_GE(thp_mode_set("madvise"), 0, "THP mode set")) + return -1; + + skel = test_thp_adjust__open(); + if (!ASSERT_OK_PTR(skel, "open")) + goto thp_reset; + + skel->bss->pmd_order = pmd_order; + skel->struct_ops.thp_eligible_ops->pid = getpid(); + + err = test_thp_adjust__load(skel); + if (!ASSERT_OK(err, "load")) + goto destroy; + return 0; + +destroy: + test_thp_adjust__destroy(skel); +thp_reset: + ASSERT_GE(thp_mode_reset(), 0, "THP mode reset"); + return err; +} + +static void thp_adjust_destroy(void) +{ + test_thp_adjust__destroy(skel); + ASSERT_GE(thp_mode_reset(), 0, "THP mode reset"); +} + +void test_thp_adjust(void) +{ + if (thp_adjust_setup() == -1) + return; + + if (test__start_subtest("thp_eligible")) + subtest_thp_eligible(); + + thp_adjust_destroy(); +} diff --git a/tools/testing/selftests/bpf/progs/lsm.c b/tools/testing/selftests/bpf/progs/lsm.c index 0c13b7409947e..7de173daf27b6 100644 --- a/tools/testing/selftests/bpf/progs/lsm.c +++ b/tools/testing/selftests/bpf/progs/lsm.c @@ -89,14 +89,16 @@ SEC("lsm/file_mprotect") int BPF_PROG(test_int_hook, struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot, int ret) { - if (ret != 0) + struct mm_struct *mm = vma->vm_mm; + + if (ret != 0 || !mm) return ret; __s32 pid = bpf_get_current_pid_tgid() >> 32; int is_stack = 0; - is_stack = (vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack); + is_stack = (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack); if (is_stack && monitored_pid == pid) { mprotect_count++; diff --git a/tools/testing/selftests/bpf/progs/test_thp_adjust.c b/tools/testing/selftests/bpf/progs/test_thp_adjust.c new file mode 100644 index 0000000000000..bc062d7feed4f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_thp_adjust.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +int pmd_order; + +SEC("struct_ops/thp_get_order") +int BPF_PROG(thp_eligible, struct vm_area_struct *vma, enum tva_type type, + unsigned long orders) +{ + if (type != TVA_SMAPS) + return 0; + return pmd_order; +} + +SEC(".struct_ops.link") +struct bpf_thp_ops thp_eligible_ops = { + .thp_get_order = (void *)thp_eligible, +};