Skip to content
Closed
39 changes: 39 additions & 0 deletions Documentation/admin-guide/mm/transhuge.rst
Original file line number Diff line number Diff line change
Expand Up @@ -738,3 +738,42 @@ support enabled just fine as always. No difference can be noted in
hugetlbfs other than there will be less overall fragmentation. All
usual features belonging to hugetlbfs are preserved and
unaffected. libhugetlbfs will also work fine as usual.

BPF THP
=======

Overview
--------

When the system is configured with "always" or "madvise" THP mode, a BPF program
can be used to adjust THP allocation policies dynamically. This enables
fine-grained control over THP decisions based on various factors including
workload identity, allocation context, and system memory pressure.

Program Interface
-----------------

This feature implements a struct_ops BPF program with the following interface::

int thp_get_order(struct vm_area_struct *vma,
enum tva_type type,
unsigned long orders);

Parameters::

@vma: vm_area_struct associated with the THP allocation
@type: TVA type for current @vma
@orders: Bitmask of available THP orders for this allocation

Return value::

The suggested THP order for allocation from the BPF program. Must be
a valid, available order.

Implementation Notes
--------------------

This is currently an experimental feature. CONFIG_BPF_THP (EXPERIMENTAL) must be
enabled to use it. Only one BPF program can be attached at a time, but the
program can be updated dynamically to adjust policies without requiring affected
tasks to be restarted.
3 changes: 3 additions & 0 deletions MAINTAINERS
Original file line number Diff line number Diff line change
Expand Up @@ -16520,8 +16520,11 @@ F: include/linux/huge_mm.h
F: include/linux/khugepaged.h
F: include/trace/events/huge_memory.h
F: mm/huge_memory.c
F: mm/huge_memory_bpf.c
F: mm/khugepaged.c
F: mm/mm_slot.h
F: tools/testing/selftests/bpf/prog_tests/thp_adjust.c
F: tools/testing/selftests/bpf/progs/test_thp_adjust*
F: tools/testing/selftests/mm/khugepaged.c
F: tools/testing/selftests/mm/split_huge_page_test.c
F: tools/testing/selftests/mm/transhuge-stress.c
Expand Down
1 change: 1 addition & 0 deletions fs/exec.c
Original file line number Diff line number Diff line change
Expand Up @@ -890,6 +890,7 @@ static int exec_mmap(struct mm_struct *mm)
activate_mm(active_mm, mm);
if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
bpf_thp_retain_mm(mm, old_mm);
lru_gen_add_mm(mm);
task_unlock(tsk);
lru_gen_use_mm(mm);
Expand Down
3 changes: 1 addition & 2 deletions fs/proc/task_mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1369,8 +1369,7 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);

seq_printf(m, "THPeligible: %8u\n",
!!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS,
THP_ORDERS_ALL));
!!thp_vma_allowable_orders(vma, TVA_SMAPS, THP_ORDERS_ALL));

if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
Expand Down
59 changes: 50 additions & 9 deletions include/linux/huge_mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,14 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;

enum tva_type {
TVA_SMAPS, /* Exposing "THPeligible:" in smaps. */
TVA_PAGEFAULT, /* Serving a page fault. */
TVA_PAGEFAULT, /* Serving a non-swap page fault. */
TVA_KHUGEPAGED, /* Khugepaged collapse. */
TVA_FORCED_COLLAPSE, /* Forced collapse (e.g. MADV_COLLAPSE). */
TVA_SWAP_PAGEFAULT, /* serving a swap page fault. */
};

#define thp_vma_allowable_order(vma, vm_flags, type, order) \
(!!thp_vma_allowable_orders(vma, vm_flags, type, BIT(order)))
#define thp_vma_allowable_order(vma, type, order) \
(!!thp_vma_allowable_orders(vma, type, BIT(order)))

#define split_folio(f) split_folio_to_list(f, NULL)

Expand Down Expand Up @@ -266,14 +267,47 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
}

unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags,
enum tva_type type,
unsigned long orders);

#ifdef CONFIG_BPF_THP

unsigned long
bpf_hook_thp_get_orders(struct vm_area_struct *vma, enum tva_type type,
unsigned long orders);

void bpf_thp_exit_mm(struct mm_struct *mm);
void bpf_thp_retain_mm(struct mm_struct *mm, struct mm_struct *old_mm);
void bpf_thp_fork(struct mm_struct *mm, struct mm_struct *old_mm);

#else

static inline unsigned long
bpf_hook_thp_get_orders(struct vm_area_struct *vma, enum tva_type type,
unsigned long orders)
{
return orders;
}

static inline void bpf_thp_ops_exit(struct mm_struct *mm)
{
}

static inline void
bpf_thp_retain_mm(struct mm_struct *mm, struct mm_struct *old_mm)
{
}

static inline void
bpf_thp_fork(struct mm_struct *mm, struct mm_struct *old_mm)
{
}

#endif

/**
* thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
* @vma: the vm area to check
* @vm_flags: use these vm_flags instead of vma->vm_flags
* @type: TVA type
* @orders: bitfield of all orders to consider
*
Expand All @@ -287,10 +321,16 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
*/
static inline
unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags,
enum tva_type type,
unsigned long orders)
{
vm_flags_t vm_flags = vma->vm_flags;

/* The BPF-specified order overrides which order is selected. */
orders &= bpf_hook_thp_get_orders(vma, type, orders);
if (!orders)
return 0;

/*
* Optimization to check if required orders are enabled early. Only
* forced collapse ignores sysfs configs.
Expand All @@ -309,7 +349,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
return 0;
}

return __thp_vma_allowable_orders(vma, vm_flags, type, orders);
return __thp_vma_allowable_orders(vma, type, orders);
}

struct thpsize {
Expand All @@ -329,8 +369,10 @@ struct thpsize {
* through madvise or prctl.
*/
static inline bool vma_thp_disabled(struct vm_area_struct *vma,
vm_flags_t vm_flags, bool forced_collapse)
bool forced_collapse)
{
vm_flags_t vm_flags = vma->vm_flags;

/* Are THPs disabled for this VMA? */
if (vm_flags & VM_NOHUGEPAGE)
return true;
Expand Down Expand Up @@ -560,7 +602,6 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
}

static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags,
enum tva_type type,
unsigned long orders)
{
Expand Down
10 changes: 6 additions & 4 deletions include/linux/khugepaged.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ extern void khugepaged_destroy(void);
extern int start_stop_khugepaged(void);
extern void __khugepaged_enter(struct mm_struct *mm);
extern void __khugepaged_exit(struct mm_struct *mm);
extern void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags);
extern void khugepaged_enter_vma(struct vm_area_struct *vma);
extern void khugepaged_enter_mm(struct mm_struct *mm);
extern void khugepaged_min_free_kbytes_update(void);
extern bool current_is_khugepaged(void);
extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
Expand All @@ -38,8 +38,10 @@ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm
static inline void khugepaged_exit(struct mm_struct *mm)
{
}
static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags)
static inline void khugepaged_enter_vma(struct vm_area_struct *vma)
{
}
static inline void khugepaged_enter_mm(struct mm_struct *mm)
{
}
static inline int collapse_pte_mapped_thp(struct mm_struct *mm,
Expand Down
18 changes: 18 additions & 0 deletions include/linux/mm_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
struct address_space;
struct futex_private_hash;
struct mem_cgroup;
struct bpf_mm_ops;

typedef struct {
unsigned long f;
Expand Down Expand Up @@ -930,6 +931,19 @@ struct mm_cid {
};
#endif

#ifdef CONFIG_BPF_THP
struct bpf_thp_ops;
#endif

#ifdef CONFIG_BPF_MM
struct bpf_mm_ops {
#ifdef CONFIG_BPF_THP
struct bpf_thp_ops __rcu *bpf_thp;
struct list_head bpf_thp_list;
#endif
};
#endif

/*
* Opaque type representing current mm_struct flag state. Must be accessed via
* mm_flags_xxx() helper functions.
Expand Down Expand Up @@ -1227,6 +1241,10 @@ struct mm_struct {
#ifdef CONFIG_MM_ID
mm_id_t mm_id;
#endif /* CONFIG_MM_ID */

#ifdef CONFIG_BPF_MM
struct bpf_mm_ops bpf_mm;
#endif
} __randomize_layout;

/*
Expand Down
8 changes: 8 additions & 0 deletions kernel/bpf/verifier.c
Original file line number Diff line number Diff line change
Expand Up @@ -7054,6 +7054,9 @@ BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state) {
/* RCU trusted: these fields are trusted in RCU CS and can be NULL */
BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) {
struct file __rcu *exe_file;
#ifdef CONFIG_MEMCG
struct task_struct __rcu *owner;
#endif
};

/* skb->sk, req->sk are not RCU protected, but we mark them as such
Expand Down Expand Up @@ -7093,6 +7096,10 @@ BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) {
struct sock *sk;
};

BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct) {
struct mm_struct *vm_mm;
};

static bool type_is_rcu(struct bpf_verifier_env *env,
struct bpf_reg_state *reg,
const char *field_name, u32 btf_id)
Expand Down Expand Up @@ -7134,6 +7141,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env,
{
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct));

return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id,
"__safe_trusted_or_null");
Expand Down
1 change: 1 addition & 0 deletions kernel/fork.c
Original file line number Diff line number Diff line change
Expand Up @@ -1130,6 +1130,7 @@ static inline void __mmput(struct mm_struct *mm)
exit_aio(mm);
ksm_exit(mm);
khugepaged_exit(mm); /* must run before exit_mmap */
bpf_thp_exit_mm(mm);
exit_mmap(mm);
mm_put_huge_zero_folio(mm);
set_mm_exe_file(mm, NULL);
Expand Down
22 changes: 22 additions & 0 deletions mm/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -1372,6 +1372,28 @@ config PT_RECLAIM
config FIND_NORMAL_PAGE
def_bool n

menuconfig BPF_MM
bool "BPF-based Memory Management (EXPERIMENTAL)"
depends on BPF_SYSCALL

help
Enable BPF-based Memory Management Policy. This feature is currently
experimental.

WARNING: This feature is unstable and may change in future kernel

if BPF_MM
config BPF_THP
bool "BPF-based THP Policy (EXPERIMENTAL)"
depends on TRANSPARENT_HUGEPAGE && BPF_MM

help
Enable dynamic THP policy adjustment using BPF programs. This feature
is currently experimental.

WARNING: This feature is unstable and may change in future kernel
endif # BPF_MM

source "mm/damon/Kconfig"

endmenu
1 change: 1 addition & 0 deletions mm/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_NUMA) += memory-tiers.o
obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
obj-$(CONFIG_BPF_THP) += huge_memory_bpf.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
Expand Down
7 changes: 3 additions & 4 deletions mm/huge_memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,14 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
}

unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags,
enum tva_type type,
unsigned long orders)
{
const bool smaps = type == TVA_SMAPS;
const bool in_pf = type == TVA_PAGEFAULT;
const bool in_pf = (type == TVA_PAGEFAULT || type == TVA_SWAP_PAGEFAULT);
const bool forced_collapse = type == TVA_FORCED_COLLAPSE;
unsigned long supported_orders;
vm_flags_t vm_flags = vma->vm_flags;

/* Check the intersection of requested and supported orders. */
if (vma_is_anonymous(vma))
Expand All @@ -122,7 +122,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
if (!vma->vm_mm) /* vdso */
return 0;

if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse))
if (thp_disabled_by_hw() || vma_thp_disabled(vma, forced_collapse))
return 0;

/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
Expand Down Expand Up @@ -1346,7 +1346,6 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
ret = vmf_anon_prepare(vmf);
if (ret)
return ret;
khugepaged_enter_vma(vma, vma->vm_flags);

if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm) &&
Expand Down
Loading
Loading