Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions include/linux/huge_mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

#include <linux/fs.h> /* only for vma_is_dax() */
#include <linux/kobject.h>
#include <linux/pgtable.h>
#include <linux/mm.h>

vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
Expand Down Expand Up @@ -54,6 +56,7 @@ enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
TRANSPARENT_HUGEPAGE_BPF_ATTACHED, /* BPF prog is attached */
};

struct kobject;
Expand Down Expand Up @@ -190,6 +193,16 @@ static inline bool hugepage_global_always(void)
(1<<TRANSPARENT_HUGEPAGE_FLAG);
}

#ifdef CONFIG_EXPERIMENTAL_BPF_ORDER_SELECTION
int get_suggested_order(struct mm_struct *mm, unsigned long tva_flags, int order);
#else
static inline int
get_suggested_order(struct mm_struct *mm, unsigned long tva_flags, int order)
{
return order;
}
#endif

static inline int highest_order(unsigned long orders)
{
return fls_long(orders) - 1;
Expand Down
12 changes: 11 additions & 1 deletion include/linux/khugepaged.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#ifndef _LINUX_KHUGEPAGED_H
#define _LINUX_KHUGEPAGED_H

#include <linux/huge_mm.h>

extern unsigned int khugepaged_max_ptes_none __read_mostly;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern struct attribute_group khugepaged_attr_group;
Expand All @@ -20,7 +22,15 @@ extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,

static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags))
/*
* THP allocation policy can be dynamically modified via BPF. If a
* long-lived task was previously allowed to allocate THP but is no
* longer permitted under the new policy, we must ensure its forked
* child processes also inherit this restriction.
* The MMF_VM_HUGEPAGE flag will be cleared by khugepaged.
*/
if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags) &&
get_suggested_order(mm, 0, PMD_ORDER) == PMD_ORDER)
__khugepaged_enter(mm);
}

Expand Down
12 changes: 12 additions & 0 deletions mm/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -908,6 +908,18 @@ config NO_PAGE_MAPCOUNT

EXPERIMENTAL because the impact of some changes is still unclear.

config EXPERIMENTAL_BPF_ORDER_SELECTION
bool "BPF-based THP order selection (EXPERIMENTAL)"
depends on TRANSPARENT_HUGEPAGE && BPF_SYSCALL

help
Enable dynamic THP order selection using BPF programs. This
experimental feature allows custom BPF logic to determine optimal
transparent hugepage allocation sizes at runtime.

Warning: This feature is unstable and may change in future kernel
versions.

endif # TRANSPARENT_HUGEPAGE

# simple helper to make the code a bit easier to read
Expand Down
1 change: 1 addition & 0 deletions mm/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_NUMA) += memory-tiers.o
obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
obj-$(CONFIG_EXPERIMENTAL_BPF_ORDER_SELECTION) += bpf_thp.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
Expand Down
255 changes: 255 additions & 0 deletions mm/bpf_thp.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
// SPDX-License-Identifier: GPL-2.0

#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/huge_mm.h>
#include <linux/khugepaged.h>

struct bpf_thp_ops {
/**
* @get_suggested_order: Get the suggested highest THP order for allocation
* @mm: mm_struct associated with the THP allocation
* @tva_flags: TVA flags for current context
* %TVA_IN_PF: Set when in page fault context
* Other flags: Reserved for future use
* @order: The highest order being considered for this THP allocation.
* %PUD_ORDER for PUD-mapped allocations
* %PMD_ORDER for PMD-mapped allocations
* %PMD_ORDER - 1 for mTHP allocations
*
* Rerurn: Suggested highest THP order to use for allocation. The returned
* order will never exceed the input @order value.
*/
int (*get_suggested_order)(struct mm_struct *mm, unsigned long tva_flags, int order) __rcu;
};

static struct bpf_thp_ops bpf_thp;
static DEFINE_SPINLOCK(thp_ops_lock);

int get_suggested_order(struct mm_struct *mm, unsigned long tva_flags, int order)
{
int (*bpf_suggested_order)(struct mm_struct *mm, unsigned long tva_flags, int order);
int suggested_order = order;

/* No BPF program is attached */
if (!test_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED,
&transparent_hugepage_flags))
return suggested_order;

rcu_read_lock();
bpf_suggested_order = rcu_dereference(bpf_thp.get_suggested_order);
if (!bpf_suggested_order)
goto out;

suggested_order = bpf_suggested_order(mm, tva_flags, order);
if (suggested_order > order)
suggested_order = order;

out:
rcu_read_unlock();
return suggested_order;
}

static bool bpf_thp_ops_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
}

static const struct bpf_func_proto *
bpf_thp_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
return bpf_base_func_proto(func_id, prog);
}

static const struct bpf_verifier_ops thp_bpf_verifier_ops = {
.get_func_proto = bpf_thp_get_func_proto,
.is_valid_access = bpf_thp_ops_is_valid_access,
};

static int bpf_thp_init(struct btf *btf)
{
return 0;
}

static int bpf_thp_init_member(const struct btf_type *t,
const struct btf_member *member,
void *kdata, const void *udata)
{
return 0;
}

static int bpf_thp_reg(void *kdata, struct bpf_link *link)
{
struct bpf_thp_ops *ops = kdata;

spin_lock(&thp_ops_lock);
if (test_and_set_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED,
&transparent_hugepage_flags)) {
spin_unlock(&thp_ops_lock);
return -EBUSY;
}
WARN_ON_ONCE(bpf_thp.get_suggested_order);
WRITE_ONCE(bpf_thp.get_suggested_order, ops->get_suggested_order);
spin_unlock(&thp_ops_lock);
return 0;
}

static void bpf_thp_unreg(void *kdata, struct bpf_link *link)
{
spin_lock(&thp_ops_lock);
clear_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, &transparent_hugepage_flags);
WARN_ON_ONCE(!bpf_thp.get_suggested_order);
rcu_replace_pointer(bpf_thp.get_suggested_order, NULL, lockdep_is_held(&thp_ops_lock));
spin_unlock(&thp_ops_lock);

synchronize_rcu();
}

static int bpf_thp_update(void *kdata, void *old_kdata, struct bpf_link *link)
{
struct bpf_thp_ops *ops = kdata;
struct bpf_thp_ops *old = old_kdata;

if (!ops || !old)
return -EINVAL;

spin_lock(&thp_ops_lock);
if (!test_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, &transparent_hugepage_flags))
goto out;
rcu_replace_pointer(bpf_thp.get_suggested_order, ops->get_suggested_order,
lockdep_is_held(&thp_ops_lock));

out:
spin_unlock(&thp_ops_lock);
synchronize_rcu();
return 0;
}

static int bpf_thp_validate(void *kdata)
{
struct bpf_thp_ops *ops = kdata;

if (!ops->get_suggested_order) {
pr_err("bpf_thp: required ops isn't implemented\n");
return -EINVAL;
}
return 0;
}

static int suggested_order(struct mm_struct *mm, unsigned long vm_flags, int order)
{
return order;
}

static struct bpf_thp_ops __bpf_thp_ops = {
.get_suggested_order = suggested_order,
};

static struct bpf_struct_ops bpf_bpf_thp_ops = {
.verifier_ops = &thp_bpf_verifier_ops,
.init = bpf_thp_init,
.init_member = bpf_thp_init_member,
.reg = bpf_thp_reg,
.unreg = bpf_thp_unreg,
.update = bpf_thp_update,
.validate = bpf_thp_validate,
.cfi_stubs = &__bpf_thp_ops,
.owner = THIS_MODULE,
.name = "bpf_thp_ops",
};

__bpf_kfunc_start_defs();

/**
* bpf_mm_get_mem_cgroup - Get the memory cgroup associated with a mm_struct.
* @mm: The mm_struct to query
*
* The obtained mem_cgroup must be released by calling bpf_put_mem_cgroup().
*
* Return: The associated mem_cgroup on success, or NULL on failure. Note that
* this function depends on CONFIG_MEMCG being enabled - it will always return
* NULL if CONFIG_MEMCG is not configured.
*/
__bpf_kfunc struct mem_cgroup *bpf_mm_get_mem_cgroup(struct mm_struct *mm)
{
return get_mem_cgroup_from_mm(mm);
}

/**
* bpf_put_mem_cgroup - Release a memory cgroup obtained from bpf_mm_get_mem_cgroup()
* @memcg: The memory cgroup to release
*/
__bpf_kfunc void bpf_put_mem_cgroup(struct mem_cgroup *memcg)
{
#ifdef CONFIG_MEMCG
if (!memcg)
return;
css_put(&memcg->css);
#endif
}

/**
* bpf_mm_get_task - Get the task struct associated with a mm_struct.
* @mm: The mm_struct to query
*
* The obtained task_struct must be released by calling bpf_task_release().
*
* Return: The associated task_struct on success, or NULL on failure. Note that
* this function depends on CONFIG_MEMCG being enabled - it will always return
* NULL if CONFIG_MEMCG is not configured.
*/
__bpf_kfunc struct task_struct *bpf_mm_get_task(struct mm_struct *mm)
{
#ifdef CONFIG_MEMCG
struct task_struct *task;

if (!mm)
return NULL;
rcu_read_lock();
task = rcu_dereference(mm->owner);
if (!task)
goto out;
if (!refcount_inc_not_zero(&task->rcu_users))
goto out;

rcu_read_unlock();
return task;

out:
rcu_read_unlock();
#endif
return NULL;
}

__bpf_kfunc_end_defs();

BTF_KFUNCS_START(bpf_thp_ids)
BTF_ID_FLAGS(func, bpf_mm_get_mem_cgroup, KF_TRUSTED_ARGS | KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_mm_get_task, KF_TRUSTED_ARGS | KF_ACQUIRE | KF_RET_NULL)
BTF_KFUNCS_END(bpf_thp_ids)

static const struct btf_kfunc_id_set bpf_thp_set = {
.owner = THIS_MODULE,
.set = &bpf_thp_ids,
};

static int __init bpf_thp_ops_init(void)
{
int err;

err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_thp_set);
if (err) {
pr_err("bpf_thp: Failed to register kfunc sets (%d)\n", err);
return err;
}

err = register_bpf_struct_ops(&bpf_bpf_thp_ops, bpf_thp_ops);
if (err)
pr_err("bpf_thp: Failed to register struct_ops (%d)\n", err);
return err;
}
late_initcall(bpf_thp_ops_init);
9 changes: 9 additions & 0 deletions mm/huge_memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -1328,6 +1328,15 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
return ret;
khugepaged_enter_vma(vma, vma->vm_flags);

/*
* This check must occur after khugepaged_enter_vma() because:
* 1. We may permit THP allocation via khugepaged
* 2. While simultaneously disallowing THP allocation
* during page fault handling
*/
if (get_suggested_order(vma->vm_mm, TVA_IN_PF, PMD_ORDER) != PMD_ORDER)
return VM_FAULT_FALLBACK;

if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm) &&
transparent_hugepage_use_zero_page()) {
Expand Down
Loading
Loading