Skip to content

Commit ad73a4d

Browse files
committed
bpf: task work scheduling kfuncs
Implementation of the bpf_task_work_schedule kfuncs. Main components: * struct bpf_task_work_context – Metadata and state management per task work. * enum bpf_task_work_state – A state machine to serialize work scheduling and execution. * bpf_task_work_schedule() – The central helper that initiates scheduling. * bpf_task_work_callback() – Invoked when the actual task_work runs. * bpf_task_work_irq() – An intermediate step (runs in softirq context) to enqueue task work. * bpf_task_work_cancel_and_free() – Cleanup for deleted BPF map entries. Flow of task work scheduling 1) bpf_task_work_schedule_* is called from BPF code. 2) Transition state from STANDBY to PENDING. 3) irq_work_queue() schedules bpf_task_work_irq(). 4) Transition state from PENDING to SCHEDULING. 4) bpf_task_work_irq() attempts task_work_add(). If successful, state transitions to SCHEDULED. 5) Task work calls bpf_task_work_callback(), which transition state to RUNNING. 6) BPF callback is executed 7) Context is cleaned up, refcounts released, state set back to STANDBY. Map value deletion If map value that contains bpf_task_work_context is deleted, BPF map implementation calls bpf_task_work_cancel_and_free(). Deletion is handled by atomically setting state to FREED and releasing references or letting scheduler do that, depending on the last state before the deletion: * SCHEDULING: release references in bpf_task_work_cancel_and_free(), expect bpf_task_work_irq() to cancel task work. * SCHEDULED: release references and try to cancel task work in bpf_task_work_cancel_and_free(). * other states: one of bpf_task_work_irq(), bpf_task_work_schedule(), bpf_task_work_callback() should cleanup upon detecting the state switching to FREED. The state transitions are controlled with atomic_cmpxchg, ensuring: * Only one thread can successfully enqueue work. * Proper handling of concurrent deletes (BPF_TW_FREED). * Safe rollback if task_work_add() fails. Signed-off-by: Mykyta Yatsenko <[email protected]>
1 parent d56553b commit ad73a4d

File tree

1 file changed

+232
-2
lines changed

1 file changed

+232
-2
lines changed

kernel/bpf/helpers.c

Lines changed: 232 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
#include <linux/kasan.h>
2626
#include <linux/bpf_verifier.h>
2727
#include <linux/uaccess.h>
28+
#include <linux/task_work.h>
29+
#include <linux/irq_work.h>
2830

2931
#include "../../lib/kstrtox.h"
3032

@@ -3702,6 +3704,200 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
37023704

37033705
typedef void (*bpf_task_work_callback_t)(struct bpf_map *, void *, void *);
37043706

3707+
enum bpf_task_work_state {
3708+
/* bpf_task_work is ready to be used */
3709+
BPF_TW_STANDBY = 0,
3710+
/* bpf_task_work is getting scheduled into irq_work */
3711+
BPF_TW_PENDING,
3712+
/* task work scheduling in progress */
3713+
BPF_TW_SCHEDULING,
3714+
/* bpf_task_work is scheduled into task_work successfully */
3715+
BPF_TW_SCHEDULED,
3716+
/* callback is running */
3717+
BPF_TW_RUNNING,
3718+
/* BPF map value storing this bpf_task_work is deleted */
3719+
BPF_TW_FREED,
3720+
};
3721+
3722+
struct bpf_task_work_context {
3723+
/* map that contains this structure in a value */
3724+
struct bpf_map *map;
3725+
/* map value that contains pointer to this context */
3726+
void *map_val;
3727+
/* bpf_prog that schedules task work */
3728+
struct bpf_prog *prog;
3729+
/* task for which callback is scheduled */
3730+
struct task_struct *task;
3731+
/* notification mode for task work scheduling */
3732+
enum task_work_notify_mode mode;
3733+
/* bpf_task_work_state value, representing the state */
3734+
enum bpf_task_work_state state;
3735+
/* callback to call from task work */
3736+
bpf_task_work_callback_t callback_fn;
3737+
struct callback_head work;
3738+
struct irq_work irq_work;
3739+
} __aligned(8);
3740+
3741+
static bool task_work_match(struct callback_head *head, void *data)
3742+
{
3743+
struct bpf_task_work_context *ctx = container_of(head, struct bpf_task_work_context, work);
3744+
3745+
return ctx == data;
3746+
}
3747+
3748+
static void bpf_reset_task_work_context(struct bpf_task_work_context *ctx)
3749+
{
3750+
bpf_prog_put(ctx->prog);
3751+
bpf_task_release(ctx->task);
3752+
}
3753+
3754+
static void bpf_task_work_callback(struct callback_head *cb)
3755+
{
3756+
enum bpf_task_work_state state;
3757+
struct bpf_task_work_context *ctx;
3758+
u32 idx;
3759+
void *key;
3760+
3761+
ctx = container_of(cb, struct bpf_task_work_context, work);
3762+
3763+
/*
3764+
* If map element is freed, context state is set to BPF_TW_FREED
3765+
* Otherwise it is safe to access map key value under the rcu_read_lock
3766+
*/
3767+
rcu_read_lock_trace();
3768+
state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING);
3769+
if (state == BPF_TW_SCHEDULED)
3770+
state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING);
3771+
if (state == BPF_TW_FREED) {
3772+
rcu_read_unlock_trace();
3773+
bpf_reset_task_work_context(ctx);
3774+
goto free;
3775+
}
3776+
3777+
key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx);
3778+
migrate_disable();
3779+
ctx->callback_fn(ctx->map, key, ctx->map_val);
3780+
migrate_enable();
3781+
3782+
rcu_read_unlock_trace();
3783+
/* State is running or freed, either way reset. */
3784+
bpf_reset_task_work_context(ctx);
3785+
state = cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY);
3786+
if (state == BPF_TW_RUNNING)
3787+
return; /* Work is done, context is reset */
3788+
3789+
free:
3790+
kfree(ctx);
3791+
}
3792+
3793+
static void bpf_task_work_irq(struct irq_work *irq_work)
3794+
{
3795+
struct bpf_task_work_context *ctx;
3796+
enum bpf_task_work_state state;
3797+
int err;
3798+
3799+
ctx = container_of(irq_work, struct bpf_task_work_context, irq_work);
3800+
3801+
state = cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING);
3802+
if (state == BPF_TW_FREED)
3803+
goto free_context;
3804+
3805+
err = task_work_add(ctx->task, &ctx->work, ctx->mode);
3806+
if (err) {
3807+
bpf_reset_task_work_context(ctx);
3808+
state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY);
3809+
if (state == BPF_TW_FREED)
3810+
kfree(ctx);
3811+
return;
3812+
}
3813+
state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED);
3814+
if (state == BPF_TW_FREED &&
3815+
task_work_cancel_match(ctx->task, task_work_match, ctx))
3816+
goto free_context;
3817+
/* otherwise task work is not cancelled, expect callback to cleanup */
3818+
return;
3819+
3820+
free_context:
3821+
bpf_reset_task_work_context(ctx);
3822+
kfree(ctx);
3823+
}
3824+
3825+
static struct bpf_task_work_context *
3826+
bpf_task_work_aquire_ctx(struct bpf_task_work *tw, struct bpf_map *map)
3827+
{
3828+
struct bpf_task_work_context *ctx;
3829+
enum bpf_task_work_state state;
3830+
atomic_long_t *ctx_ptr = (atomic_long_t *)&tw->ctx;
3831+
3832+
ctx = (void *)atomic_long_read(ctx_ptr);
3833+
if (!ctx) {
3834+
ctx = bpf_map_kmalloc_node(map,
3835+
sizeof(struct bpf_task_work_context),
3836+
GFP_ATOMIC, map->numa_node);
3837+
if (!ctx)
3838+
return ERR_PTR(-ENOMEM);
3839+
memset(ctx, 0, sizeof(*ctx));
3840+
if (atomic_long_cmpxchg(ctx_ptr, 0, (unsigned long)ctx) != 0) {
3841+
kfree(ctx);
3842+
return ERR_PTR(-EBUSY);
3843+
}
3844+
}
3845+
state = cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING);
3846+
if (state != BPF_TW_STANDBY)
3847+
return ERR_PTR(-EBUSY);
3848+
3849+
return ctx;
3850+
}
3851+
3852+
static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw,
3853+
struct bpf_map *map, bpf_task_work_callback_t callback_fn,
3854+
struct bpf_prog_aux *aux, enum task_work_notify_mode mode)
3855+
{
3856+
struct bpf_prog *prog;
3857+
struct bpf_task_work_context *ctx = NULL;
3858+
int err;
3859+
3860+
BTF_TYPE_EMIT(struct bpf_task_work);
3861+
3862+
prog = bpf_prog_inc_not_zero(aux->prog);
3863+
if (IS_ERR(prog))
3864+
return -EPERM;
3865+
3866+
if (!atomic64_read(&map->usercnt)) {
3867+
err = -EPERM;
3868+
goto release_prog;
3869+
}
3870+
task = bpf_task_acquire(task);
3871+
if (!task) {
3872+
err = -EPERM;
3873+
goto release_prog;
3874+
}
3875+
ctx = bpf_task_work_aquire_ctx(tw, map);
3876+
if (IS_ERR(ctx)) {
3877+
err = PTR_ERR(ctx);
3878+
goto release_all;
3879+
}
3880+
3881+
ctx->task = task;
3882+
ctx->callback_fn = callback_fn;
3883+
ctx->prog = prog;
3884+
ctx->mode = mode;
3885+
ctx->map = map;
3886+
ctx->map_val = (void *)tw - map->record->task_work_off;
3887+
init_irq_work(&ctx->irq_work, bpf_task_work_irq);
3888+
init_task_work(&ctx->work, bpf_task_work_callback);
3889+
3890+
irq_work_queue(&ctx->irq_work);
3891+
3892+
return 0;
3893+
3894+
release_all:
3895+
bpf_task_release(task);
3896+
release_prog:
3897+
bpf_prog_put(prog);
3898+
return err;
3899+
}
3900+
37053901
/**
37063902
* bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode
37073903
* @task: Task struct for which callback should be scheduled
@@ -3718,7 +3914,8 @@ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task,
37183914
bpf_task_work_callback_t callback,
37193915
void *aux__prog)
37203916
{
3721-
return 0;
3917+
return bpf_task_work_schedule(task, tw, map__map,
3918+
callback, aux__prog, TWA_SIGNAL);
37223919
}
37233920

37243921
/**
@@ -3738,13 +3935,44 @@ __bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task,
37383935
bpf_task_work_callback_t callback,
37393936
void *aux__prog)
37403937
{
3741-
return 0;
3938+
enum task_work_notify_mode mode;
3939+
3940+
mode = task == current && in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME;
3941+
return bpf_task_work_schedule(task, tw, map__map,
3942+
callback, aux__prog, mode);
37423943
}
37433944

37443945
__bpf_kfunc_end_defs();
37453946

37463947
void bpf_task_work_cancel_and_free(void *val)
37473948
{
3949+
struct bpf_task_work *tw = val;
3950+
struct bpf_task_work_context *ctx;
3951+
enum bpf_task_work_state state;
3952+
3953+
ctx = (void *)atomic_long_xchg((atomic_long_t *)&tw->ctx, 0);
3954+
if (!ctx)
3955+
return;
3956+
3957+
state = xchg(&ctx->state, BPF_TW_FREED);
3958+
3959+
switch (state) {
3960+
case BPF_TW_SCHEDULED:
3961+
/* If we can't cancel task work, rely on task work callback to free the context */
3962+
if (!task_work_cancel_match(ctx->task, task_work_match, ctx))
3963+
break;
3964+
bpf_reset_task_work_context(ctx);
3965+
fallthrough;
3966+
case BPF_TW_STANDBY:
3967+
kfree(ctx);
3968+
break;
3969+
/* In all below cases scheduling logic should detect context state change and cleanup */
3970+
case BPF_TW_SCHEDULING:
3971+
case BPF_TW_PENDING:
3972+
case BPF_TW_RUNNING:
3973+
default:
3974+
break;
3975+
}
37483976
}
37493977

37503978
BTF_KFUNCS_START(generic_btf_ids)
@@ -3770,6 +3998,8 @@ BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
37703998
BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL)
37713999
BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL)
37724000
BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL)
4001+
BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS)
4002+
BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS)
37734003

37744004
#ifdef CONFIG_CGROUPS
37754005
BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)

0 commit comments

Comments
 (0)