2525#include <linux/kasan.h>
2626#include <linux/bpf_verifier.h>
2727#include <linux/uaccess.h>
28+ #include <linux/task_work.h>
29+ #include <linux/irq_work.h>
2830
2931#include "../../lib/kstrtox.h"
3032
@@ -3737,6 +3739,292 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
37373739
37383740typedef void (* bpf_task_work_callback_t )(struct bpf_map * map , void * key , void * value );
37393741
3742+ enum bpf_task_work_state {
3743+ /* bpf_task_work is ready to be used */
3744+ BPF_TW_STANDBY = 0 ,
3745+ /* irq work scheduling in progress */
3746+ BPF_TW_PENDING ,
3747+ /* task work scheduling in progress */
3748+ BPF_TW_SCHEDULING ,
3749+ /* task work is scheduled successfully */
3750+ BPF_TW_SCHEDULED ,
3751+ /* callback is running */
3752+ BPF_TW_RUNNING ,
3753+ /* associated BPF map value is deleted */
3754+ BPF_TW_FREED ,
3755+ };
3756+
3757+ struct bpf_task_work_ctx {
3758+ enum bpf_task_work_state state ;
3759+ refcount_t refcnt ;
3760+ struct callback_head work ;
3761+ struct irq_work irq_work ;
3762+ /* bpf_prog that schedules task work */
3763+ struct bpf_prog * prog ;
3764+ /* task for which callback is scheduled */
3765+ struct task_struct * task ;
3766+ /* the map and map value associated with this context */
3767+ struct bpf_map * map ;
3768+ void * map_val ;
3769+ enum task_work_notify_mode mode ;
3770+ bpf_task_work_callback_t callback_fn ;
3771+ struct rcu_head rcu ;
3772+ } __aligned (8 );
3773+
3774+ /* Actual type for struct bpf_task_work */
3775+ struct bpf_task_work_kern {
3776+ struct bpf_task_work_ctx * ctx ;
3777+ };
3778+
3779+ static void bpf_task_work_ctx_free_rcu_gp (struct rcu_head * rcu )
3780+ {
3781+ struct bpf_task_work_ctx * ctx = container_of (rcu , struct bpf_task_work_ctx , rcu );
3782+
3783+ /* bpf_mem_free expects migration to be disabled */
3784+ migrate_disable ();
3785+ bpf_mem_free (& bpf_global_ma , ctx );
3786+ migrate_enable ();
3787+ }
3788+
3789+ static void bpf_task_work_ctx_free_mult_rcu_gp (struct rcu_head * rcu )
3790+ {
3791+ if (rcu_trace_implies_rcu_gp ())
3792+ bpf_task_work_ctx_free_rcu_gp (rcu );
3793+ else
3794+ call_rcu (rcu , bpf_task_work_ctx_free_rcu_gp );
3795+ }
3796+
3797+ static void bpf_task_work_ctx_reset (struct bpf_task_work_ctx * ctx )
3798+ {
3799+ if (ctx -> prog ) {
3800+ bpf_prog_put (ctx -> prog );
3801+ ctx -> prog = NULL ;
3802+ }
3803+ if (ctx -> task ) {
3804+ bpf_task_release (ctx -> task );
3805+ ctx -> task = NULL ;
3806+ }
3807+ }
3808+
3809+ static bool bpf_task_work_ctx_tryget (struct bpf_task_work_ctx * ctx )
3810+ {
3811+ return refcount_inc_not_zero (& ctx -> refcnt );
3812+ }
3813+
3814+ static void bpf_task_work_ctx_put (struct bpf_task_work_ctx * ctx )
3815+ {
3816+ if (!refcount_dec_and_test (& ctx -> refcnt ))
3817+ return ;
3818+
3819+ bpf_task_work_ctx_reset (ctx );
3820+ call_rcu_tasks_trace (& ctx -> rcu , bpf_task_work_ctx_free_mult_rcu_gp );
3821+ }
3822+
3823+ static bool task_work_match (struct callback_head * head , void * data )
3824+ {
3825+ struct bpf_task_work_ctx * ctx = container_of (head , struct bpf_task_work_ctx , work );
3826+
3827+ return ctx == data ;
3828+ }
3829+
3830+ static void bpf_task_work_cancel (struct bpf_task_work_ctx * ctx )
3831+ {
3832+ /*
3833+ * Scheduled task_work callback holds ctx ref, so if we successfully
3834+ * cancelled, we put that ref on callback's behalf. If we couldn't
3835+ * cancel, callback is inevitably run or has already completed
3836+ * running, and it would have taken care of its ctx ref itself.
3837+ */
3838+ if (task_work_cancel_match (ctx -> task , task_work_match , ctx ))
3839+ bpf_task_work_ctx_put (ctx );
3840+ }
3841+
3842+ static void bpf_task_work_callback (struct callback_head * cb )
3843+ {
3844+ struct bpf_task_work_ctx * ctx = container_of (cb , struct bpf_task_work_ctx , work );
3845+ enum bpf_task_work_state state ;
3846+ u32 idx ;
3847+ void * key ;
3848+
3849+ /* Read lock is needed to protect ctx and map key/value access */
3850+ guard (rcu_tasks_trace )();
3851+ /*
3852+ * This callback may start running before bpf_task_work_irq() switched to
3853+ * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING.
3854+ */
3855+ state = cmpxchg (& ctx -> state , BPF_TW_SCHEDULING , BPF_TW_RUNNING );
3856+ if (state == BPF_TW_SCHEDULED )
3857+ state = cmpxchg (& ctx -> state , BPF_TW_SCHEDULED , BPF_TW_RUNNING );
3858+ if (state == BPF_TW_FREED ) {
3859+ bpf_task_work_ctx_put (ctx );
3860+ return ;
3861+ }
3862+
3863+ key = (void * )map_key_from_value (ctx -> map , ctx -> map_val , & idx );
3864+
3865+ migrate_disable ();
3866+ ctx -> callback_fn (ctx -> map , key , ctx -> map_val );
3867+ migrate_enable ();
3868+
3869+ bpf_task_work_ctx_reset (ctx );
3870+ (void )cmpxchg (& ctx -> state , BPF_TW_RUNNING , BPF_TW_STANDBY );
3871+
3872+ bpf_task_work_ctx_put (ctx );
3873+ }
3874+
3875+ static void bpf_task_work_irq (struct irq_work * irq_work )
3876+ {
3877+ struct bpf_task_work_ctx * ctx = container_of (irq_work , struct bpf_task_work_ctx , irq_work );
3878+ enum bpf_task_work_state state ;
3879+ int err ;
3880+
3881+ guard (rcu_tasks_trace )();
3882+
3883+ if (cmpxchg (& ctx -> state , BPF_TW_PENDING , BPF_TW_SCHEDULING ) != BPF_TW_PENDING ) {
3884+ bpf_task_work_ctx_put (ctx );
3885+ return ;
3886+ }
3887+
3888+ err = task_work_add (ctx -> task , & ctx -> work , ctx -> mode );
3889+ if (err ) {
3890+ bpf_task_work_ctx_reset (ctx );
3891+ /*
3892+ * try to switch back to STANDBY for another task_work reuse, but we might have
3893+ * gone to FREED already, which is fine as we already cleaned up after ourselves
3894+ */
3895+ (void )cmpxchg (& ctx -> state , BPF_TW_SCHEDULING , BPF_TW_STANDBY );
3896+
3897+ /* we don't have RCU protection, so put after switching state */
3898+ bpf_task_work_ctx_put (ctx );
3899+ }
3900+
3901+ /*
3902+ * It's technically possible for just scheduled task_work callback to
3903+ * complete running by now, going SCHEDULING -> RUNNING and then
3904+ * dropping its ctx refcount. Instead of capturing extra ref just to
3905+ * protected below ctx->state access, we rely on RCU protection to
3906+ * perform below SCHEDULING -> SCHEDULED attempt.
3907+ */
3908+ state = cmpxchg (& ctx -> state , BPF_TW_SCHEDULING , BPF_TW_SCHEDULED );
3909+ if (state == BPF_TW_FREED )
3910+ bpf_task_work_cancel (ctx ); /* clean up if we switched into FREED state */
3911+ }
3912+
3913+ static struct bpf_task_work_ctx * bpf_task_work_fetch_ctx (struct bpf_task_work * tw ,
3914+ struct bpf_map * map )
3915+ {
3916+ struct bpf_task_work_kern * twk = (void * )tw ;
3917+ struct bpf_task_work_ctx * ctx , * old_ctx ;
3918+
3919+ ctx = READ_ONCE (twk -> ctx );
3920+ if (ctx )
3921+ return ctx ;
3922+
3923+ ctx = bpf_mem_alloc (& bpf_global_ma , sizeof (struct bpf_task_work_ctx ));
3924+ if (!ctx )
3925+ return ERR_PTR (- ENOMEM );
3926+
3927+ memset (ctx , 0 , sizeof (* ctx ));
3928+ refcount_set (& ctx -> refcnt , 1 ); /* map's own ref */
3929+ ctx -> state = BPF_TW_STANDBY ;
3930+
3931+ old_ctx = cmpxchg (& twk -> ctx , NULL , ctx );
3932+ if (old_ctx ) {
3933+ /*
3934+ * tw->ctx is set by concurrent BPF program, release allocated
3935+ * memory and try to reuse already set context.
3936+ */
3937+ bpf_mem_free (& bpf_global_ma , ctx );
3938+ return old_ctx ;
3939+ }
3940+
3941+ return ctx ; /* Success */
3942+ }
3943+
3944+ static struct bpf_task_work_ctx * bpf_task_work_acquire_ctx (struct bpf_task_work * tw ,
3945+ struct bpf_map * map )
3946+ {
3947+ struct bpf_task_work_ctx * ctx ;
3948+
3949+ /* early check to avoid any work, we'll double check at the end again */
3950+ if (!atomic64_read (& map -> usercnt ))
3951+ return ERR_PTR (- EBUSY );
3952+
3953+ ctx = bpf_task_work_fetch_ctx (tw , map );
3954+ if (IS_ERR (ctx ))
3955+ return ctx ;
3956+
3957+ /* try to get ref for task_work callback to hold */
3958+ if (!bpf_task_work_ctx_tryget (ctx ))
3959+ return ERR_PTR (- EBUSY );
3960+
3961+ if (cmpxchg (& ctx -> state , BPF_TW_STANDBY , BPF_TW_PENDING ) != BPF_TW_STANDBY ) {
3962+ /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */
3963+ bpf_task_work_ctx_put (ctx );
3964+ return ERR_PTR (- EBUSY );
3965+ }
3966+
3967+ /*
3968+ * Double check that map->usercnt wasn't dropped while we were
3969+ * preparing context, and if it was, we need to clean up as if
3970+ * map_release_uref() was called; bpf_task_work_cancel_and_free()
3971+ * is safe to be called twice on the same task work
3972+ */
3973+ if (!atomic64_read (& map -> usercnt )) {
3974+ /* drop ref we just got for task_work callback itself */
3975+ bpf_task_work_ctx_put (ctx );
3976+ /* transfer map's ref into cancel_and_free() */
3977+ bpf_task_work_cancel_and_free (tw );
3978+ return ERR_PTR (- EBUSY );
3979+ }
3980+
3981+ return ctx ;
3982+ }
3983+
3984+ static int bpf_task_work_schedule (struct task_struct * task , struct bpf_task_work * tw ,
3985+ struct bpf_map * map , bpf_task_work_callback_t callback_fn ,
3986+ struct bpf_prog_aux * aux , enum task_work_notify_mode mode )
3987+ {
3988+ struct bpf_prog * prog ;
3989+ struct bpf_task_work_ctx * ctx ;
3990+ int err ;
3991+
3992+ BTF_TYPE_EMIT (struct bpf_task_work );
3993+
3994+ prog = bpf_prog_inc_not_zero (aux -> prog );
3995+ if (IS_ERR (prog ))
3996+ return - EBADF ;
3997+ task = bpf_task_acquire (task );
3998+ if (!task ) {
3999+ err = - EPERM ;
4000+ goto release_prog ;
4001+ }
4002+
4003+ ctx = bpf_task_work_acquire_ctx (tw , map );
4004+ if (IS_ERR (ctx )) {
4005+ err = PTR_ERR (ctx );
4006+ goto release_all ;
4007+ }
4008+
4009+ ctx -> task = task ;
4010+ ctx -> callback_fn = callback_fn ;
4011+ ctx -> prog = prog ;
4012+ ctx -> mode = mode ;
4013+ ctx -> map = map ;
4014+ ctx -> map_val = (void * )tw - map -> record -> task_work_off ;
4015+ init_task_work (& ctx -> work , bpf_task_work_callback );
4016+ init_irq_work (& ctx -> irq_work , bpf_task_work_irq );
4017+
4018+ irq_work_queue (& ctx -> irq_work );
4019+ return 0 ;
4020+
4021+ release_all :
4022+ bpf_task_release (task );
4023+ release_prog :
4024+ bpf_prog_put (prog );
4025+ return err ;
4026+ }
4027+
37404028/**
37414029 * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode
37424030 * @task: Task struct for which callback should be scheduled
@@ -3751,7 +4039,7 @@ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct b
37514039 struct bpf_map * map__map ,
37524040 bpf_task_work_callback_t callback , void * aux__prog )
37534041{
3754- return 0 ;
4042+ return bpf_task_work_schedule ( task , tw , map__map , callback , aux__prog , TWA_SIGNAL ) ;
37554043}
37564044
37574045/**
@@ -3768,13 +4056,38 @@ __bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct b
37684056 struct bpf_map * map__map ,
37694057 bpf_task_work_callback_t callback , void * aux__prog )
37704058{
3771- return 0 ;
4059+ return bpf_task_work_schedule ( task , tw , map__map , callback , aux__prog , TWA_RESUME ) ;
37724060}
37734061
37744062__bpf_kfunc_end_defs ();
37754063
4064+ static void bpf_task_work_cancel_scheduled (struct irq_work * irq_work )
4065+ {
4066+ struct bpf_task_work_ctx * ctx = container_of (irq_work , struct bpf_task_work_ctx , irq_work );
4067+
4068+ bpf_task_work_cancel (ctx ); /* this might put task_work callback's ref */
4069+ bpf_task_work_ctx_put (ctx ); /* and here we put map's own ref that was transferred to us */
4070+ }
4071+
37764072void bpf_task_work_cancel_and_free (void * val )
37774073{
4074+ struct bpf_task_work_kern * twk = val ;
4075+ struct bpf_task_work_ctx * ctx ;
4076+ enum bpf_task_work_state state ;
4077+
4078+ ctx = xchg (& twk -> ctx , NULL );
4079+ if (!ctx )
4080+ return ;
4081+
4082+ state = xchg (& ctx -> state , BPF_TW_FREED );
4083+ if (state == BPF_TW_SCHEDULED ) {
4084+ /* run in irq_work to avoid locks in NMI */
4085+ init_irq_work (& ctx -> irq_work , bpf_task_work_cancel_scheduled );
4086+ irq_work_queue (& ctx -> irq_work );
4087+ return ;
4088+ }
4089+
4090+ bpf_task_work_ctx_put (ctx ); /* put bpf map's ref */
37784091}
37794092
37804093BTF_KFUNCS_START (generic_btf_ids )
@@ -3911,6 +4224,8 @@ BTF_ID_FLAGS(func, bpf_strnstr);
39114224BTF_ID_FLAGS (func , bpf_cgroup_read_xattr , KF_RCU )
39124225#endif
39134226BTF_ID_FLAGS (func , bpf_stream_vprintk , KF_TRUSTED_ARGS )
4227+ BTF_ID_FLAGS (func , bpf_task_work_schedule_signal , KF_TRUSTED_ARGS )
4228+ BTF_ID_FLAGS (func , bpf_task_work_schedule_resume , KF_TRUSTED_ARGS )
39144229BTF_KFUNCS_END (common_btf_ids )
39154230
39164231static const struct btf_kfunc_id_set common_kfunc_set = {
0 commit comments