@@ -1092,6 +1092,12 @@ static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx)
10921092 return (void * )value - round_up (map -> key_size , 8 );
10931093}
10941094
1095+ enum bpf_async_state {
1096+ BPF_ASYNC_READY ,
1097+ BPF_ASYNC_BUSY ,
1098+ BPF_ASYNC_FREED ,
1099+ };
1100+
10951101struct bpf_async_cb {
10961102 struct bpf_map * map ;
10971103 struct bpf_prog * prog ;
@@ -1103,6 +1109,7 @@ struct bpf_async_cb {
11031109 };
11041110 u64 flags ;
11051111 refcount_t refcnt ;
1112+ enum bpf_async_state state ;
11061113};
11071114
11081115/* BPF map elements can contain 'struct bpf_timer'.
@@ -1140,11 +1147,6 @@ struct bpf_async_kern {
11401147 struct bpf_hrtimer * timer ;
11411148 struct bpf_work * work ;
11421149 };
1143- /* bpf_spin_lock is used here instead of spinlock_t to make
1144- * sure that it always fits into space reserved by struct bpf_timer
1145- * regardless of LOCKDEP and spinlock debug flags.
1146- */
1147- struct bpf_spin_lock lock ;
11481150} __attribute__((aligned (8 )));
11491151
11501152enum bpf_async_type {
@@ -1276,7 +1278,7 @@ static void bpf_timer_delete_work(struct work_struct *work)
12761278static int __bpf_async_init (struct bpf_async_kern * async , struct bpf_map * map , u64 flags ,
12771279 enum bpf_async_type type )
12781280{
1279- struct bpf_async_cb * cb ;
1281+ struct bpf_async_cb * cb , * old_cb ;
12801282 struct bpf_hrtimer * t ;
12811283 struct bpf_work * w ;
12821284 clockid_t clockid ;
@@ -1297,18 +1299,13 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
12971299 return - EINVAL ;
12981300 }
12991301
1300- __bpf_spin_lock_irqsave (& async -> lock );
1301- t = async -> timer ;
1302- if (t ) {
1303- ret = - EBUSY ;
1304- goto out ;
1305- }
1302+ cb = READ_ONCE (async -> cb );
1303+ if (cb )
1304+ return - EBUSY ;
13061305
13071306 cb = bpf_map_kmalloc_nolock (map , size , 0 , map -> numa_node );
1308- if (!cb ) {
1309- ret = - ENOMEM ;
1310- goto out ;
1311- }
1307+ if (!cb )
1308+ return - ENOMEM ;
13121309
13131310 switch (type ) {
13141311 case BPF_ASYNC_TYPE_TIMER :
@@ -1331,9 +1328,16 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
13311328 cb -> map = map ;
13321329 cb -> prog = NULL ;
13331330 cb -> flags = flags ;
1331+ cb -> state = BPF_ASYNC_READY ;
13341332 rcu_assign_pointer (cb -> callback_fn , NULL );
1333+ refcount_set (& cb -> refcnt , 1 ); /* map's own ref */
13351334
1336- WRITE_ONCE (async -> cb , cb );
1335+ old_cb = cmpxchg (& async -> cb , NULL , cb );
1336+ if (old_cb ) {
1337+ /* Lost the race to initialize this bpf_async_kern, drop the allocated object */
1338+ kfree_nolock (cb );
1339+ return - EBUSY ;
1340+ }
13371341 /* Guarantee the order between async->cb and map->usercnt. So
13381342 * when there are concurrent uref release and bpf timer init, either
13391343 * bpf_timer_cancel_and_free() called by uref release reads a no-NULL
@@ -1344,12 +1348,17 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
13441348 /* maps with timers must be either held by user space
13451349 * or pinned in bpffs.
13461350 */
1347- WRITE_ONCE (async -> cb , NULL );
1348- kfree_nolock (cb );
1349- ret = - EPERM ;
1351+ switch (type ) {
1352+ case BPF_ASYNC_TYPE_TIMER :
1353+ bpf_timer_cancel_and_free (async );
1354+ break ;
1355+ case BPF_ASYNC_TYPE_WQ :
1356+ bpf_wq_cancel_and_free (async );
1357+ break ;
1358+ }
1359+ return - EPERM ;
13501360 }
1351- out :
1352- __bpf_spin_unlock_irqrestore (& async -> lock );
1361+
13531362 return ret ;
13541363}
13551364
@@ -1398,41 +1407,42 @@ static int bpf_async_swap_prog(struct bpf_async_cb *cb, struct bpf_prog *prog)
13981407 if (IS_ERR (prog ))
13991408 return PTR_ERR (prog );
14001409 }
1410+ /* Make sure only one thread runs bpf_prog_put() */
1411+ prev = xchg (& cb -> prog , prog );
14011412 if (prev )
14021413 /* Drop prev prog refcnt when swapping with new prog */
14031414 bpf_prog_put (prev );
14041415
1405- cb -> prog = prog ;
14061416 return 0 ;
14071417}
14081418
1409- static int bpf_async_update_callback (struct bpf_async_kern * async , void * callback_fn ,
1419+ static int bpf_async_update_callback (struct bpf_async_cb * cb , void * callback_fn ,
14101420 struct bpf_prog * prog )
14111421{
1412- struct bpf_async_cb * cb ;
1422+ enum bpf_async_state state ;
14131423 int err = 0 ;
14141424
1415- __bpf_spin_lock_irqsave (& async -> lock );
1416- cb = async -> cb ;
1417- if (!cb ) {
1418- err = - EINVAL ;
1419- goto out ;
1420- }
1421- if (!atomic64_read (& cb -> map -> usercnt )) {
1422- /* maps with timers must be either held by user space
1423- * or pinned in bpffs. Otherwise timer might still be
1424- * running even when bpf prog is detached and user space
1425- * is gone, since map_release_uref won't ever be called.
1426- */
1427- err = - EPERM ;
1428- goto out ;
1429- }
1425+ state = cmpxchg (& cb -> state , BPF_ASYNC_READY , BPF_ASYNC_BUSY );
1426+ if (state == BPF_ASYNC_BUSY )
1427+ return - EBUSY ;
1428+ if (state == BPF_ASYNC_FREED )
1429+ goto drop ;
14301430
14311431 err = bpf_async_swap_prog (cb , prog );
14321432 if (!err )
14331433 rcu_assign_pointer (cb -> callback_fn , callback_fn );
1434- out :
1435- __bpf_spin_unlock_irqrestore (& async -> lock );
1434+
1435+ state = cmpxchg (& cb -> state , BPF_ASYNC_BUSY , BPF_ASYNC_READY );
1436+ if (state == BPF_ASYNC_FREED ) {
1437+ /*
1438+ * cb is freed concurrently, we may have overwritten prog and callback,
1439+ * make sure to drop them
1440+ */
1441+ drop :
1442+ bpf_async_swap_prog (cb , NULL );
1443+ rcu_assign_pointer (cb -> callback_fn , NULL );
1444+ return - EPERM ;
1445+ }
14361446 return err ;
14371447}
14381448
@@ -1441,11 +1451,18 @@ static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback
14411451 enum bpf_async_type type )
14421452{
14431453 struct bpf_prog * prog = aux -> prog ;
1454+ struct bpf_async_cb * cb ;
14441455
14451456 if (in_nmi ())
14461457 return - EOPNOTSUPP ;
14471458
1448- return bpf_async_update_callback (async , callback_fn , prog );
1459+ guard (rcu )();
1460+
1461+ cb = READ_ONCE (async -> cb );
1462+ if (!cb )
1463+ return - EINVAL ;
1464+
1465+ return bpf_async_update_callback (cb , callback_fn , prog );
14491466}
14501467
14511468BPF_CALL_3 (bpf_timer_set_callback , struct bpf_async_kern * , timer , void * , callback_fn ,
@@ -1462,7 +1479,7 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = {
14621479 .arg2_type = ARG_PTR_TO_FUNC ,
14631480};
14641481
1465- BPF_CALL_3 (bpf_timer_start , struct bpf_async_kern * , timer , u64 , nsecs , u64 , flags )
1482+ BPF_CALL_3 (bpf_timer_start , struct bpf_async_kern * , async , u64 , nsecs , u64 , flags )
14661483{
14671484 struct bpf_hrtimer * t ;
14681485 int ret = 0 ;
@@ -1472,12 +1489,19 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, fla
14721489 return - EOPNOTSUPP ;
14731490 if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN ))
14741491 return - EINVAL ;
1475- __bpf_spin_lock_irqsave (& timer -> lock );
1476- t = timer -> timer ;
1477- if (!t || !t -> cb .prog ) {
1478- ret = - EINVAL ;
1479- goto out ;
1480- }
1492+
1493+ guard (rcu )();
1494+
1495+ t = READ_ONCE (async -> timer );
1496+ if (!t )
1497+ return - EINVAL ;
1498+
1499+ /*
1500+ * Hold ref while scheduling timer, to make sure, we only cancel and free after
1501+ * hrtimer_start().
1502+ */
1503+ if (!bpf_async_tryget (& t -> cb ))
1504+ return - EINVAL ;
14811505
14821506 if (flags & BPF_F_TIMER_ABS )
14831507 mode = HRTIMER_MODE_ABS_SOFT ;
@@ -1488,8 +1512,8 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, fla
14881512 mode |= HRTIMER_MODE_PINNED ;
14891513
14901514 hrtimer_start (& t -> timer , ns_to_ktime (nsecs ), mode );
1491- out :
1492- __bpf_spin_unlock_irqrestore ( & timer -> lock );
1515+
1516+ bpf_async_put ( & t -> cb , BPF_ASYNC_TYPE_TIMER );
14931517 return ret ;
14941518}
14951519
@@ -1502,32 +1526,20 @@ static const struct bpf_func_proto bpf_timer_start_proto = {
15021526 .arg3_type = ARG_ANYTHING ,
15031527};
15041528
1505- static void drop_prog_refcnt (struct bpf_async_cb * async )
1506- {
1507- struct bpf_prog * prog = async -> prog ;
1508-
1509- if (prog ) {
1510- bpf_prog_put (prog );
1511- async -> prog = NULL ;
1512- rcu_assign_pointer (async -> callback_fn , NULL );
1513- }
1514- }
1515-
1516- BPF_CALL_1 (bpf_timer_cancel , struct bpf_async_kern * , timer )
1529+ BPF_CALL_1 (bpf_timer_cancel , struct bpf_async_kern * , async )
15171530{
15181531 struct bpf_hrtimer * t , * cur_t ;
15191532 bool inc = false;
15201533 int ret = 0 ;
15211534
15221535 if (in_nmi ())
15231536 return - EOPNOTSUPP ;
1524- rcu_read_lock ();
1525- __bpf_spin_lock_irqsave (& timer -> lock );
1526- t = timer -> timer ;
1527- if (!t ) {
1528- ret = - EINVAL ;
1529- goto out ;
1530- }
1537+
1538+ guard (rcu )();
1539+
1540+ t = READ_ONCE (async -> timer );
1541+ if (!t )
1542+ return - EINVAL ;
15311543
15321544 cur_t = this_cpu_read (hrtimer_running );
15331545 if (cur_t == t ) {
@@ -1563,16 +1575,15 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
15631575 goto out ;
15641576 }
15651577drop :
1566- drop_prog_refcnt (& t -> cb );
1578+ bpf_async_update_callback (& t -> cb , NULL , NULL );
15671579out :
1568- __bpf_spin_unlock_irqrestore (& timer -> lock );
15691580 /* Cancel the timer and wait for associated callback to finish
15701581 * if it was running.
15711582 */
15721583 ret = ret ?: hrtimer_cancel (& t -> timer );
15731584 if (inc )
15741585 atomic_dec (& t -> cancelling );
1575- rcu_read_unlock ();
1586+
15761587 return ret ;
15771588}
15781589
@@ -1587,22 +1598,17 @@ static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *a
15871598{
15881599 struct bpf_async_cb * cb ;
15891600
1590- /* Performance optimization: read async->cb without lock first. */
1591- if (!READ_ONCE (async -> cb ))
1592- return NULL ;
1593-
1594- __bpf_spin_lock_irqsave (& async -> lock );
1595- /* re-read it under lock */
1596- cb = async -> cb ;
1597- if (!cb )
1598- goto out ;
1599- drop_prog_refcnt (cb );
1600- /* The subsequent bpf_timer_start/cancel() helpers won't be able to use
1601+ /*
1602+ * The subsequent bpf_timer_start/cancel() helpers won't be able to use
16011603 * this timer, since it won't be initialized.
16021604 */
1603- WRITE_ONCE (async -> cb , NULL );
1604- out :
1605- __bpf_spin_unlock_irqrestore (& async -> lock );
1605+ cb = xchg (& async -> cb , NULL );
1606+ if (!cb )
1607+ return NULL ;
1608+
1609+ /* cb is detached, set state to FREED, so that concurrent users drop it */
1610+ xchg (& cb -> state , BPF_ASYNC_FREED );
1611+ bpf_async_update_callback (cb , NULL , NULL );
16061612 return cb ;
16071613}
16081614
@@ -1670,7 +1676,7 @@ void bpf_timer_cancel_and_free(void *val)
16701676 if (!t )
16711677 return ;
16721678
1673- bpf_timer_delete ( t );
1679+ bpf_async_put ( & t -> cb , BPF_ASYNC_TYPE_TIMER ); /* Put map's own reference */
16741680}
16751681
16761682/* This function is called by map_delete/update_elem for individual element and
@@ -1685,12 +1691,8 @@ void bpf_wq_cancel_and_free(void *val)
16851691 work = (struct bpf_work * )__bpf_async_cancel_and_free (val );
16861692 if (!work )
16871693 return ;
1688- /* Trigger cancel of the sleepable work, but *do not* wait for
1689- * it to finish if it was running as we might not be in a
1690- * sleepable context.
1691- * kfree will be called once the work has finished.
1692- */
1693- schedule_work (& work -> delete_work );
1694+
1695+ bpf_async_put (& work -> cb , BPF_ASYNC_TYPE_WQ ); /* Put map's own reference */
16941696}
16951697
16961698BPF_CALL_2 (bpf_kptr_xchg , void * , dst , void * , ptr )
@@ -3169,11 +3171,20 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
31693171 return - EOPNOTSUPP ;
31703172 if (flags )
31713173 return - EINVAL ;
3174+
3175+ guard (rcu )();
3176+
31723177 w = READ_ONCE (async -> work );
31733178 if (!w || !READ_ONCE (w -> cb .prog ))
31743179 return - EINVAL ;
31753180
3181+ if (!bpf_async_tryget (& w -> cb ))
3182+ return - EINVAL ;
3183+
31763184 schedule_work (& w -> work );
3185+
3186+ bpf_async_put (& w -> cb , BPF_ASYNC_TYPE_WQ );
3187+
31773188 return 0 ;
31783189}
31793190
0 commit comments