Skip to content

Commit d4fdc91

Browse files
mykyta5Kernel Patches Daemon
authored andcommitted
bpf: remove lock from bpf_async_cb
Remove lock from bpf_async_cb, refactor bpf_timer and bpf_wq kfuncs and helpers to run without it. bpf_async_cb lifetime is managed by the refcnt and RCU, so every function that uses it has to apply RCU guard. cancel_and_free() path detaches bpf_async_cb from the map value (struct bpf_async_kern) and sets the state to the terminal BPF_ASYNC_FREED atomically, concurrent readers may operate on detached bpf_async_cb safely under RCU read lock. Guarantee safe bpf_prog drop from the bpf_async_cb by handling BPF_ASYNC_FREED state in bpf_async_update_callback(). Signed-off-by: Mykyta Yatsenko <[email protected]>
1 parent 4793260 commit d4fdc91

File tree

1 file changed

+106
-95
lines changed

1 file changed

+106
-95
lines changed

kernel/bpf/helpers.c

Lines changed: 106 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,6 +1092,12 @@ static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx)
10921092
return (void *)value - round_up(map->key_size, 8);
10931093
}
10941094

1095+
enum bpf_async_state {
1096+
BPF_ASYNC_READY,
1097+
BPF_ASYNC_BUSY,
1098+
BPF_ASYNC_FREED,
1099+
};
1100+
10951101
struct bpf_async_cb {
10961102
struct bpf_map *map;
10971103
struct bpf_prog *prog;
@@ -1103,6 +1109,7 @@ struct bpf_async_cb {
11031109
};
11041110
u64 flags;
11051111
refcount_t refcnt;
1112+
enum bpf_async_state state;
11061113
};
11071114

11081115
/* BPF map elements can contain 'struct bpf_timer'.
@@ -1140,11 +1147,6 @@ struct bpf_async_kern {
11401147
struct bpf_hrtimer *timer;
11411148
struct bpf_work *work;
11421149
};
1143-
/* bpf_spin_lock is used here instead of spinlock_t to make
1144-
* sure that it always fits into space reserved by struct bpf_timer
1145-
* regardless of LOCKDEP and spinlock debug flags.
1146-
*/
1147-
struct bpf_spin_lock lock;
11481150
} __attribute__((aligned(8)));
11491151

11501152
enum bpf_async_type {
@@ -1276,7 +1278,7 @@ static void bpf_timer_delete_work(struct work_struct *work)
12761278
static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
12771279
enum bpf_async_type type)
12781280
{
1279-
struct bpf_async_cb *cb;
1281+
struct bpf_async_cb *cb, *old_cb;
12801282
struct bpf_hrtimer *t;
12811283
struct bpf_work *w;
12821284
clockid_t clockid;
@@ -1297,18 +1299,13 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
12971299
return -EINVAL;
12981300
}
12991301

1300-
__bpf_spin_lock_irqsave(&async->lock);
1301-
t = async->timer;
1302-
if (t) {
1303-
ret = -EBUSY;
1304-
goto out;
1305-
}
1302+
cb = READ_ONCE(async->cb);
1303+
if (cb)
1304+
return -EBUSY;
13061305

13071306
cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node);
1308-
if (!cb) {
1309-
ret = -ENOMEM;
1310-
goto out;
1311-
}
1307+
if (!cb)
1308+
return -ENOMEM;
13121309

13131310
switch (type) {
13141311
case BPF_ASYNC_TYPE_TIMER:
@@ -1331,9 +1328,16 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
13311328
cb->map = map;
13321329
cb->prog = NULL;
13331330
cb->flags = flags;
1331+
cb->state = BPF_ASYNC_READY;
13341332
rcu_assign_pointer(cb->callback_fn, NULL);
1333+
refcount_set(&cb->refcnt, 1); /* map's own ref */
13351334

1336-
WRITE_ONCE(async->cb, cb);
1335+
old_cb = cmpxchg(&async->cb, NULL, cb);
1336+
if (old_cb) {
1337+
/* Lost the race to initialize this bpf_async_kern, drop the allocated object */
1338+
kfree_nolock(cb);
1339+
return -EBUSY;
1340+
}
13371341
/* Guarantee the order between async->cb and map->usercnt. So
13381342
* when there are concurrent uref release and bpf timer init, either
13391343
* bpf_timer_cancel_and_free() called by uref release reads a no-NULL
@@ -1344,12 +1348,17 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
13441348
/* maps with timers must be either held by user space
13451349
* or pinned in bpffs.
13461350
*/
1347-
WRITE_ONCE(async->cb, NULL);
1348-
kfree_nolock(cb);
1349-
ret = -EPERM;
1351+
switch (type) {
1352+
case BPF_ASYNC_TYPE_TIMER:
1353+
bpf_timer_cancel_and_free(async);
1354+
break;
1355+
case BPF_ASYNC_TYPE_WQ:
1356+
bpf_wq_cancel_and_free(async);
1357+
break;
1358+
}
1359+
return -EPERM;
13501360
}
1351-
out:
1352-
__bpf_spin_unlock_irqrestore(&async->lock);
1361+
13531362
return ret;
13541363
}
13551364

@@ -1398,41 +1407,42 @@ static int bpf_async_swap_prog(struct bpf_async_cb *cb, struct bpf_prog *prog)
13981407
if (IS_ERR(prog))
13991408
return PTR_ERR(prog);
14001409
}
1410+
/* Make sure only one thread runs bpf_prog_put() */
1411+
prev = xchg(&cb->prog, prog);
14011412
if (prev)
14021413
/* Drop prev prog refcnt when swapping with new prog */
14031414
bpf_prog_put(prev);
14041415

1405-
cb->prog = prog;
14061416
return 0;
14071417
}
14081418

1409-
static int bpf_async_update_callback(struct bpf_async_kern *async, void *callback_fn,
1419+
static int bpf_async_update_callback(struct bpf_async_cb *cb, void *callback_fn,
14101420
struct bpf_prog *prog)
14111421
{
1412-
struct bpf_async_cb *cb;
1422+
enum bpf_async_state state;
14131423
int err = 0;
14141424

1415-
__bpf_spin_lock_irqsave(&async->lock);
1416-
cb = async->cb;
1417-
if (!cb) {
1418-
err = -EINVAL;
1419-
goto out;
1420-
}
1421-
if (!atomic64_read(&cb->map->usercnt)) {
1422-
/* maps with timers must be either held by user space
1423-
* or pinned in bpffs. Otherwise timer might still be
1424-
* running even when bpf prog is detached and user space
1425-
* is gone, since map_release_uref won't ever be called.
1426-
*/
1427-
err = -EPERM;
1428-
goto out;
1429-
}
1425+
state = cmpxchg(&cb->state, BPF_ASYNC_READY, BPF_ASYNC_BUSY);
1426+
if (state == BPF_ASYNC_BUSY)
1427+
return -EBUSY;
1428+
if (state == BPF_ASYNC_FREED)
1429+
goto drop;
14301430

14311431
err = bpf_async_swap_prog(cb, prog);
14321432
if (!err)
14331433
rcu_assign_pointer(cb->callback_fn, callback_fn);
1434-
out:
1435-
__bpf_spin_unlock_irqrestore(&async->lock);
1434+
1435+
state = cmpxchg(&cb->state, BPF_ASYNC_BUSY, BPF_ASYNC_READY);
1436+
if (state == BPF_ASYNC_FREED) {
1437+
/*
1438+
* cb is freed concurrently, we may have overwritten prog and callback,
1439+
* make sure to drop them
1440+
*/
1441+
drop:
1442+
bpf_async_swap_prog(cb, NULL);
1443+
rcu_assign_pointer(cb->callback_fn, NULL);
1444+
return -EPERM;
1445+
}
14361446
return err;
14371447
}
14381448

@@ -1441,11 +1451,18 @@ static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback
14411451
enum bpf_async_type type)
14421452
{
14431453
struct bpf_prog *prog = aux->prog;
1454+
struct bpf_async_cb *cb;
14441455

14451456
if (in_nmi())
14461457
return -EOPNOTSUPP;
14471458

1448-
return bpf_async_update_callback(async, callback_fn, prog);
1459+
guard(rcu)();
1460+
1461+
cb = READ_ONCE(async->cb);
1462+
if (!cb)
1463+
return -EINVAL;
1464+
1465+
return bpf_async_update_callback(cb, callback_fn, prog);
14491466
}
14501467

14511468
BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn,
@@ -1462,7 +1479,7 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = {
14621479
.arg2_type = ARG_PTR_TO_FUNC,
14631480
};
14641481

1465-
BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags)
1482+
BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, flags)
14661483
{
14671484
struct bpf_hrtimer *t;
14681485
int ret = 0;
@@ -1472,12 +1489,19 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, fla
14721489
return -EOPNOTSUPP;
14731490
if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN))
14741491
return -EINVAL;
1475-
__bpf_spin_lock_irqsave(&timer->lock);
1476-
t = timer->timer;
1477-
if (!t || !t->cb.prog) {
1478-
ret = -EINVAL;
1479-
goto out;
1480-
}
1492+
1493+
guard(rcu)();
1494+
1495+
t = READ_ONCE(async->timer);
1496+
if (!t)
1497+
return -EINVAL;
1498+
1499+
/*
1500+
* Hold ref while scheduling timer, to make sure, we only cancel and free after
1501+
* hrtimer_start().
1502+
*/
1503+
if (!bpf_async_tryget(&t->cb))
1504+
return -EINVAL;
14811505

14821506
if (flags & BPF_F_TIMER_ABS)
14831507
mode = HRTIMER_MODE_ABS_SOFT;
@@ -1488,8 +1512,8 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, fla
14881512
mode |= HRTIMER_MODE_PINNED;
14891513

14901514
hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
1491-
out:
1492-
__bpf_spin_unlock_irqrestore(&timer->lock);
1515+
1516+
bpf_async_put(&t->cb, BPF_ASYNC_TYPE_TIMER);
14931517
return ret;
14941518
}
14951519

@@ -1502,32 +1526,20 @@ static const struct bpf_func_proto bpf_timer_start_proto = {
15021526
.arg3_type = ARG_ANYTHING,
15031527
};
15041528

1505-
static void drop_prog_refcnt(struct bpf_async_cb *async)
1506-
{
1507-
struct bpf_prog *prog = async->prog;
1508-
1509-
if (prog) {
1510-
bpf_prog_put(prog);
1511-
async->prog = NULL;
1512-
rcu_assign_pointer(async->callback_fn, NULL);
1513-
}
1514-
}
1515-
1516-
BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
1529+
BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async)
15171530
{
15181531
struct bpf_hrtimer *t, *cur_t;
15191532
bool inc = false;
15201533
int ret = 0;
15211534

15221535
if (in_nmi())
15231536
return -EOPNOTSUPP;
1524-
rcu_read_lock();
1525-
__bpf_spin_lock_irqsave(&timer->lock);
1526-
t = timer->timer;
1527-
if (!t) {
1528-
ret = -EINVAL;
1529-
goto out;
1530-
}
1537+
1538+
guard(rcu)();
1539+
1540+
t = READ_ONCE(async->timer);
1541+
if (!t)
1542+
return -EINVAL;
15311543

15321544
cur_t = this_cpu_read(hrtimer_running);
15331545
if (cur_t == t) {
@@ -1563,16 +1575,15 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
15631575
goto out;
15641576
}
15651577
drop:
1566-
drop_prog_refcnt(&t->cb);
1578+
bpf_async_update_callback(&t->cb, NULL, NULL);
15671579
out:
1568-
__bpf_spin_unlock_irqrestore(&timer->lock);
15691580
/* Cancel the timer and wait for associated callback to finish
15701581
* if it was running.
15711582
*/
15721583
ret = ret ?: hrtimer_cancel(&t->timer);
15731584
if (inc)
15741585
atomic_dec(&t->cancelling);
1575-
rcu_read_unlock();
1586+
15761587
return ret;
15771588
}
15781589

@@ -1587,22 +1598,17 @@ static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *a
15871598
{
15881599
struct bpf_async_cb *cb;
15891600

1590-
/* Performance optimization: read async->cb without lock first. */
1591-
if (!READ_ONCE(async->cb))
1592-
return NULL;
1593-
1594-
__bpf_spin_lock_irqsave(&async->lock);
1595-
/* re-read it under lock */
1596-
cb = async->cb;
1597-
if (!cb)
1598-
goto out;
1599-
drop_prog_refcnt(cb);
1600-
/* The subsequent bpf_timer_start/cancel() helpers won't be able to use
1601+
/*
1602+
* The subsequent bpf_timer_start/cancel() helpers won't be able to use
16011603
* this timer, since it won't be initialized.
16021604
*/
1603-
WRITE_ONCE(async->cb, NULL);
1604-
out:
1605-
__bpf_spin_unlock_irqrestore(&async->lock);
1605+
cb = xchg(&async->cb, NULL);
1606+
if (!cb)
1607+
return NULL;
1608+
1609+
/* cb is detached, set state to FREED, so that concurrent users drop it */
1610+
xchg(&cb->state, BPF_ASYNC_FREED);
1611+
bpf_async_update_callback(cb, NULL, NULL);
16061612
return cb;
16071613
}
16081614

@@ -1670,7 +1676,7 @@ void bpf_timer_cancel_and_free(void *val)
16701676
if (!t)
16711677
return;
16721678

1673-
bpf_timer_delete(t);
1679+
bpf_async_put(&t->cb, BPF_ASYNC_TYPE_TIMER); /* Put map's own reference */
16741680
}
16751681

16761682
/* This function is called by map_delete/update_elem for individual element and
@@ -1685,12 +1691,8 @@ void bpf_wq_cancel_and_free(void *val)
16851691
work = (struct bpf_work *)__bpf_async_cancel_and_free(val);
16861692
if (!work)
16871693
return;
1688-
/* Trigger cancel of the sleepable work, but *do not* wait for
1689-
* it to finish if it was running as we might not be in a
1690-
* sleepable context.
1691-
* kfree will be called once the work has finished.
1692-
*/
1693-
schedule_work(&work->delete_work);
1694+
1695+
bpf_async_put(&work->cb, BPF_ASYNC_TYPE_WQ); /* Put map's own reference */
16941696
}
16951697

16961698
BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr)
@@ -3169,11 +3171,20 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
31693171
return -EOPNOTSUPP;
31703172
if (flags)
31713173
return -EINVAL;
3174+
3175+
guard(rcu)();
3176+
31723177
w = READ_ONCE(async->work);
31733178
if (!w || !READ_ONCE(w->cb.prog))
31743179
return -EINVAL;
31753180

3181+
if (!bpf_async_tryget(&w->cb))
3182+
return -EINVAL;
3183+
31763184
schedule_work(&w->work);
3185+
3186+
bpf_async_put(&w->cb, BPF_ASYNC_TYPE_WQ);
3187+
31773188
return 0;
31783189
}
31793190

0 commit comments

Comments
 (0)