Skip to content

Commit 4612115

Browse files
peilin-yeKernel Patches Daemon
authored andcommitted
bpf/helpers: Skip memcg accounting in __bpf_async_init()
Calling bpf_map_kmalloc_node() from __bpf_async_init() can cause various locking issues; see the following stack trace (edited for style) as one example: ... [10.011566] do_raw_spin_lock.cold [10.011570] try_to_wake_up (5) double-acquiring the same [10.011575] kick_pool rq_lock, causing a hardlockup [10.011579] __queue_work [10.011582] queue_work_on [10.011585] kernfs_notify [10.011589] cgroup_file_notify [10.011593] try_charge_memcg (4) memcg accounting raises an [10.011597] obj_cgroup_charge_pages MEMCG_MAX event [10.011599] obj_cgroup_charge_account [10.011600] __memcg_slab_post_alloc_hook [10.011603] __kmalloc_node_noprof ... [10.011611] bpf_map_kmalloc_node [10.011612] __bpf_async_init [10.011615] bpf_timer_init (3) BPF calls bpf_timer_init() [10.011617] bpf_prog_xxxxxxxxxxxxxxxx_fcg_runnable [10.011619] bpf__sched_ext_ops_runnable [10.011620] enqueue_task_scx (2) BPF runs with rq_lock held [10.011622] enqueue_task [10.011626] ttwu_do_activate [10.011629] sched_ttwu_pending (1) grabs rq_lock ... The above was reproduced on bpf-next (b338cf8) by modifying ./tools/sched_ext/scx_flatcg.bpf.c to call bpf_timer_init() during ops.runnable(), and hacking [1] the memcg accounting code a bit to make it (much more likely to) raise an MEMCG_MAX event from a bpf_timer_init() call. We have also run into other similar variants both internally (without applying the [1] hack) and on bpf-next, including: * run_timer_softirq() -> cgroup_file_notify() (grabs cgroup_file_kn_lock) -> try_to_wake_up() -> BPF calls bpf_timer_init() -> bpf_map_kmalloc_node() -> try_charge_memcg() raises MEMCG_MAX -> cgroup_file_notify() (tries to grab cgroup_file_kn_lock again) * __queue_work() (grabs worker_pool::lock) -> try_to_wake_up() -> BPF calls bpf_timer_init() -> bpf_map_kmalloc_node() -> try_charge_memcg() raises MEMCG_MAX -> cgroup_file_notify() -> __queue_work() (tries to grab the same worker_pool::lock) ... As pointed out by Kumar, we can use bpf_mem_alloc() and friends for bpf_hrtimer and bpf_work, to skip memcg accounting. Tested with vmtest.sh (llvm-18, x86-64): $ ./test_progs -a '*timer*' -a '*wq*' ... Summary: 7/12 PASSED, 0 SKIPPED, 0 FAILED [1] Making a bpf_timer_init() call (much more likely) to raise an MEMCG_MAX event (gist-only, for brevity): kernel/bpf/helpers.c:__bpf_async_init(): /* allocate hrtimer via map_kmalloc to use memcg accounting */ - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); + cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC | __GFP_HACK, + map->numa_node); mm/memcontrol.c:try_charge_memcg(): if (!do_memsw_account() || - page_counter_try_charge(&memcg->memsw, batch, &counter)) { - if (page_counter_try_charge(&memcg->memory, batch, &counter)) + page_counter_try_charge_hack(&memcg->memsw, batch, &counter, + gfp_mask & __GFP_HACK)) { + if (page_counter_try_charge_hack(&memcg->memory, batch, + &counter, + gfp_mask & __GFP_HACK)) goto done_restock; mm/page_counter.c:page_counter_try_charge(): -bool page_counter_try_charge(struct page_counter *counter, - unsigned long nr_pages, - struct page_counter **fail) +bool page_counter_try_charge_hack(struct page_counter *counter, + unsigned long nr_pages, + struct page_counter **fail, bool hack) { ... - if (new > c->max) { + if (hack || new > c->max) { // goto failed; atomic_long_sub(nr_pages, &c->usage); /* Fixes: b00628b ("bpf: Introduce bpf timers.") Suggested-by: Kumar Kartikeya Dwivedi <[email protected]> Signed-off-by: Peilin Ye <[email protected]>
1 parent 83390c0 commit 4612115

File tree

1 file changed

+8
-12
lines changed

1 file changed

+8
-12
lines changed

kernel/bpf/helpers.c

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,10 +1089,7 @@ struct bpf_async_cb {
10891089
struct bpf_prog *prog;
10901090
void __rcu *callback_fn;
10911091
void *value;
1092-
union {
1093-
struct rcu_head rcu;
1094-
struct work_struct delete_work;
1095-
};
1092+
struct work_struct delete_work;
10961093
u64 flags;
10971094
};
10981095

@@ -1225,7 +1222,7 @@ static void bpf_wq_delete_work(struct work_struct *work)
12251222

12261223
cancel_work_sync(&w->work);
12271224

1228-
kfree_rcu(w, cb.rcu);
1225+
bpf_mem_free_rcu(&bpf_global_ma, w);
12291226
}
12301227

12311228
static void bpf_timer_delete_work(struct work_struct *work)
@@ -1234,13 +1231,13 @@ static void bpf_timer_delete_work(struct work_struct *work)
12341231

12351232
/* Cancel the timer and wait for callback to complete if it was running.
12361233
* If hrtimer_cancel() can be safely called it's safe to call
1237-
* kfree_rcu(t) right after for both preallocated and non-preallocated
1234+
* bpf_mem_free_rcu(t) right after for both preallocated and non-preallocated
12381235
* maps. The async->cb = NULL was already done and no code path can see
12391236
* address 't' anymore. Timer if armed for existing bpf_hrtimer before
12401237
* bpf_timer_cancel_and_free will have been cancelled.
12411238
*/
12421239
hrtimer_cancel(&t->timer);
1243-
kfree_rcu(t, cb.rcu);
1240+
bpf_mem_free_rcu(&bpf_global_ma, t);
12441241
}
12451242

12461243
static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
@@ -1274,8 +1271,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
12741271
goto out;
12751272
}
12761273

1277-
/* allocate hrtimer via map_kmalloc to use memcg accounting */
1278-
cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
1274+
cb = bpf_mem_alloc(&bpf_global_ma, size);
12791275
if (!cb) {
12801276
ret = -ENOMEM;
12811277
goto out;
@@ -1571,7 +1567,7 @@ void bpf_timer_cancel_and_free(void *val)
15711567
* callback_fn. In such case we don't call hrtimer_cancel() (since it
15721568
* will deadlock) and don't call hrtimer_try_to_cancel() (since it will
15731569
* just return -1). Though callback_fn is still running on this cpu it's
1574-
* safe to do kfree(t) because bpf_timer_cb() read everything it needed
1570+
* safe to free 't' because bpf_timer_cb() read everything it needed
15751571
* from 't'. The bpf subprog callback_fn won't be able to access 't',
15761572
* since async->cb = NULL was already done. The timer will be
15771573
* effectively cancelled because bpf_timer_cb() will return
@@ -1581,7 +1577,7 @@ void bpf_timer_cancel_and_free(void *val)
15811577
* timer _before_ calling us, such that failing to cancel it here will
15821578
* cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
15831579
* Therefore, we _need_ to cancel any outstanding timers before we do
1584-
* kfree_rcu, even though no more timers can be armed.
1580+
* bpf_mem_free_rcu(), even though no more timers can be armed.
15851581
*
15861582
* Moreover, we need to schedule work even if timer does not belong to
15871583
* the calling callback_fn, as on two different CPUs, we can end up in a
@@ -1608,7 +1604,7 @@ void bpf_timer_cancel_and_free(void *val)
16081604
* completion.
16091605
*/
16101606
if (hrtimer_try_to_cancel(&t->timer) >= 0)
1611-
kfree_rcu(t, cb.rcu);
1607+
bpf_mem_free_rcu(&bpf_global_ma, t);
16121608
else
16131609
queue_work(system_unbound_wq, &t->cb.delete_work);
16141610
} else {

0 commit comments

Comments
 (0)