Skip to content

Conversation

kernel-patches-daemon-bpf[bot]
Copy link

Pull request for series with
subject: Pass external callchain entry to get_perf_callchain
version: 3
url: https://patchwork.kernel.org/project/netdevbpf/list/?series=1013305

Tao Chen added 2 commits October 19, 2025 10:07
From bpf stack map, we want to use our own buffers to avoid unnecessary
copy, so let us pass it directly. BPF will use this in the next patch.

Signed-off-by: Tao Chen <[email protected]>
As Alexei noted, get_perf_callchain() return values may be reused
if a task is preempted after the BPF program enters migrate disable
mode. Drawing on the per-cpu design of bpf_bprintf_buffers,
per-cpu BPF callchain entry is used here.

Signed-off-by: Tao Chen <[email protected]>
@kernel-patches-daemon-bpf
Copy link
Author

Upstream branch: 7361c86
series: https://patchwork.kernel.org/project/netdevbpf/list/?series=1013305
version: 3

@kernel-patches-review-bot
Copy link

commit d8e76e700a50de3165213495d434390d670007bf
Author: Tao Chen <[email protected]>

bpf: Use per-cpu BPF callchain entry to save callchain

This patch introduces per-CPU BPF callchain entries to prevent reuse
issues when a task is preempted after entering migrate disable mode.
It replaces get_callchain_entry()/put_callchain_entry() with a new
counter-based resource management system.

> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index 94e46b7f3..3513077c5 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
> @@ -31,6 +31,52 @@ struct bpf_stack_map {
>  	struct stack_map_bucket *buckets[] __counted_by(n_buckets);
>  };
>
> +struct bpf_perf_callchain_entry {
> +	u64 nr;
> +	u64 ip[PERF_MAX_STACK_DEPTH];
> +};
> +
> +#define MAX_PERF_CALLCHAIN_PREEMPT 3
> +static DEFINE_PER_CPU(struct bpf_perf_callchain_entry[MAX_PERF_CALLCHAIN_PREEMPT],
> +		      bpf_perf_callchain_entries);
> +static DEFINE_PER_CPU(int, bpf_perf_callchain_preempt_cnt);
> +
> +static int bpf_get_perf_callchain_or_entry(struct perf_callchain_entry **entry,
> +					   struct pt_regs *regs, bool kernel,
> +					   bool user, u32 max_stack, bool crosstack,
> +					   bool add_mark, bool get_callchain)
> +{
> +	struct bpf_perf_callchain_entry *bpf_entry;
> +	struct perf_callchain_entry *perf_entry;
> +	int preempt_cnt;
> +
> +	preempt_cnt = this_cpu_inc_return(bpf_perf_callchain_preempt_cnt);
> +	if (WARN_ON_ONCE(preempt_cnt > MAX_PERF_CALLCHAIN_PREEMPT)) {
> +		this_cpu_dec(bpf_perf_callchain_preempt_cnt);
> +		return -EBUSY;
> +	}
> +
> +	bpf_entry = this_cpu_ptr(&bpf_perf_callchain_entries[preempt_cnt - 1]);
> +	if (!get_callchain) {
> +		*entry = (struct perf_callchain_entry *)bpf_entry;
> +		return 0;
> +	}
> +
> +	perf_entry = get_perf_callchain(regs, (struct perf_callchain_entry *)bpf_entry,
> +					kernel, user, max_stack,
> +					crosstack, add_mark);
> +	*entry = perf_entry;
> +
> +	return 0;
> +}
> +
> +static void bpf_put_perf_callchain(void)
> +{
> +	if (WARN_ON_ONCE(this_cpu_read(bpf_perf_callchain_preempt_cnt) == 0))
> +		return;
> +	this_cpu_dec(bpf_perf_callchain_preempt_cnt);
> +}
> +
>  static inline bool stack_map_use_build_id(struct bpf_map *map)
>  {
>  	return (map->map_flags & BPF_F_STACK_BUILD_ID);
> @@ -192,11 +238,11 @@ get_callchain_entry_for_task(struct task_struct *task, u32 max_depth)
>  {
>  #ifdef CONFIG_STACKTRACE
>  	struct perf_callchain_entry *entry;
> -	int rctx;
> -
> -	entry = get_callchain_entry(&rctx);
> +	int ret;
>
> -	if (!entry)
> +	ret = bpf_get_perf_callchain_or_entry(&entry, NULL, false, false, 0, false, false,
> +					      false);
> +	if (ret)
>  		return NULL;
>
>  	entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip,
> @@ -216,7 +262,7 @@ get_callchain_entry_for_task(struct task_struct *task, u32 max_depth)
>  			to[i] = (u64)(from[i]);
>  	}
>
> -	put_callchain_entry(rctx);
> +	bpf_put_perf_callchain();
>
>  	return entry;
                   ^^^^

Does get_callchain_entry_for_task() need to call bpf_put_perf_callchain()
before returning the entry? The resource tracking counter gets decremented
here, but then the caller in __bpf_get_stack() calls bpf_put_perf_callchain()
again at the end of the function.

This creates a double-put scenario:
  bpf_get_task_stack()->__bpf_get_task_stack()->__bpf_get_stack():
    - calls get_callchain_entry_for_task() which does get+put
    - then does its own put at line 529 or 510
    - triggers WARN_ON_ONCE in bpf_put_perf_callchain()

The other path through __bpf_get_stack() that calls
bpf_get_perf_callchain_or_entry() directly expects the caller to do the
put, but get_callchain_entry_for_task() is doing put internally.

>  #else /* CONFIG_STACKTRACE */
>  	return NULL;
>  #endif

[ ... ]

> @@ -443,20 +491,23 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
>  	if (sysctl_perf_event_max_stack < max_depth)
>  		max_depth = sysctl_perf_event_max_stack;
>
> -	if (may_fault)
> -		rcu_read_lock(); /* need RCU for perf's callchain below */
> -
>  	if (trace_in)
>  		trace = trace_in;
> -	else if (kernel && task)
> +	else if (kernel && task) {
>  		trace = get_callchain_entry_for_task(task, max_depth);
> -	else
> -		trace = get_perf_callchain(regs, NULL, kernel, user, max_depth,
> -					   crosstask, false);
> +	} else {
> +		err = bpf_get_perf_callchain_or_entry(&trace, regs, kernel, user, max_depth,
> +						      false, false, true);
> +		if (err)
> +			return err;
> +	}
> +
> +	if (unlikely(!trace))
> +		goto err_fault;
>
> -	if (unlikely(!trace) || trace->nr < skip) {
> -		if (may_fault)
> -			rcu_read_unlock();
> +	if (trace->nr < skip) {
> +		if (!trace_in)
> +			bpf_put_perf_callchain();
                        ^^^^

When the "kernel && task" path is taken above, get_callchain_entry_for_task()
already called bpf_put_perf_callchain() before returning. This second
bpf_put_perf_callchain() call will trigger the WARN_ON_ONCE since the
counter is already zero.

>  		goto err_fault;
>  	}
>
> @@ -475,9 +526,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
>  		memcpy(buf, ips, copy_len);
>  	}
>
> -	/* trace/ips should not be dereferenced after this point */
> -	if (may_fault)
> -		rcu_read_unlock();
> +	if (!trace_in)
> +		bpf_put_perf_callchain();
                ^^^^

Same double-put issue here on the success path when "kernel && task" was
used.

Should get_callchain_entry_for_task() avoid calling bpf_put_perf_callchain()
and leave it to the caller, similar to how the other path through
bpf_get_perf_callchain_or_entry() works?



AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

In-Reply-To-Subject: bpf: Use per-cpu BPF callchain entry to save callchain
CI run summary: https://github.com/kernel-patches/bpf/actions/runs/18633525045

@kernel-patches-daemon-bpf
Copy link
Author

Forwarding comment 3419821144 via email
In-Reply-To: [email protected]
Patch: https://patchwork.kernel.org/project/netdevbpf/patch/[email protected]/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

0 participants