Skip to content

Commit d4dd977

Browse files
anakryikoAlexei Starovoitov
authored andcommitted
bpf: wire up sleepable bpf_get_stack() and bpf_get_task_stack() helpers
Add sleepable implementations of bpf_get_stack() and bpf_get_task_stack() helpers and allow them to be used from sleepable BPF program (e.g., sleepable uprobes). Note, the stack trace IPs capturing itself is not sleepable (that would need to be a separate project), only build ID fetching is sleepable and thus more reliable, as it will wait for data to be paged in, if necessary. For that we make use of sleepable build_id_parse() implementation. Now that build ID related internals in kernel/bpf/stackmap.c can be used both in sleepable and non-sleepable contexts, we need to add additional rcu_read_lock()/rcu_read_unlock() protection around fetching perf_callchain_entry, but with the refactoring in previous commit it's now pretty straightforward. We make sure to do rcu_read_unlock (in sleepable mode only) right before stack_map_get_build_id_offset() call which can sleep. By that time we don't have any more use of perf_callchain_entry. Note, bpf_get_task_stack() will fail for user mode if task != current. And for kernel mode build ID are irrelevant. So in that sense adding sleepable bpf_get_task_stack() implementation is a no-op. It feel right to wire this up for symmetry and completeness, but I'm open to just dropping it until we support `user && crosstask` condition. Reviewed-by: Eduard Zingerman <[email protected]> Signed-off-by: Andrii Nakryiko <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Alexei Starovoitov <[email protected]>
1 parent 4f4c4fc commit d4dd977

File tree

3 files changed

+77
-20
lines changed

3 files changed

+77
-20
lines changed

include/linux/bpf.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3200,7 +3200,9 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
32003200
extern const struct bpf_func_proto bpf_get_current_comm_proto;
32013201
extern const struct bpf_func_proto bpf_get_stackid_proto;
32023202
extern const struct bpf_func_proto bpf_get_stack_proto;
3203+
extern const struct bpf_func_proto bpf_get_stack_sleepable_proto;
32033204
extern const struct bpf_func_proto bpf_get_task_stack_proto;
3205+
extern const struct bpf_func_proto bpf_get_task_stack_sleepable_proto;
32043206
extern const struct bpf_func_proto bpf_get_stackid_proto_pe;
32053207
extern const struct bpf_func_proto bpf_get_stack_proto_pe;
32063208
extern const struct bpf_func_proto bpf_sock_map_update_proto;

kernel/bpf/stackmap.c

Lines changed: 72 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,12 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
124124
return ERR_PTR(err);
125125
}
126126

127+
static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, bool may_fault)
128+
{
129+
return may_fault ? build_id_parse(vma, build_id, NULL)
130+
: build_id_parse_nofault(vma, build_id, NULL);
131+
}
132+
127133
/*
128134
* Expects all id_offs[i].ip values to be set to correct initial IPs.
129135
* They will be subsequently:
@@ -135,7 +141,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
135141
* BPF_STACK_BUILD_ID_IP.
136142
*/
137143
static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
138-
u32 trace_nr, bool user)
144+
u32 trace_nr, bool user, bool may_fault)
139145
{
140146
int i;
141147
struct mmap_unlock_irq_work *work = NULL;
@@ -166,7 +172,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
166172
goto build_id_valid;
167173
}
168174
vma = find_vma(current->mm, ip);
169-
if (!vma || build_id_parse_nofault(vma, id_offs[i].build_id, NULL)) {
175+
if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) {
170176
/* per entry fall back to ips */
171177
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
172178
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
@@ -257,7 +263,7 @@ static long __bpf_get_stackid(struct bpf_map *map,
257263
id_offs = (struct bpf_stack_build_id *)new_bucket->data;
258264
for (i = 0; i < trace_nr; i++)
259265
id_offs[i].ip = ips[i];
260-
stack_map_get_build_id_offset(id_offs, trace_nr, user);
266+
stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */);
261267
trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
262268
if (hash_matches && bucket->nr == trace_nr &&
263269
memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
@@ -398,7 +404,7 @@ const struct bpf_func_proto bpf_get_stackid_proto_pe = {
398404

399405
static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
400406
struct perf_callchain_entry *trace_in,
401-
void *buf, u32 size, u64 flags)
407+
void *buf, u32 size, u64 flags, bool may_fault)
402408
{
403409
u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
404410
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
@@ -416,8 +422,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
416422
if (kernel && user_build_id)
417423
goto clear;
418424

419-
elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
420-
: sizeof(u64);
425+
elem_size = user_build_id ? sizeof(struct bpf_stack_build_id) : sizeof(u64);
421426
if (unlikely(size % elem_size))
422427
goto clear;
423428

@@ -438,35 +443,45 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
438443
if (sysctl_perf_event_max_stack < max_depth)
439444
max_depth = sysctl_perf_event_max_stack;
440445

446+
if (may_fault)
447+
rcu_read_lock(); /* need RCU for perf's callchain below */
448+
441449
if (trace_in)
442450
trace = trace_in;
443451
else if (kernel && task)
444452
trace = get_callchain_entry_for_task(task, max_depth);
445453
else
446454
trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
447455
crosstask, false);
448-
if (unlikely(!trace))
449-
goto err_fault;
450456

451-
if (trace->nr < skip)
457+
if (unlikely(!trace) || trace->nr < skip) {
458+
if (may_fault)
459+
rcu_read_unlock();
452460
goto err_fault;
461+
}
453462

454463
trace_nr = trace->nr - skip;
455464
trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
456465
copy_len = trace_nr * elem_size;
457466

458467
ips = trace->ip + skip;
459-
if (user && user_build_id) {
468+
if (user_build_id) {
460469
struct bpf_stack_build_id *id_offs = buf;
461470
u32 i;
462471

463472
for (i = 0; i < trace_nr; i++)
464473
id_offs[i].ip = ips[i];
465-
stack_map_get_build_id_offset(buf, trace_nr, user);
466474
} else {
467475
memcpy(buf, ips, copy_len);
468476
}
469477

478+
/* trace/ips should not be dereferenced after this point */
479+
if (may_fault)
480+
rcu_read_unlock();
481+
482+
if (user_build_id)
483+
stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
484+
470485
if (size > copy_len)
471486
memset(buf + copy_len, 0, size - copy_len);
472487
return copy_len;
@@ -481,7 +496,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
481496
BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
482497
u64, flags)
483498
{
484-
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
499+
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
485500
}
486501

487502
const struct bpf_func_proto bpf_get_stack_proto = {
@@ -494,8 +509,24 @@ const struct bpf_func_proto bpf_get_stack_proto = {
494509
.arg4_type = ARG_ANYTHING,
495510
};
496511

497-
BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
498-
u32, size, u64, flags)
512+
BPF_CALL_4(bpf_get_stack_sleepable, struct pt_regs *, regs, void *, buf, u32, size,
513+
u64, flags)
514+
{
515+
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, true /* may_fault */);
516+
}
517+
518+
const struct bpf_func_proto bpf_get_stack_sleepable_proto = {
519+
.func = bpf_get_stack_sleepable,
520+
.gpl_only = true,
521+
.ret_type = RET_INTEGER,
522+
.arg1_type = ARG_PTR_TO_CTX,
523+
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
524+
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
525+
.arg4_type = ARG_ANYTHING,
526+
};
527+
528+
static long __bpf_get_task_stack(struct task_struct *task, void *buf, u32 size,
529+
u64 flags, bool may_fault)
499530
{
500531
struct pt_regs *regs;
501532
long res = -EINVAL;
@@ -505,12 +536,18 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
505536

506537
regs = task_pt_regs(task);
507538
if (regs)
508-
res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
539+
res = __bpf_get_stack(regs, task, NULL, buf, size, flags, may_fault);
509540
put_task_stack(task);
510541

511542
return res;
512543
}
513544

545+
BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
546+
u32, size, u64, flags)
547+
{
548+
return __bpf_get_task_stack(task, buf, size, flags, false /* !may_fault */);
549+
}
550+
514551
const struct bpf_func_proto bpf_get_task_stack_proto = {
515552
.func = bpf_get_task_stack,
516553
.gpl_only = false,
@@ -522,6 +559,23 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
522559
.arg4_type = ARG_ANYTHING,
523560
};
524561

562+
BPF_CALL_4(bpf_get_task_stack_sleepable, struct task_struct *, task, void *, buf,
563+
u32, size, u64, flags)
564+
{
565+
return __bpf_get_task_stack(task, buf, size, flags, true /* !may_fault */);
566+
}
567+
568+
const struct bpf_func_proto bpf_get_task_stack_sleepable_proto = {
569+
.func = bpf_get_task_stack_sleepable,
570+
.gpl_only = false,
571+
.ret_type = RET_INTEGER,
572+
.arg1_type = ARG_PTR_TO_BTF_ID,
573+
.arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
574+
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
575+
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
576+
.arg4_type = ARG_ANYTHING,
577+
};
578+
525579
BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
526580
void *, buf, u32, size, u64, flags)
527581
{
@@ -533,7 +587,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
533587
__u64 nr_kernel;
534588

535589
if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
536-
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
590+
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
537591

538592
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
539593
BPF_F_USER_BUILD_ID)))
@@ -553,7 +607,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
553607
__u64 nr = trace->nr;
554608

555609
trace->nr = nr_kernel;
556-
err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
610+
err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
557611

558612
/* restore nr */
559613
trace->nr = nr;
@@ -565,7 +619,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
565619
goto clear;
566620

567621
flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
568-
err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
622+
err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
569623
}
570624
return err;
571625

kernel/trace/bpf_trace.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1507,7 +1507,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
15071507
case BPF_FUNC_jiffies64:
15081508
return &bpf_jiffies64_proto;
15091509
case BPF_FUNC_get_task_stack:
1510-
return &bpf_get_task_stack_proto;
1510+
return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
1511+
: &bpf_get_task_stack_proto;
15111512
case BPF_FUNC_copy_from_user:
15121513
return &bpf_copy_from_user_proto;
15131514
case BPF_FUNC_copy_from_user_task:
@@ -1563,7 +1564,7 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
15631564
case BPF_FUNC_get_stackid:
15641565
return &bpf_get_stackid_proto;
15651566
case BPF_FUNC_get_stack:
1566-
return &bpf_get_stack_proto;
1567+
return prog->sleepable ? &bpf_get_stack_sleepable_proto : &bpf_get_stack_proto;
15671568
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
15681569
case BPF_FUNC_override_return:
15691570
return &bpf_override_return_proto;

0 commit comments

Comments
 (0)