Skip to content

Commit 376bd59

Browse files
Yonghong Songanakryiko
authored andcommitted
bpf: Use fake pt_regs when doing bpf syscall tracepoint tracing
Salvatore Benedetto reported an issue that when doing syscall tracepoint tracing the kernel stack is empty. For example, using the following command line bpftrace -e 'tracepoint:syscalls:sys_enter_read { print("Kernel Stack\n"); print(kstack()); }' bpftrace -e 'tracepoint:syscalls:sys_exit_read { print("Kernel Stack\n"); print(kstack()); }' the output for both commands is === Kernel Stack === Further analysis shows that pt_regs used for bpf syscall tracepoint tracing is from the one constructed during user->kernel transition. The call stack looks like perf_syscall_enter+0x88/0x7c0 trace_sys_enter+0x41/0x80 syscall_trace_enter+0x100/0x160 do_syscall_64+0x38/0xf0 entry_SYSCALL_64_after_hwframe+0x76/0x7e The ip address stored in pt_regs is from user space hence no kernel stack is printed. To fix the issue, kernel address from pt_regs is required. In kernel repo, there are already a few cases like this. For example, in kernel/trace/bpf_trace.c, several perf_fetch_caller_regs(fake_regs_ptr) instances are used to supply ip address or use ip address to construct call stack. Instead of allocate fake_regs in the stack which may consume a lot of bytes, the function perf_trace_buf_alloc() in perf_syscall_{enter, exit}() is leveraged to create fake_regs, which will be passed to perf_call_bpf_{enter,exit}(). For the above bpftrace script, I got the following output with this patch: for tracepoint:syscalls:sys_enter_read === Kernel Stack syscall_trace_enter+407 syscall_trace_enter+407 do_syscall_64+74 entry_SYSCALL_64_after_hwframe+75 === and for tracepoint:syscalls:sys_exit_read === Kernel Stack syscall_exit_work+185 syscall_exit_work+185 syscall_exit_to_user_mode+305 do_syscall_64+118 entry_SYSCALL_64_after_hwframe+75 === Reported-by: Salvatore Benedetto <[email protected]> Suggested-by: Andrii Nakryiko <[email protected]> Signed-off-by: Yonghong Song <[email protected]> Signed-off-by: Andrii Nakryiko <[email protected]> Acked-by: Andrii Nakryiko <[email protected]> Link: https://lore.kernel.org/bpf/[email protected]
1 parent 2bea33f commit 376bd59

File tree

1 file changed

+8
-4
lines changed

1 file changed

+8
-4
lines changed

kernel/trace/trace_syscalls.c

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,7 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re
564564
BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
565565

566566
/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
567+
perf_fetch_caller_regs(regs);
567568
*(struct pt_regs **)&param = regs;
568569
param.syscall_nr = rec->nr;
569570
for (i = 0; i < sys_data->nb_args; i++)
@@ -575,6 +576,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
575576
{
576577
struct syscall_metadata *sys_data;
577578
struct syscall_trace_enter *rec;
579+
struct pt_regs *fake_regs;
578580
struct hlist_head *head;
579581
unsigned long args[6];
580582
bool valid_prog_array;
@@ -602,7 +604,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
602604
size = ALIGN(size + sizeof(u32), sizeof(u64));
603605
size -= sizeof(u32);
604606

605-
rec = perf_trace_buf_alloc(size, NULL, &rctx);
607+
rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
606608
if (!rec)
607609
return;
608610

@@ -611,7 +613,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
611613
memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
612614

613615
if ((valid_prog_array &&
614-
!perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
616+
!perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
615617
hlist_empty(head)) {
616618
perf_swevent_put_recursion_context(rctx);
617619
return;
@@ -666,6 +668,7 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg
666668
} __aligned(8) param;
667669

668670
/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
671+
perf_fetch_caller_regs(regs);
669672
*(struct pt_regs **)&param = regs;
670673
param.syscall_nr = rec->nr;
671674
param.ret = rec->ret;
@@ -676,6 +679,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
676679
{
677680
struct syscall_metadata *sys_data;
678681
struct syscall_trace_exit *rec;
682+
struct pt_regs *fake_regs;
679683
struct hlist_head *head;
680684
bool valid_prog_array;
681685
int syscall_nr;
@@ -701,15 +705,15 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
701705
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
702706
size -= sizeof(u32);
703707

704-
rec = perf_trace_buf_alloc(size, NULL, &rctx);
708+
rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
705709
if (!rec)
706710
return;
707711

708712
rec->nr = syscall_nr;
709713
rec->ret = syscall_get_return_value(current, regs);
710714

711715
if ((valid_prog_array &&
712-
!perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
716+
!perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
713717
hlist_empty(head)) {
714718
perf_swevent_put_recursion_context(rctx);
715719
return;

0 commit comments

Comments
 (0)