Skip to content

Commit cce2246

Browse files
committed
feat: enable namespace PID translation
When enable_namespace_pid is set, translate host PIDs/TGIDs into the profiler’s PID namespace so sidecar deployments report container PIDs (e.g. PID 1) instead of host PIDs. This is useful when the profiler is embedded into an application running in a sidecar container. The feature needs to be enable using configuration, and to have BTF support available.
1 parent aea8e0e commit cce2246

File tree

3 files changed

+352
-18
lines changed

3 files changed

+352
-18
lines changed

support/ebpf/native_stack_trace.ebpf.c

Lines changed: 138 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,61 @@ BPF_RODATA_VAR(u32, task_stack_offset, 0)
2525
// The offset of struct pt_regs within the kernel entry stack.
2626
BPF_RODATA_VAR(u32, stack_ptregs_offset, 0)
2727

28+
// If enabled, the profiler translates host-level PIDs/TGIDs into the
29+
// corresponding IDs within a specific PID namespace. This is essential
30+
// for sidecar deployments to report PIDs consistent with the container's
31+
// internal view (e.g., reporting PID 1 instead of the host PID).
32+
// It requires to have BTF support for the kernel.
33+
BPF_RODATA_VAR(bool, pid_ns_translation_enabled, false)
34+
35+
// The inode number of the target PID namespace.
36+
// Obtained by calling stat() on /proc/[pid]/ns/pid.
37+
BPF_RODATA_VAR(u64, target_pid_ns_inode, 0)
38+
39+
// The device ID (st_dev) of the target PID namespace inode.
40+
// Required by the bpf_get_ns_current_pid_tgid helper to uniquely
41+
// identify the namespace filesystem (nsfs) instance.
42+
BPF_RODATA_VAR(u64, target_pid_ns_dev, 0)
43+
44+
// Offsets for walking kernel structures to translate host PIDs into a target PID
45+
// namespace (see parseBTFForNsTranslation). Hierarchy of related kernel types:
46+
//
47+
// task_struct nsproxy pid_namespace
48+
// +------------------+ +----------------+ +------------------+
49+
// | nsproxy |---------->| pid_ns_for_ |------>| ns.inum |
50+
// | thread_pid |--+ | children | +------------------+
51+
// | group_leader |--| +----------------+ ^
52+
// +------------------+ | |
53+
// | v |
54+
// | struct pid |
55+
// | +------------------+ |
56+
// +-------->| level | |
57+
// | numbers[] |--+ (array of struct upid)
58+
// +------------------+ | |
59+
// v v
60+
// struct upid struct upid ...
61+
// +----------+ +----------+
62+
// | nr | | nr | (PID value per level)
63+
// +----------+ +----------+
64+
//
65+
// task_struct:
66+
// offset of nsproxy (-> struct nsproxy)
67+
BPF_RODATA_VAR(u32, task_nsproxy_off, 0)
68+
// offset of thread_pid (-> struct pid)
69+
BPF_RODATA_VAR(u32, task_thread_pid_off, 0)
70+
// offset of group_leader (-> task_struct of main thread)
71+
BPF_RODATA_VAR(u32, task_group_leader_off, 0)
72+
// offset of pid_ns_for_children (-> struct pid_namespace)
73+
BPF_RODATA_VAR(u32, nsproxy_pid_ns_for_children_off, 0)
74+
// pid_namespace: one per PID namespace; ns.inum is the inode number (e.g. for /proc/pid/ns/pid).
75+
BPF_RODATA_VAR(u32, pid_ns_inum_off, 0) // offset of ns.inum within pid_namespace
76+
// pid: represents a PID across namespace levels; numbers[] has one upid per level.
77+
BPF_RODATA_VAR(u32, pid_level_off, 0) // offset of level
78+
BPF_RODATA_VAR(u32, pid_numbers_off, 0) // offset of numbers (array of struct upid)
79+
// upid: PID value in a single namespace; nr is the numeric PID in that namespace.
80+
BPF_RODATA_VAR(u32, upid_nr_off, 0) // offset of nr within struct upid
81+
BPF_RODATA_VAR(u32, upid_size, 0) // sizeof(struct upid), stride of pid.numbers[]
82+
2883
// Macro to create a map named exe_id_to_X_stack_deltas that is a nested maps with a fileID for the
2984
// outer map and an array as inner map that holds up to 2^X stack delta entries for the given
3085
// fileID.
@@ -607,17 +662,93 @@ static EBPF_INLINE int unwind_native(struct pt_regs *ctx)
607662
return -1;
608663
}
609664

610-
SEC("perf_event/native_tracer_entry")
611-
int native_tracer_entry(struct bpf_perf_event_data *ctx)
665+
struct ns_pid_info {
666+
u32 ns_inode;
667+
u32 vpid;
668+
u32 vtgid;
669+
};
670+
671+
#ifdef TESTING_COREDUMP
672+
static int get_current_ns_pid_tgid(struct ns_pid_info *pid_info)
612673
{
613-
// Get the PID and TGID register.
614-
u64 id = bpf_get_current_pid_tgid();
615-
u32 pid = id >> 32;
616-
u32 tid = id & 0xFFFFFFFF;
674+
*pid_info = (struct ns_pid_info){0, 0, 0};
675+
return 0;
676+
}
677+
#else
678+
// Get namespace inode, virtual PID and virtual TGID using only offsets (no kernel struct defs).
679+
// Offsets must be set at load time via BPF_RODATA;
680+
// Return 0 on success, -1 on failure.
681+
static int get_current_ns_pid_tgid(struct ns_pid_info *pid_info)
682+
{
683+
*pid_info = (struct ns_pid_info){0, 0, 0};
684+
685+
u64 ptr_val;
686+
u32 level;
617687

618-
if (pid == 0 && filter_idle_frames) {
688+
if (!pid_ns_translation_enabled) {
619689
return 0;
620690
}
691+
void *task = (void *)bpf_get_current_task();
692+
char *t = (char *)task;
693+
694+
if (bpf_probe_read_kernel(&ptr_val, sizeof(ptr_val), t + task_nsproxy_off) == 0 && ptr_val) {
695+
if (
696+
bpf_probe_read_kernel(
697+
&ptr_val, sizeof(ptr_val), (char *)ptr_val + nsproxy_pid_ns_for_children_off) == 0 &&
698+
ptr_val) {
699+
bpf_probe_read_kernel(
700+
&pid_info->ns_inode, sizeof(pid_info->ns_inode), (char *)ptr_val + pid_ns_inum_off);
701+
}
702+
}
703+
704+
if (bpf_probe_read_kernel(&ptr_val, sizeof(ptr_val), t + task_thread_pid_off) != 0) {
705+
return -1;
706+
}
707+
if (
708+
bpf_probe_read_kernel(&level, sizeof(level), (char *)ptr_val + pid_level_off) != 0 ||
709+
level > 32) {
710+
return -1;
711+
}
712+
bpf_probe_read_kernel(
713+
&pid_info->vpid,
714+
sizeof(pid_info->vpid),
715+
(char *)ptr_val + pid_numbers_off + level * upid_size + upid_nr_off);
716+
717+
if (bpf_probe_read_kernel(&ptr_val, sizeof(ptr_val), t + task_group_leader_off) == 0 && ptr_val) {
718+
if (
719+
bpf_probe_read_kernel(&ptr_val, sizeof(ptr_val), (char *)ptr_val + task_thread_pid_off) ==
720+
0 &&
721+
ptr_val) {
722+
bpf_probe_read_kernel(
723+
&pid_info->vtgid,
724+
sizeof(pid_info->vtgid),
725+
(char *)ptr_val + pid_numbers_off + level * upid_size + upid_nr_off);
726+
}
727+
}
728+
return 0;
729+
}
730+
#endif
731+
732+
SEC("perf_event/native_tracer_entry")
733+
int native_tracer_entry(struct bpf_perf_event_data *ctx)
734+
{
735+
u32 pid = 0;
736+
u32 tid = 0;
737+
if (pid_ns_translation_enabled) {
738+
struct ns_pid_info pid_info = {0};
739+
if (get_current_ns_pid_tgid(&pid_info) != 0) {
740+
return 0;
741+
}
742+
if (pid_info.ns_inode != target_pid_ns_inode) {
743+
return 0;
744+
}
745+
pid = pid_info.vpid;
746+
tid = pid_info.vtgid;
747+
} else {
748+
u64 id = bpf_get_current_pid_tgid();
749+
pid = id >> 32;
750+
tid = id & 0xFFFFFFFF;
751+
}
621752

622753
u64 ts = bpf_ktime_get_ns();
623754
return collect_trace((struct pt_regs *)&ctx->regs, TRACE_SAMPLING, pid, tid, ts, 0);

support/ebpf/tracer.ebpf.amd64

5.71 KB
Binary file not shown.

0 commit comments

Comments
 (0)