Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions collector/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ type Config struct {
NoKernelVersionCheck bool `mapstructure:"no_kernel_version_check"`
MaxGRPCRetries uint32 `mapstructure:"max_grpc_retries"`
MaxRPCMsgSize int `mapstructure:"max_rpc_msg_size"`
EnableNamespacePID bool `mapstructure:"enable_namespace_pid"`
}

// Validate validates the config.
Expand Down
1 change: 1 addition & 0 deletions internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ func (c *Controller) Start(ctx context.Context) error {
ProbeLinks: c.config.ProbeLinks,
LoadProbe: c.config.LoadProbe,
ExecutableReporter: c.config.ExecutableReporter,
EnableNamespacePID: c.config.EnableNamespacePID,
})
if err != nil {
return fmt.Errorf("failed to load eBPF tracer: %w", err)
Expand Down
145 changes: 138 additions & 7 deletions support/ebpf/native_stack_trace.ebpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,61 @@ BPF_RODATA_VAR(u32, task_stack_offset, 0)
// The offset of struct pt_regs within the kernel entry stack.
BPF_RODATA_VAR(u32, stack_ptregs_offset, 0)

// If enabled, the profiler translates host-level PIDs/TGIDs into the
// corresponding IDs within a specific PID namespace. This is essential
// for sidecar deployments to report PIDs consistent with the container's
// internal view (e.g., reporting PID 1 instead of the host PID).
// It requires to have BTF support for the kernel.
BPF_RODATA_VAR(bool, pid_ns_translation_enabled, false)

// The inode number of the target PID namespace.
// Obtained by calling stat() on /proc/[pid]/ns/pid.
BPF_RODATA_VAR(u64, target_pid_ns_inode, 0)

// The device ID (st_dev) of the target PID namespace inode.
// Required by the bpf_get_ns_current_pid_tgid helper to uniquely
// identify the namespace filesystem (nsfs) instance.
BPF_RODATA_VAR(u64, target_pid_ns_dev, 0)

// Offsets for walking kernel structures to translate host PIDs into a target PID
// namespace (see parseBTFForNsTranslation). Hierarchy of related kernel types:
//
// task_struct nsproxy pid_namespace
// +------------------+ +----------------+ +------------------+
// | nsproxy |---------->| pid_ns_for_ |------>| ns.inum |
// | thread_pid |--+ | children | +------------------+
// | group_leader |--| +----------------+ ^
// +------------------+ | |
// | v |
// | struct pid |
// | +------------------+ |
// +-------->| level | |
// | numbers[] |--+ (array of struct upid)
// +------------------+ | |
// v v
// struct upid struct upid ...
// +----------+ +----------+
// | nr | | nr | (PID value per level)
// +----------+ +----------+
//
// task_struct:
// offset of nsproxy (-> struct nsproxy)
BPF_RODATA_VAR(u32, task_nsproxy_off, 0)
// offset of thread_pid (-> struct pid)
BPF_RODATA_VAR(u32, task_thread_pid_off, 0)
// offset of group_leader (-> task_struct of main thread)
BPF_RODATA_VAR(u32, task_group_leader_off, 0)
// offset of pid_ns_for_children (-> struct pid_namespace)
BPF_RODATA_VAR(u32, nsproxy_pid_ns_for_children_off, 0)
// pid_namespace: one per PID namespace; ns.inum is the inode number (e.g. for /proc/pid/ns/pid).
BPF_RODATA_VAR(u32, pid_ns_inum_off, 0) // offset of ns.inum within pid_namespace
// pid: represents a PID across namespace levels; numbers[] has one upid per level.
BPF_RODATA_VAR(u32, pid_level_off, 0) // offset of level
BPF_RODATA_VAR(u32, pid_numbers_off, 0) // offset of numbers (array of struct upid)
// upid: PID value in a single namespace; nr is the numeric PID in that namespace.
BPF_RODATA_VAR(u32, upid_nr_off, 0) // offset of nr within struct upid
BPF_RODATA_VAR(u32, upid_size, 0) // sizeof(struct upid), stride of pid.numbers[]

// Macro to create a map named exe_id_to_X_stack_deltas that is a nested maps with a fileID for the
// outer map and an array as inner map that holds up to 2^X stack delta entries for the given
// fileID.
Expand Down Expand Up @@ -607,17 +662,93 @@ static EBPF_INLINE int unwind_native(struct pt_regs *ctx)
return -1;
}

SEC("perf_event/native_tracer_entry")
int native_tracer_entry(struct bpf_perf_event_data *ctx)
struct ns_pid_info {
u32 ns_inode;
u32 vpid;
u32 vtgid;
};

#ifdef TESTING_COREDUMP
static int get_current_ns_pid_tgid(struct ns_pid_info *pid_info)
{
// Get the PID and TGID register.
u64 id = bpf_get_current_pid_tgid();
u32 pid = id >> 32;
u32 tid = id & 0xFFFFFFFF;
*pid_info = (struct ns_pid_info){0, 0, 0};
return 0;
}
#else
// Get namespace inode, virtual PID and virtual TGID using only offsets (no kernel struct defs).
// Offsets must be set at load time via BPF_RODATA;
// Return 0 on success, -1 on failure.
static int get_current_ns_pid_tgid(struct ns_pid_info *pid_info)
{
*pid_info = (struct ns_pid_info){0, 0, 0};

u64 ptr_val;
u32 level;

if (pid == 0 && filter_idle_frames) {
if (!pid_ns_translation_enabled) {
return 0;
}
void *task = (void *)bpf_get_current_task();
char *t = (char *)task;

if (bpf_probe_read_kernel(&ptr_val, sizeof(ptr_val), t + task_nsproxy_off) == 0 && ptr_val) {
if (
bpf_probe_read_kernel(
&ptr_val, sizeof(ptr_val), (char *)ptr_val + nsproxy_pid_ns_for_children_off) == 0 &&
ptr_val) {
bpf_probe_read_kernel(
&pid_info->ns_inode, sizeof(pid_info->ns_inode), (char *)ptr_val + pid_ns_inum_off);
}
}

if (bpf_probe_read_kernel(&ptr_val, sizeof(ptr_val), t + task_thread_pid_off) != 0) {
return -1;
}
if (
bpf_probe_read_kernel(&level, sizeof(level), (char *)ptr_val + pid_level_off) != 0 ||
level > 32) {
return -1;
}
bpf_probe_read_kernel(
&pid_info->vpid,
sizeof(pid_info->vpid),
(char *)ptr_val + pid_numbers_off + level * upid_size + upid_nr_off);

if (bpf_probe_read_kernel(&ptr_val, sizeof(ptr_val), t + task_group_leader_off) == 0 && ptr_val) {
if (
bpf_probe_read_kernel(&ptr_val, sizeof(ptr_val), (char *)ptr_val + task_thread_pid_off) ==
0 &&
ptr_val) {
bpf_probe_read_kernel(
&pid_info->vtgid,
sizeof(pid_info->vtgid),
(char *)ptr_val + pid_numbers_off + level * upid_size + upid_nr_off);
}
}
return 0;
}
#endif

SEC("perf_event/native_tracer_entry")
int native_tracer_entry(struct bpf_perf_event_data *ctx)
{
u32 pid = 0;
u32 tid = 0;
if (pid_ns_translation_enabled) {
struct ns_pid_info pid_info = {0};
if (get_current_ns_pid_tgid(&pid_info) != 0) {
return 0;
}
if (pid_info.ns_inode != target_pid_ns_inode) {
return 0;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this incompatible with the whole-host nature of the profiler? E.g. won't this limit the processes that the profiler can profile to those seen from the namespace the profiler runs in?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for your comment !

This change does not restrict the profiler's native "whole-host" visibility. The eBPF remains attached to the host kernel and continues to intercept events across the entire system. The bpf_get_ns_current_pid_tgid helper is used specifically to perform an in-kernel translation to retrieve the "Container PID" only when a match is found with the target namespace.

Key points:
This feature is disabled by default. If not explicitly configured, no translation is performed, preserving the original behavior.
his is specifically designed for sidecar deployments (e.g., using Grafana Alloy) where security constraints favor shareProcessNamespace: true over the more permissive hostPID: true.

Copy link
Copy Markdown
Member

@christos68k christos68k Feb 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll run some tests locally to better understand the use case. Just looking at the code tells me that the whole-host visibility of the profiler is indeed restricted when running in this mode (we shouldn't only be looking at eBPF in isolation, but at the entire profiler as a system) but maybe I'm misunderstanding.

For more context, we've had similar requests in the past (e.g. limiting profiling to "special" processes only for performance reasons) that we decided not to support. Maybe it's worth it to make an exception in this case, but let's first understand better what the tradeoffs are.

Copy link
Copy Markdown
Author

@7za 7za Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to be sure we are on the same line,:
the profiler itself when running this mode, will translate the host pid to the corresponding namespace PID. This is necessary when the application embedding the profiler runs inside a container without hostPID: true (for security reason).
In that case, the profiling application can only see the processes running inside container of the same pods (using shareProcesssNamespace). This is where the translation is needed to match the PID seen from the namespace to the PID raised by the host.
This is not for perf reason, but more because of deployment (as a sidecar) and security constraints (do not use hostPID: true).
Using alloy (>= v1.11.0) with a simple collection of yaml for kind (I can share it) can be a good way to test this.

Copy link
Copy Markdown
Member

@christos68k christos68k Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The question here is whether we want to support this execution mode and assume the maintenance/support burden. We designed the profiler to be a whole system profiler and as such we require the profiler to be able to access all processes running on the host (not just limited to the container the userspace process executes in) and thus run with hostPID: true.

Other configurations and deployment scenarios of course exist but we're not required to support them. We've turned away people in the past that had similar (conflict with whole-host profiling) functionality requests which set a precedent. If we accept this PR we'd both be going against this precedent and also setting a new one.

Personally, I want to focus on the whole-host nature of the profiler and not be side-tracked with code that works against this paradigm but I'm not the only maintainer.

CC: @open-telemetry/ebpf-profiler-maintainers

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thank you for this clarification
I get your point, and I understand the overall design behind that (having a host profiler instead of containerized profiler)., altough I didn't know that ebpf profiler was required to work in this mode only.

}
pid = pid_info.vpid;
tid = pid_info.vtgid;
} else {
u64 id = bpf_get_current_pid_tgid();
pid = id >> 32;
tid = id & 0xFFFFFFFF;
}

u64 ts = bpf_ktime_get_ns();
return collect_trace((struct pt_regs *)&ctx->regs, TRACE_SAMPLING, pid, tid, ts, 0);
Expand Down
Binary file modified support/ebpf/tracer.ebpf.amd64
Binary file not shown.
41 changes: 29 additions & 12 deletions tracer/ebpf_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
package tracer_test

import (
"fmt"
"math"
"runtime"
"slices"
Expand Down Expand Up @@ -186,16 +187,32 @@ Loop:
}

func TestAllTracers(t *testing.T) {
tr, err := tracer.NewTracer(t.Context(), &tracer.Config{
Intervals: &mockIntervals{},
IncludeTracers: tracertypes.AllTracers(),
SamplesPerSecond: 20,
ProbabilisticInterval: 100,
ProbabilisticThreshold: 100,
OffCPUThreshold: uint32(math.MaxUint32 / 100),
VerboseMode: true,
LoadProbe: true,
})
require.NoError(t, err)
defer tr.Close()
testcases := []struct {
enablePidTranslation bool
}{
{
enablePidTranslation: true,
},
{
enablePidTranslation: false,
},
}

for _, testcase := range testcases {
t.Run(fmt.Sprintf("enablePidTranslation=%t", testcase.enablePidTranslation), func(t *testing.T) {
tr, err := tracer.NewTracer(t.Context(), &tracer.Config{
Intervals: &mockIntervals{},
IncludeTracers: tracertypes.AllTracers(),
SamplesPerSecond: 20,
ProbabilisticInterval: 100,
ProbabilisticThreshold: 100,
OffCPUThreshold: uint32(math.MaxUint32 / 100),
VerboseMode: true,
LoadProbe: true,
EnableNamespacePID: testcase.enablePidTranslation,
})
require.NoError(t, err)
defer tr.Close()
})
}
}
Loading