Skip to content

Commit 159dcb0

Browse files
committed
Use new usdt arg support for gpu profiler
1 parent 74df5d2 commit 159dcb0

File tree

6 files changed

+134
-245
lines changed

6 files changed

+134
-245
lines changed

interpreter/gpu/cuda.go

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package gpu // import "go.opentelemetry.io/ebpf-profiler/interpreter/gpu"
33
import (
44
"errors"
55
"fmt"
6-
"runtime"
76
"slices"
87
"strconv"
98
"sync"
@@ -87,11 +86,6 @@ func Loader(ebpf interpreter.EbpfHandler, info *interpreter.LoaderInfo) (interpr
8786
return nil, nil
8887
}
8988

90-
// Validate probe arguments match what cuda.ebpf.c expects
91-
if err := validateProbeArguments(parcagpuProbes, info.FileName()); err != nil {
92-
return nil, err
93-
}
94-
9589
d := &data{
9690
path: info.FileName(),
9791
probes: parcagpuProbes,
@@ -102,46 +96,6 @@ func Loader(ebpf interpreter.EbpfHandler, info *interpreter.LoaderInfo) (interpr
10296
return nil, nil
10397
}
10498

105-
// validateProbeArguments checks that the USDT probe arguments match the expectations
106-
// in cuda.ebpf.c and returns an error if they don't match.
107-
func validateProbeArguments(probes []pfelf.USDTProbe, path string) error {
108-
var expectedProbes map[string]string
109-
110-
switch runtime.GOARCH {
111-
case "amd64":
112-
expectedProbes = map[string]string{
113-
"cuda_correlation": "4@-44(%rbp) 4@-64(%rbp) 8@-40(%rbp)",
114-
"kernel_executed": "8@%rax 8@%rdx 4@%ecx 4@%esi 4@%edi 4@%r8d 8@%r9 8@%r10",
115-
}
116-
case "arm64":
117-
expectedProbes = map[string]string{
118-
"cuda_correlation": "4@[sp, 60] 4@[sp, 32] 8@[sp, 64]",
119-
"kernel_executed": "8@x1 8@x2 4@x3 4@x4 4@x5 4@x6 8@x7 8@x0",
120-
}
121-
default:
122-
return fmt.Errorf("unknown architecture %s, cannot validate USDT probe arguments for %s",
123-
runtime.GOARCH, path)
124-
}
125-
126-
probeMap := make(map[string]string)
127-
for _, probe := range probes {
128-
probeMap[probe.Name] = probe.Arguments
129-
}
130-
131-
for name, expectedArgs := range expectedProbes {
132-
actualArgs, ok := probeMap[name]
133-
if !ok {
134-
return fmt.Errorf("missing expected USDT probe '%s' in %s", name, path)
135-
}
136-
if actualArgs != expectedArgs {
137-
return fmt.Errorf("USDT probe '%s' in %s has incorrect arguments: "+
138-
"expected: %s"+
139-
"actual: %s",
140-
name, path, expectedArgs, actualArgs)
141-
}
142-
}
143-
return nil
144-
}
14599

146100
func (d *data) Attach(ebpf interpreter.EbpfHandler, pid libpf.PID, _ libpf.Address,
147101
_ remotememory.RemoteMemory) (interpreter.Instance, error) {

support/ebpf/cuda.ebpf.c

Lines changed: 31 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
11
#include "bpfdefs.h"
22
#include "tracemgmt.h"
33
#include "types.h"
4+
#include "usdt_args.h"
45

5-
// cuda_correlation reads the correlation ID the usdt probe:
6-
// u32 correlationId
7-
// u32 callbackId
8-
// char* name
9-
// AMD64: 4@-44(%rbp) 4@-64(%rbp) 8@-40(%rbp)
10-
// ARM64: 4@[sp, 60] 4@[sp, 32] 8@[sp, 64]
11-
static EBPF_INLINE int cuda_correlation(struct pt_regs *ctx)
6+
// cuda_correlation reads the correlation ID from the USDT probe and records a trace.
7+
SEC("usdt/parcagpu/cuda_correlation")
8+
int BPF_USDT(cuda_correlation, u32 correlation_id, u32 cbid)
129
{
1310
u64 pid_tgid = bpf_get_current_pid_tgid();
1411
u32 pid = pid_tgid >> 32;
@@ -18,29 +15,6 @@ static EBPF_INLINE int cuda_correlation(struct pt_regs *ctx)
1815
return 0;
1916
}
2017

21-
u32 correlation_id, cbid = 0;
22-
int err;
23-
24-
#if defined(__aarch64__)
25-
// ARM64: Arguments: 4@[sp, 36]
26-
u64 addr = ctx->sp;
27-
err = bpf_probe_read_user(&correlation_id, sizeof(correlation_id), (void *)(addr + 60));
28-
if (err)
29-
return err;
30-
err = bpf_probe_read_user(&cbid, sizeof(cbid), (void *)(addr + 32));
31-
if (err)
32-
return err;
33-
#else
34-
// AMD64: Arguments: 4@-36(%rbp)
35-
u64 rbp = ctx->bp;
36-
err = bpf_probe_read_user(&correlation_id, sizeof(correlation_id), (void *)rbp - 44);
37-
if (err)
38-
return err;
39-
err = bpf_probe_read_user(&cbid, sizeof(cbid), (void *)rbp - 64);
40-
if (err)
41-
return err;
42-
#endif
43-
4418
DEBUG_PRINT("cuda_correlation_probe: correlation_id=%u, cbid=%u", correlation_id, cbid);
4519

4620
u64 ts = bpf_ktime_get_ns();
@@ -67,45 +41,22 @@ bpf_map_def SEC("maps") cuda_timing_events = {
6741
.max_entries = 0,
6842
};
6943

70-
// u64 start
71-
// u64 end
72-
// u32 correlationId
73-
// u32 deviceId
74-
// u32 streamId
75-
// u32 graphId
76-
// const char *kernelName
77-
// AMD64 Arguments: 8@%rax 8@%rdx 4@%ecx 4@%esi 4@%edi 4@%r8d 8@%r9 8@%r10
78-
// ARM64 Arguments: 8@x1 8@x2 4@x3 4@x4 4@x5 4@x6 8@x0
79-
static EBPF_INLINE int cuda_kernel_exec(struct pt_regs *ctx)
44+
SEC("usdt/parcagpu/cuda_kernel")
45+
int BPF_USDT(
46+
cuda_kernel_exec,
47+
u64 start,
48+
u64 end,
49+
u32 correlation_id,
50+
u32 device_id,
51+
u32 stream_id,
52+
u32 graph_id,
53+
u64 graph_node_id,
54+
u64 name_ptr)
8055
{
81-
u64 pid_tgid = bpf_get_current_pid_tgid();
82-
u32 pid = pid_tgid >> 32;
83-
84-
u64 start, end, graph_node_id = 0;
85-
u32 correlation_id, device_id, stream_id, graph_id = 0;
86-
const char *name;
87-
88-
#if defined(__aarch64__)
89-
start = ctx->regs[1]; // x1
90-
end = ctx->regs[2]; // x2
91-
correlation_id = ctx->regs[3]; // x3
92-
device_id = ctx->regs[4]; // x4
93-
stream_id = ctx->regs[5]; // x5
94-
graph_id = ctx->regs[6]; // x6
95-
graph_node_id = ctx->regs[7]; // x7
96-
name = (const char *)ctx->regs[0]; // x0
97-
#else
98-
start = ctx->ax;
99-
end = ctx->dx;
100-
correlation_id = ctx->cx;
101-
device_id = ctx->si;
102-
stream_id = ctx->di;
103-
graph_id = ctx->r8;
104-
graph_node_id = ctx->r9;
105-
name = (const char *)ctx->r10;
106-
#endif
107-
108-
u64 duration_ns = end - start;
56+
u64 pid_tgid = bpf_get_current_pid_tgid();
57+
u32 pid = pid_tgid >> 32;
58+
u64 duration_ns = end - start;
59+
const char *name = (const char *)name_ptr;
10960

11061
DEBUG_PRINT(
11162
"cuda_kernel_exec: correlation_id=%u, duration_ns=%llu, name=%s\n",
@@ -149,22 +100,19 @@ int cuda_probe(struct pt_regs *ctx)
149100
u64 full_cookie = bpf_get_attach_cookie(ctx);
150101
u32 cookie = (u32)(full_cookie & 0xFFFFFFFF);
151102
switch (cookie) {
152-
case 'c': return cuda_correlation(ctx);
153-
case 'k': return cuda_kernel_exec(ctx);
103+
case 'c': return BPF_USDT_CALL(cuda_correlation, correlation_id, cbid);
104+
case 'k':
105+
return BPF_USDT_CALL(
106+
cuda_kernel_exec,
107+
start,
108+
end,
109+
correlation_id,
110+
device_id,
111+
stream_id,
112+
graph_id,
113+
graph_node_id,
114+
name);
154115
default: DEBUG_PRINT("cuda_probe: unknown cookie %u", cookie); break;
155116
}
156117
return 0;
157118
}
158-
159-
// Individual probe entry points for single-shot mode
160-
SEC("usdt/parcagpu/cuda_correlation")
161-
int usdt_parcagpu_cuda_correlation(struct pt_regs *ctx)
162-
{
163-
return cuda_correlation(ctx);
164-
}
165-
166-
SEC("usdt/parcagpu/cuda_kernel")
167-
int usdt_parcagpu_cuda_kernel(struct pt_regs *ctx)
168-
{
169-
return cuda_kernel_exec(ctx);
170-
}

support/ebpf/tracer.ebpf.amd64

94.1 KB
Binary file not shown.

support/ebpf/tracer.ebpf.arm64

158 KB
Binary file not shown.

support/ebpf/usdt_args.h

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,4 +171,74 @@ static EBPF_INLINE UNUSED int bpf_usdt_arg(struct pt_regs *ctx, u64 arg_num, lon
171171
return 0;
172172
}
173173

174+
// clang-format off
175+
// Individual argument extraction macros
176+
// Usage: s32 arg0 = bpf_usdt_arg0(ctx);
177+
#define bpf_usdt_arg0(ctx) ({ long _arg; bpf_usdt_arg(ctx, 0, &_arg); _arg; })
178+
#define bpf_usdt_arg1(ctx) ({ long _arg; bpf_usdt_arg(ctx, 1, &_arg); _arg; })
179+
#define bpf_usdt_arg2(ctx) ({ long _arg; bpf_usdt_arg(ctx, 2, &_arg); _arg; })
180+
#define bpf_usdt_arg3(ctx) ({ long _arg; bpf_usdt_arg(ctx, 3, &_arg); _arg; })
181+
#define bpf_usdt_arg4(ctx) ({ long _arg; bpf_usdt_arg(ctx, 4, &_arg); _arg; })
182+
#define bpf_usdt_arg5(ctx) ({ long _arg; bpf_usdt_arg(ctx, 5, &_arg); _arg; })
183+
#define bpf_usdt_arg6(ctx) ({ long _arg; bpf_usdt_arg(ctx, 6, &_arg); _arg; })
184+
#define bpf_usdt_arg7(ctx) ({ long _arg; bpf_usdt_arg(ctx, 7, &_arg); _arg; })
185+
#define bpf_usdt_arg8(ctx) ({ long _arg; bpf_usdt_arg(ctx, 8, &_arg); _arg; })
186+
#define bpf_usdt_arg9(ctx) ({ long _arg; bpf_usdt_arg(ctx, 9, &_arg); _arg; })
187+
#define bpf_usdt_arg10(ctx) ({ long _arg; bpf_usdt_arg(ctx, 10, &_arg); _arg; })
188+
#define bpf_usdt_arg11(ctx) ({ long _arg; bpf_usdt_arg(ctx, 11, &_arg); _arg; })
189+
190+
// The rest of this code is from libbpf
191+
#ifndef ___bpf_concat
192+
#define ___bpf_concat(a, b) a##b
193+
#endif
194+
#ifndef ___bpf_apply
195+
#define ___bpf_apply(fn, n) ___bpf_concat(fn, n)
196+
#endif
197+
#ifndef ___bpf_nth
198+
#define ___bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _a, _b, _c, N, ...) N
199+
#endif
200+
#ifndef ___bpf_narg
201+
#define ___bpf_narg(...) \
202+
___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
203+
#endif
204+
205+
#define ___bpf_usdt_args0() ctx
206+
#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); _x; })
207+
#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); _x; })
208+
#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); _x; })
209+
#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); _x; })
210+
#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); _x; })
211+
#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); _x; })
212+
#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); _x; })
213+
#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); _x; })
214+
#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); _x; })
215+
#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); _x; })
216+
#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); _x; })
217+
#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); _x; })
218+
#define ___bpf_usdt_args(args...) ___bpf_apply(___bpf_usdt_args, ___bpf_narg(args))(args)
219+
220+
/*
221+
* BPF_USDT serves the same purpose for USDT handlers as BPF_PROG for
222+
* tp_btf/fentry/fexit BPF programs and BPF_KPROBE for kprobes.
223+
* Original struct pt_regs * context is preserved as 'ctx' argument.
224+
*/
225+
#define BPF_USDT(name, args...) \
226+
name(struct pt_regs *ctx); \
227+
static EBPF_INLINE typeof(name(0)) \
228+
____##name(UNUSED struct pt_regs *ctx, ##args); \
229+
typeof(name(0)) name(struct pt_regs *ctx) \
230+
{ \
231+
_Pragma("GCC diagnostic push") \
232+
_Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \
233+
return ____##name(___bpf_usdt_args(args)); \
234+
_Pragma("GCC diagnostic pop") \
235+
} \
236+
static EBPF_INLINE typeof(name(0)) \
237+
____##name(UNUSED struct pt_regs *ctx, ##args)
238+
239+
#define BPF_USDT_CALL(name, args...) \
240+
____##name(___bpf_usdt_args(args))
241+
242+
// clang-format on
243+
174244
#endif // OPTI_USDT_ARGS_H

0 commit comments

Comments
 (0)