66 "bytes"
77 "context"
88 "flag"
9- "math"
109 "os"
1110 "testing"
1211 "time"
@@ -15,213 +14,15 @@ import (
1514 "github.com/cilium/ebpf/perf"
1615 "github.com/stretchr/testify/require"
1716
18- "go.opentelemetry.io/ebpf-profiler/interpreter"
1917 "go.opentelemetry.io/ebpf-profiler/interpreter/gpu"
2018 "go.opentelemetry.io/ebpf-profiler/libpf"
21- "go.opentelemetry.io/ebpf-profiler/libpf/pfelf"
22- "go.opentelemetry.io/ebpf-profiler/reporter/samples"
2319 "go.opentelemetry.io/ebpf-profiler/testutils"
24- "go.opentelemetry.io/ebpf-profiler/tracer"
2520 tracertypes "go.opentelemetry.io/ebpf-profiler/tracer/types"
2621 "go.opentelemetry.io/ebpf-profiler/util"
2722)
2823
2924var soPath = flag .String ("so-path" , "/libparcagpucupti.so" , "path to libparcagpucupti.so" )
3025
31- type mockIntervals struct {}
32-
33- func (mockIntervals ) MonitorInterval () time.Duration { return 1 * time .Second }
34- func (mockIntervals ) TracePollInterval () time.Duration { return 250 * time .Millisecond }
35- func (mockIntervals ) PIDCleanupInterval () time.Duration { return 1 * time .Second }
36- func (mockIntervals ) ExecutableUnloadDelay () time.Duration { return 1 * time .Second }
37-
38- type mockReporter struct {}
39-
40- func (mockReporter ) ExecutableKnown (_ libpf.FileID ) bool { return true }
41-
42- // discardTraceReporter is a TraceReporter that silently discards all traces.
43- type discardTraceReporter struct {}
44-
45- func (discardTraceReporter ) ReportTraceEvent (_ * libpf.Trace , _ * samples.TraceEventMeta ) error {
46- return nil
47- }
48-
49- // parseProbes opens the .so and extracts the required parcagpu USDT probes.
50- func parseProbes (t * testing.T ) []pfelf.USDTProbe {
51- t .Helper ()
52-
53- ef , err := pfelf .Open (* soPath )
54- require .NoError (t , err , "failed to open %s" , * soPath )
55- defer ef .Close ()
56-
57- require .NoError (t , ef .LoadSections (), "failed to load sections" )
58-
59- allProbes , err := ef .ParseUSDTProbes ()
60- require .NoError (t , err , "failed to parse USDT probes" )
61-
62- var requiredProbes []pfelf.USDTProbe
63- for _ , probe := range allProbes {
64- if probe .Provider == "parcagpu" &&
65- (probe .Name == "cuda_correlation" || probe .Name == "kernel_executed" || probe .Name == "activity_batch" ) {
66- requiredProbes = append (requiredProbes , probe )
67- }
68- }
69- // Need cuda_correlation + at least one of kernel_executed/activity_batch
70- hasCorrelation := false
71- hasKernel := false
72- for _ , p := range requiredProbes {
73- switch p .Name {
74- case "cuda_correlation" :
75- hasCorrelation = true
76- case "kernel_executed" , "activity_batch" :
77- hasKernel = true
78- }
79- }
80- require .True (t , hasCorrelation , "missing cuda_correlation probe" )
81- require .True (t , hasKernel , "missing kernel_executed or activity_batch probe" )
82-
83- for _ , p := range requiredProbes {
84- t .Logf ("Found probe: provider=%s name=%s location=0x%x args=%s" ,
85- p .Provider , p .Name , p .Location , p .Arguments )
86- }
87- return requiredProbes
88- }
89-
90- // createTracer creates a Tracer with InstrumentCudaLaunch enabled so the CUDA
91- // eBPF programs (tail-call destinations) are loaded and the verifier runs.
92- func createTracer (t * testing.T ) (* tracer.Tracer , interpreter.EbpfHandler , context.CancelFunc ) {
93- t .Helper ()
94-
95- ctx , cancel := context .WithCancel (context .Background ())
96- enabledTracers , _ := tracertypes .Parse ("" )
97-
98- tr , err := tracer .NewTracer (ctx , & tracer.Config {
99- Intervals : & mockIntervals {},
100- IncludeTracers : enabledTracers ,
101- FilterErrorFrames : false ,
102- SamplesPerSecond : 20 ,
103- MapScaleFactor : 0 ,
104- KernelVersionCheck : false ,
105- BPFVerifierLogLevel : 0 ,
106- ProbabilisticInterval : 100 ,
107- ProbabilisticThreshold : 100 ,
108- OffCPUThreshold : 1 * math .MaxUint32 ,
109- InstrumentCudaLaunch : true ,
110- })
111- require .NoError (t , err , "failed to create tracer" )
112-
113- ebpfHandler := tr .GetEbpfHandler ()
114- return tr , ebpfHandler , cancel
115- }
116-
117- // buildCookiesAndProgNames builds the cookie and program-name slices that
118- // mirror interpreter/gpu/cuda.go Attach().
119- func buildCookiesAndProgNames (probes []pfelf.USDTProbe ) ([]uint64 , []string ) {
120- cookies := make ([]uint64 , len (probes ))
121- progNames := make ([]string , len (probes ))
122- for i , probe := range probes {
123- switch probe .Name {
124- case "cuda_correlation" :
125- cookies [i ] = 0 // CudaProgCorrelation
126- progNames [i ] = "cuda_correlation"
127- case "kernel_executed" :
128- cookies [i ] = 1 // CudaProgKernelExec
129- progNames [i ] = "cuda_kernel_exec"
130- case "activity_batch" :
131- cookies [i ] = 2 // CudaProgActivityBatch
132- progNames [i ] = "cuda_activity_batch"
133- }
134- }
135- return cookies , progNames
136- }
137-
138- // TestCUDAVerifierSingleShot verifies CUDA eBPF programs pass the BPF verifier
139- // using individual per-probe program attachment (works on kernel 5.15+).
140- // Forces single-shot mode so that AttachUSDTProbes uses per-probe attachment.
141- func TestCUDAVerifierSingleShot (t * testing.T ) {
142- if os .Getuid () != 0 {
143- t .Skip ("requires root to load eBPF programs" )
144- }
145- if ! util .HasBpfGetAttachCookie () {
146- t .Skip ("requires kernel support for bpf_get_attach_cookie (5.15+)" )
147- }
148-
149- // Force single-shot mode so loadUSDTProgram does not set
150- // AttachTraceUprobeMulti.
151- noMulti := false
152- util .SetTestOnlyMultiUprobeSupport (& noMulti )
153- defer util .SetTestOnlyMultiUprobeSupport (nil )
154-
155- probes := parseProbes (t )
156-
157- testutils .InitializeMetrics ()
158- tr , ebpfHandler , cancel := createTracer (t )
159- defer tr .Close ()
160- defer cancel ()
161-
162- cookies , progNames := buildCookiesAndProgNames (probes )
163-
164- lc , err := ebpfHandler .AttachUSDTProbes (
165- libpf .PID (os .Getpid ()),
166- * soPath ,
167- "" , // no multi-prog
168- probes ,
169- cookies ,
170- progNames ,
171- )
172- require .NoError (t , err , "AttachUSDTProbes (single-shot) failed — BPF verifier rejected CUDA programs" )
173- defer lc .Unload ()
174-
175- t .Log ("SingleShot: all CUDA eBPF programs passed the BPF verifier" )
176- }
177-
178- // TestCUDAVerifierMultiProbe verifies CUDA eBPF programs pass the BPF verifier
179- // using multi-uprobe attachment with cookies (requires kernel 6.6+).
180- func TestCUDAVerifierMultiProbe (t * testing.T ) {
181- if os .Getuid () != 0 {
182- t .Skip ("requires root to load eBPF programs" )
183- }
184- if ! util .HasBpfGetAttachCookie () {
185- t .Skip ("requires kernel support for bpf_get_attach_cookie (5.15+)" )
186- }
187- if ! util .HasMultiUprobeSupport () {
188- t .Skip ("requires kernel support for uprobe multi-attach (6.6+)" )
189- }
190-
191- probes := parseProbes (t )
192-
193- testutils .InitializeMetrics ()
194-
195- tr , ebpfHandler , cancel := createTracer (t )
196- defer tr .Close ()
197- defer cancel ()
198-
199- cookies , progNames := buildCookiesAndProgNames (probes )
200-
201- // Populate the tail-call prog array for activity_batch (the only tail-call
202- // target — correlation and kernel_exec are inlined in cuda_probe).
203- for _ , probe := range probes {
204- if probe .Name == "activity_batch" {
205- err := ebpfHandler .UpdateProgArray ("cuda_progs" , 0 , "cuda_activity_batch_tail" )
206- require .NoError (t , err , "UpdateProgArray failed for cuda_activity_batch" )
207- break
208- }
209- }
210-
211- lc , err := ebpfHandler .AttachUSDTProbes (
212- libpf .PID (os .Getpid ()),
213- * soPath ,
214- "cuda_probe" , // multi-probe program
215- probes ,
216- cookies ,
217- progNames ,
218- )
219- require .NoError (t , err , "AttachUSDTProbes (multi-probe) failed — BPF verifier rejected CUDA programs" )
220- defer lc .Unload ()
221-
222- t .Log ("MultiProbe: all CUDA eBPF programs passed the BPF verifier" )
223- }
224-
22526// runEndToEnd exercises the full process-manager driven GPU probe attachment flow:
22627//
22728// 1. Start the full tracer pipeline (PID event processor, map monitors, profiling).
@@ -247,69 +48,31 @@ func runEndToEnd(t *testing.T, multiProbe bool) {
24748 enabledTracers , _ := tracertypes .Parse ("" )
24849 enabledTracers .Enable (tracertypes .CUDATracer )
24950
250- tr , err := tracer .NewTracer (ctx , & tracer.Config {
251- TraceReporter : discardTraceReporter {},
252- Intervals : & mockIntervals {},
253- IncludeTracers : enabledTracers ,
254- FilterErrorFrames : false ,
255- SamplesPerSecond : 20 ,
256- MapScaleFactor : 0 ,
257- KernelVersionCheck : false ,
258- BPFVerifierLogLevel : 0 ,
259- ProbabilisticInterval : 100 ,
260- ProbabilisticThreshold : 100 ,
261- OffCPUThreshold : 1 * math .MaxUint32 ,
262- InstrumentCudaLaunch : true ,
263- VerboseMode : true ,
264- })
265- require .NoError (t , err , "failed to create tracer" )
266- defer tr .Close ()
267-
268- // Start the full pipeline: PID event processor, profiling, map monitors.
269- tr .StartPIDEventProcessor (ctx )
270- require .NoError (t , tr .AttachTracer (), "AttachTracer failed" )
271- require .NoError (t , tr .EnableProfiling (), "EnableProfiling failed" )
272- require .NoError (t , tr .AttachSchedMonitor (), "AttachSchedMonitor failed" )
273-
274- ebpfTraceCh := make (chan * libpf.EbpfTrace )
275- require .NoError (t , tr .StartMapMonitors (ctx , ebpfTraceCh ), "StartMapMonitors failed" )
276-
277- // Consume eBPF traces to prevent blocking the pipeline.
278- go func () {
279- for {
280- select {
281- case trace := <- ebpfTraceCh :
282- if trace != nil {
283- tr .HandleTrace (trace )
284- }
285- case <- ctx .Done ():
286- return
287- }
288- }
289- }()
51+ _ , trc := testutils .StartTracer (ctx , t , enabledTracers , false )
52+ defer trc .Close ()
29053
29154 // Trigger initial process sync for our PID so the tracer discovers our
29255 // mappings and attaches the dlopen uprobe to libc.
29356 pid := libpf .PID (uint32 (os .Getpid ()))
294- tr .ForceProcessPID (pid )
57+ trc .ForceProcessPID (pid )
29558
29659 // Wait until the process manager has processed our PID and attached
29760 // interpreter instances (the rtld instance attaches the dlopen uprobe
29861 // to libc as a side effect).
29962 require .Eventually (t , func () bool {
300- instances := tr .GetInterpretersForPID (pid )
63+ instances := trc .GetInterpretersForPID (pid )
30164 if len (instances ) > 0 {
30265 t .Logf ("process synced: %d interpreter(s) attached" , len (instances ))
30366 return true
30467 }
30568 t .Log ("waiting for initial process sync..." )
306- tr .ForceProcessPID (pid )
69+ trc .ForceProcessPID (pid )
30770 return false
30871 }, 30 * time .Second , 200 * time .Millisecond , "process manager never synced our PID" )
30972
31073 // Set up perf reader on the cuda_timing_events map BEFORE the dlopen so we
31174 // don't miss any events.
312- timingMap := tr .GetEbpfMaps ()["cuda_timing_events" ]
75+ timingMap := trc .GetEbpfMaps ()["cuda_timing_events" ]
31376 require .NotNil (t , timingMap , "cuda_timing_events map not found" )
31477
31578 reader , err := perf .NewReader (timingMap , 1024 * 1024 )
@@ -324,20 +87,20 @@ func runEndToEnd(t *testing.T, multiProbe bool) {
32487 defer cCleanupParcaGPU ()
32588
32689 // Speed up the re-sync after dlopen.
327- tr .ForceProcessPID (pid )
90+ trc .ForceProcessPID (pid )
32891
32992 // Wait until the GPU interpreter instance appears, confirming the USDT
33093 // probes were attached by the process manager.
33194 require .Eventually (t , func () bool {
332- instances := tr .GetInterpretersForPID (pid )
95+ instances := trc .GetInterpretersForPID (pid )
33396 for _ , inst := range instances {
33497 if _ , ok := inst .(* gpu.Instance ); ok {
33598 t .Log ("GPU interpreter instance attached" )
33699 return true
337100 }
338101 }
339102 t .Logf ("waiting for GPU interpreter instance (%d interpreters so far)..." , len (instances ))
340- tr .ForceProcessPID (pid )
103+ trc .ForceProcessPID (pid )
341104 return false
342105 }, 30 * time .Second , 200 * time .Millisecond , "GPU interpreter never attached after dlopen" )
343106
0 commit comments