1515static bool debug_enabled = false;
1616
1717// Activity buffer management
18- static size_t activityBufferSize = 10 * 1024 * 1024 ;
18+ // A kernel activity is around 224 bytes so a 128kb buffer
19+ // will hold ~500 activities, we want to flush regularly since
20+ // we are a continuous profiler so we don't need a huge buffer
21+ // like most CUPTI profilers. Also a small size avoid malloc
22+ // just going to mmap every time so the allocator should cache
23+ // and re-use these for us.
24+ static size_t activityBufferSize = 128 * 1024 ;
1925
2026// Global variables
2127static CUpti_SubscriberHandle subscriber = 0 ;
@@ -45,10 +51,11 @@ static void init_debug(void) {
4551static void parcagpuCuptiCallback (void * userdata , CUpti_CallbackDomain domain ,
4652 CUpti_CallbackId cbid ,
4753 const CUpti_CallbackData * cbdata );
48- static void bufferRequested (uint8_t * * buffer , size_t * size ,
49- size_t * maxNumRecords );
50- static void bufferCompleted (CUcontext ctx , uint32_t streamId , uint8_t * buffer ,
51- size_t size , size_t validSize );
54+ static void parcagpuBufferRequested (uint8_t * * buffer , size_t * size ,
55+ size_t * maxNumRecords );
56+ static void parcagpuBufferCompleted (CUcontext ctx , uint32_t streamId ,
57+ uint8_t * buffer , size_t size ,
58+ size_t validSize );
5259
5360void cleanup (void );
5461
@@ -79,49 +86,35 @@ int InitializeInjection(void) {
7986 return 1 ; // Still return success to not break the injection
8087 }
8188
82- // Try enabling driver API kernel launch callback like the example
83- result = cuptiEnableCallback (1 , subscriber , CUPTI_CB_DOMAIN_DRIVER_API ,
84- CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel );
85- if (result != CUPTI_SUCCESS ) {
86- const char * errstr ;
87- cuptiGetResultString (result , & errstr );
88- fprintf (stderr , "[CUPTI] Failed to enable cuLaunchKernel callback: %s\n" ,
89- errstr );
90- }
91-
92- // Enable runtime API callbacks for cudaLaunchKernel
93- result = cuptiEnableCallback (1 , subscriber , CUPTI_CB_DOMAIN_RUNTIME_API ,
94- CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000 );
95- if (result != CUPTI_SUCCESS ) {
96- const char * errstr ;
97- cuptiGetResultString (result , & errstr );
98- fprintf (stderr , "[CUPTI] Failed to enable cudaLaunchKernel callback: %s\n" ,
99- errstr );
100- }
101-
102- // Enable runtime API callbacks for cudaGraphLaunch
103- result = cuptiEnableCallback (1 , subscriber , CUPTI_CB_DOMAIN_RUNTIME_API ,
104- CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000 );
105- if (result != CUPTI_SUCCESS ) {
106- const char * errstr ;
107- cuptiGetResultString (result , & errstr );
108- fprintf (stderr , "[CUPTI] Failed to enable cudaGraphLaunch callback: %s\n" ,
109- errstr );
110- }
111-
112- // Enable runtime API callbacks for cudaGraphLaunch
113- result =
114- cuptiEnableCallback (1 , subscriber , CUPTI_CB_DOMAIN_RUNTIME_API ,
115- CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000 );
116- if (result != CUPTI_SUCCESS ) {
117- const char * errstr ;
118- cuptiGetResultString (result , & errstr );
119- fprintf (stderr , "[CUPTI] Failed to enable cudaGraphLaunch callback: %s\n" ,
120- errstr );
89+ // Enable all runtime API kernel launch callbacks
90+ CUpti_CallbackId launchCallbacks [] = {
91+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020 ,
92+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000 ,
93+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_ptsz_v7000 ,
94+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_ptsz_v7000 ,
95+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060 ,
96+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_ptsz_v11060 ,
97+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000 ,
98+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_ptsz_v9000 ,
99+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000 ,
100+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000 ,
101+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000 ,
102+ };
103+ for (size_t i = 0 ; i < sizeof (launchCallbacks ) / sizeof (launchCallbacks [0 ]);
104+ i ++ ) {
105+ result = cuptiEnableCallback (1 , subscriber , CUPTI_CB_DOMAIN_RUNTIME_API ,
106+ launchCallbacks [i ]);
107+ if (result != CUPTI_SUCCESS ) {
108+ const char * errstr ;
109+ cuptiGetResultString (result , & errstr );
110+ fprintf (stderr , "[CUPTI] Failed to enable runtime callback %d: %s\n" ,
111+ launchCallbacks [i ], errstr );
112+ }
121113 }
122114
123115 // Register activity buffer callbacks
124- result = cuptiActivityRegisterCallbacks (bufferRequested , bufferCompleted );
116+ result = cuptiActivityRegisterCallbacks (parcagpuBufferRequested ,
117+ parcagpuBufferCompleted );
125118 if (result != CUPTI_SUCCESS ) {
126119 const char * errstr ;
127120 cuptiGetResultString (result , & errstr );
@@ -130,7 +123,6 @@ int InitializeInjection(void) {
130123 return 1 ; // Still return success to not break the injection
131124 }
132125
133- // Enable multiple kernel activity recording types
134126 result = cuptiActivityEnable (CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL );
135127 if (result != CUPTI_SUCCESS ) {
136128 const char * errstr ;
@@ -141,54 +133,11 @@ int InitializeInjection(void) {
141133 DEBUG_PRINTF ("[CUPTI] Enabled CONCURRENT_KERNEL activity\n" );
142134 }
143135
144- // This activity kind serializes execution and gives me errors on a T4:
145- // CUPTI_ERROR_NOT_COMPATIBLE But its not a fatal error so do it anyways
146- // result = cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL);
147- // if (result != CUPTI_SUCCESS) {
148- // const char *errstr;
149- // cuptiGetResultString(result, &errstr);
150- // fprintf(stderr, "[CUPTI] Failed to enable kernel activity: %s\n",
151- // errstr);
152- // } else {
153- // DEBUG_PRINTF("[CUPTI] Enabled KERNEL activity\n");
154- // }
155-
156- // Also try enabling runtime activities
157- // result = cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME);
158- // if (result != CUPTI_SUCCESS) {
159- // const char *errstr;
160- // cuptiGetResultString(result, &errstr);
161- // fprintf(stderr, "[CUPTI] Failed to enable runtime activity: %s\n",
162- // errstr);
163- // } else {
164- // DEBUG_PRINTF("[CUPTI] Enabled RUNTIME activity\n");
165- // }
166-
167- // Try enabling graph activities
168- // result = cuptiActivityEnable(CUPTI_ACTIVITY_KIND_GRAPH_TRACE);
169- // if (result != CUPTI_SUCCESS) {
170- // const char *errstr;
171- // cuptiGetResultString(result, &errstr);
172- // fprintf(stderr, "[CUPTI] Failed to enable graph trace activity: %s\n",
173- // errstr);
174- // } else {
175- // DEBUG_PRINTF("[CUPTI] Enabled GRAPH_TRACE activity\n");
176- // }
177-
178136 atexit (cleanup );
179137
180138 DEBUG_PRINTF ("[CUPTI] Successfully initialized CUPTI callbacks with external "
181139 "correlation and activity API\n" );
182140
183- // NOTE: If automatic flush still doesn't work, you can implement manual
184- // periodic flushing:
185- // 1. Create a background thread that calls cuptiActivityFlushAll(0)
186- // periodically
187- // 2. Or call cuptiActivityFlushAll(0) from your application at regular
188- // intervals
189- // 3. Or hook into CUDA synchronization points (cudaDeviceSynchronize, etc.)
190- // to flush
191-
192141 return 1 ;
193142}
194143
@@ -210,45 +159,27 @@ static void print_backtrace(const char *prefix) {
210159 }
211160}
212161
213- // Callback handler for both runtime and driver API
162+ // Callback handler for runtime API
214163static void parcagpuCuptiCallback (void * userdata , CUpti_CallbackDomain domain ,
215164 CUpti_CallbackId cbid ,
216165 const CUpti_CallbackData * cbdata ) {
217- if (domain == CUPTI_CB_DOMAIN_RUNTIME_API ) {
218- // We hook on EXIT because that makes our probe overhead not add to GPU
219- // launch latency and hopefully covers some of the overhead in the shadow of
220- // GPU async work.
221- if (cbdata -> callbackSite == CUPTI_API_EXIT ) {
222- // Probablistic gate should go here.
223- uint32_t correlationId = cbdata -> correlationId ;
224- // Call stub functions for uprobe attachment
225- const char * name = cbdata -> functionName ;
226- switch (cbid ) {
227- case CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000 :
228- case CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060 :
229- if (cbdata -> symbolName ) {
230- DEBUG_PRINTF ("----------- %s\n" , cbdata -> symbolName );
231- name = cbdata -> symbolName ;
232- }
233- case CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000 :
234- case CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000 :
235- DEBUG_PRINTF ("[CUPTI] Runtime API callback: cbid=%d, correlationId=%u, "
236- "func=%s\n" ,
237- cbid , correlationId , cbdata -> functionName );
238- outstandingEvents ++ ;
239- DTRACE_PROBE3 (parcagpu , cuda_correlation , correlationId , cbid , name );
240- break ;
241- default :
242- // Debug: print any other runtime API callback we see with backtrace
243- DEBUG_PRINTF (
244- "[CUPTI] Other Runtime API callback: cbid=%d, correlationId=%u\n" ,
245- cbid , correlationId );
246- // Print backtrace to see who's calling this
247- if (debug_enabled ) {
248- print_backtrace ("[CUPTI]" );
249- }
250- }
251- }
166+ if (domain != CUPTI_CB_DOMAIN_RUNTIME_API ) {
167+ return ;
168+ }
169+
170+ // We hook on EXIT because that makes our probe overhead not add to GPU
171+ // launch latency and hopefully covers some of the overhead in the shadow of
172+ // GPU async work.
173+ if (cbdata -> callbackSite == CUPTI_API_EXIT ) {
174+ uint32_t correlationId = cbdata -> correlationId ;
175+ const char * name =
176+ cbdata -> symbolName ? cbdata -> symbolName : cbdata -> functionName ;
177+
178+ DEBUG_PRINTF (
179+ "[CUPTI] Runtime API callback: cbid=%d, correlationId=%u, func=%s\n" ,
180+ cbid , correlationId , name );
181+ outstandingEvents ++ ;
182+ DTRACE_PROBE3 (parcagpu , cuda_correlation , correlationId , cbid , name );
252183 }
253184 // If we let too many events pile up it overwhelms the perf_event buffers,
254185 // just another reason to explore just passing the activity buffer through to
@@ -257,13 +188,13 @@ static void parcagpuCuptiCallback(void *userdata, CUpti_CallbackDomain domain,
257188 DEBUG_PRINTF ("[CUPTI] Flushing: outstandingEvents=%zu\n" ,
258189 outstandingEvents );
259190 cuptiActivityFlushAll (0 );
191+ outstandingEvents = 0 ;
260192 }
261193}
262194
263195// Buffer request callback
264- static void bufferRequested (uint8_t * * buffer , size_t * size ,
265- size_t * maxNumRecords ) {
266- // Allocate 64MB buffer aligned to 8 bytes
196+ static void parcagpuBufferRequested (uint8_t * * buffer , size_t * size ,
197+ size_t * maxNumRecords ) {
267198 * buffer = (uint8_t * )aligned_alloc (8 , activityBufferSize );
268199 * size = activityBufferSize ;
269200 * maxNumRecords = 0 ; // Let CUPTI decide
@@ -273,8 +204,9 @@ static void bufferRequested(uint8_t **buffer, size_t *size,
273204}
274205
275206// Buffer completion callback
276- static void bufferCompleted (CUcontext ctx , uint32_t streamId , uint8_t * buffer ,
277- size_t size , size_t validSize ) {
207+ static void parcagpuBufferCompleted (CUcontext ctx , uint32_t streamId ,
208+ uint8_t * buffer , size_t size ,
209+ size_t validSize ) {
278210 CUptiResult result ;
279211 CUpti_Activity * record = NULL ;
280212 int recordCount = 0 ;
@@ -295,15 +227,8 @@ static void bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
295227 }
296228
297229 recordCount ++ ;
298- switch (record -> kind ) {
299- case CUPTI_ACTIVITY_KIND_RUNTIME : {
300- CUpti_ActivityAPI * r = (CUpti_ActivityAPI * )record ;
301- DEBUG_PRINTF ("[CUPTI] Runtime activity: correlationId=%u, cbid=%d,\n" ,
302- r -> correlationId , r -> cbid );
303- break ;
304- }
305- case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL :
306- case CUPTI_ACTIVITY_KIND_KERNEL : {
230+ if (record -> kind == CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL ||
231+ record -> kind == CUPTI_ACTIVITY_KIND_KERNEL ) {
307232 CUpti_ActivityKernel5 * k = (CUpti_ActivityKernel5 * )record ;
308233
309234 DEBUG_PRINTF ("[CUPTI] Kernel activity: graphId=%u graphNodeId=%lu "
@@ -315,26 +240,6 @@ static void bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
315240 DTRACE_PROBE8 (parcagpu , kernel_executed , k -> start , k -> end ,
316241 k -> correlationId , k -> deviceId , k -> streamId , k -> graphId ,
317242 k -> graphNodeId , k -> name );
318- break ;
319- }
320- // case CUPTI_ACTIVITY_KIND_GRAPH_TRACE: {
321- // CUpti_ActivityGraphTrace *g = (CUpti_ActivityGraphTrace *)record;
322-
323- // DEBUG_PRINTF(
324- // "[CUPTI] Graph activity: graphId=%u, correlationId=%u, deviceId=%u,
325- // " "streamId=%u, start=%lu, end=%lu, duration=%lu ns\n", g->graphId,
326- // g->correlationId, g->deviceId, g->streamId, g->start, g->end,
327- // g->end - g->start);
328- // // Call stub function for uprobe attachment
329- // uint64_t devCorrelationId =
330- // g->correlationId | ((uint64_t)g->deviceId << 32);
331- // DTRACE_PROBE5(parcagpu, graph_executed, g->start, g->end,
332- // devCorrelationId, g->streamId, g->graphId);
333- // break;
334- // }
335- default :
336- DEBUG_PRINTF ("[CUPTI] Activity record %d: kind=%d\n" , recordCount ,
337- record -> kind );
338243 }
339244 }
340245
@@ -347,7 +252,7 @@ static void bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
347252 outstandingEvents = 0 ;
348253
349254 // Free the buffer
350- DEBUG_PRINTF ("[CUPTI:bufferCompleted ] Freeing buffer %p\n" , buffer );
255+ DEBUG_PRINTF ("[CUPTI] Freeing buffer %p\n" , buffer );
351256 free (buffer );
352257
353258 // Report any records dropped due to buffer overflow
0 commit comments