Skip to content

Commit cc73758

Browse files
gnurizenclaude
andcommitted
Add realistic CUDA API function names to test stack traces
- Add fake cuLaunchKernel, cuGraphLaunch, cudaLaunchKernel, cudaGraphLaunch functions that appear in the call stack when the profiler captures samples - Use __attribute__((noinline)) to prevent compiler from inlining these - Use random kernel names for graph kernels instead of hardcoded "graph_kernel" - Add cuda_runtime.h include for runtime API types Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent dd1299f commit cc73758

File tree

1 file changed

+147
-32
lines changed

1 file changed

+147
-32
lines changed

test/test_cupti_prof.c

Lines changed: 147 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <unistd.h>
1616
#include <stdint.h>
1717
#include <cuda.h>
18+
#include <cuda_runtime.h>
1819
#include <cupti.h>
1920

2021
//=============================================================================
@@ -350,59 +351,172 @@ size_t get_queue_size(void) {
350351
// Counter for simulating dropped graph launches (for testing fallback cleanup)
351352
static atomic_uint_least32_t graph_launch_counter = ATOMIC_VAR_INIT(0);
352353

353-
void simulate_runtime_kernel_launch(uint32_t correlationId, CUpti_CallbackId cbid, bool should_generate_activities) {
354-
CUpti_CallbackData cbdata = {0};
354+
//-----------------------------------------------------------------------------
355+
// Fake CUDA API functions to make stack traces look realistic
356+
// These functions exist solely to appear in the call stack when the profiler
357+
// captures a stack sample during a kernel/graph launch callback.
358+
//-----------------------------------------------------------------------------
359+
360+
// Prevent inlining so these functions appear in the stack
361+
#define NOINLINE __attribute__((noinline))
362+
363+
// Thread-local storage for passing data to fake CUDA API functions
364+
static __thread uint32_t tls_correlation_id;
365+
static __thread CUpti_CallbackData tls_cbdata;
366+
static __thread bool tls_should_generate_activities;
367+
368+
// Driver API kernel launch - calls the callback with cuLaunchKernel in the stack
369+
NOINLINE CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
370+
unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
371+
unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
372+
void **kernelParams, void **extra) {
373+
(void)f; (void)gridDimX; (void)gridDimY; (void)gridDimZ;
374+
(void)blockDimX; (void)blockDimY; (void)blockDimZ;
375+
(void)sharedMemBytes; (void)hStream; (void)kernelParams; (void)extra;
355376

356-
if (!parcagpuCuptiCallback) {
357-
fprintf(stderr, "ERROR: parcagpuCuptiCallback is NULL!\n");
358-
return;
377+
// DRIVER ENTER callback
378+
tls_cbdata.callbackSite = CUPTI_API_ENTER;
379+
tls_cbdata.correlationId = tls_correlation_id;
380+
parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API,
381+
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, &tls_cbdata);
382+
383+
// DRIVER EXIT callback
384+
tls_cbdata.callbackSite = CUPTI_API_EXIT;
385+
parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API,
386+
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, &tls_cbdata);
387+
388+
return CUDA_SUCCESS;
389+
}
390+
391+
// Driver API graph launch - calls the callback with cuGraphLaunch in the stack
392+
NOINLINE CUresult cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream) {
393+
(void)hGraphExec; (void)hStream;
394+
395+
// DRIVER ENTER callback
396+
tls_cbdata.callbackSite = CUPTI_API_ENTER;
397+
tls_cbdata.correlationId = tls_correlation_id;
398+
parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API,
399+
CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch, &tls_cbdata);
400+
401+
// DRIVER EXIT callback
402+
tls_cbdata.callbackSite = CUPTI_API_EXIT;
403+
parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API,
404+
CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch, &tls_cbdata);
405+
406+
// Enqueue for activity generation
407+
if (tls_should_generate_activities) {
408+
enqueue_launched_kernel(tls_correlation_id);
359409
}
360410

411+
return CUDA_SUCCESS;
412+
}
413+
414+
// Runtime API kernel launch - calls the callback with cudaLaunchKernel in the stack
415+
NOINLINE cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
416+
void **args, size_t sharedMem, cudaStream_t stream) {
417+
(void)func; (void)gridDim; (void)blockDim; (void)args; (void)sharedMem; (void)stream;
418+
361419
// RUNTIME ENTER callback
362-
cbdata.callbackSite = CUPTI_API_ENTER;
363-
cbdata.correlationId = correlationId;
364-
parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API, cbid, &cbdata);
420+
tls_cbdata.callbackSite = CUPTI_API_ENTER;
421+
tls_cbdata.correlationId = tls_correlation_id;
422+
parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API,
423+
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, &tls_cbdata);
424+
425+
// Runtime internally calls driver - call through cuLaunchKernel so it appears in stack
426+
cuLaunchKernel(NULL, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL);
427+
428+
// RUNTIME EXIT callback
429+
tls_cbdata.callbackSite = CUPTI_API_EXIT;
430+
parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API,
431+
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, &tls_cbdata);
432+
433+
return cudaSuccess;
434+
}
365435

366-
// DRIVER ENTER callback (runtime internally calls driver)
367-
cbdata.callbackSite = CUPTI_API_ENTER;
436+
// Runtime API graph launch - calls the callback with cudaGraphLaunch in the stack
437+
NOINLINE cudaError_t cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream) {
438+
(void)graphExec; (void)stream;
439+
440+
// RUNTIME ENTER callback
441+
tls_cbdata.callbackSite = CUPTI_API_ENTER;
442+
tls_cbdata.correlationId = tls_correlation_id;
443+
parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API,
444+
CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000, &tls_cbdata);
445+
446+
// Runtime internally calls driver - use inline driver callback with cuGraphLaunch cbid
447+
// (We don't call cuGraphLaunch here to avoid double-queueing)
448+
tls_cbdata.callbackSite = CUPTI_API_ENTER;
368449
parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API,
369-
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, &cbdata);
450+
CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch, &tls_cbdata);
370451

371-
// DRIVER EXIT callback
372-
cbdata.callbackSite = CUPTI_API_EXIT;
452+
tls_cbdata.callbackSite = CUPTI_API_EXIT;
373453
parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API,
374-
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, &cbdata);
454+
CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch, &tls_cbdata);
375455

376456
// RUNTIME EXIT callback
377-
cbdata.callbackSite = CUPTI_API_EXIT;
378-
parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API, cbid, &cbdata);
457+
tls_cbdata.callbackSite = CUPTI_API_EXIT;
458+
parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API,
459+
CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000, &tls_cbdata);
379460

380-
// Enqueue this correlation ID for activity generation (unless simulating dropped activities)
381-
if (should_generate_activities) {
382-
enqueue_launched_kernel(correlationId);
461+
// Enqueue for activity generation
462+
if (tls_should_generate_activities) {
463+
enqueue_launched_kernel(tls_correlation_id);
383464
}
465+
466+
return cudaSuccess;
384467
}
385468

386-
void simulate_driver_kernel_launch(uint32_t correlationId, CUpti_CallbackId cbid, bool should_generate_activities) {
387-
CUpti_CallbackData cbdata = {0};
469+
//-----------------------------------------------------------------------------
470+
// Simulation functions that call through the fake CUDA APIs
471+
//-----------------------------------------------------------------------------
388472

473+
void simulate_runtime_kernel_launch(uint32_t correlationId, CUpti_CallbackId cbid, bool should_generate_activities) {
389474
if (!parcagpuCuptiCallback) {
390475
fprintf(stderr, "ERROR: parcagpuCuptiCallback is NULL!\n");
391476
return;
392477
}
393478

394-
// DRIVER ENTER callback
395-
cbdata.callbackSite = CUPTI_API_ENTER;
396-
cbdata.correlationId = correlationId;
397-
parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API, cbid, &cbdata);
479+
// Set up thread-local data for the fake CUDA API functions
480+
tls_correlation_id = correlationId;
481+
memset(&tls_cbdata, 0, sizeof(tls_cbdata));
482+
tls_should_generate_activities = should_generate_activities;
398483

399-
// DRIVER EXIT callback
400-
cbdata.callbackSite = CUPTI_API_EXIT;
401-
parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API, cbid, &cbdata);
484+
if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000) {
485+
// Graph launch - call through cudaGraphLaunch
486+
cudaGraphLaunch(NULL, NULL);
487+
} else {
488+
// Kernel launch - call through cudaLaunchKernel
489+
dim3 grid = {1, 1, 1};
490+
dim3 block = {1, 1, 1};
491+
cudaLaunchKernel(NULL, grid, block, NULL, 0, NULL);
492+
// Enqueue for activity generation
493+
if (should_generate_activities) {
494+
enqueue_launched_kernel(correlationId);
495+
}
496+
}
497+
}
498+
499+
void simulate_driver_kernel_launch(uint32_t correlationId, CUpti_CallbackId cbid, bool should_generate_activities) {
500+
if (!parcagpuCuptiCallback) {
501+
fprintf(stderr, "ERROR: parcagpuCuptiCallback is NULL!\n");
502+
return;
503+
}
402504

403-
// Enqueue this correlation ID for activity generation (unless simulating dropped activities)
404-
if (should_generate_activities) {
405-
enqueue_launched_kernel(correlationId);
505+
// Set up thread-local data for the fake CUDA API functions
506+
tls_correlation_id = correlationId;
507+
memset(&tls_cbdata, 0, sizeof(tls_cbdata));
508+
tls_should_generate_activities = should_generate_activities;
509+
510+
if (cbid == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch) {
511+
// Graph launch - call through cuGraphLaunch
512+
cuGraphLaunch(NULL, NULL);
513+
} else {
514+
// Kernel launch - call through cuLaunchKernel
515+
cuLaunchKernel(NULL, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL);
516+
// Enqueue for activity generation
517+
if (should_generate_activities) {
518+
enqueue_launched_kernel(correlationId);
519+
}
406520
}
407521
}
408522

@@ -551,7 +665,8 @@ void *cupti_thread(void *arg) {
551665
kernel->end = kernel->start + duration;
552666
kernel->graphId = graph_exec_id;
553667
kernel->graphNodeId = i; // Increment for each kernel in the graph
554-
kernel->name = "graph_kernel";
668+
kernel->name = get_next_kernel_name(args->kernel_names);
669+
if (!kernel->name) kernel->name = "mock_kernel";
555670

556671
offset += recordSize;
557672
}

0 commit comments

Comments
 (0)