|
15 | 15 | #include <unistd.h> |
16 | 16 | #include <stdint.h> |
17 | 17 | #include <cuda.h> |
| 18 | +#include <cuda_runtime.h> |
18 | 19 | #include <cupti.h> |
19 | 20 |
|
20 | 21 | //============================================================================= |
@@ -350,59 +351,172 @@ size_t get_queue_size(void) { |
350 | 351 | // Counter for simulating dropped graph launches (for testing fallback cleanup) |
351 | 352 | static atomic_uint_least32_t graph_launch_counter = ATOMIC_VAR_INIT(0); |
352 | 353 |
|
353 | | -void simulate_runtime_kernel_launch(uint32_t correlationId, CUpti_CallbackId cbid, bool should_generate_activities) { |
354 | | - CUpti_CallbackData cbdata = {0}; |
| 354 | +//----------------------------------------------------------------------------- |
| 355 | +// Fake CUDA API functions to make stack traces look realistic |
| 356 | +// These functions exist solely to appear in the call stack when the profiler |
| 357 | +// captures a stack sample during a kernel/graph launch callback. |
| 358 | +//----------------------------------------------------------------------------- |
| 359 | + |
| 360 | +// Prevent inlining so these functions appear in the stack |
| 361 | +#define NOINLINE __attribute__((noinline)) |
| 362 | + |
| 363 | +// Thread-local storage for passing data to fake CUDA API functions |
| 364 | +static __thread uint32_t tls_correlation_id; |
| 365 | +static __thread CUpti_CallbackData tls_cbdata; |
| 366 | +static __thread bool tls_should_generate_activities; |
| 367 | + |
| 368 | +// Driver API kernel launch - calls the callback with cuLaunchKernel in the stack |
| 369 | +NOINLINE CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, |
| 370 | + unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, |
| 371 | + unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, |
| 372 | + void **kernelParams, void **extra) { |
| 373 | + (void)f; (void)gridDimX; (void)gridDimY; (void)gridDimZ; |
| 374 | + (void)blockDimX; (void)blockDimY; (void)blockDimZ; |
| 375 | + (void)sharedMemBytes; (void)hStream; (void)kernelParams; (void)extra; |
355 | 376 |
|
356 | | - if (!parcagpuCuptiCallback) { |
357 | | - fprintf(stderr, "ERROR: parcagpuCuptiCallback is NULL!\n"); |
358 | | - return; |
| 377 | + // DRIVER ENTER callback |
| 378 | + tls_cbdata.callbackSite = CUPTI_API_ENTER; |
| 379 | + tls_cbdata.correlationId = tls_correlation_id; |
| 380 | + parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API, |
| 381 | + CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, &tls_cbdata); |
| 382 | + |
| 383 | + // DRIVER EXIT callback |
| 384 | + tls_cbdata.callbackSite = CUPTI_API_EXIT; |
| 385 | + parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API, |
| 386 | + CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, &tls_cbdata); |
| 387 | + |
| 388 | + return CUDA_SUCCESS; |
| 389 | +} |
| 390 | + |
| 391 | +// Driver API graph launch - calls the callback with cuGraphLaunch in the stack |
| 392 | +NOINLINE CUresult cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream) { |
| 393 | + (void)hGraphExec; (void)hStream; |
| 394 | + |
| 395 | + // DRIVER ENTER callback |
| 396 | + tls_cbdata.callbackSite = CUPTI_API_ENTER; |
| 397 | + tls_cbdata.correlationId = tls_correlation_id; |
| 398 | + parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API, |
| 399 | + CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch, &tls_cbdata); |
| 400 | + |
| 401 | + // DRIVER EXIT callback |
| 402 | + tls_cbdata.callbackSite = CUPTI_API_EXIT; |
| 403 | + parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API, |
| 404 | + CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch, &tls_cbdata); |
| 405 | + |
| 406 | + // Enqueue for activity generation |
| 407 | + if (tls_should_generate_activities) { |
| 408 | + enqueue_launched_kernel(tls_correlation_id); |
359 | 409 | } |
360 | 410 |
|
| 411 | + return CUDA_SUCCESS; |
| 412 | +} |
| 413 | + |
| 414 | +// Runtime API kernel launch - calls the callback with cudaLaunchKernel in the stack |
| 415 | +NOINLINE cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, |
| 416 | + void **args, size_t sharedMem, cudaStream_t stream) { |
| 417 | + (void)func; (void)gridDim; (void)blockDim; (void)args; (void)sharedMem; (void)stream; |
| 418 | + |
361 | 419 | // RUNTIME ENTER callback |
362 | | - cbdata.callbackSite = CUPTI_API_ENTER; |
363 | | - cbdata.correlationId = correlationId; |
364 | | - parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API, cbid, &cbdata); |
| 420 | + tls_cbdata.callbackSite = CUPTI_API_ENTER; |
| 421 | + tls_cbdata.correlationId = tls_correlation_id; |
| 422 | + parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API, |
| 423 | + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, &tls_cbdata); |
| 424 | + |
| 425 | + // Runtime internally calls driver - call through cuLaunchKernel so it appears in stack |
| 426 | + cuLaunchKernel(NULL, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL); |
| 427 | + |
| 428 | + // RUNTIME EXIT callback |
| 429 | + tls_cbdata.callbackSite = CUPTI_API_EXIT; |
| 430 | + parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API, |
| 431 | + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, &tls_cbdata); |
| 432 | + |
| 433 | + return cudaSuccess; |
| 434 | +} |
365 | 435 |
|
366 | | - // DRIVER ENTER callback (runtime internally calls driver) |
367 | | - cbdata.callbackSite = CUPTI_API_ENTER; |
| 436 | +// Runtime API graph launch - calls the callback with cudaGraphLaunch in the stack |
| 437 | +NOINLINE cudaError_t cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream) { |
| 438 | + (void)graphExec; (void)stream; |
| 439 | + |
| 440 | + // RUNTIME ENTER callback |
| 441 | + tls_cbdata.callbackSite = CUPTI_API_ENTER; |
| 442 | + tls_cbdata.correlationId = tls_correlation_id; |
| 443 | + parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API, |
| 444 | + CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000, &tls_cbdata); |
| 445 | + |
| 446 | + // Runtime internally calls driver - use inline driver callback with cuGraphLaunch cbid |
| 447 | + // (We don't call cuGraphLaunch here to avoid double-queueing) |
| 448 | + tls_cbdata.callbackSite = CUPTI_API_ENTER; |
368 | 449 | parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API, |
369 | | - CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, &cbdata); |
| 450 | + CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch, &tls_cbdata); |
370 | 451 |
|
371 | | - // DRIVER EXIT callback |
372 | | - cbdata.callbackSite = CUPTI_API_EXIT; |
| 452 | + tls_cbdata.callbackSite = CUPTI_API_EXIT; |
373 | 453 | parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API, |
374 | | - CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, &cbdata); |
| 454 | + CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch, &tls_cbdata); |
375 | 455 |
|
376 | 456 | // RUNTIME EXIT callback |
377 | | - cbdata.callbackSite = CUPTI_API_EXIT; |
378 | | - parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API, cbid, &cbdata); |
| 457 | + tls_cbdata.callbackSite = CUPTI_API_EXIT; |
| 458 | + parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API, |
| 459 | + CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000, &tls_cbdata); |
379 | 460 |
|
380 | | - // Enqueue this correlation ID for activity generation (unless simulating dropped activities) |
381 | | - if (should_generate_activities) { |
382 | | - enqueue_launched_kernel(correlationId); |
| 461 | + // Enqueue for activity generation |
| 462 | + if (tls_should_generate_activities) { |
| 463 | + enqueue_launched_kernel(tls_correlation_id); |
383 | 464 | } |
| 465 | + |
| 466 | + return cudaSuccess; |
384 | 467 | } |
385 | 468 |
|
386 | | -void simulate_driver_kernel_launch(uint32_t correlationId, CUpti_CallbackId cbid, bool should_generate_activities) { |
387 | | - CUpti_CallbackData cbdata = {0}; |
| 469 | +//----------------------------------------------------------------------------- |
| 470 | +// Simulation functions that call through the fake CUDA APIs |
| 471 | +//----------------------------------------------------------------------------- |
388 | 472 |
|
| 473 | +void simulate_runtime_kernel_launch(uint32_t correlationId, CUpti_CallbackId cbid, bool should_generate_activities) { |
389 | 474 | if (!parcagpuCuptiCallback) { |
390 | 475 | fprintf(stderr, "ERROR: parcagpuCuptiCallback is NULL!\n"); |
391 | 476 | return; |
392 | 477 | } |
393 | 478 |
|
394 | | - // DRIVER ENTER callback |
395 | | - cbdata.callbackSite = CUPTI_API_ENTER; |
396 | | - cbdata.correlationId = correlationId; |
397 | | - parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API, cbid, &cbdata); |
| 479 | + // Set up thread-local data for the fake CUDA API functions |
| 480 | + tls_correlation_id = correlationId; |
| 481 | + memset(&tls_cbdata, 0, sizeof(tls_cbdata)); |
| 482 | + tls_should_generate_activities = should_generate_activities; |
398 | 483 |
|
399 | | - // DRIVER EXIT callback |
400 | | - cbdata.callbackSite = CUPTI_API_EXIT; |
401 | | - parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API, cbid, &cbdata); |
| 484 | + if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000) { |
| 485 | + // Graph launch - call through cudaGraphLaunch |
| 486 | + cudaGraphLaunch(NULL, NULL); |
| 487 | + } else { |
| 488 | + // Kernel launch - call through cudaLaunchKernel |
| 489 | + dim3 grid = {1, 1, 1}; |
| 490 | + dim3 block = {1, 1, 1}; |
| 491 | + cudaLaunchKernel(NULL, grid, block, NULL, 0, NULL); |
| 492 | + // Enqueue for activity generation |
| 493 | + if (should_generate_activities) { |
| 494 | + enqueue_launched_kernel(correlationId); |
| 495 | + } |
| 496 | + } |
| 497 | +} |
| 498 | + |
| 499 | +void simulate_driver_kernel_launch(uint32_t correlationId, CUpti_CallbackId cbid, bool should_generate_activities) { |
| 500 | + if (!parcagpuCuptiCallback) { |
| 501 | + fprintf(stderr, "ERROR: parcagpuCuptiCallback is NULL!\n"); |
| 502 | + return; |
| 503 | + } |
402 | 504 |
|
403 | | - // Enqueue this correlation ID for activity generation (unless simulating dropped activities) |
404 | | - if (should_generate_activities) { |
405 | | - enqueue_launched_kernel(correlationId); |
| 505 | + // Set up thread-local data for the fake CUDA API functions |
| 506 | + tls_correlation_id = correlationId; |
| 507 | + memset(&tls_cbdata, 0, sizeof(tls_cbdata)); |
| 508 | + tls_should_generate_activities = should_generate_activities; |
| 509 | + |
| 510 | + if (cbid == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch) { |
| 511 | + // Graph launch - call through cuGraphLaunch |
| 512 | + cuGraphLaunch(NULL, NULL); |
| 513 | + } else { |
| 514 | + // Kernel launch - call through cuLaunchKernel |
| 515 | + cuLaunchKernel(NULL, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL); |
| 516 | + // Enqueue for activity generation |
| 517 | + if (should_generate_activities) { |
| 518 | + enqueue_launched_kernel(correlationId); |
| 519 | + } |
406 | 520 | } |
407 | 521 | } |
408 | 522 |
|
@@ -551,7 +665,8 @@ void *cupti_thread(void *arg) { |
551 | 665 | kernel->end = kernel->start + duration; |
552 | 666 | kernel->graphId = graph_exec_id; |
553 | 667 | kernel->graphNodeId = i; // Increment for each kernel in the graph |
554 | | - kernel->name = "graph_kernel"; |
| 668 | + kernel->name = get_next_kernel_name(args->kernel_names); |
| 669 | + if (!kernel->name) kernel->name = "mock_kernel"; |
555 | 670 |
|
556 | 671 | offset += recordSize; |
557 | 672 | } |
|
0 commit comments