@@ -16,31 +16,72 @@ typedef int (*InitializeInjectionFunc)(void);
1616// Global callback functions that will be registered by InitializeInjection
1717static void (* bufferRequestedCallback )(uint8_t * * buffer , size_t * size , size_t * maxNumRecords ) = NULL ;
1818static void (* bufferCompletedCallback )(CUcontext ctx , uint32_t streamId , uint8_t * buffer , size_t size , size_t validSize ) = NULL ;
19- static void (* runtimeApiCallback )(void * userdata , CUpti_CallbackDomain domain , CUpti_CallbackId cbid , const CUpti_CallbackData * cbdata ) = NULL ;
19+ static void (* parcagpuCuptiCallback )(void * userdata , CUpti_CallbackDomain domain , CUpti_CallbackId cbid , const CUpti_CallbackData * cbdata ) = NULL ;
2020
2121// Helper to create activity buffer with kernel records
2222static uint8_t * create_kernel_activity_buffer (size_t * validSize ,
2323 uint32_t correlationId , uint32_t deviceId ,
2424 uint32_t streamId , const char * kernelName ) {
2525 // Allocate buffer large enough for one kernel record
26- size_t bufferSize = sizeof (CUpti_ActivityKernel4 ) + 256 ;
26+ size_t bufferSize = sizeof (CUpti_ActivityKernel5 ) + 256 ;
2727 uint8_t * buffer = (uint8_t * )malloc (bufferSize );
2828 memset (buffer , 0 , bufferSize );
2929
30- CUpti_ActivityKernel4 * kernel = (CUpti_ActivityKernel4 * )buffer ;
30+ CUpti_ActivityKernel5 * kernel = (CUpti_ActivityKernel5 * )buffer ;
3131 kernel -> kind = CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL ;
3232 kernel -> correlationId = correlationId ;
3333 kernel -> deviceId = deviceId ;
3434 kernel -> streamId = streamId ;
35+ kernel -> graphId = 0 ; // Not a graph launch
36+ kernel -> graphNodeId = 0 ;
3537 kernel -> start = 1000000000UL + (correlationId * 1000000UL );
3638 kernel -> end = kernel -> start + 500000UL ;
3739
3840 // Copy kernel name
39- char * namePtr = (char * )(buffer + sizeof (CUpti_ActivityKernel4 ));
41+ char * namePtr = (char * )(buffer + sizeof (CUpti_ActivityKernel5 ));
4042 strncpy (namePtr , kernelName , 255 );
4143 kernel -> name = namePtr ;
4244
43- * validSize = sizeof (CUpti_ActivityKernel4 );
45+ * validSize = sizeof (CUpti_ActivityKernel5 );
46+ return buffer ;
47+ }
48+
49+ // Helper to create activity buffer with multiple kernel records for a graph launch
50+ // This simulates how real graph launches work: N kernel activities with the same
51+ // correlationId (matching the cudaGraphLaunch) and the same graphId
52+ static uint8_t * create_graph_kernel_activities_buffer (size_t * validSize ,
53+ uint32_t correlationId ,
54+ uint32_t deviceId ,
55+ uint32_t streamId ,
56+ uint32_t graphId ,
57+ int numKernels ) {
58+ // Allocate buffer large enough for multiple kernel records
59+ size_t recordSize = sizeof (CUpti_ActivityKernel5 );
60+ size_t namesSize = numKernels * 256 ;
61+ size_t bufferSize = (recordSize * numKernels ) + namesSize ;
62+ uint8_t * buffer = (uint8_t * )malloc (bufferSize );
63+ memset (buffer , 0 , bufferSize );
64+
65+ // Create multiple kernel records with the SAME correlationId but different graphNodeIds
66+ uint64_t baseTime = 1000000000UL + (correlationId * 1000000UL );
67+ for (int i = 0 ; i < numKernels ; i ++ ) {
68+ CUpti_ActivityKernel5 * kernel = (CUpti_ActivityKernel5 * )(buffer + (i * recordSize ));
69+ kernel -> kind = CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL ;
70+ kernel -> correlationId = correlationId ; // SAME correlationId for all kernels in the graph
71+ kernel -> deviceId = deviceId ;
72+ kernel -> streamId = streamId ;
73+ kernel -> graphId = graphId ; // Same graphId for all kernels in this graph
74+ kernel -> graphNodeId = 100 + i ; // Different node IDs
75+ kernel -> start = baseTime + (i * 100000UL ); // Slightly offset start times
76+ kernel -> end = kernel -> start + 50000UL ;
77+
78+ // Copy kernel name
79+ char * namePtr = (char * )(buffer + (numKernels * recordSize ) + (i * 256 ));
80+ snprintf (namePtr , 255 , "graph_kernel_%d" , i );
81+ kernel -> name = namePtr ;
82+ }
83+
84+ * validSize = recordSize * numKernels ;
4485 return buffer ;
4586}
4687
@@ -110,8 +151,8 @@ int main(int argc, char **argv) {
110151
111152 // Dereference to get the actual callback functions
112153 if (runtime_api_cb_ptr ) {
113- runtimeApiCallback = (void (* )(void * , CUpti_CallbackDomain , CUpti_CallbackId , const CUpti_CallbackData * ))* runtime_api_cb_ptr ;
114- fprintf (stderr , "Got runtime callback: %p\n" , (void * )runtimeApiCallback );
154+ parcagpuCuptiCallback = (void (* )(void * , CUpti_CallbackDomain , CUpti_CallbackId , const CUpti_CallbackData * ))* runtime_api_cb_ptr ;
155+ fprintf (stderr , "Got runtime callback: %p\n" , (void * )parcagpuCuptiCallback );
115156 }
116157 if (buffer_requested_cb_ptr ) {
117158 bufferRequestedCallback = (void (* )(uint8_t * * , size_t * , size_t * ))* buffer_requested_cb_ptr ;
@@ -122,7 +163,7 @@ int main(int argc, char **argv) {
122163 }
123164
124165 // Check if we have the callback pointers
125- if (!runtimeApiCallback || !bufferCompletedCallback ) {
166+ if (!parcagpuCuptiCallback || !bufferCompletedCallback ) {
126167 fprintf (stderr , "Warning: Could not get callback pointers from mock CUPTI.\n" );
127168 fprintf (stderr , "Test will run but won't be able to simulate full callback flow.\n" );
128169 fprintf (stderr , "The library is loaded and InitializeInjection was called successfully.\n" );
@@ -137,8 +178,10 @@ int main(int argc, char **argv) {
137178
138179 // Now simulate CUPTI callbacks
139180 fprintf (stderr , "\n=== Starting test simulation (1000 events/second) ===\n" );
181+ fprintf (stderr , "Graph launches will have 3 kernel activities each with the same correlationId\n" );
140182
141183 uint32_t correlationId = 1 ;
184+ uint32_t graphId = 1000 ; // Start with graphId 1000
142185 struct timespec sleep_time = {0 , 1000000 }; // 1ms sleep = 1000 events/second
143186
144187 for (int i = 0 ; run_forever || i < 100 ; i ++ ) {
@@ -149,13 +192,15 @@ int main(int argc, char **argv) {
149192 cbdata .correlationId = correlationId ;
150193
151194 // Alternate between kernel and graph launches
152- if (correlationId % 2 == 0 ) {
153- runtimeApiCallback (NULL , CUPTI_CB_DOMAIN_RUNTIME_API ,
154- CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000 ,
195+ if (correlationId % 3 == 0 ) {
196+ // Graph launch - this will have multiple kernel activities with same correlationId
197+ parcagpuCuptiCallback (NULL , CUPTI_CB_DOMAIN_RUNTIME_API ,
198+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000 ,
155199 & cbdata );
156200 } else {
157- runtimeApiCallback (NULL , CUPTI_CB_DOMAIN_RUNTIME_API ,
158- CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000 ,
201+ // Regular kernel launch - this will have one kernel activity
202+ parcagpuCuptiCallback (NULL , CUPTI_CB_DOMAIN_RUNTIME_API ,
203+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000 ,
159204 & cbdata );
160205 }
161206
@@ -169,11 +214,14 @@ int main(int argc, char **argv) {
169214 uint8_t * buffer ;
170215 size_t validSize ;
171216
172- if (recCorrelationId % 2 == 0 ) {
173- buffer = create_kernel_activity_buffer (& validSize , recCorrelationId , 0 , 1 , "mock_cuda_kernel_name" );
217+ if (recCorrelationId % 3 == 0 ) {
218+ // Graph launch: create buffer with 3 kernel activities sharing the same correlationId
219+ buffer = create_graph_kernel_activities_buffer (& validSize , recCorrelationId , 0 , 1 , graphId , 3 );
174220 bufferCompletedCallback (NULL , 1 , buffer , 32 * 1024 , validSize );
221+ graphId ++ ; // Different graphId for next graph
175222 } else {
176- buffer = create_graph_activity_buffer (& validSize , recCorrelationId , 0 , 1 , recCorrelationId / 2 );
223+ // Regular kernel launch: single kernel activity
224+ buffer = create_kernel_activity_buffer (& validSize , recCorrelationId , 0 , 1 , "mock_cuda_kernel_name" );
177225 bufferCompletedCallback (NULL , 1 , buffer , 32 * 1024 , validSize );
178226 }
179227
0 commit comments