Skip to content

Commit 9dba2bf

Browse files
committed
Use better name for callback and fix mock
1 parent 2510145 commit 9dba2bf

File tree

3 files changed

+69
-20
lines changed

3 files changed

+69
-20
lines changed

cupti/cupti-prof.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ static void init_debug(void) {
5050
} while (0)
5151

5252
// Forward declarations
53-
static void runtimeApiCallback(void *userdata, CUpti_CallbackDomain domain,
53+
static void parcagpuCuptiCallback(void *userdata, CUpti_CallbackDomain domain,
5454
CUpti_CallbackId cbid,
5555
const CUpti_CallbackData *cbdata);
5656
static void bufferRequested(uint8_t **buffer, size_t *size,
@@ -79,7 +79,7 @@ int InitializeInjection(void) {
7979

8080
// Try to subscribe to callbacks
8181
result =
82-
cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)runtimeApiCallback, NULL);
82+
cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)parcagpuCuptiCallback, NULL);
8383
if (result != CUPTI_SUCCESS) {
8484
const char *errstr;
8585
cuptiGetResultString(result, &errstr);
@@ -219,7 +219,7 @@ static void print_backtrace(const char *prefix) {
219219
}
220220

221221
// Callback handler for both runtime and driver API
222-
static void runtimeApiCallback(void *userdata, CUpti_CallbackDomain domain,
222+
static void parcagpuCuptiCallback(void *userdata, CUpti_CallbackDomain domain,
223223
CUpti_CallbackId cbid,
224224
const CUpti_CallbackData *cbdata) {
225225
if (domain == CUPTI_CB_DOMAIN_RUNTIME_API) {

test/mock_cupti.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,8 @@ CUptiResult cuptiActivityGetNextRecord(uint8_t *buffer,
105105
switch (activity->kind) {
106106
case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
107107
case CUPTI_ACTIVITY_KIND_KERNEL:
108-
recordSize = sizeof(CUpti_ActivityKernel4);
108+
// Use CUpti_ActivityKernel5 which includes graphId and graphNodeId
109+
recordSize = sizeof(CUpti_ActivityKernel5);
109110
break;
110111
case CUPTI_ACTIVITY_KIND_GRAPH_TRACE:
111112
recordSize = sizeof(CUpti_ActivityGraphTrace);

test/test_cupti_prof.c

Lines changed: 64 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,31 +16,72 @@ typedef int (*InitializeInjectionFunc)(void);
1616
// Global callback functions that will be registered by InitializeInjection
1717
static void (*bufferRequestedCallback)(uint8_t **buffer, size_t *size, size_t *maxNumRecords) = NULL;
1818
static void (*bufferCompletedCallback)(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize) = NULL;
19-
static void (*runtimeApiCallback)(void *userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata) = NULL;
19+
static void (*parcagpuCuptiCallback)(void *userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata) = NULL;
2020

2121
// Helper to create activity buffer with kernel records
2222
static uint8_t *create_kernel_activity_buffer(size_t *validSize,
2323
uint32_t correlationId, uint32_t deviceId,
2424
uint32_t streamId, const char *kernelName) {
2525
// Allocate buffer large enough for one kernel record
26-
size_t bufferSize = sizeof(CUpti_ActivityKernel4) + 256;
26+
size_t bufferSize = sizeof(CUpti_ActivityKernel5) + 256;
2727
uint8_t *buffer = (uint8_t *)malloc(bufferSize);
2828
memset(buffer, 0, bufferSize);
2929

30-
CUpti_ActivityKernel4 *kernel = (CUpti_ActivityKernel4 *)buffer;
30+
CUpti_ActivityKernel5 *kernel = (CUpti_ActivityKernel5 *)buffer;
3131
kernel->kind = CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL;
3232
kernel->correlationId = correlationId;
3333
kernel->deviceId = deviceId;
3434
kernel->streamId = streamId;
35+
kernel->graphId = 0; // Not a graph launch
36+
kernel->graphNodeId = 0;
3537
kernel->start = 1000000000UL + (correlationId * 1000000UL);
3638
kernel->end = kernel->start + 500000UL;
3739

3840
// Copy kernel name
39-
char *namePtr = (char *)(buffer + sizeof(CUpti_ActivityKernel4));
41+
char *namePtr = (char *)(buffer + sizeof(CUpti_ActivityKernel5));
4042
strncpy(namePtr, kernelName, 255);
4143
kernel->name = namePtr;
4244

43-
*validSize = sizeof(CUpti_ActivityKernel4);
45+
*validSize = sizeof(CUpti_ActivityKernel5);
46+
return buffer;
47+
}
48+
49+
// Helper to create activity buffer with multiple kernel records for a graph launch
50+
// This simulates how real graph launches work: N kernel activities with the same
51+
// correlationId (matching the cudaGraphLaunch) and the same graphId
52+
static uint8_t *create_graph_kernel_activities_buffer(size_t *validSize,
53+
uint32_t correlationId,
54+
uint32_t deviceId,
55+
uint32_t streamId,
56+
uint32_t graphId,
57+
int numKernels) {
58+
// Allocate buffer large enough for multiple kernel records
59+
size_t recordSize = sizeof(CUpti_ActivityKernel5);
60+
size_t namesSize = numKernels * 256;
61+
size_t bufferSize = (recordSize * numKernels) + namesSize;
62+
uint8_t *buffer = (uint8_t *)malloc(bufferSize);
63+
memset(buffer, 0, bufferSize);
64+
65+
// Create multiple kernel records with the SAME correlationId but different graphNodeIds
66+
uint64_t baseTime = 1000000000UL + (correlationId * 1000000UL);
67+
for (int i = 0; i < numKernels; i++) {
68+
CUpti_ActivityKernel5 *kernel = (CUpti_ActivityKernel5 *)(buffer + (i * recordSize));
69+
kernel->kind = CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL;
70+
kernel->correlationId = correlationId; // SAME correlationId for all kernels in the graph
71+
kernel->deviceId = deviceId;
72+
kernel->streamId = streamId;
73+
kernel->graphId = graphId; // Same graphId for all kernels in this graph
74+
kernel->graphNodeId = 100 + i; // Different node IDs
75+
kernel->start = baseTime + (i * 100000UL); // Slightly offset start times
76+
kernel->end = kernel->start + 50000UL;
77+
78+
// Copy kernel name
79+
char *namePtr = (char *)(buffer + (numKernels * recordSize) + (i * 256));
80+
snprintf(namePtr, 255, "graph_kernel_%d", i);
81+
kernel->name = namePtr;
82+
}
83+
84+
*validSize = recordSize * numKernels;
4485
return buffer;
4586
}
4687

@@ -110,8 +151,8 @@ int main(int argc, char **argv) {
110151

111152
// Dereference to get the actual callback functions
112153
if (runtime_api_cb_ptr) {
113-
runtimeApiCallback = (void (*)(void *, CUpti_CallbackDomain, CUpti_CallbackId, const CUpti_CallbackData *))*runtime_api_cb_ptr;
114-
fprintf(stderr, "Got runtime callback: %p\n", (void *)runtimeApiCallback);
154+
parcagpuCuptiCallback = (void (*)(void *, CUpti_CallbackDomain, CUpti_CallbackId, const CUpti_CallbackData *))*runtime_api_cb_ptr;
155+
fprintf(stderr, "Got runtime callback: %p\n", (void *)parcagpuCuptiCallback);
115156
}
116157
if (buffer_requested_cb_ptr) {
117158
bufferRequestedCallback = (void (*)(uint8_t **, size_t *, size_t *))*buffer_requested_cb_ptr;
@@ -122,7 +163,7 @@ int main(int argc, char **argv) {
122163
}
123164

124165
// Check if we have the callback pointers
125-
if (!runtimeApiCallback || !bufferCompletedCallback) {
166+
if (!parcagpuCuptiCallback || !bufferCompletedCallback) {
126167
fprintf(stderr, "Warning: Could not get callback pointers from mock CUPTI.\n");
127168
fprintf(stderr, "Test will run but won't be able to simulate full callback flow.\n");
128169
fprintf(stderr, "The library is loaded and InitializeInjection was called successfully.\n");
@@ -137,8 +178,10 @@ int main(int argc, char **argv) {
137178

138179
// Now simulate CUPTI callbacks
139180
fprintf(stderr, "\n=== Starting test simulation (1000 events/second) ===\n");
181+
fprintf(stderr, "Graph launches will have 3 kernel activities each with the same correlationId\n");
140182

141183
uint32_t correlationId = 1;
184+
uint32_t graphId = 1000; // Start with graphId 1000
142185
struct timespec sleep_time = {0, 1000000}; // 1ms sleep = 1000 events/second
143186

144187
for (int i = 0; run_forever || i < 100; i++) {
@@ -149,13 +192,15 @@ int main(int argc, char **argv) {
149192
cbdata.correlationId = correlationId;
150193

151194
// Alternate between kernel and graph launches
152-
if (correlationId % 2 == 0) {
153-
runtimeApiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API,
154-
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000,
195+
if (correlationId % 3 == 0) {
196+
// Graph launch - this will have multiple kernel activities with same correlationId
197+
parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API,
198+
CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000,
155199
&cbdata);
156200
} else {
157-
runtimeApiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API,
158-
CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000,
201+
// Regular kernel launch - this will have one kernel activity
202+
parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API,
203+
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000,
159204
&cbdata);
160205
}
161206

@@ -169,11 +214,14 @@ int main(int argc, char **argv) {
169214
uint8_t *buffer;
170215
size_t validSize;
171216

172-
if (recCorrelationId % 2 == 0) {
173-
buffer = create_kernel_activity_buffer(&validSize, recCorrelationId, 0, 1, "mock_cuda_kernel_name");
217+
if (recCorrelationId % 3 == 0) {
218+
// Graph launch: create buffer with 3 kernel activities sharing the same correlationId
219+
buffer = create_graph_kernel_activities_buffer(&validSize, recCorrelationId, 0, 1, graphId, 3);
174220
bufferCompletedCallback(NULL, 1, buffer, 32 * 1024, validSize);
221+
graphId++; // Different graphId for next graph
175222
} else {
176-
buffer = create_graph_activity_buffer(&validSize, recCorrelationId, 0, 1, recCorrelationId / 2);
223+
// Regular kernel launch: single kernel activity
224+
buffer = create_kernel_activity_buffer(&validSize, recCorrelationId, 0, 1, "mock_cuda_kernel_name");
177225
bufferCompletedCallback(NULL, 1, buffer, 32 * 1024, validSize);
178226
}
179227

0 commit comments

Comments
 (0)