Skip to content

Commit 0893cb9

Browse files
authored
Merge pull request #5 from parca-dev/trim
Go back to small buffer but let allocator worry about caching
2 parents 9bcd357 + 11ca3c2 commit 0893cb9

File tree

1 file changed

+65
-160
lines changed

1 file changed

+65
-160
lines changed

cupti/cupti-prof.c

Lines changed: 65 additions & 160 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,13 @@
1515
static bool debug_enabled = false;
1616

1717
// Activity buffer management
18-
static size_t activityBufferSize = 10 * 1024 * 1024;
18+
// A kernel activity is around 224 bytes so a 128kb buffer
19+
// will hold ~500 activities, we want to flush regularly since
20+
// we are a continuous profiler so we don't need a huge buffer
21+
// like most CUPTI profilers. Also a small size avoid malloc
22+
// just going to mmap every time so the allocator should cache
23+
// and re-use these for us.
24+
static size_t activityBufferSize = 128 * 1024;
1925

2026
// Global variables
2127
static CUpti_SubscriberHandle subscriber = 0;
@@ -45,10 +51,11 @@ static void init_debug(void) {
4551
static void parcagpuCuptiCallback(void *userdata, CUpti_CallbackDomain domain,
4652
CUpti_CallbackId cbid,
4753
const CUpti_CallbackData *cbdata);
48-
static void bufferRequested(uint8_t **buffer, size_t *size,
49-
size_t *maxNumRecords);
50-
static void bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
51-
size_t size, size_t validSize);
54+
static void parcagpuBufferRequested(uint8_t **buffer, size_t *size,
55+
size_t *maxNumRecords);
56+
static void parcagpuBufferCompleted(CUcontext ctx, uint32_t streamId,
57+
uint8_t *buffer, size_t size,
58+
size_t validSize);
5259

5360
void cleanup(void);
5461

@@ -79,49 +86,35 @@ int InitializeInjection(void) {
7986
return 1; // Still return success to not break the injection
8087
}
8188

82-
// Try enabling driver API kernel launch callback like the example
83-
result = cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
84-
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel);
85-
if (result != CUPTI_SUCCESS) {
86-
const char *errstr;
87-
cuptiGetResultString(result, &errstr);
88-
fprintf(stderr, "[CUPTI] Failed to enable cuLaunchKernel callback: %s\n",
89-
errstr);
90-
}
91-
92-
// Enable runtime API callbacks for cudaLaunchKernel
93-
result = cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API,
94-
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000);
95-
if (result != CUPTI_SUCCESS) {
96-
const char *errstr;
97-
cuptiGetResultString(result, &errstr);
98-
fprintf(stderr, "[CUPTI] Failed to enable cudaLaunchKernel callback: %s\n",
99-
errstr);
100-
}
101-
102-
// Enable runtime API callbacks for cudaGraphLaunch
103-
result = cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API,
104-
CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000);
105-
if (result != CUPTI_SUCCESS) {
106-
const char *errstr;
107-
cuptiGetResultString(result, &errstr);
108-
fprintf(stderr, "[CUPTI] Failed to enable cudaGraphLaunch callback: %s\n",
109-
errstr);
110-
}
111-
112-
// Enable runtime API callbacks for cudaGraphLaunch
113-
result =
114-
cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API,
115-
CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000);
116-
if (result != CUPTI_SUCCESS) {
117-
const char *errstr;
118-
cuptiGetResultString(result, &errstr);
119-
fprintf(stderr, "[CUPTI] Failed to enable cudaGraphLaunch callback: %s\n",
120-
errstr);
89+
// Enable all runtime API kernel launch callbacks
90+
CUpti_CallbackId launchCallbacks[] = {
91+
CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020,
92+
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000,
93+
CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_ptsz_v7000,
94+
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_ptsz_v7000,
95+
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060,
96+
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_ptsz_v11060,
97+
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000,
98+
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_ptsz_v9000,
99+
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000,
100+
CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000,
101+
CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000,
102+
};
103+
for (size_t i = 0; i < sizeof(launchCallbacks) / sizeof(launchCallbacks[0]);
104+
i++) {
105+
result = cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API,
106+
launchCallbacks[i]);
107+
if (result != CUPTI_SUCCESS) {
108+
const char *errstr;
109+
cuptiGetResultString(result, &errstr);
110+
fprintf(stderr, "[CUPTI] Failed to enable runtime callback %d: %s\n",
111+
launchCallbacks[i], errstr);
112+
}
121113
}
122114

123115
// Register activity buffer callbacks
124-
result = cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted);
116+
result = cuptiActivityRegisterCallbacks(parcagpuBufferRequested,
117+
parcagpuBufferCompleted);
125118
if (result != CUPTI_SUCCESS) {
126119
const char *errstr;
127120
cuptiGetResultString(result, &errstr);
@@ -130,7 +123,6 @@ int InitializeInjection(void) {
130123
return 1; // Still return success to not break the injection
131124
}
132125

133-
// Enable multiple kernel activity recording types
134126
result = cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
135127
if (result != CUPTI_SUCCESS) {
136128
const char *errstr;
@@ -141,54 +133,11 @@ int InitializeInjection(void) {
141133
DEBUG_PRINTF("[CUPTI] Enabled CONCURRENT_KERNEL activity\n");
142134
}
143135

144-
// This activity kind serializes execution and gives me errors on a T4:
145-
// CUPTI_ERROR_NOT_COMPATIBLE But its not a fatal error so do it anyways
146-
// result = cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL);
147-
// if (result != CUPTI_SUCCESS) {
148-
// const char *errstr;
149-
// cuptiGetResultString(result, &errstr);
150-
// fprintf(stderr, "[CUPTI] Failed to enable kernel activity: %s\n",
151-
// errstr);
152-
// } else {
153-
// DEBUG_PRINTF("[CUPTI] Enabled KERNEL activity\n");
154-
// }
155-
156-
// Also try enabling runtime activities
157-
// result = cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME);
158-
// if (result != CUPTI_SUCCESS) {
159-
// const char *errstr;
160-
// cuptiGetResultString(result, &errstr);
161-
// fprintf(stderr, "[CUPTI] Failed to enable runtime activity: %s\n",
162-
// errstr);
163-
// } else {
164-
// DEBUG_PRINTF("[CUPTI] Enabled RUNTIME activity\n");
165-
// }
166-
167-
// Try enabling graph activities
168-
// result = cuptiActivityEnable(CUPTI_ACTIVITY_KIND_GRAPH_TRACE);
169-
// if (result != CUPTI_SUCCESS) {
170-
// const char *errstr;
171-
// cuptiGetResultString(result, &errstr);
172-
// fprintf(stderr, "[CUPTI] Failed to enable graph trace activity: %s\n",
173-
// errstr);
174-
// } else {
175-
// DEBUG_PRINTF("[CUPTI] Enabled GRAPH_TRACE activity\n");
176-
// }
177-
178136
atexit(cleanup);
179137

180138
DEBUG_PRINTF("[CUPTI] Successfully initialized CUPTI callbacks with external "
181139
"correlation and activity API\n");
182140

183-
// NOTE: If automatic flush still doesn't work, you can implement manual
184-
// periodic flushing:
185-
// 1. Create a background thread that calls cuptiActivityFlushAll(0)
186-
// periodically
187-
// 2. Or call cuptiActivityFlushAll(0) from your application at regular
188-
// intervals
189-
// 3. Or hook into CUDA synchronization points (cudaDeviceSynchronize, etc.)
190-
// to flush
191-
192141
return 1;
193142
}
194143

@@ -210,45 +159,27 @@ static void print_backtrace(const char *prefix) {
210159
}
211160
}
212161

213-
// Callback handler for both runtime and driver API
162+
// Callback handler for runtime API
214163
static void parcagpuCuptiCallback(void *userdata, CUpti_CallbackDomain domain,
215164
CUpti_CallbackId cbid,
216165
const CUpti_CallbackData *cbdata) {
217-
if (domain == CUPTI_CB_DOMAIN_RUNTIME_API) {
218-
// We hook on EXIT because that makes our probe overhead not add to GPU
219-
// launch latency and hopefully covers some of the overhead in the shadow of
220-
// GPU async work.
221-
if (cbdata->callbackSite == CUPTI_API_EXIT) {
222-
// Probablistic gate should go here.
223-
uint32_t correlationId = cbdata->correlationId;
224-
// Call stub functions for uprobe attachment
225-
const char *name = cbdata->functionName;
226-
switch (cbid) {
227-
case CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000:
228-
case CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060:
229-
if (cbdata->symbolName) {
230-
DEBUG_PRINTF("----------- %s\n", cbdata->symbolName);
231-
name = cbdata->symbolName;
232-
}
233-
case CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000:
234-
case CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000:
235-
DEBUG_PRINTF("[CUPTI] Runtime API callback: cbid=%d, correlationId=%u, "
236-
"func=%s\n",
237-
cbid, correlationId, cbdata->functionName);
238-
outstandingEvents++;
239-
DTRACE_PROBE3(parcagpu, cuda_correlation, correlationId, cbid, name);
240-
break;
241-
default:
242-
// Debug: print any other runtime API callback we see with backtrace
243-
DEBUG_PRINTF(
244-
"[CUPTI] Other Runtime API callback: cbid=%d, correlationId=%u\n",
245-
cbid, correlationId);
246-
// Print backtrace to see who's calling this
247-
if (debug_enabled) {
248-
print_backtrace("[CUPTI]");
249-
}
250-
}
251-
}
166+
if (domain != CUPTI_CB_DOMAIN_RUNTIME_API) {
167+
return;
168+
}
169+
170+
// We hook on EXIT because that makes our probe overhead not add to GPU
171+
// launch latency and hopefully covers some of the overhead in the shadow of
172+
// GPU async work.
173+
if (cbdata->callbackSite == CUPTI_API_EXIT) {
174+
uint32_t correlationId = cbdata->correlationId;
175+
const char *name =
176+
cbdata->symbolName ? cbdata->symbolName : cbdata->functionName;
177+
178+
DEBUG_PRINTF(
179+
"[CUPTI] Runtime API callback: cbid=%d, correlationId=%u, func=%s\n",
180+
cbid, correlationId, name);
181+
outstandingEvents++;
182+
DTRACE_PROBE3(parcagpu, cuda_correlation, correlationId, cbid, name);
252183
}
253184
// If we let too many events pile up it overwhelms the perf_event buffers,
254185
// just another reason to explore just passing the activity buffer through to
@@ -257,13 +188,13 @@ static void parcagpuCuptiCallback(void *userdata, CUpti_CallbackDomain domain,
257188
DEBUG_PRINTF("[CUPTI] Flushing: outstandingEvents=%zu\n",
258189
outstandingEvents);
259190
cuptiActivityFlushAll(0);
191+
outstandingEvents = 0;
260192
}
261193
}
262194

263195
// Buffer request callback
264-
static void bufferRequested(uint8_t **buffer, size_t *size,
265-
size_t *maxNumRecords) {
266-
// Allocate 64MB buffer aligned to 8 bytes
196+
static void parcagpuBufferRequested(uint8_t **buffer, size_t *size,
197+
size_t *maxNumRecords) {
267198
*buffer = (uint8_t *)aligned_alloc(8, activityBufferSize);
268199
*size = activityBufferSize;
269200
*maxNumRecords = 0; // Let CUPTI decide
@@ -273,8 +204,9 @@ static void bufferRequested(uint8_t **buffer, size_t *size,
273204
}
274205

275206
// Buffer completion callback
276-
static void bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
277-
size_t size, size_t validSize) {
207+
static void parcagpuBufferCompleted(CUcontext ctx, uint32_t streamId,
208+
uint8_t *buffer, size_t size,
209+
size_t validSize) {
278210
CUptiResult result;
279211
CUpti_Activity *record = NULL;
280212
int recordCount = 0;
@@ -295,15 +227,8 @@ static void bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
295227
}
296228

297229
recordCount++;
298-
switch (record->kind) {
299-
case CUPTI_ACTIVITY_KIND_RUNTIME: {
300-
CUpti_ActivityAPI *r = (CUpti_ActivityAPI *)record;
301-
DEBUG_PRINTF("[CUPTI] Runtime activity: correlationId=%u, cbid=%d,\n",
302-
r->correlationId, r->cbid);
303-
break;
304-
}
305-
case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
306-
case CUPTI_ACTIVITY_KIND_KERNEL: {
230+
if (record->kind == CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL ||
231+
record->kind == CUPTI_ACTIVITY_KIND_KERNEL) {
307232
CUpti_ActivityKernel5 *k = (CUpti_ActivityKernel5 *)record;
308233

309234
DEBUG_PRINTF("[CUPTI] Kernel activity: graphId=%u graphNodeId=%lu "
@@ -315,26 +240,6 @@ static void bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
315240
DTRACE_PROBE8(parcagpu, kernel_executed, k->start, k->end,
316241
k->correlationId, k->deviceId, k->streamId, k->graphId,
317242
k->graphNodeId, k->name);
318-
break;
319-
}
320-
// case CUPTI_ACTIVITY_KIND_GRAPH_TRACE: {
321-
// CUpti_ActivityGraphTrace *g = (CUpti_ActivityGraphTrace *)record;
322-
323-
// DEBUG_PRINTF(
324-
// "[CUPTI] Graph activity: graphId=%u, correlationId=%u, deviceId=%u,
325-
// " "streamId=%u, start=%lu, end=%lu, duration=%lu ns\n", g->graphId,
326-
// g->correlationId, g->deviceId, g->streamId, g->start, g->end,
327-
// g->end - g->start);
328-
// // Call stub function for uprobe attachment
329-
// uint64_t devCorrelationId =
330-
// g->correlationId | ((uint64_t)g->deviceId << 32);
331-
// DTRACE_PROBE5(parcagpu, graph_executed, g->start, g->end,
332-
// devCorrelationId, g->streamId, g->graphId);
333-
// break;
334-
// }
335-
default:
336-
DEBUG_PRINTF("[CUPTI] Activity record %d: kind=%d\n", recordCount,
337-
record->kind);
338243
}
339244
}
340245

@@ -347,7 +252,7 @@ static void bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
347252
outstandingEvents = 0;
348253

349254
// Free the buffer
350-
DEBUG_PRINTF("[CUPTI:bufferCompleted] Freeing buffer %p\n", buffer);
255+
DEBUG_PRINTF("[CUPTI] Freeing buffer %p\n", buffer);
351256
free(buffer);
352257

353258
// Report any records dropped due to buffer overflow

0 commit comments

Comments
 (0)