Skip to content

Commit 6a0be2a

Browse files
committed
Use one big buffer policy that proton uses, should fix leaks
Don't print anything to stderr unless debugging is enabled Fix test script
1 parent ac7534c commit 6a0be2a

File tree

3 files changed

+33
-66
lines changed

3 files changed

+33
-66
lines changed

cupti/cupti-prof.c

Lines changed: 11 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,7 @@
1515
static bool debug_enabled = false;
1616

1717
// Activity buffer management
18-
#define NUM_BUFFERS 16
19-
typedef struct {
20-
uint8_t *buffer;
21-
int in_use; // 1 if CUPTI owns it, 0 if available
22-
} BufferInfo;
23-
24-
static BufferInfo activityBuffers[NUM_BUFFERS] = {{NULL, 0}};
25-
26-
static size_t activityBufferSize = 32 * 1024;
18+
static size_t activityBufferSize = 64 * 1024 * 1024; // 64MB
2719

2820
// Global variables
2921
static CUpti_SubscriberHandle subscriber = 0;
@@ -62,7 +54,7 @@ void cleanup(void);
6254

6355
// CUPTI initialization function required for CUDA_INJECTION64_PATH
6456
int InitializeInjection(void) {
65-
fprintf(stderr, "[CUPTI] InitializeInjection called\n");
57+
DEBUG_PRINTF("[CUPTI] InitializeInjection called\n");
6658
CUptiResult result;
6759

6860
// Set flush period BEFORE enabling activities (in milliseconds)
@@ -269,37 +261,13 @@ static void parcagpuCuptiCallback(void *userdata, CUpti_CallbackDomain domain,
269261
// Buffer request callback
270262
static void bufferRequested(uint8_t **buffer, size_t *size,
271263
size_t *maxNumRecords) {
272-
// Find an available buffer that's not in use
273-
for (int i = 0; i < NUM_BUFFERS; i++) {
274-
if (!activityBuffers[i].in_use) {
275-
// Allocate if needed
276-
if (activityBuffers[i].buffer == NULL) {
277-
activityBuffers[i].buffer = (uint8_t *)malloc(activityBufferSize);
278-
DEBUG_PRINTF("[CUPTI:bufferRequested] Allocated new buffer[%d] at %p\n",
279-
i, activityBuffers[i].buffer);
280-
}
281-
282-
// Mark as in use and return it
283-
activityBuffers[i].in_use = 1;
284-
*buffer = activityBuffers[i].buffer;
285-
*size = activityBufferSize;
286-
*maxNumRecords = 0; // Let CUPTI decide
287-
288-
DEBUG_PRINTF("[CUPTI:bufferRequested] Giving buffer[%d]=%p to CUPTI "
289-
"(marked in_use)\n",
290-
i, *buffer);
291-
return;
292-
}
293-
}
294-
295-
// All buffers are in use - this shouldn't happen with enough buffers
296-
// Allocate a temporary buffer that won't be reused
297-
DEBUG_PRINTF("[CUPTI:bufferRequested] ERROR: All %d buffers in use! "
298-
"Allocating temporary buffer\n",
299-
NUM_BUFFERS);
300-
*buffer = (uint8_t *)malloc(activityBufferSize);
264+
// Allocate 64MB buffer aligned to 8 bytes
265+
*buffer = (uint8_t *)aligned_alloc(8, activityBufferSize);
301266
*size = activityBufferSize;
302-
*maxNumRecords = 0;
267+
*maxNumRecords = 0; // Let CUPTI decide
268+
269+
DEBUG_PRINTF("[CUPTI:bufferRequested] Allocated buffer %p, size=%zu\n",
270+
*buffer, *size);
303271
}
304272

305273
// Buffer completion callback
@@ -373,16 +341,9 @@ static void bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
373341

374342
outstandingEvents -= recordCount;
375343

376-
// Mark the buffer as available for reuse
377-
for (int i = 0; i < NUM_BUFFERS; i++) {
378-
if (activityBuffers[i].buffer == buffer) {
379-
activityBuffers[i].in_use = 0;
380-
DEBUG_PRINTF("[CUPTI:bufferCompleted] Buffer[%d]=%p marked as available "
381-
"(not in_use)\n",
382-
i, buffer);
383-
break;
384-
}
385-
}
344+
// Free the buffer
345+
DEBUG_PRINTF("[CUPTI:bufferCompleted] Freeing buffer %p\n", buffer);
346+
free(buffer);
386347

387348
// Report any records dropped due to buffer overflow
388349
size_t dropped;
@@ -412,14 +373,5 @@ void cleanup(void) {
412373
subscriber = 0;
413374
}
414375

415-
// Free all activity buffers
416-
for (int i = 0; i < NUM_BUFFERS; i++) {
417-
if (activityBuffers[i].buffer) {
418-
free(activityBuffers[i].buffer);
419-
activityBuffers[i].buffer = NULL;
420-
activityBuffers[i].in_use = 0;
421-
}
422-
}
423-
424376
DEBUG_PRINTF("[CUPTI] Cleanup completed\n");
425377
}

test.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ cd "$(dirname "$0")"
77
# Parse arguments
88
USE_BPFTRACE=0
99
ARCH="${ARCH:-amd64}"
10+
CUDA_MAJOR="${CUDA_MAJOR:-12}"
1011
for arg in "$@"; do
1112
case $arg in
1213
--bpftrace)
@@ -51,7 +52,7 @@ echo "=== Running test program ==="
5152
export LD_LIBRARY_PATH="$(pwd)/zig-out/lib:$LD_LIBRARY_PATH"
5253
export PARCAGPU_DEBUG=1
5354
# Use the CMake-built library with real CUPTI
54-
zig-out/bin/test_cupti_prof build/$ARCH/libparcagpucupti.so "$@"
55+
zig-out/bin/test_cupti_prof build/$CUDA_MAJOR/$ARCH/libparcagpucupti.so "$@"
5556

5657
# If bpftrace was started, stop it and show results
5758
if [ "$USE_BPFTRACE" -eq 1 ]; then

test/test_cupti_prof.c

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -211,21 +211,35 @@ int main(int argc, char **argv) {
211211
if (i % 2 == 0 && i > 0) {
212212
for (int k = 0; k < 5; k++) {
213213
uint32_t recCorrelationId = correlationId - 10 + k;
214-
uint8_t *buffer;
214+
uint8_t *activityData;
215215
size_t validSize;
216216

217+
// Create activity data (this is temporary, not the actual buffer)
217218
if (recCorrelationId % 3 == 0) {
218219
// Graph launch: create buffer with 3 kernel activities sharing the same correlationId
219-
buffer = create_graph_kernel_activities_buffer(&validSize, recCorrelationId, 0, 1, graphId, 3);
220-
bufferCompletedCallback(NULL, 1, buffer, 32 * 1024, validSize);
220+
activityData = create_graph_kernel_activities_buffer(&validSize, recCorrelationId, 0, 1, graphId, 3);
221221
graphId++; // Different graphId for next graph
222222
} else {
223223
// Regular kernel launch: single kernel activity
224-
buffer = create_kernel_activity_buffer(&validSize, recCorrelationId, 0, 1, "mock_cuda_kernel_name");
225-
bufferCompletedCallback(NULL, 1, buffer, 32 * 1024, validSize);
224+
activityData = create_kernel_activity_buffer(&validSize, recCorrelationId, 0, 1, "mock_cuda_kernel_name");
225+
}
226+
227+
// Request a buffer from the callback (this will allocate it properly)
228+
uint8_t *buffer;
229+
size_t bufferSize;
230+
size_t maxNumRecords;
231+
if (bufferRequestedCallback) {
232+
bufferRequestedCallback(&buffer, &bufferSize, &maxNumRecords);
233+
234+
// Copy the activity data into the buffer
235+
memcpy(buffer, activityData, validSize);
236+
237+
// Now call bufferCompleted with the proper buffer (it will free it)
238+
bufferCompletedCallback(NULL, 1, buffer, bufferSize, validSize);
226239
}
227240

228-
free(buffer);
241+
// Free the temporary activity data
242+
free(activityData);
229243
}
230244
}
231245

0 commit comments

Comments
 (0)