@@ -29,13 +29,20 @@ static CUpti_SubscriberHandle subscriber = 0;
2929static size_t outstandingEvents = 0 ;
3030
3131// Thread-local tracking: store correlation ID from runtime ENTER
32- // so we can skip driver EXIT probe when it matches (driver calls happen under runtime calls)
32+ // so we can skip driver EXIT probe when it matches (driver calls happen under
33+ // runtime calls)
3334static __thread uint32_t runtimeEnterCorrelationId = 0 ;
3435
36+ // Rate limiting
37+ static __thread uint64_t lastProbeTimeNs = 0 ;
38+ static bool limiter_disabled = false;
39+ #define PROBE_MIN_INTERVAL_NS 500000 // 500μs
40+
3541static void init_debug (void ) {
3642 static bool initialized = false;
3743 if (!initialized ) {
3844 debug_enabled = getenv ("PARCAGPU_DEBUG" ) != NULL ;
45+ limiter_disabled = getenv ("PARCAGPU_LIMITER_DISABLE" ) != NULL ;
3946 initialized = true;
4047 }
4148}
@@ -217,9 +224,9 @@ static void parcagpuCuptiCallback(void *userdata, CUpti_CallbackDomain domain,
217224 if (domain == CUPTI_CB_DOMAIN_DRIVER_API ) {
218225 // Skip if this driver call is under a runtime call (same correlation ID)
219226 if (correlationId == runtimeEnterCorrelationId ) {
220- DEBUG_PRINTF (
221- "[CUPTI] Skipping driver EXIT correlationId=%u - runtime will handle\n" ,
222- correlationId );
227+ DEBUG_PRINTF ("[CUPTI] Skipping driver EXIT correlationId=%u - runtime "
228+ " will handle\n" ,
229+ correlationId );
223230 return ;
224231 }
225232 // Pure driver call (no runtime wrapper) - use negative cbid
@@ -237,6 +244,31 @@ static void parcagpuCuptiCallback(void *userdata, CUpti_CallbackDomain domain,
237244 return ;
238245 }
239246
247+ // Check if this is a graph launch (never rate limit these)
248+ bool isGraphLaunch = false;
249+ if (signedCbid < 0 ) {
250+ // Driver API: cuGraphLaunch = 514, cuGraphLaunch_ptsz = 515
251+ int driverCbid = - signedCbid ;
252+ isGraphLaunch = (driverCbid == 514 || driverCbid == 515 );
253+ } else {
254+ // Runtime API: cudaGraphLaunch = 311, cudaGraphLaunch_ptsz = 312
255+ isGraphLaunch = (signedCbid == 311 || signedCbid == 312 );
256+ }
257+
258+ // Rate limit probes (skip for graph launches)
259+ if (!limiter_disabled && !isGraphLaunch ) {
260+ struct timespec ts ;
261+ clock_gettime (CLOCK_MONOTONIC , & ts );
262+ uint64_t nowNs = (uint64_t )ts .tv_sec * 1000000000ULL + ts .tv_nsec ;
263+ if (nowNs - lastProbeTimeNs < PROBE_MIN_INTERVAL_NS ) {
264+ DEBUG_PRINTF (
265+ "[CUPTI] Rate limited: skipping probe for correlationId=%u\n" ,
266+ correlationId );
267+ return ;
268+ }
269+ lastProbeTimeNs = nowNs ;
270+ }
271+
240272 outstandingEvents ++ ;
241273 DTRACE_PROBE3 (parcagpu , cuda_correlation , correlationId , signedCbid , name );
242274 // If we let too many events pile up it overwhelms the perf_event buffers,
0 commit comments