@@ -33,16 +33,45 @@ static size_t outstandingEvents = 0;
3333// runtime calls)
3434static __thread uint32_t runtimeEnterCorrelationId = 0;
3535
36- // Rate limiting
37- static __thread uint64_t lastProbeTimeNs = 0;
38- static bool limiter_disabled = false;
39- #define PROBE_MIN_INTERVAL_NS 500000 // 500μs
36+ // Rate limiting - token bucket algorithm (configurable via PARCAGPU_RATE_LIMIT)
37+ static double rateLimitPerSec = 100.0;
38+
39+ // Thread-local token bucket state
40+ static __thread uint64_t lastRefillNs = 0;
41+ static __thread double tokens = 0;
42+
43+ // Returns true if the sample should be emitted, false if rate limited
44+ static bool rateLimiterTryAcquire(uint64_t nowNs) {
45+ // Refill tokens based on elapsed time
46+ if (lastRefillNs > 0) {
47+ double elapsedSec = (nowNs - lastRefillNs) / 1e9;
48+ tokens = tokens + elapsedSec * rateLimitPerSec;
49+ if (tokens > rateLimitPerSec) {
50+ tokens = rateLimitPerSec;
51+ }
52+ } else {
53+ tokens = rateLimitPerSec; // Start with full bucket
54+ }
55+ lastRefillNs = nowNs;
56+
57+ if (tokens >= 1.0) {
58+ tokens -= 1.0;
59+ return true;
60+ }
61+ return false;
62+ }
4063
4164static void init_debug(void) {
4265 static bool initialized = false;
4366 if (!initialized) {
4467 debug_enabled = getenv("PARCAGPU_DEBUG") != NULL;
45- limiter_disabled = getenv("PARCAGPU_LIMITER_DISABLE") != NULL;
68+ const char *rateEnv = getenv("PARCAGPU_RATE_LIMIT");
69+ if (rateEnv != NULL) {
70+ double rate = atof(rateEnv);
71+ if (rate > 0) {
72+ rateLimitPerSec = rate;
73+ }
74+ }
4675 initialized = true;
4776 }
4877}
@@ -244,29 +273,15 @@ static void parcagpuCuptiCallback(void *userdata, CUpti_CallbackDomain domain,
244273 return;
245274 }
246275
247- // Check if this is a graph launch (never rate limit these)
248- bool isGraphLaunch = false;
249- if (signedCbid < 0) {
250- // Driver API: cuGraphLaunch = 514, cuGraphLaunch_ptsz = 515
251- int driverCbid = -signedCbid;
252- isGraphLaunch = (driverCbid == 514 || driverCbid == 515);
253- } else {
254- // Runtime API: cudaGraphLaunch = 311, cudaGraphLaunch_ptsz = 312
255- isGraphLaunch = (signedCbid == 311 || signedCbid == 312);
256- }
257-
258- // Rate limit probes (skip for graph launches)
259- if (!limiter_disabled && !isGraphLaunch) {
260- struct timespec ts;
261- clock_gettime(CLOCK_MONOTONIC, &ts);
262- uint64_t nowNs = (uint64_t)ts.tv_sec * 1000000000ULL + ts.tv_nsec;
263- if (nowNs - lastProbeTimeNs < PROBE_MIN_INTERVAL_NS) {
264- DEBUG_PRINTF(
265- "[CUPTI] Rate limited: skipping probe for correlationId=%u\n",
266- correlationId);
267- return;
268- }
269- lastProbeTimeNs = nowNs;
276+ // Rate limit probes
277+ struct timespec ts;
278+ clock_gettime(CLOCK_MONOTONIC, &ts);
279+ uint64_t nowNs = (uint64_t)ts.tv_sec * 1000000000ULL + ts.tv_nsec;
280+ if (!rateLimiterTryAcquire(nowNs)) {
281+ DEBUG_PRINTF(
282+ "[CUPTI] Rate limited: skipping probe for correlationId=%u\n",
283+ correlationId);
284+ return;
270285 }
271286
272287 outstandingEvents++;
0 commit comments