Optimize seqlock and utilization watcher to prevent random 256MB allocation slowdowns

nishitnshah · Nishit Shah · commit 1853a24f7146 · 2026-03-25T10:26:56.000+08:00
Root cause: Random 20x slowdowns (12.734ms vs 0.586ms) for 256MB allocations when
all 8 processes allocate simultaneously. Two issues:

1. Seqlock retry storm: When all 8 processes write to their slots, readers see
   writers active (seqlock odd) and spin in tight loop, causing CPU contention.

2. Utilization watcher contention: The utilization_watcher thread held lock_shrreg()
   during slow NVML queries (nvmlDeviceGetComputeRunningProcesses,
   nvmlDeviceGetProcessUtilization), blocking shared memory operations.

Fixes:

1. Seqlock exponential backoff:
   - Removed stale data fallback (memory checks require accurate data)
   - Progressive delays: CPU pause → 1μs → 10μs → 100μs
   - Prevents tight spinning while ensuring accurate reads

2. Utilization watcher optimization:
   - Moved NVML queries OUTSIDE lock_shrreg()
   - Lock now only held briefly to update shared memory
   - Reduces lock hold time from milliseconds to microseconds

Impact: Should eliminate random 256MB allocation slowdowns by reducing
seqlock contention and utilization watcher blocking.

Signed-off-by: Nishit Shah &lt;nishshah@linkedin.com&gt;
diff --git a/src/multiprocess/multiprocess_memory_limit.c b/src/multiprocess/multiprocess_memory_limit.c
@@ -285,27 +285,40 @@ size_t get_gpu_memory_usage(const int dev) {
         uint64_t proc_usage;
         uint64_t seq1, seq2;
         int retry_count = 0;
-        const int MAX_RETRIES = 100;
 
         // Seqlock read protocol: retry until we get a consistent snapshot
+        // CRITICAL: Memory checks require accurate data, cannot use stale reads
         do {
             // Read sequence number (must be even = no write in progress)
             seq1 = atomic_load_explicit(&slot->seqlock, memory_order_acquire);
 
-            // If odd, writer is in progress, spin briefly
+            // If odd, writer is in progress, back off with exponential delay
             while (seq1 & 1) {
-                // CPU pause instruction to avoid hammering cache
-                #if defined(__x86_64__) || defined(__i386__)
-                __asm__ __volatile__("pause" ::: "memory");
-                #elif defined(__aarch64__)
-                __asm__ __volatile__("yield" ::: "memory");
-                #endif
-                seq1 = atomic_load_explicit(&slot->seqlock, memory_order_acquire);
-
-                if (++retry_count > MAX_RETRIES) {
-                    LOG_WARN("Seqlock retry limit exceeded for slot %d, using best-effort read", i);
-                    goto best_effort_read;
+                // Exponential backoff to reduce contention
+                if (retry_count < 5) {
+                    // First 5 retries: just CPU pause (fast path)
+                    #if defined(__x86_64__) || defined(__i386__)
+                    __asm__ __volatile__("pause" ::: "memory");
+                    #elif defined(__aarch64__)
+                    __asm__ __volatile__("yield" ::: "memory");
+                    #endif
+                } else if (retry_count < 20) {
+                    // Next 15 retries: 1μs delay
+                    usleep(1);
+                } else if (retry_count < 100) {
+                    // Next 80 retries: 10μs delay
+                    usleep(10);
+                } else {
+                    // After 100 retries: 100μs delay
+                    usleep(100);
+                    // Log if we're spinning for a very long time
+                    if (retry_count % 100 == 0) {
+                        LOG_DEBUG("Seqlock spinning for slot %d, retry %d (writer active)", i, retry_count);
+                    }
                 }
+
+                retry_count++;
+                seq1 = atomic_load_explicit(&slot->seqlock, memory_order_acquire);
             }
 
             // Read the data with acquire semantics
@@ -326,15 +339,6 @@ size_t get_gpu_memory_usage(const int dev) {
 
         LOG_INFO("dev=%d pid=%d host pid=%d i=%lu", dev, pid, hostpid, proc_usage);
         total+=proc_usage;
-        continue;
-
-best_effort_read:
-        // Fallback: best-effort read if spinning too long
-        proc_usage = atomic_load_explicit(&slot->used[dev].total, memory_order_acquire);
-        pid = atomic_load_explicit(&slot->pid, memory_order_relaxed);
-        hostpid = atomic_load_explicit(&slot->hostpid, memory_order_relaxed);
-        LOG_WARN("dev=%d pid=%d host pid=%d i=%lu (best-effort)",dev,pid,hostpid,proc_usage);
-        total+=proc_usage;
     }
 
     total+=initial_offset;
diff --git a/src/multiprocess/multiprocess_utilization_watcher.c b/src/multiprocess/multiprocess_utilization_watcher.c
@@ -128,7 +128,6 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
 
     unsigned int nvmlCounts;
     CHECK_NVML_API(nvmlDeviceGetCount(&nvmlCounts));
-    lock_shrreg();
 
     int devi,cudadev;
     for (devi=0;devi<nvmlCounts;devi++){
@@ -142,8 +141,22 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
       nvmlDevice_t device;
       CHECK_NVML_API(nvmlDeviceGetHandleByIndex(cudadev, &device));
 
+      // OPTIMIZATION: Do slow NVML queries WITHOUT holding lock
+      // This prevents blocking memory allocation operations
+
       //Get Memory for container
       nvmlReturn_t res = nvmlDeviceGetComputeRunningProcesses(device,&infcount,infos);
+
+      // Get SM util for container
+      gettimeofday(&cur,NULL);
+      microsec = (cur.tv_sec - 1) * 1000UL * 1000UL + cur.tv_usec;
+      nvmlProcessUtilizationSample_t processes_sample[SHARED_REGION_MAX_PROCESS_NUM];
+      unsigned int processes_num = SHARED_REGION_MAX_PROCESS_NUM;
+      nvmlReturn_t res2 = nvmlDeviceGetProcessUtilization(device,processes_sample,&processes_num,microsec);
+
+      // Now acquire lock only for the brief period needed to update shared memory
+      lock_shrreg();
+
       if (res == NVML_SUCCESS) {
         for (i=0; i<infcount; i++){
           proc = find_proc_by_hostpid(infos[i].pid);
@@ -152,13 +165,8 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
           }
         }
       }
-      // Get SM util for container
-      gettimeofday(&cur,NULL);
-      microsec = (cur.tv_sec - 1) * 1000UL * 1000UL + cur.tv_usec;
-      nvmlProcessUtilizationSample_t processes_sample[SHARED_REGION_MAX_PROCESS_NUM];
-      unsigned int processes_num = SHARED_REGION_MAX_PROCESS_NUM;
-      res = nvmlDeviceGetProcessUtilization(device,processes_sample,&processes_num,microsec);
-      if (res == NVML_SUCCESS) {
+
+      if (res2 == NVML_SUCCESS) {
         for (i=0; i<processes_num; i++){
           proc = find_proc_by_hostpid(processes_sample[i].pid);
           if (proc != NULL){
@@ -167,11 +175,13 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
           }
         }
       }
+
+      unlock_shrreg();
+
       if (sum < 0)
         sum = 0;
       userutil[cudadev] = sum;
     }
-    unlock_shrreg();
     return 0;
 }