Skip to content

Commit 1853a24

Browse files
nishitnshahNishit Shah
authored andcommitted
Optimize seqlock and utilization watcher to prevent random 256MB allocation slowdowns
Root cause: Random 20x slowdowns (12.734ms vs 0.586ms) for 256MB allocations when all 8 processes allocate simultaneously. Two issues: 1. Seqlock retry storm: When all 8 processes write to their slots, readers see writers active (seqlock odd) and spin in tight loop, causing CPU contention. 2. Utilization watcher contention: The utilization_watcher thread held lock_shrreg() during slow NVML queries (nvmlDeviceGetComputeRunningProcesses, nvmlDeviceGetProcessUtilization), blocking shared memory operations. Fixes: 1. Seqlock exponential backoff: - Removed stale data fallback (memory checks require accurate data) - Progressive delays: CPU pause → 1μs → 10μs → 100μs - Prevents tight spinning while ensuring accurate reads 2. Utilization watcher optimization: - Moved NVML queries OUTSIDE lock_shrreg() - Lock now only held briefly to update shared memory - Reduces lock hold time from milliseconds to microseconds Impact: Should eliminate random 256MB allocation slowdowns by reducing seqlock contention and utilization watcher blocking. Signed-off-by: Nishit Shah <nishshah@linkedin.com>
1 parent 4cc123b commit 1853a24

File tree

2 files changed

+45
-31
lines changed

2 files changed

+45
-31
lines changed

src/multiprocess/multiprocess_memory_limit.c

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -285,27 +285,40 @@ size_t get_gpu_memory_usage(const int dev) {
285285
uint64_t proc_usage;
286286
uint64_t seq1, seq2;
287287
int retry_count = 0;
288-
const int MAX_RETRIES = 100;
289288

290289
// Seqlock read protocol: retry until we get a consistent snapshot
290+
// CRITICAL: Memory checks require accurate data, cannot use stale reads
291291
do {
292292
// Read sequence number (must be even = no write in progress)
293293
seq1 = atomic_load_explicit(&slot->seqlock, memory_order_acquire);
294294

295-
// If odd, writer is in progress, spin briefly
295+
// If odd, writer is in progress, back off with exponential delay
296296
while (seq1 & 1) {
297-
// CPU pause instruction to avoid hammering cache
298-
#if defined(__x86_64__) || defined(__i386__)
299-
__asm__ __volatile__("pause" ::: "memory");
300-
#elif defined(__aarch64__)
301-
__asm__ __volatile__("yield" ::: "memory");
302-
#endif
303-
seq1 = atomic_load_explicit(&slot->seqlock, memory_order_acquire);
304-
305-
if (++retry_count > MAX_RETRIES) {
306-
LOG_WARN("Seqlock retry limit exceeded for slot %d, using best-effort read", i);
307-
goto best_effort_read;
297+
// Exponential backoff to reduce contention
298+
if (retry_count < 5) {
299+
// First 5 retries: just CPU pause (fast path)
300+
#if defined(__x86_64__) || defined(__i386__)
301+
__asm__ __volatile__("pause" ::: "memory");
302+
#elif defined(__aarch64__)
303+
__asm__ __volatile__("yield" ::: "memory");
304+
#endif
305+
} else if (retry_count < 20) {
306+
// Next 15 retries: 1μs delay
307+
usleep(1);
308+
} else if (retry_count < 100) {
309+
// Next 80 retries: 10μs delay
310+
usleep(10);
311+
} else {
312+
// After 100 retries: 100μs delay
313+
usleep(100);
314+
// Log if we're spinning for a very long time
315+
if (retry_count % 100 == 0) {
316+
LOG_DEBUG("Seqlock spinning for slot %d, retry %d (writer active)", i, retry_count);
317+
}
308318
}
319+
320+
retry_count++;
321+
seq1 = atomic_load_explicit(&slot->seqlock, memory_order_acquire);
309322
}
310323

311324
// Read the data with acquire semantics
@@ -326,15 +339,6 @@ size_t get_gpu_memory_usage(const int dev) {
326339

327340
LOG_INFO("dev=%d pid=%d host pid=%d i=%lu", dev, pid, hostpid, proc_usage);
328341
total+=proc_usage;
329-
continue;
330-
331-
best_effort_read:
332-
// Fallback: best-effort read if spinning too long
333-
proc_usage = atomic_load_explicit(&slot->used[dev].total, memory_order_acquire);
334-
pid = atomic_load_explicit(&slot->pid, memory_order_relaxed);
335-
hostpid = atomic_load_explicit(&slot->hostpid, memory_order_relaxed);
336-
LOG_WARN("dev=%d pid=%d host pid=%d i=%lu (best-effort)",dev,pid,hostpid,proc_usage);
337-
total+=proc_usage;
338342
}
339343

340344
total+=initial_offset;

src/multiprocess/multiprocess_utilization_watcher.c

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,6 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
128128

129129
unsigned int nvmlCounts;
130130
CHECK_NVML_API(nvmlDeviceGetCount(&nvmlCounts));
131-
lock_shrreg();
132131

133132
int devi,cudadev;
134133
for (devi=0;devi<nvmlCounts;devi++){
@@ -142,8 +141,22 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
142141
nvmlDevice_t device;
143142
CHECK_NVML_API(nvmlDeviceGetHandleByIndex(cudadev, &device));
144143

144+
// OPTIMIZATION: Do slow NVML queries WITHOUT holding lock
145+
// This prevents blocking memory allocation operations
146+
145147
//Get Memory for container
146148
nvmlReturn_t res = nvmlDeviceGetComputeRunningProcesses(device,&infcount,infos);
149+
150+
// Get SM util for container
151+
gettimeofday(&cur,NULL);
152+
microsec = (cur.tv_sec - 1) * 1000UL * 1000UL + cur.tv_usec;
153+
nvmlProcessUtilizationSample_t processes_sample[SHARED_REGION_MAX_PROCESS_NUM];
154+
unsigned int processes_num = SHARED_REGION_MAX_PROCESS_NUM;
155+
nvmlReturn_t res2 = nvmlDeviceGetProcessUtilization(device,processes_sample,&processes_num,microsec);
156+
157+
// Now acquire lock only for the brief period needed to update shared memory
158+
lock_shrreg();
159+
147160
if (res == NVML_SUCCESS) {
148161
for (i=0; i<infcount; i++){
149162
proc = find_proc_by_hostpid(infos[i].pid);
@@ -152,13 +165,8 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
152165
}
153166
}
154167
}
155-
// Get SM util for container
156-
gettimeofday(&cur,NULL);
157-
microsec = (cur.tv_sec - 1) * 1000UL * 1000UL + cur.tv_usec;
158-
nvmlProcessUtilizationSample_t processes_sample[SHARED_REGION_MAX_PROCESS_NUM];
159-
unsigned int processes_num = SHARED_REGION_MAX_PROCESS_NUM;
160-
res = nvmlDeviceGetProcessUtilization(device,processes_sample,&processes_num,microsec);
161-
if (res == NVML_SUCCESS) {
168+
169+
if (res2 == NVML_SUCCESS) {
162170
for (i=0; i<processes_num; i++){
163171
proc = find_proc_by_hostpid(processes_sample[i].pid);
164172
if (proc != NULL){
@@ -167,11 +175,13 @@ int get_used_gpu_utilization(int *userutil,int *sysprocnum) {
167175
}
168176
}
169177
}
178+
179+
unlock_shrreg();
180+
170181
if (sum < 0)
171182
sum = 0;
172183
userutil[cudadev] = sum;
173184
}
174-
unlock_shrreg();
175185
return 0;
176186
}
177187

0 commit comments

Comments
 (0)