Skip to content

Commit 3273df8

Browse files
Merge pull request #163 from maverick123123/testpr155
Merge pr155 some important commits
2 parents 54b0ca3 + 9cb90f5 commit 3273df8

File tree

4 files changed

+335
-126
lines changed

4 files changed

+335
-126
lines changed

src/libvgpu.c

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -872,21 +872,30 @@ void preInit(){
872872
void postInit(){
873873
allocator_init();
874874
map_cuda_visible_devices();
875-
int lock_ret = try_lock_unified_lock();
876-
if (lock_ret != 0) {
877-
LOG_WARN("try_lock_unified_lock failed, skipping set_task_pid");
878-
pidfound=0;
875+
876+
// Use shared memory semaphore to serialize host PID detection
877+
// Returns 1 if lock acquired, 0 if timeout (skip detection)
878+
int lock_acquired = lock_postinit();
879+
nvmlReturn_t res = NVML_SUCCESS;
880+
881+
if (lock_acquired) {
882+
// Lock acquired - safe to call set_task_pid()
883+
res = set_task_pid();
884+
unlock_postinit();
879885
} else {
880-
nvmlReturn_t res = set_task_pid();
881-
try_unlock_unified_lock();
882-
if (res != NVML_SUCCESS) {
883-
LOG_WARN("SET_TASK_PID FAILED.");
884-
pidfound = 0;
885-
} else {
886-
pidfound = 1;
887-
}
886+
// Timeout - another process likely crashed holding the lock
887+
// Skip host PID detection for this process
888+
LOG_WARN("Skipped host PID detection due to lock timeout");
889+
res = NVML_ERROR_TIMEOUT;
888890
}
891+
889892
LOG_MSG("Initialized");
893+
if (res != NVML_SUCCESS) {
894+
LOG_WARN("SET_TASK_PID FAILED - using container PID for accounting");
895+
pidfound = 0;
896+
} else {
897+
pidfound = 1;
898+
}
890899

891900
//add_gpu_device_memory_usage(getpid(),0,context_size,0);
892901
env_utilization_switch = set_env_utilization_switch();

0 commit comments

Comments
 (0)