Project-HAMi · loiht2 · Jan 22, 2026 · Jan 23, 2026 · Feb 22, 2026
diff --git a/src/allocator/allocator.c b/src/allocator/allocator.c
@@ -42,7 +42,9 @@ int oom_check(const int dev, size_t addon) {
     else
         d=dev;
     uint64_t limit = get_current_device_memory_limit(d);
-    size_t _usage = get_gpu_memory_usage(d);
+    // Use real NVML-reported memory usage instead of internally tracked value
+    // This ensures OOM is triggered based on actual GPU memory consumption
+    size_t _usage = get_gpu_memory_real_usage(d);
 
     if (limit == 0) {
         return 0;

diff --git a/src/cuda/memory.c b/src/cuda/memory.c
@@ -493,17 +493,22 @@ CUresult cuMemGetInfo_v2(size_t* free, size_t* total) {
     LOG_DEBUG("cuMemGetInfo_v2");
     ENSURE_INITIALIZED();
     CHECK_DRV_API(cuCtxGetDevice(&dev));
-    size_t usage = get_current_device_memory_usage(cuda_to_nvml_map(dev));
-    size_t limit = get_current_device_memory_limit(cuda_to_nvml_map(dev));
+    // Use real NVML-reported memory usage for accurate free memory calculation
+    size_t usage = get_gpu_memory_real_usage(dev);
+    size_t limit = get_current_device_memory_limit(dev);
     if (limit == 0) {
         CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemGetInfo_v2, free, total);
         LOG_INFO("orig free=%ld total=%ld", *free, *total);
-        *free = *total - usage;
+        *free = (*total > usage) ? (*total - usage) : 0;
         LOG_INFO("after free=%ld total=%ld", *free, *total);
         return CUDA_SUCCESS;
     } else if (limit < usage) {
         LOG_WARN("limit < usage; usage=%ld, limit=%ld", usage, limit);
-        return CUDA_ERROR_INVALID_VALUE;
+        // Return 0 free memory instead of error when over limit
+        CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemGetInfo_v2, free, total);
+        *free = 0;
+        *total = limit;
+        return CUDA_SUCCESS;
     } else {
         CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemGetInfo_v2, free, total);
         LOG_INFO("orig free=%ld total=%ld limit=%ld usage=%ld",

diff --git a/src/multiprocess/multiprocess_memory_limit.c b/src/multiprocess/multiprocess_memory_limit.c
@@ -52,6 +52,7 @@ int _record_kernel_interval = 1;
 
 void do_init_device_memory_limits(uint64_t*, int);
 void exit_withlock(int exitcode);
+uint64_t nvml_get_device_memory_usage(const int dev);
 
 void set_current_gpu_status(int status){
     int i;
@@ -255,6 +256,20 @@ size_t get_gpu_memory_usage(const int dev) {
     return total;
 }
 
+size_t get_gpu_memory_real_usage(const int dev) {
+    LOG_INFO("get_gpu_memory_real_usage dev=%d",dev);
+    ensure_initialized();
+    // Query NVML directly for real-time memory usage instead of using cached monitor value
+    // This ensures OOM checks use current actual GPU memory, not stale data
+    size_t nvml_usage = nvml_get_device_memory_usage(dev);
+    size_t tracked_usage = get_gpu_memory_usage(dev);
+    // Use the maximum of NVML-reported and tracked value to be conservative
+    // NVML gives real GPU memory, tracked gives what we've accounted for
+    size_t real_usage = (nvml_usage > tracked_usage) ? nvml_usage : tracked_usage;
+    LOG_INFO("get_gpu_memory_real_usage dev=%d nvml_usage=%lu tracked_usage=%lu real_usage=%lu", dev, nvml_usage, tracked_usage, real_usage);
+    return real_usage;
+}
+
 int set_gpu_device_memory_monitor(int32_t pid,int dev,size_t monitor){
     //LOG_WARN("set_gpu_device_memory_monitor:%d %d %lu",pid,dev,monitor);
     int i;
@@ -307,13 +322,15 @@ uint64_t nvml_get_device_memory_usage(const int dev) {
     ret = nvmlDeviceGetHandleByIndex(dev, &ndev);
     if (ret != NVML_SUCCESS) {
         LOG_ERROR("NVML get device %d error, %s", dev, nvmlErrorString(ret));
+        return 0;
     }
     unsigned int pcnt = SHARED_REGION_MAX_PROCESS_NUM;
     nvmlProcessInfo_v1_t infos[SHARED_REGION_MAX_PROCESS_NUM];
     LOG_DEBUG("before nvmlDeviceGetComputeRunningProcesses");
     ret = nvmlDeviceGetComputeRunningProcesses(ndev, &pcnt, infos);
     if (ret != NVML_SUCCESS) {
         LOG_ERROR("NVML get process error, %s", nvmlErrorString(ret));
+        return 0;
     }
     int i = 0;
     uint64_t usage = 0;
@@ -322,9 +339,13 @@ uint64_t nvml_get_device_memory_usage(const int dev) {
     for (; i < pcnt; i++) {
         int slot = 0;
         for (; slot < region->proc_num; slot++) {
-            if (infos[i].pid != region->procs[slot].pid)
+            // NVML returns host PIDs, so we need to compare with hostpid, not pid
+            // pid is the container PID (from getpid()), hostpid is the real host PID
+            if (infos[i].pid != region->procs[slot].hostpid)
                 continue;
             usage += infos[i].usedGpuMemory;
+            LOG_DEBUG("nvml_get_device_memory_usage: matched hostpid=%d, usedGpuMemory=%lu", 
+                infos[i].pid, infos[i].usedGpuMemory);
         }
     }
     unlock_shrreg();
@@ -851,8 +872,8 @@ uint64_t get_current_device_memory_usage(const int dev) {
     if (dev < 0 || dev >= CUDA_DEVICE_MAX_COUNT) {
         LOG_ERROR("Illegal device id: %d", dev);
     }
-    result = get_gpu_memory_usage(dev);
-//    result= nvml_get_device_memory_usage(dev);
+    // Use real NVML-reported memory usage for accurate memory tracking
+    result = get_gpu_memory_real_usage(dev);
     finish=clock();
     LOG_DEBUG("get_current_device_memory_usage:tick=%lu result=%lu\n",finish-start,result);
     return result;

diff --git a/src/multiprocess/multiprocess_memory_limit.h b/src/multiprocess/multiprocess_memory_limit.h
@@ -133,6 +133,7 @@ int set_host_pid(int hostpid);
 uint64_t get_current_device_memory_monitor(const int dev);
 uint64_t get_current_device_memory_usage(const int dev);
 size_t get_gpu_memory_usage(const int dev);
+size_t get_gpu_memory_real_usage(const int dev);
 
 // Priority-related
 int get_current_priority();

diff --git a/src/multiprocess/multiprocess_utilization_watcher.c b/src/multiprocess/multiprocess_utilization_watcher.c
@@ -210,9 +210,72 @@ void* utilization_watcher() {
     }
 }
 
+// update_monitorused queries NVML for per-process GPU memory and writes
+// results into procs[].monitorused[]. This runs unconditionally so that
+// the monitor can always read real NVML memory from the shared region.
+static void update_monitorused() {
+    unsigned int nvmlCounts;
+    nvmlReturn_t ret = nvmlDeviceGetCount(&nvmlCounts);
+    if (ret != NVML_SUCCESS) {
+        LOG_ERROR("update_monitorused: nvmlDeviceGetCount failed: %s", nvmlErrorString(ret));
+        return;
+    }
+    lock_shrreg();
+    int devi, cudadev, i;
+    for (devi = 0; devi < nvmlCounts; devi++) {
+        unsigned int infcount = SHARED_REGION_MAX_PROCESS_NUM;
+        nvmlProcessInfo_v1_t infos[SHARED_REGION_MAX_PROCESS_NUM];
+        shrreg_proc_slot_t *proc;
+        cudadev = nvml_to_cuda_map((unsigned int)(devi));
+        if (cudadev < 0)
+            continue;
+        nvmlDevice_t device;
+        nvmlReturn_t r = nvmlDeviceGetHandleByIndex(cudadev, &device);
+        if (r != NVML_SUCCESS)
+            continue;
+        r = nvmlDeviceGetComputeRunningProcesses(device, &infcount, infos);
+        if (r == NVML_SUCCESS) {
+            for (i = 0; i < infcount; i++) {
+                proc = find_proc_by_hostpid(infos[i].pid);
+                if (proc != NULL) {
+                    proc->monitorused[cudadev] = infos[i].usedGpuMemory;
+                }
+            }
+        }
+    }
+    unlock_shrreg();
+}
+
+// memory_monitor_watcher is a lightweight thread that periodically queries
+// NVML for per-process GPU memory and writes it into monitorused[].
+// It runs every 1 second regardless of whether SM limits are configured.
+static void* memory_monitor_watcher(void *arg) {
+    (void)arg;
+    nvmlInit();
+    ensure_initialized();
+    struct timespec sleep_interval = { .tv_sec = 1, .tv_nsec = 0 };
+    while (1) {
+        nanosleep(&sleep_interval, NULL);
+        if (pidfound == 0) {
+            update_host_pid();
+            if (pidfound == 0)
+                continue;
+        }
+        update_monitorused();
+    }
+    return NULL;
+}
+
 void init_utilization_watcher() {
     LOG_INFO("set core utilization limit to  %d",get_current_device_sm_limit(0));
     setspec();
+
+    // Always start the memory monitor watcher to populate monitorused[]
+    // so the external monitor can read real NVML memory from shared region.
+    pthread_t mem_tid;
+    pthread_create(&mem_tid, NULL, memory_monitor_watcher, NULL);
+    LOG_INFO("Started memory_monitor_watcher thread for NVML memory tracking");
+
     pthread_t tid;
     if ((get_current_device_sm_limit(0)<=100) && (get_current_device_sm_limit(0)>0)){
         pthread_create(&tid, NULL, utilization_watcher, NULL);

diff --git a/src/nvml/hook.c b/src/nvml/hook.c
@@ -338,6 +338,7 @@ nvmlReturn_t _nvmlDeviceGetMemoryInfo(nvmlDevice_t device,void* memory,int versi
     if (cudadev < 0) {
         return NVML_SUCCESS;
     }
+    // get_current_device_memory_usage now returns real NVML-reported usage
     size_t usage = get_current_device_memory_usage(cudadev);
     size_t monitor = get_current_device_memory_monitor(cudadev);
     size_t limit = get_current_device_memory_limit(cudadev);
@@ -354,12 +355,12 @@ nvmlReturn_t _nvmlDeviceGetMemoryInfo(nvmlDevice_t device,void* memory,int versi
     } else {
         switch (version) {
         case 1:
-             ((nvmlMemory_t*)memory)->free = (limit-usage);
+             ((nvmlMemory_t*)memory)->free = (limit > usage) ? (limit - usage) : 0;
              ((nvmlMemory_t*)memory)->total = limit;
              ((nvmlMemory_t*)memory)->used = usage;
             return NVML_SUCCESS;
         case 2:
-            ((nvmlMemory_v2_t *)memory)->free = (limit-usage);
+            ((nvmlMemory_v2_t *)memory)->free = (limit > usage) ? (limit - usage) : 0;
             ((nvmlMemory_v2_t *)memory)->total = limit;
             ((nvmlMemory_v2_t *)memory)->used = usage;
             return NVML_SUCCESS;