Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/allocator/allocator.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ int oom_check(const int dev, size_t addon) {
else
d=dev;
uint64_t limit = get_current_device_memory_limit(d);
size_t _usage = get_gpu_memory_usage(d);
// Use real NVML-reported memory usage instead of internally tracked value
// This ensures OOM is triggered based on actual GPU memory consumption
size_t _usage = get_gpu_memory_real_usage(d);

if (limit == 0) {
return 0;
Expand Down
13 changes: 9 additions & 4 deletions src/cuda/memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -493,17 +493,22 @@ CUresult cuMemGetInfo_v2(size_t* free, size_t* total) {
LOG_DEBUG("cuMemGetInfo_v2");
ENSURE_INITIALIZED();
CHECK_DRV_API(cuCtxGetDevice(&dev));
size_t usage = get_current_device_memory_usage(cuda_to_nvml_map(dev));
size_t limit = get_current_device_memory_limit(cuda_to_nvml_map(dev));
// Use real NVML-reported memory usage for accurate free memory calculation
size_t usage = get_gpu_memory_real_usage(dev);
size_t limit = get_current_device_memory_limit(dev);
if (limit == 0) {
CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemGetInfo_v2, free, total);
LOG_INFO("orig free=%ld total=%ld", *free, *total);
*free = *total - usage;
*free = (*total > usage) ? (*total - usage) : 0;
LOG_INFO("after free=%ld total=%ld", *free, *total);
return CUDA_SUCCESS;
} else if (limit < usage) {
LOG_WARN("limit < usage; usage=%ld, limit=%ld", usage, limit);
return CUDA_ERROR_INVALID_VALUE;
// Return 0 free memory instead of error when over limit
CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemGetInfo_v2, free, total);
*free = 0;
*total = limit;
return CUDA_SUCCESS;
} else {
CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemGetInfo_v2, free, total);
LOG_INFO("orig free=%ld total=%ld limit=%ld usage=%ld",
Expand Down
27 changes: 24 additions & 3 deletions src/multiprocess/multiprocess_memory_limit.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ int _record_kernel_interval = 1;

void do_init_device_memory_limits(uint64_t*, int);
void exit_withlock(int exitcode);
uint64_t nvml_get_device_memory_usage(const int dev);

void set_current_gpu_status(int status){
int i;
Expand Down Expand Up @@ -255,6 +256,20 @@ size_t get_gpu_memory_usage(const int dev) {
return total;
}

size_t get_gpu_memory_real_usage(const int dev) {
LOG_INFO("get_gpu_memory_real_usage dev=%d",dev);
ensure_initialized();
// Query NVML directly for real-time memory usage instead of using cached monitor value
// This ensures OOM checks use current actual GPU memory, not stale data
size_t nvml_usage = nvml_get_device_memory_usage(dev);
size_t tracked_usage = get_gpu_memory_usage(dev);
// Use the maximum of NVML-reported and tracked value to be conservative
// NVML gives real GPU memory, tracked gives what we've accounted for
size_t real_usage = (nvml_usage > tracked_usage) ? nvml_usage : tracked_usage;
LOG_INFO("get_gpu_memory_real_usage dev=%d nvml_usage=%lu tracked_usage=%lu real_usage=%lu", dev, nvml_usage, tracked_usage, real_usage);
return real_usage;
}

int set_gpu_device_memory_monitor(int32_t pid,int dev,size_t monitor){
//LOG_WARN("set_gpu_device_memory_monitor:%d %d %lu",pid,dev,monitor);
int i;
Expand Down Expand Up @@ -307,13 +322,15 @@ uint64_t nvml_get_device_memory_usage(const int dev) {
ret = nvmlDeviceGetHandleByIndex(dev, &ndev);
if (ret != NVML_SUCCESS) {
LOG_ERROR("NVML get device %d error, %s", dev, nvmlErrorString(ret));
return 0;
}
unsigned int pcnt = SHARED_REGION_MAX_PROCESS_NUM;
nvmlProcessInfo_v1_t infos[SHARED_REGION_MAX_PROCESS_NUM];
LOG_DEBUG("before nvmlDeviceGetComputeRunningProcesses");
ret = nvmlDeviceGetComputeRunningProcesses(ndev, &pcnt, infos);
if (ret != NVML_SUCCESS) {
LOG_ERROR("NVML get process error, %s", nvmlErrorString(ret));
return 0;
}
int i = 0;
uint64_t usage = 0;
Expand All @@ -322,9 +339,13 @@ uint64_t nvml_get_device_memory_usage(const int dev) {
for (; i < pcnt; i++) {
int slot = 0;
for (; slot < region->proc_num; slot++) {
if (infos[i].pid != region->procs[slot].pid)
// NVML returns host PIDs, so we need to compare with hostpid, not pid
// pid is the container PID (from getpid()), hostpid is the real host PID
if (infos[i].pid != region->procs[slot].hostpid)
continue;
usage += infos[i].usedGpuMemory;
LOG_DEBUG("nvml_get_device_memory_usage: matched hostpid=%d, usedGpuMemory=%lu",
infos[i].pid, infos[i].usedGpuMemory);
}
}
unlock_shrreg();
Expand Down Expand Up @@ -851,8 +872,8 @@ uint64_t get_current_device_memory_usage(const int dev) {
if (dev < 0 || dev >= CUDA_DEVICE_MAX_COUNT) {
LOG_ERROR("Illegal device id: %d", dev);
}
result = get_gpu_memory_usage(dev);
// result= nvml_get_device_memory_usage(dev);
// Use real NVML-reported memory usage for accurate memory tracking
result = get_gpu_memory_real_usage(dev);
finish=clock();
LOG_DEBUG("get_current_device_memory_usage:tick=%lu result=%lu\n",finish-start,result);
return result;
Expand Down
1 change: 1 addition & 0 deletions src/multiprocess/multiprocess_memory_limit.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ int set_host_pid(int hostpid);
uint64_t get_current_device_memory_monitor(const int dev);
uint64_t get_current_device_memory_usage(const int dev);
size_t get_gpu_memory_usage(const int dev);
size_t get_gpu_memory_real_usage(const int dev);

// Priority-related
int get_current_priority();
Expand Down
63 changes: 63 additions & 0 deletions src/multiprocess/multiprocess_utilization_watcher.c
Original file line number Diff line number Diff line change
Expand Up @@ -210,9 +210,72 @@ void* utilization_watcher() {
}
}

// update_monitorused queries NVML for per-process GPU memory and writes
// results into procs[].monitorused[]. This runs unconditionally so that
// the monitor can always read real NVML memory from the shared region.
static void update_monitorused() {
unsigned int nvmlCounts;
nvmlReturn_t ret = nvmlDeviceGetCount(&nvmlCounts);
if (ret != NVML_SUCCESS) {
LOG_ERROR("update_monitorused: nvmlDeviceGetCount failed: %s", nvmlErrorString(ret));
return;
}
lock_shrreg();
int devi, cudadev, i;
for (devi = 0; devi < nvmlCounts; devi++) {
unsigned int infcount = SHARED_REGION_MAX_PROCESS_NUM;
nvmlProcessInfo_v1_t infos[SHARED_REGION_MAX_PROCESS_NUM];
shrreg_proc_slot_t *proc;
cudadev = nvml_to_cuda_map((unsigned int)(devi));
if (cudadev < 0)
continue;
nvmlDevice_t device;
nvmlReturn_t r = nvmlDeviceGetHandleByIndex(cudadev, &device);
if (r != NVML_SUCCESS)
continue;
r = nvmlDeviceGetComputeRunningProcesses(device, &infcount, infos);
if (r == NVML_SUCCESS) {
for (i = 0; i < infcount; i++) {
proc = find_proc_by_hostpid(infos[i].pid);
if (proc != NULL) {
proc->monitorused[cudadev] = infos[i].usedGpuMemory;
}
}
}
}
unlock_shrreg();
}

// memory_monitor_watcher is a lightweight thread that periodically queries
// NVML for per-process GPU memory and writes it into monitorused[].
// It runs every 1 second regardless of whether SM limits are configured.
static void* memory_monitor_watcher(void *arg) {
(void)arg;
nvmlInit();
ensure_initialized();
struct timespec sleep_interval = { .tv_sec = 1, .tv_nsec = 0 };
while (1) {
nanosleep(&sleep_interval, NULL);
if (pidfound == 0) {
update_host_pid();
if (pidfound == 0)
continue;
}
update_monitorused();
}
return NULL;
}

void init_utilization_watcher() {
LOG_INFO("set core utilization limit to %d",get_current_device_sm_limit(0));
setspec();

// Always start the memory monitor watcher to populate monitorused[]
// so the external monitor can read real NVML memory from shared region.
pthread_t mem_tid;
pthread_create(&mem_tid, NULL, memory_monitor_watcher, NULL);
LOG_INFO("Started memory_monitor_watcher thread for NVML memory tracking");

pthread_t tid;
if ((get_current_device_sm_limit(0)<=100) && (get_current_device_sm_limit(0)>0)){
pthread_create(&tid, NULL, utilization_watcher, NULL);
Expand Down
5 changes: 3 additions & 2 deletions src/nvml/hook.c
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,7 @@ nvmlReturn_t _nvmlDeviceGetMemoryInfo(nvmlDevice_t device,void* memory,int versi
if (cudadev < 0) {
return NVML_SUCCESS;
}
// get_current_device_memory_usage now returns real NVML-reported usage
size_t usage = get_current_device_memory_usage(cudadev);
size_t monitor = get_current_device_memory_monitor(cudadev);
size_t limit = get_current_device_memory_limit(cudadev);
Expand All @@ -354,12 +355,12 @@ nvmlReturn_t _nvmlDeviceGetMemoryInfo(nvmlDevice_t device,void* memory,int versi
} else {
switch (version) {
case 1:
((nvmlMemory_t*)memory)->free = (limit-usage);
((nvmlMemory_t*)memory)->free = (limit > usage) ? (limit - usage) : 0;
((nvmlMemory_t*)memory)->total = limit;
((nvmlMemory_t*)memory)->used = usage;
return NVML_SUCCESS;
case 2:
((nvmlMemory_v2_t *)memory)->free = (limit-usage);
((nvmlMemory_v2_t *)memory)->free = (limit > usage) ? (limit - usage) : 0;
((nvmlMemory_v2_t *)memory)->total = limit;
((nvmlMemory_v2_t *)memory)->used = usage;
return NVML_SUCCESS;
Expand Down