Skip to content

Commit f3c402c

Browse files
committed
Skip memory monitoring if not available
Signed-off-by: Chenjie Luo <[email protected]>
1 parent 0178562 commit f3c402c

File tree

1 file changed

+6
-1
lines changed

1 file changed

+6
-1
lines changed

modelopt/torch/utils/memory_monitor.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def stop(self):
131131
nvmlShutdown()
132132

133133

134-
def launch_memory_monitor(monitor_interval: float = 1.0) -> GPUMemoryMonitor:
134+
def launch_memory_monitor(monitor_interval: float = 1.0) -> GPUMemoryMonitor | None:
135135
"""Launch a GPU memory monitor in a separate thread.
136136
137137
Args:
@@ -140,6 +140,11 @@ def launch_memory_monitor(monitor_interval: float = 1.0) -> GPUMemoryMonitor:
140140
Returns:
141141
GPUMemoryMonitor: The monitor instance that was launched
142142
"""
143+
try:
144+
nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(0))
145+
except Exception as e:
146+
print(f"Failed to get GPU memory info: {e}. Stopping GPU memory monitor.")
147+
return None
143148
monitor = GPUMemoryMonitor(monitor_interval)
144149
monitor.start()
145150
atexit.register(monitor.stop) # Ensure the monitor stops when the program exits

0 commit comments

Comments
 (0)