From f3c402ceab438e13ced43ce3c4956d8bdc30548f Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Thu, 25 Sep 2025 17:16:29 +0000 Subject: [PATCH] Skip memory monitoring if not available Signed-off-by: Chenjie Luo --- modelopt/torch/utils/memory_monitor.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modelopt/torch/utils/memory_monitor.py b/modelopt/torch/utils/memory_monitor.py index 2b7558537..94ed1cab8 100644 --- a/modelopt/torch/utils/memory_monitor.py +++ b/modelopt/torch/utils/memory_monitor.py @@ -131,7 +131,7 @@ def stop(self): nvmlShutdown() -def launch_memory_monitor(monitor_interval: float = 1.0) -> GPUMemoryMonitor: +def launch_memory_monitor(monitor_interval: float = 1.0) -> GPUMemoryMonitor | None: """Launch a GPU memory monitor in a separate thread. Args: @@ -140,6 +140,11 @@ def launch_memory_monitor(monitor_interval: float = 1.0) -> GPUMemoryMonitor: Returns: GPUMemoryMonitor: The monitor instance that was launched """ + try: + nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(0)) + except Exception as e: + print(f"Failed to get GPU memory info: {e}. Stopping GPU memory monitor.") + return None monitor = GPUMemoryMonitor(monitor_interval) monitor.start() atexit.register(monitor.stop) # Ensure the monitor stops when the program exits