File tree Expand file tree Collapse file tree 1 file changed +6
-1
lines changed Expand file tree Collapse file tree 1 file changed +6
-1
lines changed Original file line number Diff line number Diff line change @@ -131,7 +131,7 @@ def stop(self):
131131 nvmlShutdown ()
132132
133133
134- def launch_memory_monitor (monitor_interval : float = 1.0 ) -> GPUMemoryMonitor :
134+ def launch_memory_monitor (monitor_interval : float = 1.0 ) -> GPUMemoryMonitor | None :
135135 """Launch a GPU memory monitor in a separate thread.
136136
137137 Args:
@@ -140,6 +140,11 @@ def launch_memory_monitor(monitor_interval: float = 1.0) -> GPUMemoryMonitor:
140140 Returns:
141141 GPUMemoryMonitor: The monitor instance that was launched
142142 """
143+ try :
144+ nvmlDeviceGetMemoryInfo (nvmlDeviceGetHandleByIndex (0 ))
145+ except Exception as e :
146+ print (f"Failed to get GPU memory info: { e } . Stopping GPU memory monitor." )
147+ return None
143148 monitor = GPUMemoryMonitor (monitor_interval )
144149 monitor .start ()
145150 atexit .register (monitor .stop ) # Ensure the monitor stops when the program exits
You can’t perform that action at this time.
0 commit comments