File tree Expand file tree Collapse file tree 1 file changed +6
-1
lines changed Expand file tree Collapse file tree 1 file changed +6
-1
lines changed Original file line number Diff line number Diff line change @@ -131,7 +131,7 @@ def stop(self):
131
131
nvmlShutdown ()
132
132
133
133
134
- def launch_memory_monitor (monitor_interval : float = 1.0 ) -> GPUMemoryMonitor :
134
+ def launch_memory_monitor (monitor_interval : float = 1.0 ) -> GPUMemoryMonitor | None :
135
135
"""Launch a GPU memory monitor in a separate thread.
136
136
137
137
Args:
@@ -140,6 +140,11 @@ def launch_memory_monitor(monitor_interval: float = 1.0) -> GPUMemoryMonitor:
140
140
Returns:
141
141
GPUMemoryMonitor: The monitor instance that was launched
142
142
"""
143
+ try :
144
+ nvmlDeviceGetMemoryInfo (nvmlDeviceGetHandleByIndex (0 ))
145
+ except Exception as e :
146
+ print (f"Failed to get GPU memory info: { e } . Stopping GPU memory monitor." )
147
+ return None
143
148
monitor = GPUMemoryMonitor (monitor_interval )
144
149
monitor .start ()
145
150
atexit .register (monitor .stop ) # Ensure the monitor stops when the program exits
You can’t perform that action at this time.
0 commit comments