Add timeout and force poll gpu usage to prevent race condition

AMOOOMA · AMOOOMA · commit 540cc8c646db · 2025-12-13T04:27:02.000Z
diff --git a/sdks/python/apache_beam/ml/inference/model_manager.py b/sdks/python/apache_beam/ml/inference/model_manager.py
@@ -103,6 +103,21 @@ def get_stats(self) -> Tuple[float, float, float]:
     with self._lock:
       return self._current_usage, self._peak_usage, self._total_memory
 
+  def refresh(self):
+    """Forces an immediate poll of the GPU."""
+    usage = self._get_nvidia_smi_used()
+    now = time.time()
+    with self._lock:
+      self._current_usage = usage
+      self._memory_history.append((now, usage))
+      # Recalculate peak immediately
+      while self._memory_history and (now - self._memory_history[0][0]
+                                      > self._peak_window_seconds):
+        self._memory_history.popleft()
+      self._peak_usage = (
+          max(m for _, m in self._memory_history)
+          if self._memory_history else usage)
+
   def _get_nvidia_smi_used(self) -> float:
     try:
       cmd = "nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits"
@@ -360,7 +375,7 @@ def acquire_model(self, tag: str, loader_func: Callable[[], Any]) -> Any:
             if self._evict_to_make_space(limit, est_cost, requesting_tag=tag):
               continue
 
-            self._cv.wait()
+            self._cv.wait(timeout=10.0)
 
       finally:
         if self._wait_queue and self._wait_queue[0][2] is my_id:
@@ -492,6 +507,7 @@ def _perform_eviction(self, key, tag, instance, score):
     del instance
     gc.collect()
     torch.cuda.empty_cache()
+    self._monitor.refresh()
     self._monitor.reset_peak()
 
   def _spawn_new_model(self, tag, loader_func, is_unknown, est_cost):
@@ -538,6 +554,8 @@ def _delete_all_models(self):
     self._active_counts.clear()
     gc.collect()
     torch.cuda.empty_cache()
+    self._monitor.refresh()
+    self._monitor.reset_peak()
 
   def _force_reset(self):
     logger.warning("Force Reset Triggered")
diff --git a/sdks/python/apache_beam/ml/inference/model_manager_test.py b/sdks/python/apache_beam/ml/inference/model_manager_test.py
@@ -81,6 +81,10 @@ def free(self, amount_mb):
         self.history.pop(0)
       self._peak = max(self.history)
 
+  def refresh(self):
+    """Simulates a refresh of the monitor stats (no-op for mock)."""
+    pass
+
 
 class MockModel:
   def __init__(self, name, size, monitor):