refactor(nvidia/thead): lock gpm detection

thxCode · thxCode · commit 7af37434f071 · 2026-03-13T16:23:32.000+08:00
Signed-off-by: thxCode &lt;thxcode0824@gmail.com&gt;
diff --git a/gpustack_runtime/detector/nvidia.py b/gpustack_runtime/detector/nvidia.py
@@ -4,6 +4,7 @@
 import logging
 import math
 import re
+import threading
 import time
 from _ctypes import byref
 from functools import lru_cache
@@ -573,6 +574,9 @@ def _get_gpm_metrics(
     return list(dev_gpm_metrics.metrics)
 
 
+_gpm_metrics_lock = threading.Lock()
+
+
 def _get_sm_util_from_gpm_metrics(
     dev: pynvml.c_nvmlDevice_t,
     gpu_instance_id: int | None = None,
@@ -593,12 +597,14 @@ def _get_sm_util_from_gpm_metrics(
         The SM utilization as an integer percentage, or None if failed.
 
     """
-    dev_gpm_metrics = _get_gpm_metrics(
-        metrics=[pynvml.NVML_GPM_METRIC_SM_UTIL],
-        dev=dev,
-        gpu_instance_id=gpu_instance_id,
-        interval=interval,
-    )
+    with _gpm_metrics_lock:
+        dev_gpm_metrics = _get_gpm_metrics(
+            metrics=[pynvml.NVML_GPM_METRIC_SM_UTIL],
+            dev=dev,
+            gpu_instance_id=gpu_instance_id,
+            interval=interval,
+        )
+
     if dev_gpm_metrics and not math.isnan(dev_gpm_metrics[0].value):
         return int(dev_gpm_metrics[0].value)
 
diff --git a/gpustack_runtime/detector/thead.py b/gpustack_runtime/detector/thead.py
@@ -3,6 +3,7 @@
 import contextlib
 import logging
 import math
+import threading
 import time
 from functools import lru_cache
 
@@ -544,6 +545,9 @@ def _get_gpm_metrics(
     return list(dev_gpm_metrics.metrics)
 
 
+_gpm_metrics_lock = threading.Lock()
+
+
 def _get_sm_util_from_gpm_metrics(
     dev: pyhgml.c_hgmlDevice_t,
     gpu_instance_id: int | None = None,
@@ -564,12 +568,14 @@ def _get_sm_util_from_gpm_metrics(
         The SM utilization as an integer percentage, or None if failed.
 
     """
-    dev_gpm_metrics = _get_gpm_metrics(
-        metrics=[pyhgml.HGML_GPM_METRIC_SM_UTIL],
-        dev=dev,
-        gpu_instance_id=gpu_instance_id,
-        interval=interval,
-    )
+    with _gpm_metrics_lock:
+        dev_gpm_metrics = _get_gpm_metrics(
+            metrics=[pyhgml.HGML_GPM_METRIC_SM_UTIL],
+            dev=dev,
+            gpu_instance_id=gpu_instance_id,
+            interval=interval,
+        )
+
     if dev_gpm_metrics and not math.isnan(dev_gpm_metrics[0].value):
         return int(dev_gpm_metrics[0].value)