Skip to content

Commit 7af3743

Browse files
committed
refactor(nvidia/thead): lock gpm detection
Signed-off-by: thxCode <thxcode0824@gmail.com>
1 parent b3dc4b8 commit 7af3743

File tree

2 files changed

+24
-12
lines changed

2 files changed

+24
-12
lines changed

gpustack_runtime/detector/nvidia.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import logging
55
import math
66
import re
7+
import threading
78
import time
89
from _ctypes import byref
910
from functools import lru_cache
@@ -573,6 +574,9 @@ def _get_gpm_metrics(
573574
return list(dev_gpm_metrics.metrics)
574575

575576

577+
_gpm_metrics_lock = threading.Lock()
578+
579+
576580
def _get_sm_util_from_gpm_metrics(
577581
dev: pynvml.c_nvmlDevice_t,
578582
gpu_instance_id: int | None = None,
@@ -593,12 +597,14 @@ def _get_sm_util_from_gpm_metrics(
593597
The SM utilization as an integer percentage, or None if failed.
594598
595599
"""
596-
dev_gpm_metrics = _get_gpm_metrics(
597-
metrics=[pynvml.NVML_GPM_METRIC_SM_UTIL],
598-
dev=dev,
599-
gpu_instance_id=gpu_instance_id,
600-
interval=interval,
601-
)
600+
with _gpm_metrics_lock:
601+
dev_gpm_metrics = _get_gpm_metrics(
602+
metrics=[pynvml.NVML_GPM_METRIC_SM_UTIL],
603+
dev=dev,
604+
gpu_instance_id=gpu_instance_id,
605+
interval=interval,
606+
)
607+
602608
if dev_gpm_metrics and not math.isnan(dev_gpm_metrics[0].value):
603609
return int(dev_gpm_metrics[0].value)
604610

gpustack_runtime/detector/thead.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import contextlib
44
import logging
55
import math
6+
import threading
67
import time
78
from functools import lru_cache
89

@@ -544,6 +545,9 @@ def _get_gpm_metrics(
544545
return list(dev_gpm_metrics.metrics)
545546

546547

548+
_gpm_metrics_lock = threading.Lock()
549+
550+
547551
def _get_sm_util_from_gpm_metrics(
548552
dev: pyhgml.c_hgmlDevice_t,
549553
gpu_instance_id: int | None = None,
@@ -564,12 +568,14 @@ def _get_sm_util_from_gpm_metrics(
564568
The SM utilization as an integer percentage, or None if failed.
565569
566570
"""
567-
dev_gpm_metrics = _get_gpm_metrics(
568-
metrics=[pyhgml.HGML_GPM_METRIC_SM_UTIL],
569-
dev=dev,
570-
gpu_instance_id=gpu_instance_id,
571-
interval=interval,
572-
)
571+
with _gpm_metrics_lock:
572+
dev_gpm_metrics = _get_gpm_metrics(
573+
metrics=[pyhgml.HGML_GPM_METRIC_SM_UTIL],
574+
dev=dev,
575+
gpu_instance_id=gpu_instance_id,
576+
interval=interval,
577+
)
578+
573579
if dev_gpm_metrics and not math.isnan(dev_gpm_metrics[0].value):
574580
return int(dev_gpm_metrics[0].value)
575581

0 commit comments

Comments
 (0)