Skip to content

Commit b0b6973

Browse files
committed
refactor: device fields
Signed-off-by: thxCode <thxcode0824@gmail.com>
1 parent ea9fa9c commit b0b6973

File tree

16 files changed

+258
-249
lines changed

16 files changed

+258
-249
lines changed

gpustack_runtime/detector/__types__.py

Lines changed: 6 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -159,29 +159,18 @@ class Device:
159159
"""
160160
Driver version of the device.
161161
"""
162-
driver_version_tuple: list[int | str] | None = None
163-
"""
164-
Driver version tuple of the device.
165-
None if `driver_version` is missed.
166-
"""
167162
runtime_version: str | None = None
168163
"""
169-
Runtime version of the device.
164+
Runtime version in major[.minor] of the device.
170165
"""
171-
runtime_version_tuple: list[int | str] | None = None
166+
runtime_version_original: str | None = None
172167
"""
173-
Runtime version tuple of the device.
174-
None if `runtime_version` is missed.
168+
Original runtime version string of the device.
175169
"""
176170
compute_capability: str | None = None
177171
"""
178172
Compute capability of the device.
179173
"""
180-
compute_capability_tuple: list[int | str] | None = None
181-
"""
182-
Compute capability tuple of the device.
183-
None if `compute_capability` is missed.
184-
"""
185174
cores: int | None = None
186175
"""
187176
Total cores of the device.
@@ -190,15 +179,15 @@ class Device:
190179
"""
191180
Core utilization of the device in percentage.
192181
"""
193-
memory: int | float = 0
182+
memory: int = 0
194183
"""
195184
Total memory of the device in MiB.
196185
"""
197-
memory_used: int | float = 0
186+
memory_used: int = 0
198187
"""
199188
Used memory of the device in MiB.
200189
"""
201-
memory_utilization: int | float = 0
190+
memory_utilization: float = 0
202191
"""
203192
Memory utilization of the device in percentage.
204193
"""

gpustack_runtime/detector/__utils__.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,3 +368,92 @@ def safe_str(value: Any, default: str = "") -> str:
368368
return str(value)
369369
except (ValueError, TypeError):
370370
return default
371+
372+
373+
def kibibyte_to_mebibyte(value: int) -> int:
374+
"""
375+
Convert KiB to MiB.
376+
377+
Args:
378+
value:
379+
The value in kilobytes.
380+
381+
Returns:
382+
The value in MiB, or 0 if the input is None or negative.
383+
384+
"""
385+
if value is None or value < 0:
386+
return 0
387+
388+
try:
389+
return value >> 10
390+
except (ValueError, TypeError, OverflowError):
391+
return 0
392+
393+
394+
def byte_to_mebibyte(value: int) -> int:
395+
"""
396+
Convert bytes to MiB.
397+
398+
Args:
399+
value:
400+
The value in bytes.
401+
402+
Returns:
403+
The value in MiB, or 0 if the input is None or negative.
404+
405+
"""
406+
if value is None or value < 0:
407+
return 0
408+
409+
try:
410+
return value >> 20
411+
except (ValueError, TypeError, OverflowError):
412+
return 0
413+
414+
415+
def get_brief_version(version: str | None) -> str | None:
416+
"""
417+
Get a brief version string,
418+
e.g., "11.2.152" -> "11.2".
419+
420+
Args:
421+
version:
422+
The full version string.
423+
424+
Returns:
425+
The brief version string, or None if the input is None or empty.
426+
427+
"""
428+
if not version:
429+
return None
430+
431+
splits = version.split(".", 3)
432+
if len(splits) >= 2:
433+
return ".".join(splits[:2])
434+
if len(splits) == 1:
435+
return splits[0]
436+
return None
437+
438+
439+
def get_utilization(used: int | None, total: int | None) -> float:
440+
"""
441+
Calculate utilization percentage.
442+
443+
Args:
444+
used:
445+
The used value.
446+
total:
447+
The total value.
448+
449+
Returns:
450+
The utilization percentage, rounded to two decimal places.
451+
452+
"""
453+
if used is None or total is None or used < 0 or total <= 0:
454+
return 0.0
455+
try:
456+
result = (used / total) * 100
457+
except (OverflowError, ZeroDivisionError):
458+
return 0.0
459+
return round(result, 2)

gpustack_runtime/detector/amd.py

Lines changed: 11 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,13 @@
77
from .. import envs
88
from . import pyamdgpu, pyamdsmi, pyrocmsmi
99
from .__types__ import Detector, Device, Devices, ManufacturerEnum
10-
from .__utils__ import PCIDevice, get_device_files, get_pci_devices
10+
from .__utils__ import (
11+
PCIDevice,
12+
get_brief_version,
13+
get_device_files,
14+
get_pci_devices,
15+
get_utilization,
16+
)
1117

1218
logger = logging.getLogger(__name__)
1319

@@ -79,12 +85,8 @@ def detect(self) -> Devices | None:
7985
try:
8086
pyamdsmi.amdsmi_init()
8187

82-
sys_runtime_ver = pyamdsmi.amdsmi_get_rocm_version_major_minor()
83-
sys_runtime_ver_t = (
84-
[int(v) if v.isdigit() else v for v in sys_runtime_ver.split(".")]
85-
if sys_runtime_ver
86-
else None
87-
)
88+
sys_runtime_ver_original = pyamdsmi.amdsmi_get_rocm_original_version()
89+
sys_runtime_ver = get_brief_version(sys_runtime_ver_original)
8890

8991
devs = pyamdsmi.amdsmi_get_processor_handles()
9092
dev_files = get_device_files(
@@ -108,17 +110,11 @@ def detect(self) -> Devices | None:
108110

109111
dev_gpu_driver_info = pyamdsmi.amdsmi_get_gpu_driver_info(dev)
110112
dev_driver_ver = dev_gpu_driver_info.get("driver_version")
111-
dev_driver_ver_t = (
112-
[int(v) if v.isdigit() else v for v in dev_driver_ver.split(".")]
113-
if dev_driver_ver
114-
else None
115-
)
116113

117114
dev_gpu_asic_info = pyamdsmi.amdsmi_get_gpu_asic_info(dev)
118115
dev_uuid = dev_gpu_asic_info.get("asic_serial")
119116
dev_name = dev_gpu_asic_info.get("market_name")
120117
dev_cc = None
121-
dev_cc_t = None
122118
if hasattr(dev_gpu_asic_info, "target_graphics_version"):
123119
dev_cc = dev_gpu_asic_info.target_graphics_version
124120
else:
@@ -127,7 +123,6 @@ def detect(self) -> Devices | None:
127123
dev_cc = pyrocmsmi.rsmi_dev_target_graphics_version_get(dev_idx)
128124
if dev_cc:
129125
dev_cc = dev_cc[3:] # Strip "gfx" prefix
130-
dev_cc_t = [int(v) if v.isdigit() else v for v in dev_cc.split(".")]
131126

132127
dev_gpu_metrics_info = pyamdsmi.amdsmi_get_gpu_metrics_info(dev)
133128
dev_cores = (
@@ -165,18 +160,14 @@ def detect(self) -> Devices | None:
165160
name=dev_name,
166161
uuid=dev_uuid,
167162
driver_version=dev_driver_ver,
168-
driver_version_tuple=dev_driver_ver_t,
169163
runtime_version=sys_runtime_ver,
170-
runtime_version_tuple=sys_runtime_ver_t,
164+
runtime_version_original=sys_runtime_ver_original,
171165
compute_capability=dev_cc,
172-
compute_capability_tuple=dev_cc_t,
173166
cores=dev_cores,
174167
cores_utilization=dev_cores_util,
175168
memory=dev_mem,
176169
memory_used=dev_mem_used,
177-
memory_utilization=(
178-
(dev_mem_used * 100 // dev_mem) if dev_mem > 0 else 0
179-
),
170+
memory_utilization=get_utilization(dev_mem_used, dev_mem),
180171
temperature=dev_temp,
181172
power=dev_power,
182173
power_used=dev_power_used,

gpustack_runtime/detector/ascend.py

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from .. import envs
88
from . import pyacl, pydcmi
99
from .__types__ import Detector, Device, Devices, ManufacturerEnum
10-
from .__utils__ import PCIDevice, get_pci_devices
10+
from .__utils__ import PCIDevice, get_brief_version, get_pci_devices, get_utilization
1111

1212
logger = logging.getLogger(__name__)
1313

@@ -79,16 +79,9 @@ def detect(self) -> Devices | None:
7979
pydcmi.dcmi_init()
8080

8181
sys_driver_ver = pydcmi.dcmi_get_driver_version()
82-
sys_driver_ver_t = [
83-
int(v) if v.isdigit() else v for v in sys_driver_ver.split(".")
84-
]
85-
86-
sys_runtime_ver = pyacl.aclsysGetCANNVersion()
87-
sys_runtime_ver_t = (
88-
[int(v) if v.isdigit() else v for v in sys_runtime_ver.split(".")]
89-
if sys_runtime_ver
90-
else None
91-
)
82+
83+
sys_runtime_ver_original = pyacl.aclsysGetCANNVersion()
84+
sys_runtime_ver = get_brief_version(sys_runtime_ver_original)
9285

9386
_, card_list = pydcmi.dcmi_get_card_list()
9487
for dev_card_id in card_list:
@@ -150,6 +143,8 @@ def detect(self) -> Devices | None:
150143
dev_card_id,
151144
dev_device_id,
152145
)
146+
if dev_power_used:
147+
dev_power_used = dev_power_used / 10 # 0.1W to W
153148
dev_appendix = {
154149
"arch_family": (
155150
pyacl.aclrtGetSocName()
@@ -179,22 +174,15 @@ def detect(self) -> Devices | None:
179174
name=dev_name,
180175
uuid=dev_uuid.upper(),
181176
driver_version=sys_driver_ver,
182-
driver_version_tuple=sys_driver_ver_t,
183177
runtime_version=sys_runtime_ver,
184-
runtime_version_tuple=sys_runtime_ver_t,
178+
runtime_version_original=sys_runtime_ver_original,
185179
cores=dev_cores_aicore,
186180
cores_utilization=dev_util_aicore,
187181
memory=dev_mem,
188182
memory_used=dev_mem_used,
189-
memory_utilization=(
190-
(dev_mem_used / dev_mem) * 100 if dev_mem > 0 else 0
191-
),
183+
memory_utilization=get_utilization(dev_mem_used, dev_mem),
192184
temperature=dev_temp,
193-
power_used=(
194-
dev_power_used / 10 # Convert from 0.1W to W
195-
if dev_power_used
196-
else None
197-
),
185+
power_used=dev_power_used,
198186
appendix=dev_appendix,
199187
),
200188
)

gpustack_runtime/detector/cambricon.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
PCIDevice,
1111
execute_shell_command,
1212
get_pci_devices,
13+
get_utilization,
1314
safe_float,
1415
safe_int,
1516
support_command,
@@ -96,8 +97,8 @@ def detect(self) -> Devices | None:
9697
dev_cores_util = safe_float(dev_util_info.get("MLUAverage", 0))
9798

9899
dev_mem_usage_info = dev_info.get("PhysicalMemUsage", {})
99-
dev_mem = safe_int(dev_mem_usage_info.get("Total", 0)) << 20
100-
dev_mem_used = safe_int(dev_mem_usage_info.get("Used", 0)) << 20
100+
dev_mem = safe_int(dev_mem_usage_info.get("Total", 0))
101+
dev_mem_used = safe_int(dev_mem_usage_info.get("Used", 0))
101102

102103
dev_temp_info = dev_info.get("Temperature", {})
103104
dev_temp = safe_float(dev_temp_info.get("Chip", 0))
@@ -115,9 +116,7 @@ def detect(self) -> Devices | None:
115116
cores_utilization=dev_cores_util,
116117
memory=dev_mem,
117118
memory_used=dev_mem_used,
118-
memory_utilization=(
119-
(dev_mem_used * 100 // dev_mem) if dev_mem > 0 else 0
120-
),
119+
memory_utilization=get_utilization(dev_mem_used, dev_mem),
121120
temperature=dev_temp,
122121
appendix=dev_appendix,
123122
),

gpustack_runtime/detector/hygon.py

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from .. import envs
77
from . import pyrocmsmi
88
from .__types__ import Detector, Device, Devices, ManufacturerEnum
9-
from .__utils__ import PCIDevice, get_pci_devices
9+
from .__utils__ import PCIDevice, byte_to_mebibyte, get_pci_devices, get_utilization
1010

1111
logger = logging.getLogger(__name__)
1212

@@ -78,11 +78,6 @@ def detect(self) -> Devices | None:
7878
pyrocmsmi.rsmi_init()
7979

8080
sys_driver_ver = pyrocmsmi.rsmi_driver_version_get()
81-
sys_driver_ver_t = (
82-
[int(v) if v.isdigit() else v for v in sys_driver_ver.split(".")]
83-
if sys_driver_ver
84-
else None
85-
)
8681

8782
devs_count = pyrocmsmi.rsmi_num_monitor_devices()
8883
for dev_idx in range(devs_count):
@@ -91,15 +86,17 @@ def detect(self) -> Devices | None:
9186
dev_uuid = pyrocmsmi.rsmi_dev_unique_id_get(dev_idx)
9287
dev_name = pyrocmsmi.rsmi_dev_name_get(dev_idx)
9388
dev_cc = pyrocmsmi.rsmi_dev_target_graphics_version_get(dev_idx)
94-
dev_cc_t = None
9589
if dev_cc:
9690
dev_cc = dev_cc[3:] # Strip "gfx" prefix
97-
dev_cc_t = [int(v) if v.isdigit() else v for v in dev_cc.split(".")]
9891

9992
dev_cores = None
10093
dev_cores_util = pyrocmsmi.rsmi_dev_busy_percent_get(dev_idx)
101-
dev_mem = pyrocmsmi.rsmi_dev_memory_total_get(dev_idx)
102-
dev_mem_used = pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx)
94+
dev_mem = byte_to_mebibyte( # byte to MiB
95+
pyrocmsmi.rsmi_dev_memory_total_get(dev_idx),
96+
)
97+
dev_mem_used = byte_to_mebibyte( # byte to MiB
98+
pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx),
99+
)
103100
dev_temp = pyrocmsmi.rsmi_dev_temp_metric_get(dev_idx)
104101

105102
dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx)
@@ -116,16 +113,12 @@ def detect(self) -> Devices | None:
116113
name=dev_name,
117114
uuid=dev_uuid,
118115
driver_version=sys_driver_ver,
119-
driver_version_tuple=sys_driver_ver_t,
120116
compute_capability=dev_cc,
121-
compute_capability_tuple=dev_cc_t,
122117
cores=dev_cores,
123118
cores_utilization=dev_cores_util,
124-
memory=(dev_mem >> 20 if dev_mem > 0 else 0),
125-
memory_used=(dev_mem_used >> 20 if dev_mem_used > 0 else 0),
126-
memory_utilization=(
127-
(dev_mem_used * 100 // dev_mem) if dev_mem > 0 else 0
128-
),
119+
memory=dev_mem,
120+
memory_used=dev_mem_used,
121+
memory_utilization=get_utilization(dev_mem_used, dev_mem),
129122
temperature=dev_temp,
130123
power=dev_power,
131124
power_used=dev_power_used,

0 commit comments

Comments
 (0)