Skip to content

Commit a9b3ed3

Browse files
committed
pytest
1 parent 6406994 commit a9b3ed3

File tree

1 file changed

+315
-0
lines changed

1 file changed

+315
-0
lines changed
Lines changed: 315 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,315 @@
1+
import importlib
2+
import sys
3+
import types
4+
from typing import Tuple
5+
6+
import pytest
7+
8+
from nodescraper.enums.systeminteraction import SystemInteractionLevel
9+
from nodescraper.plugins.inband.amdsmi.amdsmi_collector import AmdSmiCollector
10+
11+
12+
class _BaseAmdSmiError(Exception):
13+
def __init__(self, ret_code: int, *args):
14+
super().__init__(ret_code, *args)
15+
self.ret_code = ret_code
16+
17+
18+
class AmdSmiLibraryError(_BaseAmdSmiError): ...
19+
20+
21+
class AmdSmiRetryError(_BaseAmdSmiError): ...
22+
23+
24+
class AmdSmiParameterError(_BaseAmdSmiError): ...
25+
26+
27+
class AmdSmiTimeoutError(_BaseAmdSmiError): ...
28+
29+
30+
def make_fake_amdsmi(
31+
*,
32+
handles: Tuple[object, ...] | None = None,
33+
lib_version="1.2.3",
34+
rocm_version="6.1.0",
35+
pcie_static=True,
36+
raise_on_handles=False,
37+
):
38+
if handles is None:
39+
handles = (object(),)
40+
41+
m = types.SimpleNamespace()
42+
m.AmdSmiException = _BaseAmdSmiError
43+
m.AmdSmiLibraryException = AmdSmiLibraryError
44+
m.AmdSmiRetryException = AmdSmiRetryError
45+
m.AmdSmiParameterException = AmdSmiParameterError
46+
m.AmdSmiTimeoutException = AmdSmiTimeoutError
47+
48+
class AmdSmiInitFlags:
49+
INIT_AMD_GPUS = 1
50+
51+
m.AmdSmiInitFlags = AmdSmiInitFlags
52+
53+
class AmdSmiMemoryType:
54+
VRAM = 0
55+
VIS_VRAM = 1
56+
GTT = 2
57+
58+
m.AmdSmiMemoryType = AmdSmiMemoryType
59+
60+
def amdsmi_init(_flags):
61+
return None
62+
63+
def amdsmi_shut_down():
64+
return None
65+
66+
m.amdsmi_init = amdsmi_init
67+
m.amdsmi_shut_down = amdsmi_shut_down
68+
69+
m.amdsmi_get_lib_version = lambda: lib_version
70+
m.amdsmi_get_rocm_version = lambda: rocm_version
71+
72+
def amdsmi_get_processor_handles():
73+
if raise_on_handles:
74+
raise AmdSmiLibraryError(5)
75+
return list(handles)
76+
77+
m.amdsmi_get_processor_handles = amdsmi_get_processor_handles
78+
79+
m.amdsmi_get_gpu_device_bdf = lambda h: "0000:0b:00.0"
80+
m.amdsmi_get_gpu_device_uuid = lambda h: "GPU-UUID-123"
81+
m.amdsmi_get_gpu_kfd_info = lambda h: {
82+
"kfd_id": 7,
83+
"node_id": 3,
84+
"cpu_affinity": 0xFF,
85+
"current_partition_id": 0,
86+
}
87+
m.amdsmi_get_gpu_board_info = lambda h: {
88+
"vbios_name": "vbiosA",
89+
"vbios_build_date": "2024-01-01",
90+
"vbios_part_number": "PN123",
91+
"vbios_version": "V1",
92+
"model_number": "Board-42",
93+
"product_serial": "SN0001",
94+
"fru_id": "FRU-1",
95+
"product_name": "ExampleBoard",
96+
"manufacturer_name": "ACME",
97+
}
98+
m.amdsmi_get_gpu_asic_info = lambda h: {
99+
"market_name": "SomeGPU",
100+
"vendor_id": "1002",
101+
"vendor_name": "AMD",
102+
"subvendor_id": "1ABC",
103+
"device_id": "0x1234",
104+
"subsystem_id": "0x5678",
105+
"rev_id": "A1",
106+
"asic_serial": "ASERIAL",
107+
"oam_id": 0,
108+
"num_compute_units": 224,
109+
"target_graphics_version": "GFX940",
110+
"vram_type": "HBM3",
111+
"vram_vendor": "Micron",
112+
"vram_bit_width": 4096,
113+
"vram_size_bytes": 64 * 1024 * 1024 * 1024,
114+
}
115+
m.amdsmi_get_gpu_driver_info = lambda h: {
116+
"driver_name": "amdgpu",
117+
"driver_version": "6.1.0",
118+
}
119+
120+
if pcie_static:
121+
122+
def amdsmi_get_pcie_info(h):
123+
return {
124+
"pcie_static": {
125+
"max_pcie_width": 16,
126+
"max_pcie_speed": 16000,
127+
"pcie_interface_version": "PCIe 5.0",
128+
"slot_type": "PCIe",
129+
}
130+
}
131+
132+
m.amdsmi_get_pcie_info = amdsmi_get_pcie_info
133+
134+
m.amdsmi_get_gpu_cache_info = lambda h: {
135+
"cache": [
136+
{
137+
"cache_level": 1,
138+
"max_num_cu_shared": 8,
139+
"num_cache_instance": 32,
140+
"cache_size": 256 * 1024,
141+
"cache_properties": "PropertyA, PropertyB; PropertyC",
142+
}
143+
]
144+
}
145+
146+
def amdsmi_get_clk_freq(h, clk_type):
147+
return {
148+
"frequency": [500_000_000, 1_500_000_000, 2_000_000_000],
149+
"current": 1,
150+
}
151+
152+
m.amdsmi_get_clk_freq = amdsmi_get_clk_freq
153+
154+
m.amdsmi_get_fw_info = lambda h: {
155+
"fw_list": [
156+
{"fw_name": "SMU", "fw_version": "55.33"},
157+
{"fw_name": "VBIOS", "fw_version": "V1"},
158+
]
159+
}
160+
161+
m.amdsmi_get_gpu_process_list = lambda h: [
162+
{
163+
"name": "python",
164+
"pid": 4242,
165+
"mem": 1024,
166+
"engine_usage": {"gfx": 1_000_000, "enc": 0},
167+
"memory_usage": {"gtt_mem": 0, "cpu_mem": 4096, "vram_mem": 2048},
168+
"cu_occupancy": 12,
169+
},
170+
{
171+
"name": "N/A",
172+
"pid": "9999",
173+
"mem": "0",
174+
"engine_usage": {"gfx": "0", "enc": "0"},
175+
"memory_usage": {"gtt_mem": "0", "cpu_mem": "0", "vram_mem": "0"},
176+
"cu_occupancy": "0",
177+
},
178+
]
179+
180+
m.amdsmi_get_gpu_memory_partition = lambda h: {"partition_type": "NPS1"}
181+
m.amdsmi_get_gpu_compute_partition = lambda h: {"partition_type": "CPX_DISABLED"}
182+
183+
return m
184+
185+
186+
@pytest.fixture
187+
def install_fake_amdsmi(monkeypatch):
188+
fake = make_fake_amdsmi()
189+
mod = types.ModuleType("amdsmi")
190+
for k, v in fake.__dict__.items():
191+
setattr(mod, k, v)
192+
monkeypatch.setitem(sys.modules, "amdsmi", mod)
193+
return mod
194+
195+
196+
@pytest.fixture
197+
def collector(install_fake_amdsmi, conn_mock, system_info):
198+
c = AmdSmiCollector(
199+
system_info=system_info,
200+
system_interaction_level=SystemInteractionLevel.PASSIVE,
201+
connection=conn_mock,
202+
)
203+
assert c._bind_amdsmi_or_log() is True
204+
return c
205+
206+
207+
def test_collect_data(collector):
208+
result, data = collector.collect_data()
209+
assert data is not None
210+
assert data.version is not None
211+
assert data.version.tool == "amdsmi"
212+
# gpu_list
213+
assert data.gpu_list is not None and len(data.gpu_list) == 1
214+
assert data.gpu_list[0].bdf == "0000:0b:00.0"
215+
assert data.gpu_list[0].uuid == "GPU-UUID-123"
216+
# processes
217+
assert data.process is not None and len(data.process) == 1
218+
assert len(data.process[0].process_list) == 2
219+
# static
220+
assert data.static is not None and len(data.static) == 1
221+
s = data.static[0]
222+
assert s.bus is not None and s.bus.max_pcie_speed is not None
223+
assert float(s.bus.max_pcie_speed.value) == pytest.approx(16.0)
224+
225+
226+
def test_bind_failure(monkeypatch, conn_mock, system_info):
227+
monkeypatch.setattr(
228+
importlib, "import_module", lambda name: (_ for _ in ()).throw(ImportError("nope"))
229+
)
230+
sys.modules.pop("amdsmi", None)
231+
232+
c = AmdSmiCollector(
233+
system_info=system_info,
234+
system_interaction_level=SystemInteractionLevel.PASSIVE,
235+
connection=conn_mock,
236+
)
237+
result, data = c.collect_data()
238+
assert data is None
239+
assert result.status.name == "NOT_RAN"
240+
241+
242+
def test_handles_exception(monkeypatch, collector):
243+
fake = make_fake_amdsmi(raise_on_handles=True)
244+
mod = types.ModuleType("amdsmi")
245+
for k, v in fake.__dict__.items():
246+
setattr(mod, k, v)
247+
monkeypatch.setitem(sys.modules, "amdsmi", mod)
248+
collector._amdsmi = mod
249+
250+
gl = collector.get_gpu_list()
251+
assert gl == [] or gl is None
252+
253+
gp = collector.get_process()
254+
assert gp == [] or gp is None
255+
256+
part = collector.get_partition()
257+
assert part is not None
258+
259+
fw = collector.get_firmware()
260+
assert fw == [] or fw is None
261+
262+
st = collector.get_static()
263+
assert st == [] or st is None
264+
265+
266+
def test_partition(collector, install_fake_amdsmi):
267+
amdsmi = install_fake_amdsmi
268+
amdsmi.amdsmi_get_gpu_memory_partition = lambda h: "NPS2"
269+
amdsmi.amdsmi_get_gpu_compute_partition = lambda h: "CPX_ENABLED"
270+
p = collector.get_partition()
271+
assert p is not None
272+
assert len(p.memory_partition) == 1 and len(p.compute_partition) == 1
273+
assert p.memory_partition[0].partition_type == "NPS2"
274+
assert p.compute_partition[0].partition_type == "CPX_ENABLED"
275+
276+
277+
def test_pcie(collector, install_fake_amdsmi):
278+
if hasattr(install_fake_amdsmi, "amdsmi_get_pcie_info"):
279+
delattr(install_fake_amdsmi, "amdsmi_get_pcie_info")
280+
stat = collector.get_static()
281+
assert stat is not None and len(stat) == 1
282+
assert stat[0].bus is not None
283+
ms = stat[0].bus.max_pcie_speed
284+
assert ms is None or ms.unit == "GT/s"
285+
286+
287+
def test_cache(collector):
288+
stat = collector.get_static()
289+
item = stat[0].cache_info[0]
290+
assert isinstance(item.cache.value, str) and item.cache.value.startswith("Label_")
291+
assert item.cache_properties
292+
assert {"PropertyA", "PropertyB", "PropertyC"}.issubset(set(item.cache_properties))
293+
294+
295+
def test_process_list(collector):
296+
procs = collector.get_process()
297+
assert procs and procs[0].process_list
298+
p0 = procs[0].process_list[0].process_info
299+
assert p0.pid == 4242
300+
assert p0.mem is not None and p0.mem.unit == "B"
301+
assert p0.usage.gfx is not None and p0.usage.gfx.unit == "ns"
302+
p1 = procs[0].process_list[1].process_info
303+
assert p1.name == "N/A"
304+
assert isinstance(p1.pid, int)
305+
306+
307+
def test_smi_try(monkeypatch, install_fake_amdsmi, collector):
308+
def raise_not_supported(*a, **kw):
309+
raise AmdSmiLibraryError(2) # NOT_SUPPORTED
310+
311+
install_fake_amdsmi.amdsmi_get_gpu_memory_partition = raise_not_supported
312+
313+
p = collector.get_partition()
314+
assert p is not None
315+
assert len(p.memory_partition) == 1

0 commit comments

Comments
 (0)