2424#
2525###############################################################################
2626import importlib
27- from typing import cast
2827
2928from pydantic import ValidationError
3029
3130from nodescraper .base .inbandcollectortask import InBandDataCollector
32- from nodescraper .connection .inband .inband import CommandArtifact
3331from nodescraper .enums import EventCategory , EventPriority , ExecutionStatus , OSFamily
3432from nodescraper .models import TaskResult
3533from nodescraper .plugins .inband .amdsmi .amdsmidata import (
6664class AmdSmiCollector (InBandDataCollector [AmdSmiDataModel , None ]):
6765 """class for collection of inband tool amd-smi data."""
6866
69- AMD_SMI_EXE = "amd-smi"
70-
7167 SUPPORTED_OS_FAMILY : set [OSFamily ] = {OSFamily .LINUX }
7268
7369 DATA_MODEL = AmdSmiDataModel
@@ -113,7 +109,7 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None:
113109 try :
114110 version = self ._get_amdsmi_version ()
115111 processes = self .get_process ()
116- partition = self ._get_partition ()
112+ partition = self .get_partition ()
117113 firmware = self .get_firmware ()
118114 gpu_list = self .get_gpu_list ()
119115 statics = self .get_static ()
@@ -168,26 +164,6 @@ def _get_amdsmi_version(self) -> AmdSmiVersion | None:
168164 rocm_version = rocm_ver ,
169165 )
170166
171- def _run_amd_smi (self , cmd : str , sudo : bool = False ) -> str | None :
172- """Run amd-smi command"""
173- cmd_ret : CommandArtifact = self ._run_sut_cmd (f"{ self .AMD_SMI_EXE } { cmd } " , sudo = sudo )
174-
175- if cmd_ret .exit_code != 0 :
176- self ._log_event (
177- category = EventCategory .APPLICATION ,
178- description = "Error running amd-smi command" ,
179- data = {
180- "command" : cmd ,
181- "exit_code" : cmd_ret .exit_code ,
182- "stderr" : cmd_ret .stderr ,
183- },
184- priority = EventPriority .ERROR ,
185- console_log = True ,
186- )
187- return None
188-
189- return cmd_ret .stdout or ""
190-
191167 def get_gpu_list (self ) -> list [AmdSmiListItem ] | None :
192168 devices = self ._get_handles ()
193169 out : list [AmdSmiListItem ] = []
@@ -238,48 +214,68 @@ def _to_int(x, default=0):
238214 def get_process (self ) -> list [Processes ] | None :
239215 devices = self ._get_handles ()
240216 out : list [Processes ] = []
217+
241218 for idx , h in enumerate (devices ):
242219 try :
243- pids = self ._amdsmi .amdsmi_get_gpu_process_list (h ) or []
220+ raw_list = (
221+ self ._smi_try (self ._amdsmi .amdsmi_get_gpu_process_list , h , default = []) or []
222+ )
244223 plist : list [ProcessListItem ] = []
245224
246- for pid in pids :
247- pinfo = self ._smi_try (
248- self ._amdsmi .amdsmi_get_gpu_compute_process_info , h , pid , default = None
249- )
250- if not isinstance (pinfo , dict ):
251- plist .append (ProcessListItem (process_info = str (pid )))
225+ for entry in raw_list :
226+ if not isinstance (entry , dict ):
227+ plist .append (ProcessListItem (process_info = str (entry )))
252228 continue
253229
254- plist .append (
255- ProcessListItem (
256- process_info = cast (
257- ProcessInfo ,
258- {
259- "name" : pinfo .get ("name" , str (pid )),
260- "pid" : int (pid ),
261- "memory_usage" : {
262- "gtt_mem" : ValueUnit (
263- value = pinfo .get ("gtt_mem" , 0 ), unit = "B"
264- ),
265- "cpu_mem" : ValueUnit (
266- value = pinfo .get ("cpu_mem" , 0 ), unit = "B"
267- ),
268- "vram_mem" : ValueUnit (
269- value = pinfo .get ("vram_mem" , 0 ), unit = "B"
270- ),
271- },
272- "mem_usage" : ValueUnit (
273- value = pinfo .get ("vram_mem" , 0 ), unit = "B"
274- ),
275- "usage" : {
276- "gfx" : ValueUnit (value = pinfo .get ("gfx" , 0 ), unit = "%" ),
277- "enc" : ValueUnit (value = pinfo .get ("enc" , 0 ), unit = "%" ),
278- },
279- },
230+ name = entry .get ("name" , "N/A" )
231+ pid_val = entry .get ("pid" , 0 )
232+ try :
233+ pid = int (pid_val ) if pid_val not in (None , "" ) else 0
234+ except Exception :
235+ pid = 0
236+
237+ mem_vu = self ._vu (entry .get ("mem" ), "B" )
238+ mu = entry .get ("memory_usage" ) or {}
239+ mem_usage = {
240+ "gtt_mem" : self ._vu (mu .get ("gtt_mem" ), "B" ),
241+ "cpu_mem" : self ._vu (mu .get ("cpu_mem" ), "B" ),
242+ "vram_mem" : self ._vu (mu .get ("vram_mem" ), "B" ),
243+ }
244+
245+ eu = entry .get ("engine_usage" ) or {}
246+ usage = {
247+ "gfx" : self ._vu (eu .get ("gfx" ), "ns" ),
248+ "enc" : self ._vu (eu .get ("enc" ), "ns" ),
249+ }
250+
251+ cu_occ = self ._vu (entry .get ("cu_occupancy" ), "" )
252+
253+ try :
254+ plist .append (
255+ ProcessListItem (
256+ process_info = ProcessInfo (
257+ name = str (name ),
258+ pid = pid ,
259+ mem = mem_vu ,
260+ memory_usage = mem_usage ,
261+ usage = usage ,
262+ cu_occupancy = cu_occ ,
263+ )
280264 )
281265 )
282- )
266+ except ValidationError as e :
267+ self ._log_event (
268+ category = EventCategory .APPLICATION ,
269+ description = "Failed to build ProcessListItem; skipping entry" ,
270+ data = {
271+ "exception" : get_exception_traceback (e ),
272+ "gpu_index" : idx ,
273+ "entry" : repr (entry ),
274+ },
275+ priority = EventPriority .WARNING ,
276+ )
277+ continue
278+
283279 try :
284280 out .append (Processes (gpu = idx , process_list = plist ))
285281 except ValidationError as e :
@@ -296,36 +292,71 @@ def get_process(self) -> list[Processes] | None:
296292 data = {"exception" : get_exception_traceback (e ), "gpu_index" : idx },
297293 priority = EventPriority .WARNING ,
298294 )
295+
299296 return out
300297
301- def _get_partition (self ) -> Partition | None :
298+ def get_partition (self ) -> Partition | None :
302299 devices = self ._get_handles ()
303300 current : list [PartitionCurrent ] = []
304301 memparts : list [PartitionMemory ] = []
305- resources : list [dict ] = [] # keep as-is if your model allows
302+ resources : list [dict ] = []
306303
307304 for idx , h in enumerate (devices ):
305+ # compute
308306 c = self ._smi_try (self ._amdsmi .amdsmi_get_gpu_compute_partition , h , default = {}) or {}
309- m = self ._smi_try (self ._amdsmi .amdsmi_get_gpu_memory_partition , h , default = {}) or {}
310307 c_dict = c if isinstance (c , dict ) else {}
308+
309+ # memory
310+ m = self ._smi_try (self ._amdsmi .amdsmi_get_gpu_memory_partition , h , default = {}) or {}
311311 m_dict = m if isinstance (m , dict ) else {}
312312
313- current .append (
314- PartitionCurrent (
315- gpu_id = idx ,
316- memory = c_dict .get ("memory" ),
317- accelerator_type = c_dict .get ("accelerator_type" ),
318- accelerator_profile_index = c_dict .get ("accelerator_profile_index" ),
319- partition_id = c_dict .get ("partition_id" ),
313+ prof_list : list [dict ] = (
314+ []
315+ ) # amdsmi_get_gpu_accelerator_partition_profile -> currently not supported
316+
317+ try :
318+ current .append (
319+ PartitionCurrent (
320+ gpu_id = idx ,
321+ memory = c_dict .get ("memory" ),
322+ accelerator_type = c_dict .get ("accelerator_type" ),
323+ accelerator_profile_index = c_dict .get ("accelerator_profile_index" ),
324+ partition_id = c_dict .get ("partition_id" ),
325+ )
320326 )
321- )
322- memparts .append (
323- PartitionMemory (
324- gpu_id = idx ,
325- memory_partition_caps = m_dict .get ("memory_partition_caps" ),
326- current_partition_id = m_dict .get ("current_partition_id" ),
327+ except ValidationError as e :
328+ self ._log_event (
329+ category = EventCategory .APPLICATION ,
330+ description = "Failed to build PartitionCurrent" ,
331+ data = {
332+ "exception" : get_exception_traceback (e ),
333+ "gpu_index" : idx ,
334+ "data" : c_dict ,
335+ },
336+ priority = EventPriority .WARNING ,
327337 )
328- )
338+
339+ try :
340+ memparts .append (
341+ PartitionMemory (
342+ gpu_id = idx ,
343+ memory_partition_caps = m_dict .get ("memory_partition_caps" ),
344+ current_partition_id = m_dict .get ("current_partition_id" ),
345+ )
346+ )
347+ except ValidationError as e :
348+ self ._log_event (
349+ category = EventCategory .APPLICATION ,
350+ description = "Failed to build PartitionMemory" ,
351+ data = {
352+ "exception" : get_exception_traceback (e ),
353+ "gpu_index" : idx ,
354+ "data" : m_dict ,
355+ },
356+ priority = EventPriority .WARNING ,
357+ )
358+
359+ resources .append ({"gpu_id" : idx , "profiles" : []})
329360
330361 try :
331362 return Partition (
@@ -461,21 +492,6 @@ def _nz(val: object, default: str = "unknown") -> str:
461492 s = str (val ).strip () if val is not None else ""
462493 return s if s and s .upper () != "N/A" else default
463494
464- def _vu (val : object , unit : str ) -> ValueUnit | None :
465- """Build ValueUnit from mixed numeric/string input, else None."""
466- if val in (None , "" , "N/A" ):
467- return None
468- try :
469- if isinstance (val , str ):
470- v = float (val ) if any (ch in val for ch in ".eE" ) else int (val )
471- elif isinstance (val , float ):
472- v = val
473- else :
474- v = int (val )
475- except Exception :
476- return None
477- return ValueUnit (value = v , unit = unit )
478-
479495 pcie_fn = getattr (self ._amdsmi , "amdsmi_get_pcie_info" , None )
480496
481497 out : list [AmdSmiStatic ] = []
@@ -496,8 +512,8 @@ def _vu(val: object, unit: str) -> ValueUnit | None:
496512 pcie_ver = p .get ("pcie_version" ) or p .get ("pcie_interface_version" )
497513 bus = StaticBus (
498514 bdf = bdf ,
499- max_pcie_width = _vu (max_w , "x" ),
500- max_pcie_speed = _vu (max_s , "GT/s" ),
515+ max_pcie_width = self . _vu (max_w , "x" ),
516+ max_pcie_speed = self . _vu (max_s , "GT/s" ),
501517 pcie_interface_version = _nz (pcie_ver ),
502518 slot_type = _nz (p .get ("slot_type" )),
503519 )
@@ -602,8 +618,8 @@ def _vu(val: object, unit: str) -> ValueUnit | None:
602618 vram_model = StaticVram (
603619 type = vram_type ,
604620 vendor = None if vram_vendor in (None , "" , "N/A" ) else str (vram_vendor ),
605- size = _vu (vram_size_b , "B" ),
606- bit_width = _vu (vram_bits , "bit" ),
621+ size = self . _vu (vram_size_b , "B" ),
622+ bit_width = self . _vu (vram_bits , "bit" ),
607623 max_bandwidth = None ,
608624 )
609625
@@ -757,28 +773,6 @@ def _get_cache_info(self, h) -> list[StaticCacheInfoItem]:
757773
758774 items = raw if isinstance (raw , list ) else [raw ]
759775
760- def _to_num (v ) -> float | int | None :
761- if isinstance (v , (int , float )):
762- return v
763- if isinstance (v , str ):
764- s = v .strip ()
765- try :
766- return int (s )
767- except Exception :
768- try :
769- return float (s )
770- except Exception :
771- return None
772- return None
773-
774- def _vu_req (v ) -> ValueUnit :
775- n = _to_num (v )
776- return ValueUnit (value = 0 if n is None else n , unit = "" )
777-
778- def _vu_opt (v ) -> ValueUnit | None :
779- n = _to_num (v )
780- return None if n is None else ValueUnit (value = n , unit = "" )
781-
782776 def _as_list_str (v ) -> list [str ]:
783777 if isinstance (v , list ):
784778 return [str (x ) for x in v ]
@@ -792,10 +786,10 @@ def _as_list_str(v) -> list[str]:
792786 if not isinstance (e , dict ):
793787 continue
794788
795- cache_level = _vu_req (e .get ("cache_level" ))
796- max_num_cu_shared = _vu_req (e .get ("max_num_cu_shared" ))
797- num_cache_instance = _vu_req (e .get ("num_cache_instance" ))
798- cache_size = _vu_opt (e .get ("cache_size" ))
789+ cache_level = self . _vu (e .get ("cache_level" ), "" , required = True )
790+ max_num_cu_shared = self . _vu (e .get ("max_num_cu_shared" ), "" , required = True )
791+ num_cache_instance = self . _vu (e .get ("num_cache_instance" ), "" , required = True )
792+ cache_size = self . _vu (e .get ("cache_size" ), "" , required = False )
799793 cache_props = _as_list_str (e .get ("cache_properties" ))
800794
801795 # AMDSMI doesn’t give a name , "Lable_<level>" as the label???
@@ -824,10 +818,8 @@ def _as_list_str(v) -> list[str]:
824818
825819 return out
826820
827-
828821 def _get_clock (self , h ) -> StaticClockData | None :
829- """
830- """
822+ """ """
831823 fn = getattr (self ._amdsmi , "amdsmi_get_clk_freq" , None )
832824 clk_type = getattr (self ._amdsmi , "AmdSmiClkType" , None )
833825 if not callable (fn ) or clk_type is None or not hasattr (clk_type , "SYS" ):
@@ -868,8 +860,6 @@ def _fmt(n: int | None) -> str | None:
868860 except ValidationError :
869861 return None
870862
871-
872-
873863 def collect_data (
874864 self ,
875865 args = None ,
@@ -902,3 +892,26 @@ def collect_data(
902892 self ._amdsmi .amdsmi_shut_down ()
903893 except Exception :
904894 pass
895+
896+ def _vu (self , v : object , unit : str , * , required : bool = False ) -> ValueUnit | None :
897+ """
898+ Build ValueUnit from mixed numeric/string input.
899+ Returns:
900+ None for None/''/'N/A' unless required=True, in which case ValueUnit(0, unit).
901+ """
902+ if v in (None , "" , "N/A" ):
903+ return ValueUnit (value = 0 , unit = unit ) if required else None
904+ try :
905+ if isinstance (v , str ):
906+ s = v .strip ()
907+ try :
908+ n = int (s )
909+ except Exception :
910+ n = float (s )
911+ elif isinstance (v , (int , float )):
912+ n = v
913+ else :
914+ n = int (v )
915+ except Exception :
916+ return ValueUnit (value = 0 , unit = unit ) if required else None
917+ return ValueUnit (value = n , unit = unit )
0 commit comments