@@ -100,15 +100,33 @@ def _run_amd_smi(self, cmd: str) -> Optional[str]:
100100 """
101101 cmd_ret = self ._run_sut_cmd (f"{ self .AMD_SMI_EXE } { cmd } " )
102102
103- # Check for known warnings that can be ignored
103+ # Check for known warnings and errors that can be handled
104104 is_group_warning = (
105105 "User is missing the following required groups" in cmd_ret .stderr
106106 or "User is missing the following required groups" in cmd_ret .stdout
107107 )
108108
109+ # Check for known amd-smi internal bugs
110+ is_amdsmi_internal_error = any (
111+ pattern in cmd_ret .stderr for pattern in ["KeyError:" , "AttributeError:" , "IndexError:" ]
112+ )
113+
109114 # Log warning if user is missing group
110115 if cmd_ret .stderr != "" or cmd_ret .exit_code != 0 :
111- if not is_group_warning :
116+ if is_amdsmi_internal_error :
117+ self ._log_event (
118+ category = EventCategory .SW_DRIVER ,
119+ description = "amd-smi internal error detected" ,
120+ data = {
121+ "command" : cmd ,
122+ "exit_code" : cmd_ret .exit_code ,
123+ "stderr" : cmd_ret .stderr ,
124+ },
125+ priority = EventPriority .WARNING ,
126+ console_log = True ,
127+ )
128+ return None
129+ elif not is_group_warning :
112130 self ._log_event (
113131 category = EventCategory .APPLICATION ,
114132 description = "Error running amd-smi command" ,
@@ -595,7 +613,23 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]:
595613 """
596614 ret = self ._run_amd_smi_dict ("static -g all" )
597615 if not ret :
598- return []
616+ self .logger .info ("Bulk static query failed, attempting per-GPU fallback" )
617+ gpu_list = self .get_gpu_list ()
618+ if gpu_list :
619+ fallback_data : list [dict ] = []
620+ for gpu in gpu_list :
621+ gpu_data = self ._run_amd_smi_dict (f"static -g { gpu .gpu } " )
622+ if gpu_data :
623+ if isinstance (gpu_data , dict ):
624+ fallback_data .append (gpu_data )
625+ elif isinstance (gpu_data , list ):
626+ fallback_data .extend (gpu_data )
627+ if fallback_data :
628+ ret = fallback_data
629+ else :
630+ return []
631+ else :
632+ return []
599633
600634 if isinstance (ret , dict ) and "gpu_data" in ret :
601635 ret = ret ["gpu_data" ]
0 commit comments