Skip to content

Commit c3f229c

Browse files
Merge pull request #68 from amd/alex_amdsmi3
AmdSmiPlugin: AmdSmiTstData, XgmiMetrics + Cper
2 parents e3f5563 + 3a703b3 commit c3f229c

File tree

10 files changed

+823
-43
lines changed

10 files changed

+823
-43
lines changed

nodescraper/cli/cli.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ def process_args(
334334
plugin_arg_index = -1
335335

336336
plugin_arg_map = {}
337+
invalid_plugins = []
337338
if plugin_arg_index != -1 and plugin_arg_index != len(raw_arg_input) - 1:
338339
top_level_args = raw_arg_input[: plugin_arg_index + 1]
339340
plugin_args = raw_arg_input[plugin_arg_index + 1 :]
@@ -344,12 +345,26 @@ def process_args(
344345
else:
345346
cur_plugin = None
346347
for arg in plugin_args:
347-
if arg in plugin_names:
348+
# Handle comma-separated plugin names (but not arguments)
349+
if not arg.startswith("-") and "," in arg:
350+
# Split comma-separated plugin names
351+
for potential_plugin in arg.split(","):
352+
potential_plugin = potential_plugin.strip()
353+
if potential_plugin in plugin_names:
354+
plugin_arg_map[potential_plugin] = []
355+
cur_plugin = potential_plugin
356+
elif potential_plugin:
357+
# Track invalid plugin names to log event later
358+
invalid_plugins.append(potential_plugin)
359+
elif arg in plugin_names:
348360
plugin_arg_map[arg] = []
349361
cur_plugin = arg
350362
elif cur_plugin:
351363
plugin_arg_map[cur_plugin].append(arg)
352-
return (top_level_args, plugin_arg_map)
364+
elif not arg.startswith("-"):
365+
# Track invalid plugin names to log event later
366+
invalid_plugins.append(arg)
367+
return (top_level_args, plugin_arg_map, invalid_plugins)
353368

354369

355370
def main(arg_input: Optional[list[str]] = None):
@@ -367,7 +382,9 @@ def main(arg_input: Optional[list[str]] = None):
367382
parser, plugin_subparser_map = build_parser(plugin_reg, config_reg)
368383

369384
try:
370-
top_level_args, plugin_arg_map = process_args(arg_input, list(plugin_subparser_map.keys()))
385+
top_level_args, plugin_arg_map, invalid_plugins = process_args(
386+
arg_input, list(plugin_subparser_map.keys())
387+
)
371388

372389
parsed_args = parser.parse_args(top_level_args)
373390
system_info = get_system_info(parsed_args)
@@ -387,6 +404,13 @@ def main(arg_input: Optional[list[str]] = None):
387404
if log_path:
388405
logger.info("Log path: %s", log_path)
389406

407+
# Log warning if invalid plugin names were provided
408+
if invalid_plugins:
409+
logger.warning(
410+
"Invalid plugin name(s) ignored: %s. Use 'describe plugin' to list available plugins.",
411+
", ".join(invalid_plugins),
412+
)
413+
390414
if parsed_args.subcmd == "summary":
391415
generate_summary(parsed_args.search_path, parsed_args.output_path, logger)
392416
sys.exit(0)

nodescraper/pluginexecutor.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,12 @@ def run_queue(self) -> list[PluginResult]:
173173
global_run_args = self.apply_global_args_to_plugin(
174174
plugin_inst, plugin_class, self.plugin_config.global_args
175175
)
176+
# Merge analysis_args and collection_args
177+
for args_key in ["analysis_args", "collection_args"]:
178+
if args_key in global_run_args and args_key in run_payload:
179+
# Merge: global args override plugin-specific args keys specified in both global and plugin-specific args
180+
run_payload[args_key].update(global_run_args[args_key])
181+
del global_run_args[args_key]
176182
run_payload.update(global_run_args)
177183
except ValueError as ve:
178184
self.logger.error(

nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py

Lines changed: 123 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@
2323
# SOFTWARE.
2424
#
2525
###############################################################################
26+
import io
2627
from collections import defaultdict
27-
from typing import Any, Dict, List, Optional, Union
28+
from typing import Any, Optional, Union
2829

2930
from nodescraper.enums import EventCategory, EventPriority
3031
from nodescraper.interfaces import DataAnalyzer
@@ -34,16 +35,19 @@
3435
AmdSmiDataModel,
3536
AmdSmiMetric,
3637
AmdSmiStatic,
38+
AmdSmiTstData,
3739
EccData,
3840
Fw,
3941
Partition,
4042
Processes,
43+
XgmiMetrics,
4144
)
4245
from .analyzer_args import AmdSmiAnalyzerArgs
46+
from .cper import CperAnalysisTaskMixin
4347

4448

45-
class AmdSmiAnalyzer(DataAnalyzer[AmdSmiDataModel, None]):
46-
""""""
49+
class AmdSmiAnalyzer(CperAnalysisTaskMixin, DataAnalyzer[AmdSmiDataModel, None]):
50+
"""Check AMD SMI Application data for PCIe, ECC errors, CPER data, and analyze amdsmitst metrics"""
4751

4852
DATA_MODEL = AmdSmiDataModel
4953

@@ -441,7 +445,7 @@ def check_static_data(
441445

442446
mismatches: list[tuple[int, str, str, str]] = []
443447

444-
expected_data: Dict[str, Optional[str]] = {
448+
expected_data: dict[str, Optional[str]] = {
445449
"vendor_id": vendor_id,
446450
"subvendor_id": subvendor_id,
447451
"vendor_name": "Advanced Micro Devices Inc",
@@ -500,24 +504,24 @@ def check_static_data(
500504

501505
def _format_static_mismatch_payload(
502506
self,
503-
mismatches: List[tuple[int, str, str, str]],
504-
) -> Dict[str, Any]:
507+
mismatches: list[tuple[int, str, str, str]],
508+
) -> dict[str, Any]:
505509
"""Helper function for pretty printing mismatch in expected data
506510
507511
Args:
508-
mismatches (List[tuple[int, str, str, str]]): mismatched data per GPU
512+
mismatches (list[tuple[int, str, str, str]]): mismatched data per GPU
509513
510514
Returns:
511-
Dict[str, Any]: dict of mismatched data per GPU
515+
dict[str, Any]: dict of mismatched data per GPU
512516
"""
513-
per_gpu: Dict[int, List[Dict[str, str]]] = defaultdict(list)
517+
per_gpu: dict[int, list[dict[str, str]]] = defaultdict(list)
514518
field_set: set[str] = set()
515519

516520
for gpu, field, expected, actual in mismatches:
517521
field_set.add(field)
518522
per_gpu[gpu].append({"field": field, "expected": expected, "actual": actual})
519523

520-
per_gpu_list: List[Dict[str, Any]] = [
524+
per_gpu_list: list[dict[str, Any]] = [
521525
{"gpu": gpu, "mismatches": entries}
522526
for gpu, entries in sorted(per_gpu.items(), key=lambda kv: kv[0])
523527
]
@@ -635,6 +639,97 @@ def check_expected_memory_partition_mode(
635639
},
636640
)
637641

642+
def check_expected_xgmi_link_speed(
643+
self,
644+
xgmi_metric: Optional[list[XgmiMetrics]],
645+
expected_xgmi_speed: Optional[list[float]] = None,
646+
):
647+
"""Check the XGMI link speed for all GPUs
648+
649+
Args:
650+
xgmi_metric (Optional[list[XgmiMetrics]]): XGMI metrics data
651+
expected_xgmi_speed (Optional[list[float]]): List of expected XGMI speeds (GT/s)
652+
"""
653+
if xgmi_metric is None or len(xgmi_metric) == 0:
654+
self._log_event(
655+
category=EventCategory.IO,
656+
description="XGMI link speed data is not available and cannot be checked",
657+
priority=EventPriority.WARNING,
658+
data={"xgmi_metric": xgmi_metric},
659+
)
660+
return
661+
662+
if expected_xgmi_speed is None or len(expected_xgmi_speed) == 0:
663+
self._log_event(
664+
category=EventCategory.IO,
665+
description="Expected XGMI speed not configured, skipping XGMI link speed check",
666+
priority=EventPriority.WARNING,
667+
)
668+
return
669+
670+
for xgmi_data in xgmi_metric:
671+
link_metric = xgmi_data.link_metrics
672+
try:
673+
if link_metric.bit_rate is None or link_metric.bit_rate.value is None:
674+
self._log_event(
675+
category=EventCategory.IO,
676+
description="XGMI link speed is not available",
677+
priority=EventPriority.ERROR,
678+
data={
679+
"gpu": xgmi_data.gpu,
680+
"xgmi_bit_rate": (
681+
link_metric.bit_rate.unit if link_metric.bit_rate else "N/A"
682+
),
683+
},
684+
)
685+
continue
686+
687+
xgmi_float = float(link_metric.bit_rate.value)
688+
except ValueError:
689+
self._log_event(
690+
category=EventCategory.IO,
691+
description="XGMI link speed is not a valid number",
692+
priority=EventPriority.ERROR,
693+
data={
694+
"gpu": xgmi_data.gpu,
695+
"xgmi_bit_rate": (
696+
link_metric.bit_rate.value if link_metric.bit_rate else "N/A"
697+
),
698+
},
699+
)
700+
continue
701+
702+
if xgmi_float not in expected_xgmi_speed:
703+
self._log_event(
704+
category=EventCategory.IO,
705+
description="XGMI link speed is not as expected",
706+
priority=EventPriority.ERROR,
707+
data={
708+
"gpu": xgmi_data.gpu,
709+
"xgmi_bit_rate": xgmi_float,
710+
"expected_xgmi_speed": expected_xgmi_speed,
711+
},
712+
console_log=True,
713+
)
714+
715+
def check_amdsmitst(self, amdsmitst_data: AmdSmiTstData):
716+
"""Check AMD SMI test results
717+
718+
Args:
719+
amdsmitst_data (AmdSmiTstData): AMD SMI test data
720+
"""
721+
if amdsmitst_data.failed_test_count > 0:
722+
self._log_event(
723+
category=EventCategory.APPLICATION,
724+
description=f"{amdsmitst_data.failed_test_count} failed tests running amdsmitst",
725+
priority=EventPriority.ERROR,
726+
data={
727+
"failed_test_count": amdsmitst_data.failed_test_count,
728+
"failed_tests": amdsmitst_data.failed_tests,
729+
},
730+
console_log=True,
731+
)
732+
638733
def analyze_data(
639734
self, data: AmdSmiDataModel, args: Optional[AmdSmiAnalyzerArgs] = None
640735
) -> TaskResult:
@@ -705,4 +800,22 @@ def analyze_data(
705800
if args.expected_pldm_version:
706801
self.check_pldm_version(data.firmware, args.expected_pldm_version)
707802

803+
if data.cper_data:
804+
self.analyzer_cpers(
805+
{
806+
file_model_obj.file_name: io.BytesIO(file_model_obj.file_contents)
807+
for file_model_obj in data.cper_data
808+
},
809+
analysis_range_start=args.analysis_range_start,
810+
analysis_range_end=args.analysis_range_end,
811+
)
812+
813+
if data.xgmi_metric and len(data.xgmi_metric) > 0:
814+
self.check_expected_xgmi_link_speed(
815+
data.xgmi_metric, expected_xgmi_speed=args.expected_xgmi_speed
816+
)
817+
818+
if data.amdsmitst_data and data.amdsmitst_data.failed_test_count > 0:
819+
self.check_amdsmitst(data.amdsmitst_data)
820+
708821
return self.result

0 commit comments

Comments
 (0)