|
23 | 23 | # SOFTWARE. |
24 | 24 | # |
25 | 25 | ############################################################################### |
| 26 | +import io |
26 | 27 | from collections import defaultdict |
27 | | -from typing import Any, Dict, List, Optional, Union |
| 28 | +from typing import Any, Optional, Union |
28 | 29 |
|
29 | 30 | from nodescraper.enums import EventCategory, EventPriority |
30 | 31 | from nodescraper.interfaces import DataAnalyzer |
|
34 | 35 | AmdSmiDataModel, |
35 | 36 | AmdSmiMetric, |
36 | 37 | AmdSmiStatic, |
| 38 | + AmdSmiTstData, |
37 | 39 | EccData, |
38 | 40 | Fw, |
39 | 41 | Partition, |
40 | 42 | Processes, |
| 43 | + XgmiMetrics, |
41 | 44 | ) |
42 | 45 | from .analyzer_args import AmdSmiAnalyzerArgs |
| 46 | +from .cper import CperAnalysisTaskMixin |
43 | 47 |
|
44 | 48 |
|
45 | | -class AmdSmiAnalyzer(DataAnalyzer[AmdSmiDataModel, None]): |
46 | | - """""" |
| 49 | +class AmdSmiAnalyzer(CperAnalysisTaskMixin, DataAnalyzer[AmdSmiDataModel, None]): |
| 50 | + """Check AMD SMI Application data for PCIe, ECC errors, CPER data, and analyze amdsmitst metrics""" |
47 | 51 |
|
48 | 52 | DATA_MODEL = AmdSmiDataModel |
49 | 53 |
|
@@ -441,7 +445,7 @@ def check_static_data( |
441 | 445 |
|
442 | 446 | mismatches: list[tuple[int, str, str, str]] = [] |
443 | 447 |
|
444 | | - expected_data: Dict[str, Optional[str]] = { |
| 448 | + expected_data: dict[str, Optional[str]] = { |
445 | 449 | "vendor_id": vendor_id, |
446 | 450 | "subvendor_id": subvendor_id, |
447 | 451 | "vendor_name": "Advanced Micro Devices Inc", |
@@ -500,24 +504,24 @@ def check_static_data( |
500 | 504 |
|
501 | 505 | def _format_static_mismatch_payload( |
502 | 506 | self, |
503 | | - mismatches: List[tuple[int, str, str, str]], |
504 | | - ) -> Dict[str, Any]: |
| 507 | + mismatches: list[tuple[int, str, str, str]], |
| 508 | + ) -> dict[str, Any]: |
505 | 509 | """Helper function for pretty printing mismatch in expected data |
506 | 510 |
|
507 | 511 | Args: |
508 | | - mismatches (List[tuple[int, str, str, str]]): mismatched data per GPU |
| 512 | + mismatches (list[tuple[int, str, str, str]]): mismatched data per GPU |
509 | 513 |
|
510 | 514 | Returns: |
511 | | - Dict[str, Any]: dict of mismatched data per GPU |
| 515 | + dict[str, Any]: dict of mismatched data per GPU |
512 | 516 | """ |
513 | | - per_gpu: Dict[int, List[Dict[str, str]]] = defaultdict(list) |
| 517 | + per_gpu: dict[int, list[dict[str, str]]] = defaultdict(list) |
514 | 518 | field_set: set[str] = set() |
515 | 519 |
|
516 | 520 | for gpu, field, expected, actual in mismatches: |
517 | 521 | field_set.add(field) |
518 | 522 | per_gpu[gpu].append({"field": field, "expected": expected, "actual": actual}) |
519 | 523 |
|
520 | | - per_gpu_list: List[Dict[str, Any]] = [ |
| 524 | + per_gpu_list: list[dict[str, Any]] = [ |
521 | 525 | {"gpu": gpu, "mismatches": entries} |
522 | 526 | for gpu, entries in sorted(per_gpu.items(), key=lambda kv: kv[0]) |
523 | 527 | ] |
@@ -635,6 +639,97 @@ def check_expected_memory_partition_mode( |
635 | 639 | }, |
636 | 640 | ) |
637 | 641 |
|
| 642 | + def check_expected_xgmi_link_speed( |
| 643 | + self, |
| 644 | + xgmi_metric: Optional[list[XgmiMetrics]], |
| 645 | + expected_xgmi_speed: Optional[list[float]] = None, |
| 646 | + ): |
| 647 | + """Check the XGMI link speed for all GPUs |
| 648 | +
|
| 649 | + Args: |
| 650 | + xgmi_metric (Optional[list[XgmiMetrics]]): XGMI metrics data |
| 651 | + expected_xgmi_speed (Optional[list[float]]): List of expected XGMI speeds (GT/s) |
| 652 | + """ |
| 653 | + if xgmi_metric is None or len(xgmi_metric) == 0: |
| 654 | + self._log_event( |
| 655 | + category=EventCategory.IO, |
| 656 | + description="XGMI link speed data is not available and cannot be checked", |
| 657 | + priority=EventPriority.WARNING, |
| 658 | + data={"xgmi_metric": xgmi_metric}, |
| 659 | + ) |
| 660 | + return |
| 661 | + |
| 662 | + if expected_xgmi_speed is None or len(expected_xgmi_speed) == 0: |
| 663 | + self._log_event( |
| 664 | + category=EventCategory.IO, |
| 665 | + description="Expected XGMI speed not configured, skipping XGMI link speed check", |
| 666 | + priority=EventPriority.WARNING, |
| 667 | + ) |
| 668 | + return |
| 669 | + |
| 670 | + for xgmi_data in xgmi_metric: |
| 671 | + link_metric = xgmi_data.link_metrics |
| 672 | + try: |
| 673 | + if link_metric.bit_rate is None or link_metric.bit_rate.value is None: |
| 674 | + self._log_event( |
| 675 | + category=EventCategory.IO, |
| 676 | + description="XGMI link speed is not available", |
| 677 | + priority=EventPriority.ERROR, |
| 678 | + data={ |
| 679 | + "gpu": xgmi_data.gpu, |
| 680 | + "xgmi_bit_rate": ( |
| 681 | + link_metric.bit_rate.unit if link_metric.bit_rate else "N/A" |
| 682 | + ), |
| 683 | + }, |
| 684 | + ) |
| 685 | + continue |
| 686 | + |
| 687 | + xgmi_float = float(link_metric.bit_rate.value) |
| 688 | + except ValueError: |
| 689 | + self._log_event( |
| 690 | + category=EventCategory.IO, |
| 691 | + description="XGMI link speed is not a valid number", |
| 692 | + priority=EventPriority.ERROR, |
| 693 | + data={ |
| 694 | + "gpu": xgmi_data.gpu, |
| 695 | + "xgmi_bit_rate": ( |
| 696 | + link_metric.bit_rate.value if link_metric.bit_rate else "N/A" |
| 697 | + ), |
| 698 | + }, |
| 699 | + ) |
| 700 | + continue |
| 701 | + |
| 702 | + if xgmi_float not in expected_xgmi_speed: |
| 703 | + self._log_event( |
| 704 | + category=EventCategory.IO, |
| 705 | + description="XGMI link speed is not as expected", |
| 706 | + priority=EventPriority.ERROR, |
| 707 | + data={ |
| 708 | + "gpu": xgmi_data.gpu, |
| 709 | + "xgmi_bit_rate": xgmi_float, |
| 710 | + "expected_xgmi_speed": expected_xgmi_speed, |
| 711 | + }, |
| 712 | + console_log=True, |
| 713 | + ) |
| 714 | + |
| 715 | + def check_amdsmitst(self, amdsmitst_data: AmdSmiTstData): |
| 716 | + """Check AMD SMI test results |
| 717 | +
|
| 718 | + Args: |
| 719 | + amdsmitst_data (AmdSmiTstData): AMD SMI test data |
| 720 | + """ |
| 721 | + if amdsmitst_data.failed_test_count > 0: |
| 722 | + self._log_event( |
| 723 | + category=EventCategory.APPLICATION, |
| 724 | + description=f"{amdsmitst_data.failed_test_count} failed tests running amdsmitst", |
| 725 | + priority=EventPriority.ERROR, |
| 726 | + data={ |
| 727 | + "failed_test_count": amdsmitst_data.failed_test_count, |
| 728 | + "failed_tests": amdsmitst_data.failed_tests, |
| 729 | + }, |
| 730 | + console_log=True, |
| 731 | + ) |
| 732 | + |
638 | 733 | def analyze_data( |
639 | 734 | self, data: AmdSmiDataModel, args: Optional[AmdSmiAnalyzerArgs] = None |
640 | 735 | ) -> TaskResult: |
@@ -705,4 +800,22 @@ def analyze_data( |
705 | 800 | if args.expected_pldm_version: |
706 | 801 | self.check_pldm_version(data.firmware, args.expected_pldm_version) |
707 | 802 |
|
| 803 | + if data.cper_data: |
| 804 | + self.analyzer_cpers( |
| 805 | + { |
| 806 | + file_model_obj.file_name: io.BytesIO(file_model_obj.file_contents) |
| 807 | + for file_model_obj in data.cper_data |
| 808 | + }, |
| 809 | + analysis_range_start=args.analysis_range_start, |
| 810 | + analysis_range_end=args.analysis_range_end, |
| 811 | + ) |
| 812 | + |
| 813 | + if data.xgmi_metric and len(data.xgmi_metric) > 0: |
| 814 | + self.check_expected_xgmi_link_speed( |
| 815 | + data.xgmi_metric, expected_xgmi_speed=args.expected_xgmi_speed |
| 816 | + ) |
| 817 | + |
| 818 | + if data.amdsmitst_data and data.amdsmitst_data.failed_test_count > 0: |
| 819 | + self.check_amdsmitst(data.amdsmitst_data) |
| 820 | + |
708 | 821 | return self.result |
0 commit comments