|
30 | 30 | from nodescraper.interfaces import DataAnalyzer |
31 | 31 | from nodescraper.models import TaskResult |
32 | 32 |
|
33 | | -from .amdsmidata import AmdSmiDataModel, AmdSmiStatic, Fw, Partition, Processes |
| 33 | +from .amdsmidata import ( |
| 34 | + AmdSmiDataModel, |
| 35 | + AmdSmiMetric, |
| 36 | + AmdSmiStatic, |
| 37 | + EccData, |
| 38 | + Fw, |
| 39 | + Partition, |
| 40 | + Processes, |
| 41 | +) |
34 | 42 | from .analyzer_args import AmdSmiAnalyzerArgs |
35 | 43 |
|
36 | 44 |
|
@@ -122,6 +130,223 @@ def check_expected_driver_version( |
122 | 130 | }, |
123 | 131 | ) |
124 | 132 |
|
| 133 | + def check_amdsmi_metric_pcie( |
| 134 | + self, |
| 135 | + amdsmi_metric_data: list[AmdSmiMetric], |
| 136 | + l0_to_recovery_count_error_threshold: int, |
| 137 | + l0_to_recovery_count_warning_threshold: int, |
| 138 | + ): |
| 139 | + """Check PCIe metrics for link errors |
| 140 | +
|
| 141 | + Checks for PCIe link width, speed, replays, recoveries, and NAKs. |
| 142 | + Expected width/speeds should come from SKU info. |
| 143 | +
|
| 144 | + Args: |
| 145 | + amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model |
| 146 | + l0_to_recovery_count_error_threshold (int): Threshold for error events |
| 147 | + l0_to_recovery_count_warning_threshold (int): Threshold for warning events |
| 148 | + """ |
| 149 | + for metric in amdsmi_metric_data: |
| 150 | + pcie_data = metric.pcie |
| 151 | + gpu = metric.gpu |
| 152 | + |
| 153 | + if pcie_data.width is not None and pcie_data.width != 16: |
| 154 | + self._log_event( |
| 155 | + category=EventCategory.IO, |
| 156 | + description=f"GPU: {gpu} PCIe width is not x16", |
| 157 | + priority=EventPriority.ERROR, |
| 158 | + data={"gpu": gpu, "pcie_width": pcie_data.width, "expected": 16}, |
| 159 | + console_log=True, |
| 160 | + ) |
| 161 | + |
| 162 | + if pcie_data.speed is not None and pcie_data.speed.value is not None: |
| 163 | + try: |
| 164 | + speed_val = float(pcie_data.speed.value) |
| 165 | + if speed_val != 32.0: |
| 166 | + self._log_event( |
| 167 | + category=EventCategory.IO, |
| 168 | + description=f"GPU: {gpu} PCIe link speed is not Gen5 (32 GT/s)", |
| 169 | + priority=EventPriority.ERROR, |
| 170 | + data={"gpu": gpu, "pcie_speed": speed_val, "expected": 32.0}, |
| 171 | + console_log=True, |
| 172 | + ) |
| 173 | + except (ValueError, TypeError): |
| 174 | + pass |
| 175 | + |
| 176 | + if pcie_data.replay_count is not None and pcie_data.replay_count > 0: |
| 177 | + self._log_event( |
| 178 | + category=EventCategory.IO, |
| 179 | + description=f"GPU: {gpu} has PCIe replay count: {pcie_data.replay_count}", |
| 180 | + priority=EventPriority.WARNING, |
| 181 | + data={"gpu": gpu, "replay_count": pcie_data.replay_count}, |
| 182 | + console_log=True, |
| 183 | + ) |
| 184 | + |
| 185 | + if ( |
| 186 | + pcie_data.replay_roll_over_count is not None |
| 187 | + and pcie_data.replay_roll_over_count > 0 |
| 188 | + ): |
| 189 | + self._log_event( |
| 190 | + category=EventCategory.IO, |
| 191 | + description=f"GPU: {gpu} has PCIe replay rollover count: {pcie_data.replay_roll_over_count}", |
| 192 | + priority=EventPriority.WARNING, |
| 193 | + data={"gpu": gpu, "replay_roll_over_count": pcie_data.replay_roll_over_count}, |
| 194 | + console_log=True, |
| 195 | + ) |
| 196 | + |
| 197 | + if pcie_data.l0_to_recovery_count is not None: |
| 198 | + if pcie_data.l0_to_recovery_count > l0_to_recovery_count_error_threshold: |
| 199 | + self._log_event( |
| 200 | + category=EventCategory.IO, |
| 201 | + description=f"GPU: {gpu} has {pcie_data.l0_to_recovery_count} L0 recoveries", |
| 202 | + priority=EventPriority.ERROR, |
| 203 | + data={ |
| 204 | + "gpu": gpu, |
| 205 | + "l0_to_recovery_count": pcie_data.l0_to_recovery_count, |
| 206 | + "error_threshold": l0_to_recovery_count_error_threshold, |
| 207 | + }, |
| 208 | + console_log=True, |
| 209 | + ) |
| 210 | + elif pcie_data.l0_to_recovery_count > l0_to_recovery_count_warning_threshold: |
| 211 | + self._log_event( |
| 212 | + category=EventCategory.IO, |
| 213 | + description=f"GPU: {gpu} has {pcie_data.l0_to_recovery_count} L0 recoveries", |
| 214 | + priority=EventPriority.WARNING, |
| 215 | + data={ |
| 216 | + "gpu": gpu, |
| 217 | + "l0_to_recovery_count": pcie_data.l0_to_recovery_count, |
| 218 | + "warning_threshold": l0_to_recovery_count_warning_threshold, |
| 219 | + }, |
| 220 | + console_log=True, |
| 221 | + ) |
| 222 | + |
| 223 | + if pcie_data.nak_sent_count is not None and pcie_data.nak_sent_count > 0: |
| 224 | + self._log_event( |
| 225 | + category=EventCategory.IO, |
| 226 | + description=f"GPU: {gpu} has sent {pcie_data.nak_sent_count} PCIe NAKs", |
| 227 | + priority=EventPriority.WARNING, |
| 228 | + data={"gpu": gpu, "nak_sent_count": pcie_data.nak_sent_count}, |
| 229 | + console_log=True, |
| 230 | + ) |
| 231 | + |
| 232 | + if pcie_data.nak_received_count is not None and pcie_data.nak_received_count > 0: |
| 233 | + self._log_event( |
| 234 | + category=EventCategory.IO, |
| 235 | + description=f"GPU: {gpu} has received {pcie_data.nak_received_count} PCIe NAKs", |
| 236 | + priority=EventPriority.WARNING, |
| 237 | + data={"gpu": gpu, "nak_received_count": pcie_data.nak_received_count}, |
| 238 | + console_log=True, |
| 239 | + ) |
| 240 | + |
| 241 | + def check_amdsmi_metric_ecc_totals(self, amdsmi_metric_data: list[AmdSmiMetric]): |
| 242 | + """Check ECC totals for all GPUs |
| 243 | +
|
| 244 | + Raises errors for uncorrectable errors, warnings for correctable and deferred. |
| 245 | +
|
| 246 | + Args: |
| 247 | + amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model |
| 248 | + """ |
| 249 | + for metric in amdsmi_metric_data: |
| 250 | + ecc_totals = metric.ecc |
| 251 | + gpu = metric.gpu |
| 252 | + |
| 253 | + ecc_checks: list[tuple[EventPriority, Optional[int], str]] = [ |
| 254 | + ( |
| 255 | + EventPriority.WARNING, |
| 256 | + ecc_totals.total_correctable_count, |
| 257 | + "Total correctable ECC errors", |
| 258 | + ), |
| 259 | + ( |
| 260 | + EventPriority.ERROR, |
| 261 | + ecc_totals.total_uncorrectable_count, |
| 262 | + "Total uncorrectable ECC errors", |
| 263 | + ), |
| 264 | + ( |
| 265 | + EventPriority.WARNING, |
| 266 | + ecc_totals.total_deferred_count, |
| 267 | + "Total deferred ECC errors", |
| 268 | + ), |
| 269 | + ( |
| 270 | + EventPriority.WARNING, |
| 271 | + ecc_totals.cache_correctable_count, |
| 272 | + "Cache correctable ECC errors", |
| 273 | + ), |
| 274 | + ( |
| 275 | + EventPriority.ERROR, |
| 276 | + ecc_totals.cache_uncorrectable_count, |
| 277 | + "Cache uncorrectable ECC errors", |
| 278 | + ), |
| 279 | + ] |
| 280 | + |
| 281 | + for priority, count, desc in ecc_checks: |
| 282 | + if count is not None and count > 0: |
| 283 | + self._log_event( |
| 284 | + category=EventCategory.RAS, |
| 285 | + description=f"GPU: {gpu} has {desc}: {count}", |
| 286 | + priority=priority, |
| 287 | + data={"gpu": gpu, "error_count": count, "error_type": desc}, |
| 288 | + console_log=True, |
| 289 | + ) |
| 290 | + |
| 291 | + def check_amdsmi_metric_ecc(self, amdsmi_metric_data: list[AmdSmiMetric]): |
| 292 | + """Check ECC counts in all blocks for all GPUs |
| 293 | +
|
| 294 | + Raises errors for uncorrectable errors, warnings for correctable and deferred. |
| 295 | +
|
| 296 | + Args: |
| 297 | + amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model |
| 298 | + """ |
| 299 | + for metric in amdsmi_metric_data: |
| 300 | + gpu = metric.gpu |
| 301 | + ecc_blocks = metric.ecc_blocks |
| 302 | + |
| 303 | + # Skip if ecc_blocks is a string (e.g., "N/A") or empty |
| 304 | + if isinstance(ecc_blocks, str) or not ecc_blocks: |
| 305 | + continue |
| 306 | + |
| 307 | + for block_name, ecc_data in ecc_blocks.items(): |
| 308 | + if not isinstance(ecc_data, EccData): |
| 309 | + continue |
| 310 | + |
| 311 | + if ecc_data.correctable_count is not None and ecc_data.correctable_count > 0: |
| 312 | + self._log_event( |
| 313 | + category=EventCategory.RAS, |
| 314 | + description=f"GPU: {gpu} has correctable ECC errors in block {block_name}", |
| 315 | + priority=EventPriority.WARNING, |
| 316 | + data={ |
| 317 | + "gpu": gpu, |
| 318 | + "block": block_name, |
| 319 | + "correctable_count": ecc_data.correctable_count, |
| 320 | + }, |
| 321 | + console_log=True, |
| 322 | + ) |
| 323 | + |
| 324 | + if ecc_data.uncorrectable_count is not None and ecc_data.uncorrectable_count > 0: |
| 325 | + self._log_event( |
| 326 | + category=EventCategory.RAS, |
| 327 | + description=f"GPU: {gpu} has uncorrectable ECC errors in block {block_name}", |
| 328 | + priority=EventPriority.ERROR, |
| 329 | + data={ |
| 330 | + "gpu": gpu, |
| 331 | + "block": block_name, |
| 332 | + "uncorrectable_count": ecc_data.uncorrectable_count, |
| 333 | + }, |
| 334 | + console_log=True, |
| 335 | + ) |
| 336 | + |
| 337 | + if ecc_data.deferred_count is not None and ecc_data.deferred_count > 0: |
| 338 | + self._log_event( |
| 339 | + category=EventCategory.RAS, |
| 340 | + description=f"GPU: {gpu} has deferred ECC errors in block {block_name}", |
| 341 | + priority=EventPriority.WARNING, |
| 342 | + data={ |
| 343 | + "gpu": gpu, |
| 344 | + "block": block_name, |
| 345 | + "deferred_count": ecc_data.deferred_count, |
| 346 | + }, |
| 347 | + console_log=True, |
| 348 | + ) |
| 349 | + |
125 | 350 | def expected_gpu_processes( |
126 | 351 | self, processes_data: Optional[list[Processes]], max_num_processes: int |
127 | 352 | ): |
@@ -398,8 +623,6 @@ def check_expected_memory_partition_mode( |
398 | 623 | } |
399 | 624 | ) |
400 | 625 |
|
401 | | - # accelerator currently not avaialbe in API |
402 | | - |
403 | 626 | if bad_memory_partition_mode_gpus: |
404 | 627 | self._log_event( |
405 | 628 | category=EventCategory.PLATFORM, |
@@ -429,6 +652,16 @@ def analyze_data( |
429 | 652 | if args is None: |
430 | 653 | args = AmdSmiAnalyzerArgs() |
431 | 654 |
|
| 655 | + if data.metric is not None and len(data.metric) > 0: |
| 656 | + if args.l0_to_recovery_count_error_threshold is not None: |
| 657 | + self.check_amdsmi_metric_pcie( |
| 658 | + data.metric, |
| 659 | + args.l0_to_recovery_count_error_threshold, |
| 660 | + args.l0_to_recovery_count_warning_threshold or 1, |
| 661 | + ) |
| 662 | + self.check_amdsmi_metric_ecc_totals(data.metric) |
| 663 | + self.check_amdsmi_metric_ecc(data.metric) |
| 664 | + |
432 | 665 | if args.expected_gpu_processes: |
433 | 666 | self.expected_gpu_processes(data.process, args.expected_gpu_processes) |
434 | 667 |
|
|
0 commit comments