Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions oci/h100_health_checks/check_h100_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ def check_rdma_link_status():
vendor_serial_num = re.search(r'Vendor Serial Number.*', output).group().split(":")[1].strip()
nic_fw_version = re.search(r'Firmware Version.*', output).group().split(":")[1].strip()
cable_fw_version = re.search(r'FW Version.*', output).group().split(":")[1].strip()
physical_BER = re.search(r'Raw Physical BER.*', output).group().split(":")[1].strip()

# Remove hidden characters from the output
link_state = re.sub(color_pattern, '', link_state)
Expand All @@ -248,8 +249,12 @@ def check_rdma_link_status():
status = False
if recommendation != "No issue was observed":
logger.debug(f"{device}: {recommendation}")
link_issues.append(f"{device} - {vendor_serial_num} - {cable_fw_version} - {nic_fw_version}: {recommendation}")
status = False
if "Bad signal integrity" in recommendation and float(physical_BER) < 1e-07:
logger.debug(f"Recommandation is {recommendation} but the Physical error are low enough that it can be ignored")
else :
logger.debug(f"Recommandation is {recommendation} and the Physical error count is too high to be ignored: {physical_BER}")
link_issues.append(f"{device} - {vendor_serial_num} - {cable_fw_version} - {nic_fw_version}: {recommendation}")
status = False
else:
logger.debug(f"{device}: {recommendation}")

Expand Down Expand Up @@ -518,4 +523,4 @@ def slurm_reason(message):
logger.info(f"Finished GPU host setup check at: {datetime_str}")

if slurm_error_count > 0 and args.slurm:
print("Healthcheck:: "+slurm_drain_reason[:-1])
print("Healthcheck:: "+slurm_drain_reason[:-1])