Skip to content

Commit 6bdf4d4

Browse files
Check Physical Error in case of bad signal integrity
1 parent bf9af6e commit 6bdf4d4

File tree

1 file changed

+7
-3
lines changed

1 file changed

+7
-3
lines changed

playbooks/roles/healthchecks/files/check_gpu_setup.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ def check_rdma_link_status():
233233
vendor_serial_num = re.search(r'Vendor Serial Number.*', output).group().split(":")[1].strip()
234234
nic_fw_version = re.search(r'Firmware Version.*', output).group().split(":")[1].strip()
235235
cable_fw_version = re.search(r'FW Version.*', output).group().split(":")[1].strip()
236-
236+
physical_BER = re.search(r'Raw Physical BER.*', output).group().split(":")[1].strip()
237237
# Remove hidden characters from the output
238238
link_state = re.sub(color_pattern, '', link_state)
239239
nic_fw_version = re.sub(color_pattern, '', nic_fw_version)
@@ -248,8 +248,12 @@ def check_rdma_link_status():
248248
status = False
249249
if recommendation != "No issue was observed":
250250
logger.debug(f"{device}: {recommendation}")
251-
link_issues.append(f"{device} - {vendor_serial_num} - {cable_fw_version} - {nic_fw_version}: {recommendation}")
252-
status = False
251+
if "Bad signal integrity" in recommendation and float(physical_BER) < 1e-09:
252+
logger.debug(f"Recommandation is {recommendation} but the Physical error are low enough that it can be ignored")
253+
else :
254+
logger.debug(f"Recommandation is {recommendation} and the Physical error count is too high to be ignored: {physical_BER}")
255+
link_issues.append(f"{device} - {vendor_serial_num} - {cable_fw_version} - {nic_fw_version}: {recommendation}")
256+
status = False
253257
else:
254258
logger.debug(f"{device}: {recommendation}")
255259

0 commit comments

Comments
 (0)