Skip to content

Commit 9ad0b5b

Browse files
committed
add additional health check metrics
1 parent 126cc23 commit 9ad0b5b

File tree

1 file changed

+166
-1
lines changed

1 file changed

+166
-1
lines changed

playbooks/roles/metrics-exporter/templates/custom_metrics.py.j2

Lines changed: 166 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,165 @@ def check_nvidia_gpu_count():
423423

424424
return True
425425

426+
def check_ecc_errors():
427+
ecc_issues = []
428+
try:
429+
# Run the nvidia-smi -q command
430+
result = subprocess.run(['nvidia-smi', '-q'], stdout=subprocess.PIPE)
431+
except FileNotFoundError:
432+
logger.warning("Skipping SRAM/DRAM ECC Test: nvidia-smi command not found")
433+
return []
434+
435+
# Decode the output from bytes to string
436+
output = result.stdout.decode('utf-8')
437+
438+
# Find the lines containing "SRAM Correctable" and "DRAM Correctable"
439+
sram_matches = re.findall(r'SRAM Uncorrectable\s+:\s+(\d+)', output)
440+
if len(sram_matches)==0:
441+
sram_matches = re.findall(r'SRAM Uncorrectable Parity\s+:\s+(\d+)', output)
442+
dram_matches = re.findall(r'DRAM Uncorrectable\s+:\s+(\d+)', output)
443+
gpu_matches = re.findall(r'\nGPU\s+(.*)\n', output)
444+
vol_sram_line = sram_matches[0::2]
445+
vol_dram_line = dram_matches[0::2]
446+
agg_sram_line = sram_matches[1::2]
447+
agg_dram_line = dram_matches[1::2]
448+
449+
for i, gpu in enumerate(gpu_matches):
450+
logger.debug(f"GPU: {gpu}")
451+
if vol_sram_line[i] != "0":
452+
logger.debug(f"Volatile SRAM Uncorrectable: {vol_sram_line[i]}")
453+
ecc_issues.append(f"{gpu_matches[i]} - Volatile SRAM Uncorrectable: {vol_sram_line[i]}")
454+
if vol_dram_line[i] != "0":
455+
logger.debug(f"Volatile DRAM Uncorrectable: {vol_dram_line[i]}")
456+
ecc_issues.append(f"{gpu_matches[i]} - Volatile DRAM Uncorrectable: {vol_dram_line[i]}")
457+
if agg_sram_line[i] != "0":
458+
logger.debug(f"Aggregate SRAM Uncorrectable: {agg_sram_line[i]}")
459+
ecc_issues.append(f"{gpu_matches[i]} - Aggregate SRAM Uncorrectable: {agg_sram_line[i]}")
460+
if agg_dram_line[i] != "0":
461+
logger.debug(f"Aggregate DRAM Uncorrectable: {agg_dram_line[i]}")
462+
ecc_issues.append(f"{gpu_matches[i]} - Aggregate DRAM Uncorrectable: {agg_dram_line[i]}")
463+
464+
# Textfile name for metrics
465+
tf_name = 'gpu_ecc_error_check.prom'
466+
tf_path = os.path.join(textfile_dir_path, tf_name)
467+
468+
# Get current process id and create a temporary textfile
469+
process_pid = os.getpid()
470+
tmp_tf_path = os.path.join('/tmp', tf_name) + "." + str(process_pid)
471+
472+
# Write Xid Error Check metric file
473+
with open(tmp_tf_path, "w") as tmp_tf:
474+
help_text = "# HELP gpu_ecc_error_check Pass or Fail based on row remap errors found in a GPU"
475+
type_text = "# TYPE gpu_ecc_error_check gauge"
476+
tmp_tf.write('{}\n{}\n'.format(help_text, type_text))
477+
# Check if there are ecc_issues
478+
if len(ecc_issues) == 0:
479+
metric_text = "gpu_ecc_error_check 1" + "\n"
480+
print(metric_text)
481+
tmp_tf.write('{}'.format(metric_text))
482+
else:
483+
metric_text = "gpu_ecc_error_check 0" + "\n"
484+
print(metric_text)
485+
tmp_tf.write('{}'.format(metric_text))
486+
tmp_tf.close()
487+
488+
copy_metric_file(tmp_tf_path, tf_path, node_exporter_user, node_exporter_group)
489+
490+
return True
491+
492+
def check_row_remap_errors():
493+
remap_issues = []
494+
try:
495+
# Run the nvidia-smi -q command
496+
result = subprocess.run(['nvidia-smi', '--query-remapped-rows=remapped_rows.pending,remapped_rows.failure,remapped_rows.uncorrectable', '--format=csv,noheader'], stdout=subprocess.PIPE)
497+
498+
if result.returncode != 0:
499+
logger.debug(f"Check row remap command exited with error code: {result.returncode}")
500+
501+
except FileNotFoundError:
502+
logger.warning("Skipping Row Remap Test: nvidia-smi command not found")
503+
return []
504+
505+
# Decode the output from bytes to string
506+
output = result.stdout.decode('utf-8')
507+
logger.debug("Output: {}".format(output))
508+
for i, line in enumerate(output.split('\n')):
509+
if line == "":
510+
continue
511+
tmp_data = line.split(",")
512+
tmp_data = [x.strip() for x in tmp_data]
513+
if tmp_data[0] != "0":
514+
logger.debug(f"GPU: {i} - Row Remap Pending: {tmp_data[0]}")
515+
remap_issues.append(f"GPU: {i} Row Remap Pending: {tmp_data[0]}")
516+
if tmp_data[1] != "0":
517+
logger.debug(f"GPU: {i} - Row Remap Failure: {tmp_data[1]}")
518+
#remap_issues.append(f"GPU: {i} Row Remap Failure: {tmp_data[1]}")
519+
if tmp_data[2] != "0":
520+
logger.debug(f"GPU: {i} - Row Remap Uncorrectable: {tmp_data[2]}")
521+
if int(tmp_data[2]) > 512:
522+
remap_issues.append(f"GPU: {i} - Row Remap Uncorrectable >512: {tmp_data[2]}")
523+
else:
524+
remap_issues.append(f"GPU: {i} - Row Remap Uncorrectable <512: {tmp_data[2]}")# Check if there are ecc_issues
525+
526+
# Textfile name for metrics
527+
tf_name = 'gpu_row_remap_error_check.prom'
528+
tf_path = os.path.join(textfile_dir_path, tf_name)
529+
530+
# Get current process id and create a temporary textfile
531+
process_pid = os.getpid()
532+
tmp_tf_path = os.path.join('/tmp', tf_name) + "." + str(process_pid)
533+
534+
# Write Xid Error Check metric file
535+
with open(tmp_tf_path, "w") as tmp_tf:
536+
help_text = "# HELP gpu_row_remap_error_check Pass or Fail based on row remap errors found in a GPU"
537+
type_text = "# TYPE gpu_row_remap_error_check gauge"
538+
tmp_tf.write('{}\n{}\n'.format(help_text, type_text))
539+
if len(remap_issues) == 0:
540+
metric_text = "gpu_row_remap_error_check 1" + "\n"
541+
print(metric_text)
542+
tmp_tf.write('{}'.format(metric_text))
543+
else:
544+
metric_text = "gpu_row_remap_error_check 0" + "\n"
545+
print(metric_text)
546+
tmp_tf.write('{}'.format(metric_text))
547+
tmp_tf.close()
548+
549+
copy_metric_file(tmp_tf_path, tf_path, node_exporter_user, node_exporter_group)
550+
551+
return True
552+
553+
def xid_check():
554+
result = subprocess.run(['sudo', 'python3', 'xid_checker.py'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
555+
output = result.stderr.decode('utf-8')
556+
557+
# Textfile name for metrics
558+
tf_name = 'xid_error_check.prom'
559+
tf_path = os.path.join(textfile_dir_path, tf_name)
560+
561+
# Get current process id and create a temporary textfile
562+
process_pid = os.getpid()
563+
tmp_tf_path = os.path.join('/tmp', tf_name) + "." + str(process_pid)
564+
565+
# Write Xid Error Check metric file
566+
with open(tmp_tf_path, "w") as tmp_tf:
567+
help_text = "# HELP xid_error_check Pass or Fail based on xid errors thrown by GPU on a PCI Device "
568+
type_text = "# TYPE xid_error_check gauge"
569+
tmp_tf.write('{}\n{}\n'.format(help_text, type_text))
570+
if output.find("Passed") > 0:
571+
metric_text = "xid_error_check 1" + "\n"
572+
print(metric_text)
573+
tmp_tf.write('{}'.format(metric_text))
574+
else:
575+
metric_text = "xid_error_check 0" + "\n"
576+
print(metric_text)
577+
tmp_tf.write('{}'.format(metric_text))
578+
tmp_tf.close()
579+
580+
copy_metric_file(tmp_tf_path, tf_path, node_exporter_user, node_exporter_group)
581+
582+
return True
583+
584+
426585
if __name__ == '__main__':
427586

428587
# User and group under which node expoerter service is being run
@@ -477,10 +636,16 @@ if __name__ == '__main__':
477636
# Check if devices have fallen off the bus
478637
check_bus_metric()
479638

480-
# Check if nvidia-smi command exists and get the count of GPUs
639+
# Check if Xid check Passed
640+
xid_check()
641+
642+
# Check if nvidia-smi command exists and run health checks
481643
try:
482644
subprocess.call(["nvidia-smi"])
483645
check_nvidia_gpu_count()
646+
check_ecc_errors()
647+
check_row_remap_errors()
648+
484649
except FileNotFoundError:
485650
logger.debug(f'Shape does not support nvidia-smi command')
486651

0 commit comments

Comments
 (0)