@@ -423,6 +423,165 @@ def check_nvidia_gpu_count():
423423
424424 return True
425425
426+ def check_ecc_errors ():
427+ ecc_issues = []
428+ try :
429+ # Run the nvidia-smi -q command
430+ result = subprocess .run (['nvidia-smi' , '-q' ], stdout = subprocess .PIPE )
431+ except FileNotFoundError :
432+ logger .warning ("Skipping SRAM/DRAM ECC Test: nvidia-smi command not found" )
433+ return []
434+
435+ # Decode the output from bytes to string
436+ output = result .stdout .decode ('utf-8' )
437+
438+ # Find the lines containing "SRAM Correctable" and "DRAM Correctable"
439+ sram_matches = re .findall (r'SRAM Uncorrectable\s+:\s+(\d+)' , output )
440+ if len (sram_matches )== 0 :
441+ sram_matches = re .findall (r'SRAM Uncorrectable Parity\s+:\s+(\d+)' , output )
442+ dram_matches = re .findall (r'DRAM Uncorrectable\s+:\s+(\d+)' , output )
443+ gpu_matches = re .findall (r'\nGPU\s+(.*)\n' , output )
444+ vol_sram_line = sram_matches [0 ::2 ]
445+ vol_dram_line = dram_matches [0 ::2 ]
446+ agg_sram_line = sram_matches [1 ::2 ]
447+ agg_dram_line = dram_matches [1 ::2 ]
448+
449+ for i , gpu in enumerate (gpu_matches ):
450+ logger .debug (f"GPU: { gpu } " )
451+ if vol_sram_line [i ] != "0" :
452+ logger .debug (f"Volatile SRAM Uncorrectable: { vol_sram_line [i ]} " )
453+ ecc_issues .append (f"{ gpu_matches [i ]} - Volatile SRAM Uncorrectable: { vol_sram_line [i ]} " )
454+ if vol_dram_line [i ] != "0" :
455+ logger .debug (f"Volatile DRAM Uncorrectable: { vol_dram_line [i ]} " )
456+ ecc_issues .append (f"{ gpu_matches [i ]} - Volatile DRAM Uncorrectable: { vol_dram_line [i ]} " )
457+ if agg_sram_line [i ] != "0" :
458+ logger .debug (f"Aggregate SRAM Uncorrectable: { agg_sram_line [i ]} " )
459+ ecc_issues .append (f"{ gpu_matches [i ]} - Aggregate SRAM Uncorrectable: { agg_sram_line [i ]} " )
460+ if agg_dram_line [i ] != "0" :
461+ logger .debug (f"Aggregate DRAM Uncorrectable: { agg_dram_line [i ]} " )
462+ ecc_issues .append (f"{ gpu_matches [i ]} - Aggregate DRAM Uncorrectable: { agg_dram_line [i ]} " )
463+
464+ # Textfile name for metrics
465+ tf_name = 'gpu_ecc_error_check.prom'
466+ tf_path = os .path .join (textfile_dir_path , tf_name )
467+
468+ # Get current process id and create a temporary textfile
469+ process_pid = os .getpid ()
470+ tmp_tf_path = os .path .join ('/tmp' , tf_name ) + "." + str (process_pid )
471+
472+ # Write Xid Error Check metric file
473+ with open (tmp_tf_path , "w" ) as tmp_tf :
474+ help_text = "# HELP gpu_ecc_error_check Pass or Fail based on row remap errors found in a GPU"
475+ type_text = "# TYPE gpu_ecc_error_check gauge"
476+ tmp_tf .write ('{}\n {}\n ' .format (help_text , type_text ))
477+ # Check if there are ecc_issues
478+ if len (ecc_issues ) == 0 :
479+ metric_text = "gpu_ecc_error_check 1" + "\n "
480+ print (metric_text )
481+ tmp_tf .write ('{}' .format (metric_text ))
482+ else :
483+ metric_text = "gpu_ecc_error_check 0" + "\n "
484+ print (metric_text )
485+ tmp_tf .write ('{}' .format (metric_text ))
486+ tmp_tf .close ()
487+
488+ copy_metric_file (tmp_tf_path , tf_path , node_exporter_user , node_exporter_group )
489+
490+ return True
491+
492+ def check_row_remap_errors ():
493+ remap_issues = []
494+ try :
495+ # Run the nvidia-smi -q command
496+ result = subprocess .run (['nvidia-smi' , '--query-remapped-rows=remapped_rows.pending,remapped_rows.failure,remapped_rows.uncorrectable' , '--format=csv,noheader' ], stdout = subprocess .PIPE )
497+
498+ if result .returncode != 0 :
499+ logger .debug (f"Check row remap command exited with error code: { result .returncode } " )
500+
501+ except FileNotFoundError :
502+ logger .warning ("Skipping Row Remap Test: nvidia-smi command not found" )
503+ return []
504+
505+ # Decode the output from bytes to string
506+ output = result .stdout .decode ('utf-8' )
507+ logger .debug ("Output: {}" .format (output ))
508+ for i , line in enumerate (output .split ('\n ' )):
509+ if line == "" :
510+ continue
511+ tmp_data = line .split ("," )
512+ tmp_data = [x .strip () for x in tmp_data ]
513+ if tmp_data [0 ] != "0" :
514+ logger .debug (f"GPU: { i } - Row Remap Pending: { tmp_data [0 ]} " )
515+ remap_issues .append (f"GPU: { i } Row Remap Pending: { tmp_data [0 ]} " )
516+ if tmp_data [1 ] != "0" :
517+ logger .debug (f"GPU: { i } - Row Remap Failure: { tmp_data [1 ]} " )
518+ #remap_issues.append(f"GPU: {i} Row Remap Failure: {tmp_data[1]}")
519+ if tmp_data [2 ] != "0" :
520+ logger .debug (f"GPU: { i } - Row Remap Uncorrectable: { tmp_data [2 ]} " )
521+ if int (tmp_data [2 ]) > 512 :
522+ remap_issues .append (f"GPU: { i } - Row Remap Uncorrectable >512: { tmp_data [2 ]} " )
523+ else :
524+ remap_issues .append (f"GPU: { i } - Row Remap Uncorrectable <512: { tmp_data [2 ]} " )# Check if there are ecc_issues
525+
526+ # Textfile name for metrics
527+ tf_name = 'gpu_row_remap_error_check.prom'
528+ tf_path = os .path .join (textfile_dir_path , tf_name )
529+
530+ # Get current process id and create a temporary textfile
531+ process_pid = os .getpid ()
532+ tmp_tf_path = os .path .join ('/tmp' , tf_name ) + "." + str (process_pid )
533+
534+ # Write Xid Error Check metric file
535+ with open (tmp_tf_path , "w" ) as tmp_tf :
536+ help_text = "# HELP gpu_row_remap_error_check Pass or Fail based on row remap errors found in a GPU"
537+ type_text = "# TYPE gpu_row_remap_error_check gauge"
538+ tmp_tf .write ('{}\n {}\n ' .format (help_text , type_text ))
539+ if len (remap_issues ) == 0 :
540+ metric_text = "gpu_row_remap_error_check 1" + "\n "
541+ print (metric_text )
542+ tmp_tf .write ('{}' .format (metric_text ))
543+ else :
544+ metric_text = "gpu_row_remap_error_check 0" + "\n "
545+ print (metric_text )
546+ tmp_tf .write ('{}' .format (metric_text ))
547+ tmp_tf .close ()
548+
549+ copy_metric_file (tmp_tf_path , tf_path , node_exporter_user , node_exporter_group )
550+
551+ return True
552+
553+ def xid_check ():
554+ result = subprocess .run (['sudo' , 'python3' , 'xid_checker.py' ], stdout = subprocess .PIPE , stderr = subprocess .PIPE )
555+ output = result .stderr .decode ('utf-8' )
556+
557+ # Textfile name for metrics
558+ tf_name = 'xid_error_check.prom'
559+ tf_path = os .path .join (textfile_dir_path , tf_name )
560+
561+ # Get current process id and create a temporary textfile
562+ process_pid = os .getpid ()
563+ tmp_tf_path = os .path .join ('/tmp' , tf_name ) + "." + str (process_pid )
564+
565+ # Write Xid Error Check metric file
566+ with open (tmp_tf_path , "w" ) as tmp_tf :
567+ help_text = "# HELP xid_error_check Pass or Fail based on xid errors thrown by GPU on a PCI Device "
568+ type_text = "# TYPE xid_error_check gauge"
569+ tmp_tf .write ('{}\n {}\n ' .format (help_text , type_text ))
570+ if output .find ("Passed" ) > 0 :
571+ metric_text = "xid_error_check 1" + "\n "
572+ print (metric_text )
573+ tmp_tf .write ('{}' .format (metric_text ))
574+ else :
575+ metric_text = "xid_error_check 0" + "\n "
576+ print (metric_text )
577+ tmp_tf .write ('{}' .format (metric_text ))
578+ tmp_tf .close ()
579+
580+ copy_metric_file (tmp_tf_path , tf_path , node_exporter_user , node_exporter_group )
581+
582+ return True
583+
584+
426585if __name__ == '__main__' :
427586
428587 # User and group under which node expoerter service is being run
@@ -477,10 +636,16 @@ if __name__ == '__main__':
477636 # Check if devices have fallen off the bus
478637 check_bus_metric ()
479638
480- # Check if nvidia-smi command exists and get the count of GPUs
639+ # Check if Xid check Passed
640+ xid_check ()
641+
642+ # Check if nvidia-smi command exists and run health checks
481643 try :
482644 subprocess .call (["nvidia-smi" ])
483645 check_nvidia_gpu_count ()
646+ check_ecc_errors ()
647+ check_row_remap_errors ()
648+
484649 except FileNotFoundError :
485650 logger .debug (f'Shape does not support nvidia-smi command' )
486651
0 commit comments