@@ -28,7 +28,7 @@ detect_amd_gpus() {
2828 echo " Error: lspci command not found. Aborting."
2929 exit 1
3030 fi
31- # Count AMD GPUs .
31+ # Count AMD/ATI GPU controllers .
3232 local count
3333 count=$( rocm-smi | grep -E ' ^Device' -A 1000 | awk ' $1 ~ /^[0-9]+$/ {count++} END {print count}' )
3434 echo " $count "
@@ -73,17 +73,36 @@ run_tests() {
7373 echo " Running multi-GPU test: $test_file "
7474
7575 # Define file paths for abort detection (files created by conftest.py)
76+ last_running_file=" ${LOG_DIR} /${test_name} _last_running.json"
7677 json_log_file=" ${LOG_DIR} /multi_gpu_${test_name} _log.json"
7778 html_log_file=" ${LOG_DIR} /multi_gpu_${test_name} _log.html"
7879
79- # Run the test
80+ # Run the test (conftest.py will create the last_running_file automatically)
8081 python3 -m pytest \
8182 --html=" $html_log_file " \
8283 --json-report \
8384 --json-report-file=" $json_log_file " \
8485 --reruns 3 \
8586 " $test_file "
86-
87+
88+ # Check for aborted test and handle it
89+ if [[ -f " $last_running_file " ]]; then
90+ echo " Abort detected for test: $test_name "
91+ # Get the absolute path of the script directory
92+ script_dir=" $( cd " $( dirname " $0 " ) " && pwd) "
93+ # Convert relative paths to absolute paths
94+ abs_json_log_file=" $( realpath " $json_log_file " ) "
95+ abs_html_log_file=" $( realpath " $html_log_file " ) "
96+ abs_last_running_file=" $( realpath " $last_running_file " ) "
97+
98+ cd " $script_dir "
99+ python3 -c "
100+ from run_single_gpu import handle_abort
101+ import sys
102+ success = handle_abort('$abs_json_log_file ', '$abs_html_log_file ', '$abs_last_running_file ', 'multi_gpu_$test_name ')
103+ sys.exit(0 if success else 1)
104+ "
105+ fi
87106 done
88107
89108 # Merge individual HTML reports into one.
0 commit comments