@@ -924,7 +924,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
924924 def scriptInstallPathNode = " ${ jobWorkspace} /${ jobUID} -slurm_install.sh"
925925 def testListPathNode = " ${ jobWorkspace} /${ testList} .txt"
926926 def waivesListPathNode = " ${ jobWorkspace} /waives.txt"
927- def outputPath = " ${ jobWorkspace} /job-output- ${ UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6) } .log"
927+ def outputPath = " ${ jobWorkspace} /job-output.log"
928928 def scriptLaunchPathLocal = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
929929 def scriptLaunchPathNode = " ${ jobWorkspace} /${ jobUID} -slurm_launch.sh"
930930 def scriptExecPathLocal = Utils . createTempLocation(pipeline, " ./slurm_exec.sh" )
@@ -1122,11 +1122,14 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11221122 ${ taskArgs.collect { "#SBATCH $it" }.join('\n')}
11231123 #SBATCH ${ partition.additionalArgs}
11241124 ${ (partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
1125- echo "Starting job \$ SLURM_JOB_ID on \$ SLURM_NODELIST"
1126- echo "\$ SLURM_JOB_ID > $jobWorkspace /slurm_job_id.txt
11271125
1126+ # SBATCH directives must appear before any executable commands.
11281127 set -xEeuo pipefail
11291128 trap 'rc=\$ ?; echo "Error in file \$ {BASH_SOURCE[0]} on line \$ LINENO: \$ BASH_COMMAND (exit \$ rc)"; exit \$ rc' ERR
1129+
1130+ echo "Starting job \$ SLURM_JOB_ID on \$ SLURM_NODELIST"
1131+ echo \$ SLURM_JOB_ID > "$jobWorkspace /slurm_job_id.txt"
1132+
11301133 export jobWorkspace=$jobWorkspace
11311134 export tarName=$tarName
11321135 export llmTarfile=$llmTarfile
@@ -1198,8 +1201,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11981201 def scriptExec = """ #!/bin/bash
11991202 set -xEeuo pipefail
12001203 trap 'rc=\$ ?; echo "Error in file \$ {BASH_SOURCE[0]} on line \$ LINENO: \$ BASH_COMMAND (exit \$ rc)"; exit \$ rc' ERR
1201- rm -rf ${ outputPath}
1202- touch ${ outputPath}
1204+
1205+ # Clean up previous job intermediate files so that retry can work
1206+ if [ -f "${ jobWorkspace} /slurm_job_id.txt" ]; then
1207+ previous_job_id=\$ (cat "${ jobWorkspace} /slurm_job_id.txt")
1208+ echo "Found previous Slurm job ID: \$ {previous_job_id}"
1209+ scancel "\$ {previous_job_id}" || true
1210+ rm -rf "${ jobWorkspace} /slurm_job_id.txt"
1211+ # Wait for 60 seconds to ensure the previous job is canceled
1212+ sleep 60
1213+ fi
1214+ rm -rf "${ jobWorkspace} /results.xml"
1215+ rm -rf "${ jobWorkspace} /report.csv"
1216+ rm -rf "${ jobWorkspace} /unfinished_test.txt"
1217+ rm -rf "${ outputPath} "
1218+
1219+ touch "${ outputPath} "
12031220 jobId=\$ (sbatch ${ scriptLaunchPathNode} | awk '{print \$ 4}')
12041221 if [ -z "\$ jobId" ]; then
12051222 echo "Error: Job submission failed, no job ID returned."
0 commit comments