Skip to content

Commit 9e2783d

Browse files
committed
Further fixes
Signed-off-by: Yanchao Lu <[email protected]>
1 parent 5722ebb commit 9e2783d

File tree

1 file changed

+22
-5
lines changed

1 file changed

+22
-5
lines changed

jenkins/L0_Test.groovy

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -924,7 +924,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
924924
def scriptInstallPathNode = "${jobWorkspace}/${jobUID}-slurm_install.sh"
925925
def testListPathNode = "${jobWorkspace}/${testList}.txt"
926926
def waivesListPathNode = "${jobWorkspace}/waives.txt"
927-
def outputPath = "${jobWorkspace}/job-output-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}.log"
927+
def outputPath = "${jobWorkspace}/job-output.log"
928928
def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
929929
def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
930930
def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
@@ -1122,11 +1122,14 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11221122
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
11231123
#SBATCH ${partition.additionalArgs}
11241124
${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
1125-
echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST"
1126-
echo "\$SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
11271125
1126+
# SBATCH directives must appear before any executable commands.
11281127
set -xEeuo pipefail
11291128
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
1129+
1130+
echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST"
1131+
echo \$SLURM_JOB_ID > "$jobWorkspace/slurm_job_id.txt"
1132+
11301133
export jobWorkspace=$jobWorkspace
11311134
export tarName=$tarName
11321135
export llmTarfile=$llmTarfile
@@ -1198,8 +1201,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11981201
def scriptExec = """#!/bin/bash
11991202
set -xEeuo pipefail
12001203
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
1201-
rm -rf ${outputPath}
1202-
touch ${outputPath}
1204+
1205+
# Clean up previous job intermediate files so that retry can work
1206+
if [ -f "${jobWorkspace}/slurm_job_id.txt" ]; then
1207+
previous_job_id=\$(cat "${jobWorkspace}/slurm_job_id.txt")
1208+
echo "Found previous Slurm job ID: \${previous_job_id}"
1209+
scancel "\${previous_job_id}" || true
1210+
rm -rf "${jobWorkspace}/slurm_job_id.txt"
1211+
# Wait for 60 seconds to ensure the previous job is canceled
1212+
sleep 60
1213+
fi
1214+
rm -rf "${jobWorkspace}/results.xml"
1215+
rm -rf "${jobWorkspace}/report.csv"
1216+
rm -rf "${jobWorkspace}/unfinished_test.txt"
1217+
rm -rf "${outputPath}"
1218+
1219+
touch "${outputPath}"
12031220
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
12041221
if [ -z "\$jobId" ]; then
12051222
echo "Error: Job submission failed, no job ID returned."

0 commit comments

Comments
 (0)