Skip to content

Commit 89f95d0

Browse files
committed
Further fixes
Signed-off-by: Yanchao Lu <[email protected]>
1 parent 5722ebb commit 89f95d0

File tree

1 file changed

+24
-7
lines changed

1 file changed

+24
-7
lines changed

jenkins/L0_Test.groovy

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
461461
def cleanupCommands = [
462462
"rm -rf ${cluster.scratchPath}/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true",
463463
"rm -rf ${jobWorkspace} || true",
464-
].join(" && ")
464+
].join(" ; ")
465465
Utils.exec(
466466
pipeline,
467467
script: Utils.sshUserCmd(
@@ -511,7 +511,7 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
511511
def cleanupCommands = [
512512
"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-${entrypoint} || true",
513513
"rm -rf ${cluster.scratchPath}/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true",
514-
].join(" && ")
514+
].join(" ; ")
515515
Utils.exec(
516516
pipeline,
517517
script: Utils.sshUserCmd(
@@ -924,7 +924,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
924924
def scriptInstallPathNode = "${jobWorkspace}/${jobUID}-slurm_install.sh"
925925
def testListPathNode = "${jobWorkspace}/${testList}.txt"
926926
def waivesListPathNode = "${jobWorkspace}/waives.txt"
927-
def outputPath = "${jobWorkspace}/job-output-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}.log"
927+
def outputPath = "${jobWorkspace}/job-output.log"
928928
def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
929929
def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
930930
def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
@@ -1122,11 +1122,14 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11221122
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
11231123
#SBATCH ${partition.additionalArgs}
11241124
${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
1125-
echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST"
1126-
echo "\$SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
11271125
1126+
# SBATCH directives must appear before any executable commands.
11281127
set -xEeuo pipefail
11291128
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
1129+
1130+
echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST"
1131+
echo \$SLURM_JOB_ID > "$jobWorkspace/slurm_job_id.txt"
1132+
11301133
export jobWorkspace=$jobWorkspace
11311134
export tarName=$tarName
11321135
export llmTarfile=$llmTarfile
@@ -1198,8 +1201,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11981201
def scriptExec = """#!/bin/bash
11991202
set -xEeuo pipefail
12001203
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
1201-
rm -rf ${outputPath}
1202-
touch ${outputPath}
1204+
1205+
# Clean up previous job intermediate files so that retry can work
1206+
if [ -f "${jobWorkspace}/slurm_job_id.txt" ]; then
1207+
previous_job_id=\$(cat "${jobWorkspace}/slurm_job_id.txt")
1208+
echo "Found previous Slurm job ID: \${previous_job_id}"
1209+
scancel "\${previous_job_id}" || true
1210+
rm -rf "${jobWorkspace}/slurm_job_id.txt"
1211+
# Wait for 60 seconds to ensure the previous job is canceled
1212+
sleep 60
1213+
fi
1214+
rm -rf "${jobWorkspace}/results.xml"
1215+
rm -rf "${jobWorkspace}/report.csv"
1216+
rm -rf "${jobWorkspace}/unfinished_test.txt"
1217+
rm -rf "${outputPath}"
1218+
1219+
touch "${outputPath}"
12031220
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
12041221
if [ -z "\$jobId" ]; then
12051222
echo "Error: Job submission failed, no job ID returned."

0 commit comments

Comments
 (0)