Skip to content

Commit c5f522d

Browse files
committed
add retry logic to get slurm sbatch job log when ssh dropped
Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com>
1 parent b51258a commit c5f522d

File tree

2 files changed

+60
-21
lines changed

2 files changed

+60
-21
lines changed

jenkins/L0_Test.groovy

Lines changed: 60 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -832,8 +832,10 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
832832
def outputPath = "${jobWorkspace}/job-output.log"
833833
def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
834834
def scriptLaunchPathNode = "${jobWorkspace}/slurm_launch.sh"
835-
def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
836-
def scriptExecPathNode = "${jobWorkspace}/slurm_exec.sh"
835+
def scriptSubmitPathLocal = Utils.createTempLocation(pipeline, "./slurm_submit.sh")
836+
def scriptSubmitPathNode = "${jobWorkspace}/slurm_submit.sh"
837+
def scriptTrackPathLocal = Utils.createTempLocation(pipeline, "./slurm_track.sh")
838+
def scriptTrackPathNode = "${jobWorkspace}/slurm_track.sh"
837839
def isAarch64 = config.contains("aarch64")
838840
def coverageConfigFile = "${jobWorkspace}/.coveragerc"
839841

@@ -962,15 +964,41 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
962964
scriptLaunchPathNode,
963965
true
964966
)
965-
def scriptExec = """
967+
def scriptSubmit = """#!/bin/bash
968+
set -Eeuo pipefail
969+
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
966970
touch ${outputPath}
967971
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
968972
if [ -z "\$jobId" ]; then
969973
echo "Error: Job submission failed, no job ID returned."
970974
exit 1
971975
fi
972976
echo "Submitted job \$jobId"
973-
tail -f ${outputPath} &
977+
# save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
978+
echo \$jobId > $jobWorkspace/slurm_job_id.txt
979+
""".replaceAll("(?m)^\\s*", "").trim()
980+
pipeline.writeFile(file: scriptSubmitPathLocal, text: scriptSubmit)
981+
Utils.copyFileToRemoteHost(
982+
pipeline,
983+
remote,
984+
scriptSubmitPathLocal,
985+
scriptSubmitPathNode,
986+
true
987+
)
988+
}
989+
stage("[${stageName}] Run Pytest") {
990+
// Submit the sbatch job
991+
Utils.exec(
992+
pipeline,
993+
timeout: false,
994+
script: Utils.sshUserCmd(
995+
remote,
996+
scriptSubmitPathNode
997+
)
998+
)
999+
def scriptTrack = """#!/bin/bash
1000+
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
1001+
tail -f $outputPath &
9741002
tailPid=\$!
9751003
# Wait until sbatch job is done.
9761004
while squeue -j \$jobId -o %T >/dev/null 2>&1; do
@@ -984,27 +1012,41 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
9841012
echo "Pytest failed in Slurm job \$jobId with exit code \$EXIT_CODE"
9851013
exit \$EXIT_CODE
9861014
fi
987-
""".replaceAll("(?m)^\\s*", "").trim()
988-
pipeline.writeFile(file: scriptExecPathLocal, text: scriptExec)
1015+
"""
1016+
pipeline.writeFile(file: scriptTrackPathLocal, text: scriptTrack)
9891017
Utils.copyFileToRemoteHost(
9901018
pipeline,
9911019
remote,
992-
scriptExecPathLocal,
993-
scriptExecPathNode,
1020+
scriptTrackPathLocal,
1021+
scriptTrackPathNode,
9941022
true
9951023
)
996-
}
997-
stage("[${stageName}] Run Pytest") {
998-
Utils.exec(
999-
pipeline,
1000-
timeout: false,
1001-
script: Utils.sshUserCmd(
1002-
remote,
1003-
scriptExecPathNode
1024+
while (true) {
1025+
// Check if the job is done by running squeue via SSH
1026+
def result = Utils.exec(
1027+
pipeline,
1028+
returnStdout: true,
1029+
script: Utils.sshUserCmd(
1030+
remote,
1031+
"'set jobId = `cat $jobWorkspace/slurm_job_id.txt`; squeue -j $jobId -h'"
1032+
)
10041033
)
1005-
)
1034+
if (result == "") {
1035+
echo "Job $jobId is done."
1036+
break
1037+
} else {
1038+
echo "Job $jobId is still running, pulling the job log."
1039+
Utils.exec(
1040+
pipeline,
1041+
timeout: false,
1042+
script: Utils.sshUserCmd(
1043+
remote,
1044+
scriptTrackPathNode
1045+
)
1046+
)
1047+
}
1048+
}
10061049
}
1007-
10081050
echo "Finished test stage execution."
10091051
}
10101052
} finally {

jenkins/scripts/slurm_run.sh

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,6 @@ set_value_in_command() {
3030
}
3131

3232
if [ $SLURM_LOCALID -eq 0 ]; then
33-
# save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
34-
echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
35-
3633
wget -nv $llmTarfile
3734
tar -zxf $tarName
3835
which python3

0 commit comments

Comments
 (0)