Skip to content

Commit 821fb59

Browse files
committed
add retry logic to get slurm sbatch job log when ssh dropped
Signed-off-by: Yuanjing Xue <[email protected]>
1 parent b51258a commit 821fb59

File tree

2 files changed

+64
-14
lines changed

2 files changed

+64
-14
lines changed

jenkins/L0_Test.groovy

Lines changed: 64 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -832,8 +832,10 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
832832
def outputPath = "${jobWorkspace}/job-output.log"
833833
def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
834834
def scriptLaunchPathNode = "${jobWorkspace}/slurm_launch.sh"
835-
def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
836-
def scriptExecPathNode = "${jobWorkspace}/slurm_exec.sh"
835+
def scriptSubmitPathLocal = Utils.createTempLocation(pipeline, "./slurm_submit.sh")
836+
def scriptSubmitPathNode = "${jobWorkspace}/slurm_submit.sh"
837+
def scriptTrackPathLocal = Utils.createTempLocation(pipeline, "./slurm_track.sh")
838+
def scriptTrackPathNode = "${jobWorkspace}/slurm_track.sh"
837839
def isAarch64 = config.contains("aarch64")
838840
def coverageConfigFile = "${jobWorkspace}/.coveragerc"
839841

@@ -962,15 +964,43 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
962964
scriptLaunchPathNode,
963965
true
964966
)
965-
def scriptExec = """
967+
def scriptSubmit = """
968+
set -Eeuo pipefail
969+
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
966970
touch ${outputPath}
967971
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
968972
if [ -z "\$jobId" ]; then
969973
echo "Error: Job submission failed, no job ID returned."
970974
exit 1
971975
fi
972976
echo "Submitted job \$jobId"
973-
tail -f ${outputPath} &
977+
# save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
978+
echo \$jobId > \$jobWorkspace/slurm_job_id.txt
979+
""".replaceAll("(?m)^\\s*", "").trim()
980+
pipeline.writeFile(file: scriptSubmitPathLocal, text: scriptSubmit)
981+
Utils.copyFileToRemoteHost(
982+
pipeline,
983+
remote,
984+
scriptSubmitPathLocal,
985+
scriptSubmitPathNode,
986+
true
987+
)
988+
}
989+
stage("[${stageName}] Run Pytest") {
990+
// Submit the sbatch job
991+
Utils.exec(
992+
pipeline,
993+
timeout: false,
994+
script: Utils.sshUserCmd(
995+
remote,
996+
scriptSubmitPathNode
997+
)
998+
)
999+
def scriptTrack = """
1000+
set -Eeuo pipefail
1001+
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
1002+
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
1003+
tail -f $outputPath &
9741004
tailPid=\$!
9751005
# Wait until sbatch job is done.
9761006
while squeue -j \$jobId -o %T >/dev/null 2>&1; do
@@ -984,17 +1014,15 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
9841014
echo "Pytest failed in Slurm job \$jobId with exit code \$EXIT_CODE"
9851015
exit \$EXIT_CODE
9861016
fi
987-
""".replaceAll("(?m)^\\s*", "").trim()
988-
pipeline.writeFile(file: scriptExecPathLocal, text: scriptExec)
1017+
"""
1018+
pipeline.writeFile(file: scriptTrackPathLocal, text: scriptTrack)
9891019
Utils.copyFileToRemoteHost(
9901020
pipeline,
9911021
remote,
992-
scriptExecPathLocal,
993-
scriptExecPathNode,
1022+
scriptTrackPathLocal,
1023+
scriptTrackPathNode,
9941024
true
9951025
)
996-
}
997-
stage("[${stageName}] Run Pytest") {
9981026
Utils.exec(
9991027
pipeline,
10001028
timeout: false,
@@ -1003,8 +1031,33 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10031031
scriptExecPathNode
10041032
)
10051033
)
1034+
while (true) {
1035+
// Check if the job is done by running squeue via SSH
1036+
def result = Utils.exec(
1037+
pipeline,
1038+
returnStdout: true,
1039+
script: Utils.sshUserCmd(
1040+
remote,
1041+
"""
1042+
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
1043+
squeue -j \$jobId -h
1044+
"""
1045+
)
1046+
)
1047+
if (result == "") {
1048+
break
1049+
} else {
1050+
Utils.exec(
1051+
pipeline,
1052+
timeout: false,
1053+
script: Utils.sshUserCmd(
1054+
remote,
1055+
scriptExecPathNode
1056+
)
1057+
)
1058+
}
1059+
}
10061060
}
1007-
10081061
echo "Finished test stage execution."
10091062
}
10101063
} finally {

jenkins/scripts/slurm_run.sh

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,6 @@ set_value_in_command() {
3030
}
3131

3232
if [ $SLURM_LOCALID -eq 0 ]; then
33-
# save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
34-
echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
35-
3633
wget -nv $llmTarfile
3734
tar -zxf $tarName
3835
which python3

0 commit comments

Comments
 (0)