Skip to content

Commit a6ca621

Browse files
yuanjingx87videodanchik
authored andcommitted
[None][infra] add retry logic to get slurm sbatch job log when ssh dropped (NVIDIA#9167)
Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com> Signed-off-by: Daniil Kulko <kulkodaniil@gmail.com>
1 parent f5f6df8 commit a6ca621

File tree

1 file changed

+97
-28
lines changed

1 file changed

+97
-28
lines changed

jenkins/L0_Test.groovy

Lines changed: 97 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -933,11 +933,16 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
933933
def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh"
934934
def testListPathNode = "${jobWorkspace}/${testList}.txt"
935935
def waivesListPathNode = "${jobWorkspace}/waives.txt"
936-
def outputPath = "${jobWorkspace}/job-output.log"
936+
def sbatchLogPath = "${jobWorkspace}/job-output.log"
937937
def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
938938
def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
939-
def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
940-
def scriptExecPathNode = "${jobWorkspace}/${jobUID}-slurm_exec.sh"
939+
def scriptSubmitPathLocal = Utils.createTempLocation(pipeline, "./slurm_submit.sh")
940+
def scriptSubmitPathNode = "${jobWorkspace}/${jobUID}-slurm_submit.sh"
941+
def scriptTrackPathLocal = Utils.createTempLocation(pipeline, "./slurm_track.sh")
942+
def scriptTrackPathNode = "${jobWorkspace}/${jobUID}-slurm_track.sh"
943+
def scriptStatusPathLocal = Utils.createTempLocation(pipeline, "./slurm_status.sh")
944+
def scriptStatusPathNode = "${jobWorkspace}/${jobUID}-slurm_status.sh"
945+
def isAarch64 = config.contains("aarch64")
941946
def coverageConfigFile = "${jobWorkspace}/.coveragerc"
942947

943948
stage("[${stageName}] Initializing Test") {
@@ -1133,10 +1138,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11331138
"export ${varName}=\"${escapedValue}\""
11341139
}.join('\n')
11351140

1136-
// Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
11371141
def scriptLaunchPrefix = """#!/bin/bash
11381142
#SBATCH ${exemptionComment}
1139-
#SBATCH --output=${outputPath}
1143+
#SBATCH --output=${sbatchLogPath}
11401144
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
11411145
#SBATCH ${partition.additionalArgs}
11421146
${partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
@@ -1214,9 +1218,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12141218
scriptLaunchPathNode,
12151219
true
12161220
)
1217-
1218-
def scriptExec = """#!/bin/bash
1219-
set -xEeuo pipefail
1221+
def scriptSubmit = """#!/bin/bash
1222+
set -Eeuo pipefail
12201223
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
12211224
12221225
# Clean up previous job intermediate files so that retry can work
@@ -1231,21 +1234,60 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12311234
rm -rf "${jobWorkspace}/results.xml"
12321235
rm -rf "${jobWorkspace}/report.csv"
12331236
rm -rf "${jobWorkspace}/unfinished_test.txt"
1234-
rm -rf "${outputPath}"
1237+
rm -rf "${sbatchLogPath}"
12351238
1236-
touch "${outputPath}"
1239+
touch ${sbatchLogPath}
12371240
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
12381241
if [ -z "\$jobId" ]; then
12391242
echo "Error: Slurm job submission failed, no job ID returned."
12401243
exit 1
12411244
fi
12421245
echo "Submitted Slurm job \$jobId"
1243-
echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt"
1244-
tail -f ${outputPath} &
1246+
# save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
1247+
echo \$jobId > $jobWorkspace/slurm_job_id.txt
1248+
""".replaceAll("(?m)^\\s*", "").trim()
1249+
pipeline.writeFile(file: scriptSubmitPathLocal, text: scriptSubmit)
1250+
Utils.copyFileToRemoteHost(
1251+
pipeline,
1252+
remote,
1253+
scriptSubmitPathLocal,
1254+
scriptSubmitPathNode,
1255+
true
1256+
)
1257+
}
1258+
stage("[${stageName}] Run Pytest") {
1259+
// Submit the sbatch job
1260+
Utils.exec(
1261+
pipeline,
1262+
timeout: false,
1263+
script: Utils.sshUserCmd(
1264+
remote,
1265+
scriptSubmitPathNode
1266+
),
1267+
numRetries: 3
1268+
)
1269+
def sbatchJobId = Utils.exec(
1270+
pipeline,
1271+
returnStdout: true,
1272+
script: Utils.sshUserCmd(
1273+
remote,
1274+
"cat $jobWorkspace/slurm_job_id.txt"
1275+
)
1276+
).trim()
1277+
def scriptTrack = """#!/bin/bash
1278+
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
1279+
tail -f ${sbatchLogPath} &
12451280
tailPid=\$!
12461281
# Wait until sbatch job is done.
1247-
while squeue -j \$jobId -o %T >/dev/null 2>&1; do
1248-
sleep 300
1282+
while true; do
1283+
state=\$(sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}')
1284+
if [[ -z \$state || \$state == "RUNNING" || \$state == "PENDING" || \$state == "CONFIGURING" ]]; then
1285+
echo "job is still running"
1286+
sleep 300
1287+
else
1288+
echo "Job \$jobId finished with state: \$state"
1289+
break
1290+
fi
12491291
done
12501292
# Kill tail -f process
12511293
kill \$tailPid
@@ -1282,28 +1324,55 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12821324
exit 1
12831325
fi
12841326
""".replaceAll("(?m)^\\s*", "").trim()
1285-
pipeline.writeFile(file: scriptExecPathLocal, text: scriptExec)
1286-
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm submission job: \" && cat ${scriptExecPathLocal}")
1327+
pipeline.writeFile(file: scriptTrackPathLocal, text: scriptTrack)
12871328
Utils.copyFileToRemoteHost(
12881329
pipeline,
12891330
remote,
1290-
scriptExecPathLocal,
1291-
scriptExecPathNode,
1331+
scriptTrackPathLocal,
1332+
scriptTrackPathNode,
12921333
true
12931334
)
1294-
}
1295-
stage("[${stageName}] Run Pytest") {
1296-
Utils.exec(
1335+
def scriptStatus = """#!/bin/bash
1336+
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
1337+
sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}'
1338+
"""
1339+
pipeline.writeFile(file: scriptStatusPathLocal, text: scriptStatus)
1340+
Utils.copyFileToRemoteHost(
12971341
pipeline,
1298-
timeout: false,
1299-
script: Utils.sshUserCmd(
1300-
remote,
1301-
"\"${scriptExecPathNode}\""
1302-
),
1303-
numRetries: 3
1342+
remote,
1343+
scriptStatusPathLocal,
1344+
scriptStatusPathNode,
1345+
true
13041346
)
1305-
}
13061347

1348+
sh "cat $scriptStatusPathLocal"
1349+
while (true) {
1350+
// Check if the job is done by running sacct via SSH
1351+
def result = Utils.exec(
1352+
pipeline,
1353+
returnStdout: true,
1354+
script: Utils.sshUserCmd(
1355+
remote,
1356+
scriptStatusPathNode
1357+
)
1358+
).trim()
1359+
if (!result || result == "RUNNING" || result == "PENDING" || result == "CONFIGURING") {
1360+
echo "Slurm job $sbatchJobId is still running, pulling the job log."
1361+
// Pulling the sbatch output log
1362+
Utils.exec(
1363+
pipeline,
1364+
timeout: false,
1365+
script: Utils.sshUserCmd(
1366+
remote,
1367+
scriptTrackPathNode
1368+
)
1369+
)
1370+
} else {
1371+
echo "Slurm job $sbatchJobId is done."
1372+
break
1373+
}
1374+
}
1375+
}
13071376
echo "Finished test stage execution."
13081377
}
13091378
} finally {

0 commit comments

Comments
 (0)