@@ -933,11 +933,16 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
933933 def scriptBashUtilsPathNode = " ${ jobWorkspace} /${ jobUID} -bash_utils.sh"
934934 def testListPathNode = " ${ jobWorkspace} /${ testList} .txt"
935935 def waivesListPathNode = " ${ jobWorkspace} /waives.txt"
936- def outputPath = " ${ jobWorkspace} /job-output.log"
936+ def sbatchLogPath = " ${ jobWorkspace} /job-output.log"
937937 def scriptLaunchPathLocal = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
938938 def scriptLaunchPathNode = " ${ jobWorkspace} /${ jobUID} -slurm_launch.sh"
939- def scriptExecPathLocal = Utils . createTempLocation(pipeline, " ./slurm_exec.sh" )
940- def scriptExecPathNode = " ${ jobWorkspace} /${ jobUID} -slurm_exec.sh"
939+ def scriptSubmitPathLocal = Utils . createTempLocation(pipeline, " ./slurm_submit.sh" )
940+ def scriptSubmitPathNode = " ${ jobWorkspace} /${ jobUID} -slurm_submit.sh"
941+ def scriptTrackPathLocal = Utils . createTempLocation(pipeline, " ./slurm_track.sh" )
942+ def scriptTrackPathNode = " ${ jobWorkspace} /${ jobUID} -slurm_track.sh"
943+ def scriptStatusPathLocal = Utils . createTempLocation(pipeline, " ./slurm_status.sh" )
944+ def scriptStatusPathNode = " ${ jobWorkspace} /${ jobUID} -slurm_status.sh"
945+ def isAarch64 = config. contains(" aarch64" )
941946 def coverageConfigFile = " ${ jobWorkspace} /.coveragerc"
942947
943948 stage(" [${ stageName} ] Initializing Test" ) {
@@ -1133,10 +1138,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11331138 " export ${ varName} =\" ${ escapedValue} \" "
11341139 }. join(' \n ' )
11351140
1136- // Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
11371141 def scriptLaunchPrefix = """ #!/bin/bash
11381142 #SBATCH ${ exemptionComment}
1139- #SBATCH --output=${ outputPath }
1143+ #SBATCH --output=${ sbatchLogPath }
11401144 ${ taskArgs.collect { "#SBATCH $it" }.join('\n')}
11411145 #SBATCH ${ partition.additionalArgs}
11421146 ${ partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
@@ -1214,9 +1218,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12141218 scriptLaunchPathNode,
12151219 true
12161220 )
1217-
1218- def scriptExec = """ #!/bin/bash
1219- set -xEeuo pipefail
1221+ def scriptSubmit = """ #!/bin/bash
1222+ set -Eeuo pipefail
12201223 trap 'rc=\$ ?; echo "Error in file \$ {BASH_SOURCE[0]} on line \$ LINENO: \$ BASH_COMMAND (exit \$ rc)"; exit \$ rc' ERR
12211224
12221225 # Clean up previous job intermediate files so that retry can work
@@ -1231,21 +1234,60 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12311234 rm -rf "${ jobWorkspace} /results.xml"
12321235 rm -rf "${ jobWorkspace} /report.csv"
12331236 rm -rf "${ jobWorkspace} /unfinished_test.txt"
1234- rm -rf "${ outputPath } "
1237+ rm -rf "${ sbatchLogPath } "
12351238
1236- touch " ${ outputPath } "
1239+ touch ${ sbatchLogPath }
12371240 jobId=\$ (sbatch ${ scriptLaunchPathNode} | awk '{print \$ 4}')
12381241 if [ -z "\$ jobId" ]; then
12391242 echo "Error: Slurm job submission failed, no job ID returned."
12401243 exit 1
12411244 fi
12421245 echo "Submitted Slurm job \$ jobId"
1243- echo "\$ jobId" > "${ jobWorkspace} /slurm_job_id.txt"
1244- tail -f ${ outputPath} &
1246+ # save job ID in $jobWorkspace /slurm_job_id.txt for later job to retrieve
1247+ echo \$ jobId > $jobWorkspace /slurm_job_id.txt
1248+ """ . replaceAll(" (?m)^\\ s*" , " " ). trim()
1249+ pipeline. writeFile(file : scriptSubmitPathLocal, text : scriptSubmit)
1250+ Utils . copyFileToRemoteHost(
1251+ pipeline,
1252+ remote,
1253+ scriptSubmitPathLocal,
1254+ scriptSubmitPathNode,
1255+ true
1256+ )
1257+ }
1258+ stage(" [${ stageName} ] Run Pytest" ) {
1259+ // Submit the sbatch job
1260+ Utils . exec(
1261+ pipeline,
1262+ timeout : false ,
1263+ script : Utils . sshUserCmd(
1264+ remote,
1265+ scriptSubmitPathNode
1266+ ),
1267+ numRetries : 3
1268+ )
1269+ def sbatchJobId = Utils . exec(
1270+ pipeline,
1271+ returnStdout : true ,
1272+ script : Utils . sshUserCmd(
1273+ remote,
1274+ " cat $jobWorkspace /slurm_job_id.txt"
1275+ )
1276+ ). trim()
1277+ def scriptTrack = """ #!/bin/bash
1278+ jobId=\$ (cat $jobWorkspace /slurm_job_id.txt)
1279+ tail -f ${ sbatchLogPath} &
12451280 tailPid=\$ !
12461281 # Wait until sbatch job is done.
1247- while squeue -j \$ jobId -o %T >/dev/null 2>&1; do
1248- sleep 300
1282+ while true; do
1283+ state=\$ (sacct -j \$ jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$ jobId '""\$ 1"" == jobId {print \$ 2}')
1284+ if [[ -z \$ state || \$ state == "RUNNING" || \$ state == "PENDING" || \$ state == "CONFIGURING" ]]; then
1285+ echo "job is still running"
1286+ sleep 300
1287+ else
1288+ echo "Job \$ jobId finished with state: \$ state"
1289+ break
1290+ fi
12491291 done
12501292 # Kill tail -f process
12511293 kill \$ tailPid
@@ -1282,28 +1324,55 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12821324 exit 1
12831325 fi
12841326 """ . replaceAll(" (?m)^\\ s*" , " " ). trim()
1285- pipeline. writeFile(file : scriptExecPathLocal, text : scriptExec)
1286- Utils . exec(pipeline, script : " echo \" Script to trigger Slurm submission job: \" && cat ${ scriptExecPathLocal} " )
1327+ pipeline. writeFile(file : scriptTrackPathLocal, text : scriptTrack)
12871328 Utils . copyFileToRemoteHost(
12881329 pipeline,
12891330 remote,
1290- scriptExecPathLocal ,
1291- scriptExecPathNode ,
1331+ scriptTrackPathLocal ,
1332+ scriptTrackPathNode ,
12921333 true
12931334 )
1294- }
1295- stage(" [${ stageName} ] Run Pytest" ) {
1296- Utils . exec(
1335+ def scriptStatus = """ #!/bin/bash
1336+ jobId=\$ (cat $jobWorkspace /slurm_job_id.txt)
1337+ sacct -j \$ jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$ jobId '""\$ 1"" == jobId {print \$ 2}'
1338+ """
1339+ pipeline. writeFile(file : scriptStatusPathLocal, text : scriptStatus)
1340+ Utils . copyFileToRemoteHost(
12971341 pipeline,
1298- timeout : false ,
1299- script : Utils . sshUserCmd(
1300- remote,
1301- " \" ${ scriptExecPathNode} \" "
1302- ),
1303- numRetries : 3
1342+ remote,
1343+ scriptStatusPathLocal,
1344+ scriptStatusPathNode,
1345+ true
13041346 )
1305- }
13061347
1348+ sh " cat $scriptStatusPathLocal "
1349+ while (true ) {
1350+ // Check if the job is done by running sacct via SSH
1351+ def result = Utils . exec(
1352+ pipeline,
1353+ returnStdout : true ,
1354+ script : Utils . sshUserCmd(
1355+ remote,
1356+ scriptStatusPathNode
1357+ )
1358+ ). trim()
1359+ if (! result || result == " RUNNING" || result == " PENDING" || result == " CONFIGURING" ) {
1360+ echo " Slurm job $sbatchJobId is still running, pulling the job log."
1361+ // Pulling the sbatch output log
1362+ Utils . exec(
1363+ pipeline,
1364+ timeout : false ,
1365+ script : Utils . sshUserCmd(
1366+ remote,
1367+ scriptTrackPathNode
1368+ )
1369+ )
1370+ } else {
1371+ echo " Slurm job $sbatchJobId is done."
1372+ break
1373+ }
1374+ }
1375+ }
13071376 echo " Finished test stage execution."
13081377 }
13091378 } finally {
0 commit comments