@@ -933,15 +933,15 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
933933 def scriptBashUtilsPathNode = " ${ jobWorkspace} /${ jobUID} -bash_utils.sh"
934934 def testListPathNode = " ${ jobWorkspace} /${ testList} .txt"
935935 def waivesListPathNode = " ${ jobWorkspace} /waives.txt"
936- def outputPath = " ${ jobWorkspace} /job-output.log"
936+ def sbatchLogPath = " ${ jobWorkspace} /job-output.log"
937937 def scriptLaunchPathLocal = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
938- def scriptLaunchPathNode = " ${ jobWorkspace} /slurm_launch.sh"
938+ def scriptLaunchPathNode = " ${ jobWorkspace} /${ jobUID } - slurm_launch.sh"
939939 def scriptSubmitPathLocal = Utils . createTempLocation(pipeline, " ./slurm_submit.sh" )
940- def scriptSubmitPathNode = " ${ jobWorkspace} /slurm_submit.sh"
940+ def scriptSubmitPathNode = " ${ jobWorkspace} /${ jobUID } - slurm_submit.sh"
941941 def scriptTrackPathLocal = Utils . createTempLocation(pipeline, " ./slurm_track.sh" )
942- def scriptTrackPathNode = " ${ jobWorkspace} /slurm_track.sh"
942+ def scriptTrackPathNode = " ${ jobWorkspace} /${ jobUID } - slurm_track.sh"
943943 def scriptStatusPathLocal = Utils . createTempLocation(pipeline, " ./slurm_status.sh" )
944- def scriptStatusPathNode = " ${ jobWorkspace} /slurm_status.sh"
944+ def scriptStatusPathNode = " ${ jobWorkspace} /${ jobUID } - slurm_status.sh"
945945 def isAarch64 = config. contains(" aarch64" )
946946 def coverageConfigFile = " ${ jobWorkspace} /.coveragerc"
947947
@@ -1138,8 +1138,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11381138 " export ${ varName} =\" ${ escapedValue} \" "
11391139 }. join(' \n ' )
11401140
1141- // Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
1142- def scriptLaunchPrefix = """ #!/bin/bash
1141+ def scriptContent = """ #!/bin/bash
11431142 #SBATCH ${ exemptionComment}
11441143 #SBATCH --output=${ outputPath}
11451144 ${ taskArgs.collect { "#SBATCH $it" }.join('\n')}
@@ -1235,9 +1234,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12351234 rm -rf "${ jobWorkspace} /results.xml"
12361235 rm -rf "${ jobWorkspace} /report.csv"
12371236 rm -rf "${ jobWorkspace} /unfinished_test.txt"
1238- rm -rf "${ outputPath } "
1237+ rm -rf "${ sbatchLogPath } "
12391238
1240- touch " ${ outputPath } "
1239+ touch ${ sbatchLogPath }
12411240 jobId=\$ (sbatch ${ scriptLaunchPathNode} | awk '{print \$ 4}')
12421241 if [ -z "\$ jobId" ]; then
12431242 echo "Error: Slurm job submission failed, no job ID returned."
@@ -1269,14 +1268,13 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12691268 )
12701269 def scriptTrack = """ #!/bin/bash
12711270 jobId=\$ (cat $jobWorkspace /slurm_job_id.txt)
1272- tail -f $o utputPath &
1271+ tail -f ${ sbatchLogPath } &
12731272 tailPid=\$ !
12741273 # Wait until sbatch job is done.
12751274 while true; do
1276- state=\$ (sacct -j \$ jobId --format=JobIDRaw,State --noheader | \
1277- awk -v jobId=\$ jobId '""\$ 1"" == jobId {print \$ 2}')
1278- if [[ -z "\$ state" || "\$ state" == "RUNNING" || \
1279- "\$ state" == "PENDING"]]; then
1275+ state=\$ (sacct -j \$ jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$ jobId '""\$ 1"" == jobId {print \$ 2}')
1276+ if [[ -z \$ state || \$ state == "RUNNING" || \$ state == "PENDING" ]]; then
1277+ echo "job is still running"
12801278 sleep 300
12811279 else
12821280 echo "Job \$ jobId finished with state: \$ state"
@@ -1328,8 +1326,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13281326 )
13291327 def scriptStatus = """ #!/bin/bash
13301328 jobId=\$ (cat $jobWorkspace /slurm_job_id.txt)
1331- sacct -j \$ jobId --format=JobIDRaw,State --noheader |\
1332- awk -v jobId=\$ jobId '""\$ 1"" == jobId {print \$ 2}'
1329+ sacct -j \$ jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$ jobId '""\$ 1"" == jobId {print \$ 2}'
13331330 """
13341331 pipeline. writeFile(file : scriptStatusPathLocal, text : scriptStatus)
13351332 Utils . copyFileToRemoteHost(
@@ -1366,13 +1363,10 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13661363 remote,
13671364 scriptStatusPathNode
13681365 )
1369- )
1370- println (result)
1371- if (result == " " ) {
1372- echo " Job is done."
1373- break
1374- } else {
1366+ ). trim()
1367+ if (! result || result == " RUNNING" || result == " PENDING" ) {
13751368 echo " Job is still running, pulling the job log."
1369+ // Pulling the sbatch output log
13761370 Utils . exec(
13771371 pipeline,
13781372 timeout : false ,
@@ -1381,6 +1375,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13811375 scriptTrackPathNode
13821376 )
13831377 )
1378+ } else {
1379+ echo " Job is done."
1380+ break
13841381 }
13851382 }
13861383 }
0 commit comments