@@ -933,15 +933,15 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
933933 def scriptBashUtilsPathNode = " ${ jobWorkspace} /${ jobUID} -bash_utils.sh"
934934 def testListPathNode = " ${ jobWorkspace} /${ testList} .txt"
935935 def waivesListPathNode = " ${ jobWorkspace} /waives.txt"
936- def outputPath = " ${ jobWorkspace} /job-output.log"
936+ def sbatchLogPath = " ${ jobWorkspace} /job-output.log"
937937 def scriptLaunchPathLocal = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
938- def scriptLaunchPathNode = " ${ jobWorkspace} /slurm_launch.sh"
938+ def scriptLaunchPathNode = " ${ jobWorkspace} /${ jobUID } - slurm_launch.sh"
939939 def scriptSubmitPathLocal = Utils . createTempLocation(pipeline, " ./slurm_submit.sh" )
940- def scriptSubmitPathNode = " ${ jobWorkspace} /slurm_submit.sh"
940+ def scriptSubmitPathNode = " ${ jobWorkspace} /${ jobUID } - slurm_submit.sh"
941941 def scriptTrackPathLocal = Utils . createTempLocation(pipeline, " ./slurm_track.sh" )
942- def scriptTrackPathNode = " ${ jobWorkspace} /slurm_track.sh"
942+ def scriptTrackPathNode = " ${ jobWorkspace} /${ jobUID } - slurm_track.sh"
943943 def scriptStatusPathLocal = Utils . createTempLocation(pipeline, " ./slurm_status.sh" )
944- def scriptStatusPathNode = " ${ jobWorkspace} /slurm_status.sh"
944+ def scriptStatusPathNode = " ${ jobWorkspace} /${ jobUID } - slurm_status.sh"
945945 def isAarch64 = config. contains(" aarch64" )
946946 def coverageConfigFile = " ${ jobWorkspace} /.coveragerc"
947947 def perfCheckScriptLocal = " ${ llmSrcLocal} /tests/integration/defs/perf/perf_regression_check.py"
@@ -1150,8 +1150,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11501150 " export ${ varName} =\" ${ escapedValue} \" "
11511151 }. join(' \n ' )
11521152
1153- // Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
1154- def scriptLaunchPrefix = """ #!/bin/bash
1153+ def scriptContent = """ #!/bin/bash
11551154 #SBATCH ${ exemptionComment}
11561155 #SBATCH --output=${ outputPath}
11571156 ${ taskArgs.collect { "#SBATCH $it" }.join('\n')}
@@ -1247,9 +1246,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12471246 rm -rf "${ jobWorkspace} /results.xml"
12481247 rm -rf "${ jobWorkspace} /report.csv"
12491248 rm -rf "${ jobWorkspace} /unfinished_test.txt"
1250- rm -rf "${ outputPath } "
1249+ rm -rf "${ sbatchLogPath } "
12511250
1252- touch " ${ outputPath } "
1251+ touch ${ sbatchLogPath }
12531252 jobId=\$ (sbatch ${ scriptLaunchPathNode} | awk '{print \$ 4}')
12541253 if [ -z "\$ jobId" ]; then
12551254 echo "Error: Slurm job submission failed, no job ID returned."
@@ -1281,14 +1280,13 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12811280 )
12821281 def scriptTrack = """ #!/bin/bash
12831282 jobId=\$ (cat $jobWorkspace /slurm_job_id.txt)
1284- tail -f $o utputPath &
1283+ tail -f ${ sbatchLogPath } &
12851284 tailPid=\$ !
12861285 # Wait until sbatch job is done.
12871286 while true; do
1288- state=\$ (sacct -j \$ jobId --format=JobIDRaw,State --noheader | \
1289- awk -v jobId=\$ jobId '""\$ 1"" == jobId {print \$ 2}')
1290- if [[ -z "\$ state" || "\$ state" == "RUNNING" || \
1291- "\$ state" == "PENDING"]]; then
1287+ state=\$ (sacct -j \$ jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$ jobId '""\$ 1"" == jobId {print \$ 2}')
1288+ if [[ -z \$ state || \$ state == "RUNNING" || \$ state == "PENDING" ]]; then
1289+ echo "job is still running"
12921290 sleep 300
12931291 else
12941292 echo "Job \$ jobId finished with state: \$ state"
@@ -1340,8 +1338,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13401338 )
13411339 def scriptStatus = """ #!/bin/bash
13421340 jobId=\$ (cat $jobWorkspace /slurm_job_id.txt)
1343- sacct -j \$ jobId --format=JobIDRaw,State --noheader |\
1344- awk -v jobId=\$ jobId '""\$ 1"" == jobId {print \$ 2}'
1341+ sacct -j \$ jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$ jobId '""\$ 1"" == jobId {print \$ 2}'
13451342 """
13461343 pipeline. writeFile(file : scriptStatusPathLocal, text : scriptStatus)
13471344 Utils . copyFileToRemoteHost(
@@ -1378,13 +1375,10 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13781375 remote,
13791376 scriptStatusPathNode
13801377 )
1381- )
1382- println (result)
1383- if (result == " " ) {
1384- echo " Job is done."
1385- break
1386- } else {
1378+ ). trim()
1379+ if (! result || result == " RUNNING" || result == " PENDING" ) {
13871380 echo " Job is still running, pulling the job log."
1381+ // Pulling the sbatch output log
13881382 Utils . exec(
13891383 pipeline,
13901384 timeout : false ,
@@ -1393,6 +1387,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13931387 scriptTrackPathNode
13941388 )
13951389 )
1390+ } else {
1391+ echo " Job is done."
1392+ break
13961393 }
13971394 }
13981395 }
0 commit comments