@@ -1150,7 +1150,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11501150 " export ${ varName} =\" ${ escapedValue} \" "
11511151 }. join(' \n ' )
11521152
1153- def scriptContent = """ #!/bin/bash
1153+ def scriptLaunchPrefix = """ #!/bin/bash
11541154 #SBATCH ${ exemptionComment}
11551155 #SBATCH --output=${ outputPath}
11561156 ${ taskArgs.collect { "#SBATCH $it" }.join('\n')}
@@ -1278,14 +1278,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12781278 ),
12791279 numRetries : 3
12801280 )
1281+ def sbatchJobId = Utils . exec(
1282+ pipeline,
1283+ returnStdout : true ,
1284+ script : Utils . sshUserCmd(
1285+ remote,
1286+ " cat $jobWorkspace /slurm_job_id.txt"
1287+ )
1288+ ). trim()
12811289 def scriptTrack = """ #!/bin/bash
12821290 jobId=\$ (cat $jobWorkspace /slurm_job_id.txt)
12831291 tail -f ${ sbatchLogPath} &
12841292 tailPid=\$ !
12851293 # Wait until sbatch job is done.
12861294 while true; do
12871295 state=\$ (sacct -j \$ jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$ jobId '""\$ 1"" == jobId {print \$ 2}')
1288- if [[ -z \$ state || \$ state == "RUNNING" || \$ state == "PENDING" ]]; then
1296+ if [[ -z \$ state || \$ state == "RUNNING" || \$ state == "PENDING" || \$ state == "CONFIGURING" ]]; then
12891297 echo "job is still running"
12901298 sleep 300
12911299 else
@@ -1376,8 +1384,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13761384 scriptStatusPathNode
13771385 )
13781386 ). trim()
1379- if (! result || result == " RUNNING" || result == " PENDING" ) {
1380- echo " Job is still running, pulling the job log."
1387+ if (! result || result == " RUNNING" || result == " PENDING" || result == " CONFIGURING " ) {
1388+ echo " Slurm job $s batchJobId is still running, pulling the job log."
13811389 // Pulling the sbatch output log
13821390 Utils . exec(
13831391 pipeline,
@@ -1388,7 +1396,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13881396 )
13891397 )
13901398 } else {
1391- echo " Job is done."
1399+ echo " Slurm job $s batchJobId is done."
13921400 break
13931401 }
13941402 }
0 commit comments