@@ -1138,9 +1138,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11381138 " export ${ varName} =\" ${ escapedValue} \" "
11391139 }. join(' \n ' )
11401140
1141- def scriptContent = """ #!/bin/bash
1141+ def scriptLaunchPrefix = """ #!/bin/bash
11421142 #SBATCH ${ exemptionComment}
1143- #SBATCH --output=${ outputPath }
1143+ #SBATCH --output=${ sbatchLogPath }
11441144 ${ taskArgs.collect { "#SBATCH $it" }.join('\n')}
11451145 #SBATCH ${ partition.additionalArgs}
11461146 ${ partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
@@ -1266,14 +1266,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12661266 ),
12671267 numRetries : 3
12681268 )
1269+ def sbatchJobId = Utils . exec(
1270+ pipeline,
1271+ returnStdout : true ,
1272+ script : Utils . sshUserCmd(
1273+ remote,
1274+ " cat $jobWorkspace /slurm_job_id.txt"
1275+ )
1276+ ). trim()
12691277 def scriptTrack = """ #!/bin/bash
12701278 jobId=\$ (cat $jobWorkspace /slurm_job_id.txt)
12711279 tail -f ${ sbatchLogPath} &
12721280 tailPid=\$ !
12731281 # Wait until sbatch job is done.
12741282 while true; do
12751283 state=\$ (sacct -j \$ jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$ jobId '""\$ 1"" == jobId {print \$ 2}')
1276- if [[ -z \$ state || \$ state == "RUNNING" || \$ state == "PENDING" ]]; then
1284+ if [[ -z \$ state || \$ state == "RUNNING" || \$ state == "PENDING" || \$ state == "CONFIGURING" ]]; then
12771285 echo "job is still running"
12781286 sleep 300
12791287 else
@@ -1364,8 +1372,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13641372 scriptStatusPathNode
13651373 )
13661374 ). trim()
1367- if (! result || result == " RUNNING" || result == " PENDING" ) {
1368- echo " Job is still running, pulling the job log."
1375+ if (! result || result == " RUNNING" || result == " PENDING" || result == " CONFIGURING " ) {
1376+ echo " Slurm job $s batchJobId is still running, pulling the job log."
13691377 // Pulling the sbatch output log
13701378 Utils . exec(
13711379 pipeline,
@@ -1376,7 +1384,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13761384 )
13771385 )
13781386 } else {
1379- echo " Job is done."
1387+ echo " Slurm job $s batchJobId is done."
13801388 break
13811389 }
13821390 }
0 commit comments