@@ -1272,8 +1272,16 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12721272 tail -f $outputPath &
12731273 tailPid=\$ !
12741274 # Wait until sbatch job is done.
1275- while squeue -j \$ jobId -o %T >/dev/null 2>&1; do
1276- sleep 300
1275+ while true; do
1276+ state=\$ (sacct -j \$ jobId --format=JobIDRaw,State --noheader | \
1277+ awk -v jobId=\$ jobId '""\$ 1"" == jobId {print \$ 2}')
1278+ if [[ -z "\$ state" || "\$ state" == "RUNNING" || \
1279+ "\$ state" == "PENDING"]]; then
1280+ sleep 300
1281+ else
1282+ echo "Job \$ jobId finished with state: \$ state"
1283+ break
1284+ fi
12771285 done
12781286 # Kill tail -f process
12791287 kill \$ tailPid
@@ -1320,7 +1328,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13201328 )
13211329 def scriptStatus = """ #!/bin/bash
13221330 jobId=\$ (cat $jobWorkspace /slurm_job_id.txt)
1323- squeue -j \$ jobId -h
1331+ sacct -j \$ jobId --format=JobIDRaw,State --noheader |\
1332+ awk -v jobId=\$ jobId '""\$ 1"" == jobId {print \$ 2}'
13241333 """
13251334 pipeline. writeFile(file : scriptStatusPathLocal, text : scriptStatus)
13261335 Utils . copyFileToRemoteHost(
@@ -1349,7 +1358,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13491358
13501359 sh " cat $scriptStatusPathLocal "
13511360 while (true ) {
1352- // Check if the job is done by running squeue via SSH
1361+ // Check if the job is done by running sacct via SSH
13531362 def result = Utils . exec(
13541363 pipeline,
13551364 returnStdout : true ,
0 commit comments