@@ -1284,8 +1284,16 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12841284 tail -f $outputPath &
12851285 tailPid=\$ !
12861286 # Wait until sbatch job is done.
1287- while squeue -j \$ jobId -o %T >/dev/null 2>&1; do
1288- sleep 300
1287+ while true; do
1288+ state=\$ (sacct -j \$ jobId --format=JobIDRaw,State --noheader | \
1289+ awk -v jobId=\$ jobId '""\$ 1"" == jobId {print \$ 2}')
1290+ if [[ -z "\$ state" || "\$ state" == "RUNNING" || \
1291+ "\$ state" == "PENDING"]]; then
1292+ sleep 300
1293+ else
1294+ echo "Job \$ jobId finished with state: \$ state"
1295+ break
1296+ fi
12891297 done
12901298 # Kill tail -f process
12911299 kill \$ tailPid
@@ -1332,7 +1340,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13321340 )
13331341 def scriptStatus = """ #!/bin/bash
13341342 jobId=\$ (cat $jobWorkspace /slurm_job_id.txt)
1335- squeue -j \$ jobId -h
1343+ sacct -j \$ jobId --format=JobIDRaw,State --noheader |\
1344+ awk -v jobId=\$ jobId '""\$ 1"" == jobId {print \$ 2}'
13361345 """
13371346 pipeline. writeFile(file : scriptStatusPathLocal, text : scriptStatus)
13381347 Utils . copyFileToRemoteHost(
@@ -1361,7 +1370,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13611370
13621371 sh " cat $scriptStatusPathLocal "
13631372 while (true ) {
1364- // Check if the job is done by running squeue via SSH
1373+ // Check if the job is done by running sacct via SSH
13651374 def result = Utils . exec(
13661375 pipeline,
13671376 returnStdout : true ,
0 commit comments