@@ -1138,9 +1138,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11381138 " export ${ varName} =\" ${ escapedValue} \" "
11391139 }. join(' \n ' )
11401140
1141- def scriptContent = """ #!/bin/bash
1141+ def scriptLaunchPrefix = """ #!/bin/bash
11421142 #SBATCH ${ exemptionComment}
1143- #SBATCH --output=${ outputPath }
1143+ #SBATCH --output=${ sbatchLogPath }
11441144 ${ taskArgs.collect { "#SBATCH $it" }.join('\n')}
11451145 #SBATCH ${ partition.additionalArgs}
11461146 ${ partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
@@ -1266,14 +1266,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12661266 ),
12671267 numRetries : 3
12681268 )
1269+ def sbatchJobId = Utils . exec(
1270+ pipeline,
1271+ returnStdout : true ,
1272+ script : Utils . sshUserCmd(
1273+ remote,
1274+ " cat $jobWorkspace /slurm_job_id.txt"
1275+ )
1276+ ). trim()
12691277 def scriptTrack = """ #!/bin/bash
12701278 jobId=\$ (cat $jobWorkspace /slurm_job_id.txt)
12711279 tail -f ${ sbatchLogPath} &
12721280 tailPid=\$ !
12731281 # Wait until sbatch job is done.
12741282 while true; do
12751283 state=\$ (sacct -j \$ jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$ jobId '""\$ 1"" == jobId {print \$ 2}')
1276- if [[ -z \$ state || \$ state == "RUNNING" || \$ state == "PENDING" ]]; then
1284+ if [[ -z \$ state || \$ state == "RUNNING" || \$ state == "PENDING" || \$ state == "CONFIGURING" ]]; then
12771285 echo "job is still running"
12781286 sleep 300
12791287 else
@@ -1337,22 +1345,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13371345 true
13381346 )
13391347
1340- if (perfSanityMode) {
1341- stage(" [${ stageName} ] Check perf result" ) {
1342- def perfCheckResult = Utils . exec(
1343- pipeline,
1344- script : Utils . sshUserCmd(
1345- remote,
1346- " python3 ${ perfCheckScriptNode} ${ jobWorkspace} "
1347- ),
1348- returnStatus : true
1349- )
1350- if (perfCheckResult != 0 ) {
1351- error " Performance regression detected and failing the build (exit code: ${ perfCheckResult} )"
1352- }
1353- }
1354- }
1355-
13561348 sh " cat $scriptStatusPathLocal "
13571349 while (true ) {
13581350 // Check if the job is done by running sacct via SSH
@@ -1364,8 +1356,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13641356 scriptStatusPathNode
13651357 )
13661358 ). trim()
1367- if (! result || result == " RUNNING" || result == " PENDING" ) {
1368- echo " Job is still running, pulling the job log."
1359+ if (! result || result == " RUNNING" || result == " PENDING" || result == " CONFIGURING " ) {
1360+ echo " Slurm job $s batchJobId is still running, pulling the job log."
13691361 // Pulling the sbatch output log
13701362 Utils . exec(
13711363 pipeline,
@@ -1376,7 +1368,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
13761368 )
13771369 )
13781370 } else {
1379- echo " Job is done."
1371+ echo " Slurm job $s batchJobId is done."
13801372 break
13811373 }
13821374 }
0 commit comments