@@ -832,8 +832,10 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
832832 def outputPath = " ${ jobWorkspace} /job-output.log"
833833 def scriptLaunchPathLocal = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
834834 def scriptLaunchPathNode = " ${ jobWorkspace} /slurm_launch.sh"
835- def scriptExecPathLocal = Utils . createTempLocation(pipeline, " ./slurm_exec.sh" )
836- def scriptExecPathNode = " ${ jobWorkspace} /slurm_exec.sh"
835+ def scriptSubmitPathLocal = Utils . createTempLocation(pipeline, " ./slurm_submit.sh" )
836+ def scriptSubmitPathNode = " ${ jobWorkspace} /slurm_submit.sh"
837+ def scriptTrackPathLocal = Utils . createTempLocation(pipeline, " ./slurm_track.sh" )
838+ def scriptTrackPathNode = " ${ jobWorkspace} /slurm_track.sh"
837839 def isAarch64 = config. contains(" aarch64" )
838840 def coverageConfigFile = " ${ jobWorkspace} /.coveragerc"
839841
@@ -962,15 +964,41 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
962964 scriptLaunchPathNode,
963965 true
964966 )
965- def scriptExec = """
967+ def scriptSubmit = """ #!/bin/bash
968+ set -Eeuo pipefail
969+ trap 'rc=\$ ?; echo "Error in file \$ {BASH_SOURCE[0]} on line \$ LINENO: \$ BASH_COMMAND (exit \$ rc)"; exit \$ rc' ERR
966970 touch ${ outputPath}
967971 jobId=\$ (sbatch ${ scriptLaunchPathNode} | awk '{print \$ 4}')
968972 if [ -z "\$ jobId" ]; then
969973 echo "Error: Job submission failed, no job ID returned."
970974 exit 1
971975 fi
972976 echo "Submitted job \$ jobId"
973- tail -f ${ outputPath} &
977+ # save job ID in $jobWorkspace /slurm_job_id.txt for later job to retrieve
978+ echo \$ jobId > $jobWorkspace /slurm_job_id.txt
979+ """ . replaceAll(" (?m)^\\ s*" , " " ). trim()
980+ pipeline. writeFile(file : scriptSubmitPathLocal, text : scriptSubmit)
981+ Utils . copyFileToRemoteHost(
982+ pipeline,
983+ remote,
984+ scriptSubmitPathLocal,
985+ scriptSubmitPathNode,
986+ true
987+ )
988+ }
989+ stage(" [${ stageName} ] Run Pytest" ) {
990+ // Submit the sbatch job
991+ Utils . exec(
992+ pipeline,
993+ timeout : false ,
994+ script : Utils . sshUserCmd(
995+ remote,
996+ scriptSubmitPathNode
997+ )
998+ )
999+ def scriptTrack = """ #!/bin/bash
1000+ jobId=\$ (cat $jobWorkspace /slurm_job_id.txt)
1001+ tail -f $outputPath &
9741002 tailPid=\$ !
9751003 # Wait until sbatch job is done.
9761004 while squeue -j \$ jobId -o %T >/dev/null 2>&1; do
@@ -984,27 +1012,41 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
9841012 echo "Pytest failed in Slurm job \$ jobId with exit code \$ EXIT_CODE"
9851013 exit \$ EXIT_CODE
9861014 fi
987- """ . replaceAll( " (?m)^ \\ s* " , " " ) . trim()
988- pipeline. writeFile(file : scriptExecPathLocal , text : scriptExec )
1015+ """
1016+ pipeline. writeFile(file : scriptTrackPathLocal , text : scriptTrack )
9891017 Utils . copyFileToRemoteHost(
9901018 pipeline,
9911019 remote,
992- scriptExecPathLocal ,
993- scriptExecPathNode ,
1020+ scriptTrackPathLocal ,
1021+ scriptTrackPathNode ,
9941022 true
9951023 )
996- }
997- stage(" [${ stageName} ] Run Pytest" ) {
998- Utils . exec(
999- pipeline,
1000- timeout : false ,
1001- script : Utils . sshUserCmd(
1002- remote,
1003- scriptExecPathNode
1024+ while (true ) {
1025+ // Check if the job is done by running squeue via SSH
1026+ def result = Utils . exec(
1027+ pipeline,
1028+ returnStdout : true ,
1029+ script : Utils . sshUserCmd(
1030+ remote,
1031+ " 'set jobId = `cat $jobWorkspace /slurm_job_id.txt`; squeue -j $jobId -h'"
1032+ )
10041033 )
1005- )
1034+ if (result == " " ) {
1035+ echo " Job $jobId is done."
1036+ break
1037+ } else {
1038+ echo " Job $jobId is still running, pulling the job log."
1039+ Utils . exec(
1040+ pipeline,
1041+ timeout : false ,
1042+ script : Utils . sshUserCmd(
1043+ remote,
1044+ scriptTrackPathNode
1045+ )
1046+ )
1047+ }
1048+ }
10061049 }
1007-
10081050 echo " Finished test stage execution."
10091051 }
10101052 } finally {
0 commit comments