@@ -832,8 +832,10 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
832832 def outputPath = " ${ jobWorkspace} /job-output.log"
833833 def scriptLaunchPathLocal = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
834834 def scriptLaunchPathNode = " ${ jobWorkspace} /slurm_launch.sh"
835- def scriptExecPathLocal = Utils . createTempLocation(pipeline, " ./slurm_exec.sh" )
836- def scriptExecPathNode = " ${ jobWorkspace} /slurm_exec.sh"
835+ def scriptSubmitPathLocal = Utils . createTempLocation(pipeline, " ./slurm_submit.sh" )
836+ def scriptSubmitPathNode = " ${ jobWorkspace} /slurm_submit.sh"
837+ def scriptTrackPathLocal = Utils . createTempLocation(pipeline, " ./slurm_track.sh" )
838+ def scriptTrackPathNode = " ${ jobWorkspace} /slurm_track.sh"
837839 def isAarch64 = config. contains(" aarch64" )
838840 def coverageConfigFile = " ${ jobWorkspace} /.coveragerc"
839841
@@ -962,15 +964,43 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
962964 scriptLaunchPathNode,
963965 true
964966 )
965- def scriptExec = """
967+ def scriptSubmit = """
968+ set -Eeuo pipefail
969+ trap 'rc=\$ ?; echo "Error in file \$ {BASH_SOURCE[0]} on line \$ LINENO: \$ BASH_COMMAND (exit \$ rc)"; exit \$ rc' ERR
966970 touch ${ outputPath}
967971 jobId=\$ (sbatch ${ scriptLaunchPathNode} | awk '{print \$ 4}')
968972 if [ -z "\$ jobId" ]; then
969973 echo "Error: Job submission failed, no job ID returned."
970974 exit 1
971975 fi
972976 echo "Submitted job \$ jobId"
973- tail -f ${ outputPath} &
977+ # save job ID in $jobWorkspace /slurm_job_id.txt for later job to retrieve
978+ echo \$ jobId > \$ jobWorkspace/slurm_job_id.txt
979+ """ . replaceAll(" (?m)^\\ s*" , " " ). trim()
980+ pipeline. writeFile(file : scriptSubmitPathLocal, text : scriptSubmit)
981+ Utils . copyFileToRemoteHost(
982+ pipeline,
983+ remote,
984+ scriptSubmitPathLocal,
985+ scriptSubmitPathNode,
986+ true
987+ )
988+ }
989+ stage(" [${ stageName} ] Run Pytest" ) {
990+ // Submit the sbatch job
991+ Utils . exec(
992+ pipeline,
993+ timeout : false ,
994+ script : Utils . sshUserCmd(
995+ remote,
996+ scriptSubmitPathNode
997+ )
998+ )
999+ def scriptTrack = """
1000+ set -Eeuo pipefail
1001+ trap 'rc=\$ ?; echo "Error in file \$ {BASH_SOURCE[0]} on line \$ LINENO: \$ BASH_COMMAND (exit \$ rc)"; exit \$ rc' ERR
1002+ jobId=\$ (cat $jobWorkspace /slurm_job_id.txt)
1003+ tail -f $outputPath &
9741004 tailPid=\$ !
9751005 # Wait until sbatch job is done.
9761006 while squeue -j \$ jobId -o %T >/dev/null 2>&1; do
@@ -984,17 +1014,15 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
9841014 echo "Pytest failed in Slurm job \$ jobId with exit code \$ EXIT_CODE"
9851015 exit \$ EXIT_CODE
9861016 fi
987- """ . replaceAll( " (?m)^ \\ s* " , " " ) . trim()
988- pipeline. writeFile(file : scriptExecPathLocal , text : scriptExec )
1017+ """
1018+ pipeline. writeFile(file : scriptTrackPathLocal , text : scriptTrack )
9891019 Utils . copyFileToRemoteHost(
9901020 pipeline,
9911021 remote,
992- scriptExecPathLocal ,
993- scriptExecPathNode ,
1022+ scriptTrackPathLocal ,
1023+ scriptTrackPathNode ,
9941024 true
9951025 )
996- }
997- stage(" [${ stageName} ] Run Pytest" ) {
9981026 Utils . exec(
9991027 pipeline,
10001028 timeout : false ,
@@ -1003,8 +1031,33 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10031031 scriptExecPathNode
10041032 )
10051033 )
1034+ while (true ) {
1035+ // Check if the job is done by running squeue via SSH
1036+ def result = Utils . exec(
1037+ pipeline,
1038+ returnStdout : true ,
1039+ script : Utils . sshUserCmd(
1040+ remote,
1041+ """
1042+ jobId=\$ (cat $jobWorkspace /slurm_job_id.txt)
1043+ squeue -j \$ jobId -h
1044+ """
1045+ )
1046+ )
1047+ if (result == " " ) {
1048+ break
1049+ } else {
1050+ Utils . exec(
1051+ pipeline,
1052+ timeout : false ,
1053+ script : Utils . sshUserCmd(
1054+ remote,
1055+ scriptExecPathNode
1056+ )
1057+ )
1058+ }
1059+ }
10061060 }
1007-
10081061 echo " Finished test stage execution."
10091062 }
10101063 } finally {
0 commit comments