@@ -832,8 +832,10 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
832832 def outputPath = " ${ jobWorkspace} /job-output.log"
833833 def scriptLaunchPathLocal = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
834834 def scriptLaunchPathNode = " ${ jobWorkspace} /slurm_launch.sh"
835- def scriptExecPathLocal = Utils . createTempLocation(pipeline, " ./slurm_exec.sh" )
836- def scriptExecPathNode = " ${ jobWorkspace} /slurm_exec.sh"
835+ def scriptSubmitPathLocal = Utils . createTempLocation(pipeline, " ./slurm_submit.sh" )
836+ def scriptSubmitPathNode = " ${ jobWorkspace} /slurm_submit.sh"
837+ def scriptTrackPathLocal = Utils . createTempLocation(pipeline, " ./slurm_track.sh" )
838+ def scriptTrackPathNode = " ${ jobWorkspace} /slurm_track.sh"
837839 def isAarch64 = config. contains(" aarch64" )
838840 def coverageConfigFile = " ${ jobWorkspace} /.coveragerc"
839841
@@ -962,15 +964,41 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
962964 scriptLaunchPathNode,
963965 true
964966 )
965- def scriptExec = """
967+ def scriptSubmit = """ #!/bin/bash
968+ set -Eeuo pipefail
969+ trap 'rc=\$ ?; echo "Error in file \$ {BASH_SOURCE[0]} on line \$ LINENO: \$ BASH_COMMAND (exit \$ rc)"; exit \$ rc' ERR
966970 touch ${ outputPath}
967971 jobId=\$ (sbatch ${ scriptLaunchPathNode} | awk '{print \$ 4}')
968972 if [ -z "\$ jobId" ]; then
969973 echo "Error: Job submission failed, no job ID returned."
970974 exit 1
971975 fi
972976 echo "Submitted job \$ jobId"
973- tail -f ${ outputPath} &
977+ # save job ID in $jobWorkspace /slurm_job_id.txt for later job to retrieve
978+ echo \$ jobId > $jobWorkspace /slurm_job_id.txt
979+ """ . replaceAll(" (?m)^\\ s*" , " " ). trim()
980+ pipeline. writeFile(file : scriptSubmitPathLocal, text : scriptSubmit)
981+ Utils . copyFileToRemoteHost(
982+ pipeline,
983+ remote,
984+ scriptSubmitPathLocal,
985+ scriptSubmitPathNode,
986+ true
987+ )
988+ }
989+ stage(" [${ stageName} ] Run Pytest" ) {
990+ // Submit the sbatch job
991+ Utils . exec(
992+ pipeline,
993+ timeout : false ,
994+ script : Utils . sshUserCmd(
995+ remote,
996+ scriptSubmitPathNode
997+ )
998+ )
999+ def scriptTrack = """ #!/bin/bash
1000+ jobId=\$ (cat $jobWorkspace /slurm_job_id.txt)
1001+ tail -f $outputPath &
9741002 tailPid=\$ !
9751003 # Wait until sbatch job is done.
9761004 while squeue -j \$ jobId -o %T >/dev/null 2>&1; do
@@ -984,27 +1012,52 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
9841012 echo "Pytest failed in Slurm job \$ jobId with exit code \$ EXIT_CODE"
9851013 exit \$ EXIT_CODE
9861014 fi
987- """ . replaceAll( " (?m)^ \\ s* " , " " ) . trim()
988- pipeline. writeFile(file : scriptExecPathLocal , text : scriptExec )
1015+ """
1016+ pipeline. writeFile(file : scriptTrackPathLocal , text : scriptTrack )
9891017 Utils . copyFileToRemoteHost(
9901018 pipeline,
9911019 remote,
992- scriptExecPathLocal ,
993- scriptExecPathNode ,
1020+ scriptTrackPathLocal ,
1021+ scriptTrackPathNode ,
9941022 true
9951023 )
996- }
997- stage(" [${ stageName} ] Run Pytest" ) {
9981024 Utils . exec(
9991025 pipeline,
10001026 timeout : false ,
10011027 script : Utils . sshUserCmd(
10021028 remote,
1003- scriptExecPathNode
1029+ scriptTrackPathNode
10041030 )
10051031 )
1032+ while (true ) {
1033+ // Check if the job is done by running squeue via SSH
1034+ def result = Utils . exec(
1035+ pipeline,
1036+ returnStdout : true ,
1037+ script : Utils . sshUserCmd(
1038+ remote,
1039+ """
1040+ jobId=\$ (cat $jobWorkspace /slurm_job_id.txt)
1041+ squeue -j \$ jobId -h
1042+ """
1043+ )
1044+ )
1045+ if (result == " " ) {
1046+ echo " Job $jobId is done."
1047+ break
1048+ } else {
1049+ echo " Job $jobId is still running, pulling the job log."
1050+ Utils . exec(
1051+ pipeline,
1052+ timeout : false ,
1053+ script : Utils . sshUserCmd(
1054+ remote,
1055+ scriptExecPathNode
1056+ )
1057+ )
1058+ }
1059+ }
10061060 }
1007-
10081061 echo " Finished test stage execution."
10091062 }
10101063 } finally {
0 commit comments