@@ -694,9 +694,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
694694 }
695695
696696 slurmRunner = null
697- if (cluster. containerRuntime == ContainerRuntime . DOCKER ) {
697+ if (cluster. containerRuntime. toString() == " DOCKER" ) {
698698 slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE , nodeName, dockerArgs, true )
699- } else if (cluster. containerRuntime == ContainerRuntime . ENROOT ) {
699+ } else if (cluster. containerRuntime. toString() == " ENROOT" ) {
700700 slurmRunner = runInEnrootOnNode(nodeName)
701701 } else {
702702 throw new Exception (" Unsupported container runtime: ${ cluster.containerRuntime} " )
@@ -799,7 +799,7 @@ def getPytestBaseCommandLine(
799799 " LLM_BACKEND_ROOT=${ llmSrc} /triton_backend" ,
800800 " LLM_MODELS_ROOT=${ MODEL_CACHE_DIR} " ,
801801 " MODEL_CACHE_DIR=${ MODEL_CACHE_DIR} " ,
802- " COLUMNS=200 " ,
802+ " COLUMNS=400 " ,
803803 extraInternalEnv,
804804 portEnvVars,
805805 pytestUtil,
@@ -860,11 +860,11 @@ def getMountListForSlurmTest(SlurmCluster cluster, boolean useSbatch = false)
860860 }
861861
862862 // data/cache mounts
863- if (cluster. containerRuntime == ContainerRuntime . DOCKER ) {
863+ if (cluster. containerRuntime. toString() == " DOCKER" ) {
864864 mounts + = [
865865 " /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro" ,
866866 ]
867- } else if (cluster. containerRuntime == ContainerRuntime . ENROOT ) {
867+ } else if (cluster. containerRuntime. toString() == " ENROOT" ) {
868868 if (! cluster. scratchPath) {
869869 throw new Exception (" Scratch path is not set for cluster: ${ cluster.name} " )
870870 }
@@ -922,6 +922,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
922922 def scriptRunPathNode = " ${ jobWorkspace} /${ jobUID} -slurm_run.sh"
923923 def scriptInstallLocalPath = " ${ llmSrcLocal} /jenkins/scripts/slurm_install.sh"
924924 def scriptInstallPathNode = " ${ jobWorkspace} /${ jobUID} -slurm_install.sh"
925+ def scriptBashUtilsLocalPath = " ${ llmSrcLocal} /jenkins/scripts/bash_utils.sh"
926+ def scriptBashUtilsPathNode = " ${ jobWorkspace} /${ jobUID} -bash_utils.sh"
925927 def testListPathNode = " ${ jobWorkspace} /${ testList} .txt"
926928 def waivesListPathNode = " ${ jobWorkspace} /waives.txt"
927929 def outputPath = " ${ jobWorkspace} /job-output.log"
@@ -956,6 +958,14 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
956958 scriptInstallPathNode,
957959 true
958960 )
961+ Utils . exec(pipeline, script : " echo \" Script for Bash utilities: \" && cat ${ scriptBashUtilsLocalPath} " )
962+ Utils . copyFileToRemoteHost(
963+ pipeline,
964+ remote,
965+ scriptBashUtilsLocalPath,
966+ scriptBashUtilsPathNode,
967+ true
968+ )
959969
960970 // Generate Test List and Upload to Frontend Node
961971 def makoArgs = getMakoArgsFromStageName(stageName, true )
@@ -1040,7 +1050,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10401050
10411051 def containerImageArg = container
10421052 def srunPrologue = " "
1043- if (cluster. containerRuntime == ContainerRuntime . ENROOT ) {
1053+ if (cluster. containerRuntime. toString() == " ENROOT" ) {
10441054 def enrootImagePath = " ${ cluster.scratchPath} /users/svc_tensorrt/containers/container-\$ {SLURM_JOB_ID}.sqsh"
10451055 containerImageArg = enrootImagePath
10461056
@@ -1127,9 +1137,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11271137 set -xEeuo pipefail
11281138 trap 'rc=\$ ?; echo "Error in file \$ {BASH_SOURCE[0]} on line \$ LINENO: \$ BASH_COMMAND (exit \$ rc)"; exit \$ rc' ERR
11291139
1130- echo "Starting job \$ SLURM_JOB_ID on \$ SLURM_NODELIST"
1131- echo \$ SLURM_JOB_ID > "$jobWorkspace /slurm_job_id.txt"
1132-
1140+ echo "Starting Slurm job \$ SLURM_JOB_ID on \$ SLURM_NODELIST"
11331141 export jobWorkspace=$jobWorkspace
11341142 export tarName=$tarName
11351143 export llmTarfile=$llmTarfile
@@ -1219,10 +1227,11 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12191227 touch "${ outputPath} "
12201228 jobId=\$ (sbatch ${ scriptLaunchPathNode} | awk '{print \$ 4}')
12211229 if [ -z "\$ jobId" ]; then
1222- echo "Error: Job submission failed, no job ID returned."
1230+ echo "Error: Slurm job submission failed, no job ID returned."
12231231 exit 1
12241232 fi
1225- echo "Submitted job \$ jobId"
1233+ echo "Submitted Slurm job \$ jobId"
1234+ echo "\$ jobId" > "${ jobWorkspace} /slurm_job_id.txt"
12261235 tail -f ${ outputPath} &
12271236 tailPid=\$ !
12281237 # Wait until sbatch job is done.
@@ -1232,9 +1241,28 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12321241 # Kill tail -f process
12331242 kill \$ tailPid
12341243 # Check if the job failed or not
1235- sleep 5
1236- STATUS=\$ (sacct -j \$ jobId --format=State --noheader | head -n 1 | awk '{print \$ 1}')
1237- EXIT_CODE=\$ (sacct -j \$ jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$ 1}')
1244+ sleep 10
1245+ # Retry getting status and exit code as sacct might be delayed
1246+ for i in {1..3}; do
1247+ STATUS=\$ (sacct -j \$ jobId --format=State --noheader | head -n 1 | awk '{print \$ 1}')
1248+ EXIT_CODE=\$ (sacct -j \$ jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$ 1}')
1249+
1250+ if [ -n "\$ STATUS" ] && [ -n "\$ EXIT_CODE" ]; then
1251+ break
1252+ fi
1253+ echo "Waiting for sacct to update... attempt \$ i"
1254+ sleep 10
1255+ done
1256+
1257+ if [ -z "\$ EXIT_CODE" ]; then
1258+ echo "Error: Failed to get exit code from sacct after retries, defaulting to 1."
1259+ EXIT_CODE=1
1260+ fi
1261+ if [ -z "\$ STATUS" ]; then
1262+ echo "Error: Failed to get status from sacct after retries, defaulting to UNKNOWN."
1263+ STATUS="UNKNOWN"
1264+ fi
1265+
12381266 if [[ "\$ STATUS" == "COMPLETED" && \$ EXIT_CODE -eq 0 ]]; then
12391267 echo "Pytest succeed in Slurm job \$ jobId"
12401268 echo "Status: \$ STATUS | Exit_code \$ EXIT_CODE"
0 commit comments