@@ -461,7 +461,7 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
461461 def cleanupCommands = [
462462 " rm -rf ${ cluster.scratchPath} /users/svc_tensorrt/containers/container-${ slurmJobID} .sqsh || true" ,
463463 " rm -rf ${ jobWorkspace} || true" ,
464- ]. join(" && " )
464+ ]. join(" ; " )
465465 Utils . exec(
466466 pipeline,
467467 script : Utils . sshUserCmd(
@@ -511,7 +511,7 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
511511 def cleanupCommands = [
512512 " rm -rf /home/svc_tensorrt/bloom/scripts/agent-${ nodeName} .jar /home/svc_tensorrt/bloom/scripts/${ nodeName} -${ entrypoint} || true" ,
513513 " rm -rf ${ cluster.scratchPath} /users/svc_tensorrt/containers/container-${ slurmJobID} .sqsh || true" ,
514- ]. join(" && " )
514+ ]. join(" ; " )
515515 Utils . exec(
516516 pipeline,
517517 script : Utils . sshUserCmd(
@@ -939,7 +939,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
939939 trtllm_utils. llmExecStepWithRetry(pipeline, script : " cd ${ llmPath} && wget -nv ${ llmTarfile} " )
940940 sh " cd ${ llmPath} && tar -zxf ${ BUILD_CONFIGS[config][TARNAME]} "
941941
942- Utils . exec(pipeline, script : " echo \" Script to trigger Slurm srun job: \" && cat ${ scriptRunLocalPath} " )
942+ Utils . exec(pipeline, script : " echo \" Script for Slurm srun job to submit : \" && cat ${ scriptRunLocalPath} " )
943943 Utils . copyFileToRemoteHost(
944944 pipeline,
945945 remote,
@@ -948,7 +948,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
948948 true
949949 )
950950
951- Utils . exec(pipeline, script : " echo \" Script to install environment : \" && cat ${ scriptInstallLocalPath} " )
951+ Utils . exec(pipeline, script : " echo \" Script to install TensorRT LLM dependencies : \" && cat ${ scriptInstallLocalPath} " )
952952 Utils . copyFileToRemoteHost(
953953 pipeline,
954954 remote,
@@ -1093,7 +1093,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10931093 srunArgs = [
10941094 " --container-name=multi_node_test-\$ {SLURM_JOB_ID}" ,
10951095 " --container-image=$containerImageArg " ,
1096- " --container-workdir=/home/svc_tensorrt/bloom/scripts " ,
1096+ " --container-workdir=$j obWorkspace " ,
10971097 " --container-mounts=$mounts " ,
10981098 " --container-env=NVIDIA_IMEX_CHANNELS"
10991099 ]
@@ -1115,16 +1115,21 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11151115 " export ${ varName} =\" ${ escapedValue} \" "
11161116 }. join(' \n ' )
11171117
1118+ // Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
11181119 def scriptLaunchPrefix = """ #!/bin/bash
11191120 #SBATCH ${ exemptionComment}
11201121 #SBATCH --output=${ outputPath}
11211122 ${ taskArgs.collect { "#SBATCH $it" }.join('\n')}
11221123 #SBATCH ${ partition.additionalArgs}
11231124 ${ (partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
1124- echo "Starting job \$ SLURM_JOB_ID on \$ SLURM_NODELIST"
11251125
1126- set -Eeuo pipefail
1126+ # SBATCH directives must appear before any executable commands.
1127+ set -xEeuo pipefail
11271128 trap 'rc=\$ ?; echo "Error in file \$ {BASH_SOURCE[0]} on line \$ LINENO: \$ BASH_COMMAND (exit \$ rc)"; exit \$ rc' ERR
1129+
1130+ echo "Starting job \$ SLURM_JOB_ID on \$ SLURM_NODELIST"
1131+ echo \$ SLURM_JOB_ID > "$jobWorkspace /slurm_job_id.txt"
1132+
11281133 export jobWorkspace=$jobWorkspace
11291134 export tarName=$tarName
11301135 export llmTarfile=$llmTarfile
@@ -1156,8 +1161,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11561161
11571162 pipeline. writeFile(file : scriptLaunchPrefixPathLocal, text : scriptLaunchPrefix)
11581163 pipeline. writeFile(file : scriptLaunchSrunArgsPathLocal, text : srunArgs. join(" " ))
1159- Utils . exec(pipeline, script : " echo \" Script launch prefix: \" && cat ${ scriptLaunchPrefixPathLocal} " )
1160- Utils . exec(pipeline, script : " echo \" Srun args content : \" && cat ${ scriptLaunchSrunArgsPathLocal} " )
1164+ Utils . exec(pipeline, script : " echo \" Script for Slurm sbatch job prefix: \" && cat ${ scriptLaunchPrefixPathLocal} " )
1165+ Utils . exec(pipeline, script : " echo \" Script for Slurm srun job args : \" && cat ${ scriptLaunchSrunArgsPathLocal} " )
11611166
11621167 // Output is the corresponding scriptLaunchPathLocal script under the disaggMode
11631168 sh """
@@ -1184,7 +1189,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11841189 pipeline. writeFile(file : scriptLaunchPathLocal, text : scriptContent)
11851190 }
11861191
1187- Utils . exec(pipeline, script : " echo \" Script to trigger Slurm sbatch job: \" && cat ${ scriptLaunchPathLocal} " )
1192+ Utils . exec(pipeline, script : " echo \" Script for Slurm sbatch job to submit : \" && cat ${ scriptLaunchPathLocal} " )
11881193 Utils . copyFileToRemoteHost(
11891194 pipeline,
11901195 remote,
@@ -1194,9 +1199,24 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11941199 )
11951200
11961201 def scriptExec = """ #!/bin/bash
1197- set -Eeuo pipefail
1202+ set -xEeuo pipefail
11981203 trap 'rc=\$ ?; echo "Error in file \$ {BASH_SOURCE[0]} on line \$ LINENO: \$ BASH_COMMAND (exit \$ rc)"; exit \$ rc' ERR
1199- touch ${ outputPath}
1204+
1205+ # Clean up previous job intermediate files so that retry can work
1206+ if [ -f "${ jobWorkspace} /slurm_job_id.txt" ]; then
1207+ previous_job_id=\$ (cat "${ jobWorkspace} /slurm_job_id.txt")
1208+ echo "Found previous Slurm job ID: \$ {previous_job_id}"
1209+ scancel "\$ {previous_job_id}" || true
1210+ rm -rf "${ jobWorkspace} /slurm_job_id.txt"
1211+ # Wait for 60 seconds to ensure the previous job is canceled
1212+ sleep 60
1213+ fi
1214+ rm -rf "${ jobWorkspace} /results.xml"
1215+ rm -rf "${ jobWorkspace} /report.csv"
1216+ rm -rf "${ jobWorkspace} /unfinished_test.txt"
1217+ rm -rf "${ outputPath} "
1218+
1219+ touch "${ outputPath} "
12001220 jobId=\$ (sbatch ${ scriptLaunchPathNode} | awk '{print \$ 4}')
12011221 if [ -z "\$ jobId" ]; then
12021222 echo "Error: Job submission failed, no job ID returned."
@@ -1460,7 +1480,8 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
14601480 if (stageIsInterrupted) {
14611481 echo " Stage is interrupted, skip to upload test result."
14621482 } else {
1463- sh ' if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi'
1483+ // Temporarily disable to reduce the log size
1484+ // sh 'if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi'
14641485 if (noResultIfSuccess && ! stageIsFailed) {
14651486 // Clean up the workspace
14661487 sh """
@@ -2603,7 +2624,8 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
26032624 def containerPortNum = GlobalState . PORT_SECTION_SIZE
26042625
26052626 // Some clusters do not allow dmesg -C so we add || true
2606- sh ' if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
2627+ // Temporarily disable to reduce the log size
2628+ // sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
26072629 def pytestCommand = getPytestBaseCommandLine(
26082630 llmSrc,
26092631 stageName,
@@ -3124,11 +3146,11 @@ def launchTestJobs(pipeline, testFilter)
31243146 " DGX_H100-4_GPUs-PyTorch-Others-1" : [" dgx-h100-x4-oci" , " l0_dgx_h100" , 1 , 1 , 4 ],
31253147 " DGX_H100-4_GPUs-PyTorch-Ray-1" : [" dgx-h100-x4-oci" , " l0_dgx_h100" , 1 , 1 , 4 ],
31263148 " B300-PyTorch-1" : [" b300-single" , " l0_b300" , 1 , 1 ],
3127- " DGX_B200-4_GPUs-PyTorch-1" : [" b200-x4" , " l0_dgx_b200" , 1 , 1 , 4 ],
3149+ " DGX_B200-4_GPUs-PyTorch-1" : [" b200-x4-lbd " , " l0_dgx_b200" , 1 , 1 , 4 , 1 , true ],
31283150 " DGX_B200-4_GPUs-PyTorch-Ray-1" : [" b200-x4-lbd" , " l0_dgx_b200" , 1 , 1 , 4 , 1 , true ],
31293151 " DGX_B200-8_GPUs-PyTorch-1" : [" b200-x8-lbd" , " l0_dgx_b200" , 1 , 1 , 8 , 1 , true ],
3130- " DGX_B200-4_GPUs-PyTorch-Post-Merge-1" : [" b200-trtllm " , " l0_dgx_b200" , 1 , 2 , 4 , 1 , true ],
3131- " DGX_B200-4_GPUs-PyTorch-Post-Merge-2" : [" b200-trtllm " , " l0_dgx_b200" , 2 , 2 , 4 , 1 , true ],
3152+ " DGX_B200-4_GPUs-PyTorch-Post-Merge-1" : [" b200-x4-lbd " , " l0_dgx_b200" , 1 , 2 , 4 , 1 , true ],
3153+ " DGX_B200-4_GPUs-PyTorch-Post-Merge-2" : [" b200-x4-lbd " , " l0_dgx_b200" , 2 , 2 , 4 , 1 , true ],
31323154 " DGX_B300-4_GPUs-PyTorch-Post-Merge-1" : [" b300-x4" , " l0_dgx_b300" , 1 , 2 , 4 ],
31333155 " DGX_B300-4_GPUs-PyTorch-Post-Merge-2" : [" b300-x4" , " l0_dgx_b300" , 2 , 2 , 4 ],
31343156 // Perf sanity post merge test
0 commit comments