@@ -924,7 +924,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
924924 def scriptInstallPathNode = " ${ jobWorkspace} /${ jobUID} -slurm_install.sh"
925925 def testListPathNode = " ${ jobWorkspace} /${ testList} .txt"
926926 def waivesListPathNode = " ${ jobWorkspace} /waives.txt"
927- def outputPath = " ${ jobWorkspace} /job-output.log"
927+ def outputPath = " ${ jobWorkspace} /job-output- ${ UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6) } .log"
928928 def scriptLaunchPathLocal = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
929929 def scriptLaunchPathNode = " ${ jobWorkspace} /${ jobUID} -slurm_launch.sh"
930930 def scriptExecPathLocal = Utils . createTempLocation(pipeline, " ./slurm_exec.sh" )
@@ -939,7 +939,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
939939 trtllm_utils. llmExecStepWithRetry(pipeline, script : " cd ${ llmPath} && wget -nv ${ llmTarfile} " )
940940 sh " cd ${ llmPath} && tar -zxf ${ BUILD_CONFIGS[config][TARNAME]} "
941941
942- Utils . exec(pipeline, script : " echo \" Script to trigger Slurm srun job: \" && cat ${ scriptRunLocalPath} " )
942+ Utils . exec(pipeline, script : " echo \" Script for Slurm srun job to submit : \" && cat ${ scriptRunLocalPath} " )
943943 Utils . copyFileToRemoteHost(
944944 pipeline,
945945 remote,
@@ -948,7 +948,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
948948 true
949949 )
950950
951- Utils . exec(pipeline, script : " echo \" Script to install environment : \" && cat ${ scriptInstallLocalPath} " )
951+ Utils . exec(pipeline, script : " echo \" Script to install TensorRT LLM dependencies : \" && cat ${ scriptInstallLocalPath} " )
952952 Utils . copyFileToRemoteHost(
953953 pipeline,
954954 remote,
@@ -1093,7 +1093,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10931093 srunArgs = [
10941094 " --container-name=multi_node_test-\$ {SLURM_JOB_ID}" ,
10951095 " --container-image=$containerImageArg " ,
1096- " --container-workdir=/home/svc_tensorrt/bloom/scripts " ,
1096+ " --container-workdir=$j obWorkspace " ,
10971097 " --container-mounts=$mounts " ,
10981098 " --container-env=NVIDIA_IMEX_CHANNELS"
10991099 ]
@@ -1115,15 +1115,17 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11151115 " export ${ varName} =\" ${ escapedValue} \" "
11161116 }. join(' \n ' )
11171117
1118+ // Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
11181119 def scriptLaunchPrefix = """ #!/bin/bash
11191120 #SBATCH ${ exemptionComment}
11201121 #SBATCH --output=${ outputPath}
11211122 ${ taskArgs.collect { "#SBATCH $it" }.join('\n')}
11221123 #SBATCH ${ partition.additionalArgs}
11231124 ${ (partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
11241125 echo "Starting job \$ SLURM_JOB_ID on \$ SLURM_NODELIST"
1126+ echo "\$ SLURM_JOB_ID > $jobWorkspace /slurm_job_id.txt
11251127
1126- set -Eeuo pipefail
1128+ set -xEeuo pipefail
11271129 trap 'rc=\$ ?; echo "Error in file \$ {BASH_SOURCE[0]} on line \$ LINENO: \$ BASH_COMMAND (exit \$ rc)"; exit \$ rc' ERR
11281130 export jobWorkspace=$jobWorkspace
11291131 export tarName=$tarName
@@ -1156,8 +1158,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11561158
11571159 pipeline. writeFile(file : scriptLaunchPrefixPathLocal, text : scriptLaunchPrefix)
11581160 pipeline. writeFile(file : scriptLaunchSrunArgsPathLocal, text : srunArgs. join(" " ))
1159- Utils . exec(pipeline, script : " echo \" Script launch prefix: \" && cat ${ scriptLaunchPrefixPathLocal} " )
1160- Utils . exec(pipeline, script : " echo \" Srun args content : \" && cat ${ scriptLaunchSrunArgsPathLocal} " )
1161+ Utils . exec(pipeline, script : " echo \" Script for Slurm sbatch job prefix: \" && cat ${ scriptLaunchPrefixPathLocal} " )
1162+ Utils . exec(pipeline, script : " echo \" Script for Slurm srun job args : \" && cat ${ scriptLaunchSrunArgsPathLocal} " )
11611163
11621164 // Output is the corresponding scriptLaunchPathLocal script under the disaggMode
11631165 sh """
@@ -1184,7 +1186,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11841186 pipeline. writeFile(file : scriptLaunchPathLocal, text : scriptContent)
11851187 }
11861188
1187- Utils . exec(pipeline, script : " echo \" Script to trigger Slurm sbatch job: \" && cat ${ scriptLaunchPathLocal} " )
1189+ Utils . exec(pipeline, script : " echo \" Script for Slurm sbatch job to submit : \" && cat ${ scriptLaunchPathLocal} " )
11881190 Utils . copyFileToRemoteHost(
11891191 pipeline,
11901192 remote,
@@ -1194,8 +1196,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11941196 )
11951197
11961198 def scriptExec = """ #!/bin/bash
1197- set -Eeuo pipefail
1199+ set -xEeuo pipefail
11981200 trap 'rc=\$ ?; echo "Error in file \$ {BASH_SOURCE[0]} on line \$ LINENO: \$ BASH_COMMAND (exit \$ rc)"; exit \$ rc' ERR
1201+ rm -rf ${ outputPath}
11991202 touch ${ outputPath}
12001203 jobId=\$ (sbatch ${ scriptLaunchPathNode} | awk '{print \$ 4}')
12011204 if [ -z "\$ jobId" ]; then
@@ -3124,11 +3127,11 @@ def launchTestJobs(pipeline, testFilter)
31243127 " DGX_H100-4_GPUs-PyTorch-Others-1" : [" dgx-h100-x4-oci" , " l0_dgx_h100" , 1 , 1 , 4 ],
31253128 " DGX_H100-4_GPUs-PyTorch-Ray-1" : [" dgx-h100-x4-oci" , " l0_dgx_h100" , 1 , 1 , 4 ],
31263129 " B300-PyTorch-1" : [" b300-single" , " l0_b300" , 1 , 1 ],
3127- " DGX_B200-4_GPUs-PyTorch-1" : [" b200-x4" , " l0_dgx_b200" , 1 , 1 , 4 ],
3130+ " DGX_B200-4_GPUs-PyTorch-1" : [" b200-x4-lbd " , " l0_dgx_b200" , 1 , 1 , 4 , 1 , true ],
31283131 " DGX_B200-4_GPUs-PyTorch-Ray-1" : [" b200-x4-lbd" , " l0_dgx_b200" , 1 , 1 , 4 , 1 , true ],
31293132 " DGX_B200-8_GPUs-PyTorch-1" : [" b200-x8-lbd" , " l0_dgx_b200" , 1 , 1 , 8 , 1 , true ],
3130- " DGX_B200-4_GPUs-PyTorch-Post-Merge-1" : [" b200-trtllm " , " l0_dgx_b200" , 1 , 2 , 4 , 1 , true ],
3131- " DGX_B200-4_GPUs-PyTorch-Post-Merge-2" : [" b200-trtllm " , " l0_dgx_b200" , 2 , 2 , 4 , 1 , true ],
3133+ " DGX_B200-4_GPUs-PyTorch-Post-Merge-1" : [" b200-x4-lbd " , " l0_dgx_b200" , 1 , 2 , 4 , 1 , true ],
3134+ " DGX_B200-4_GPUs-PyTorch-Post-Merge-2" : [" b200-x4-lbd " , " l0_dgx_b200" , 2 , 2 , 4 , 1 , true ],
31323135 " DGX_B300-4_GPUs-PyTorch-Post-Merge-1" : [" b300-x4" , " l0_dgx_b300" , 1 , 2 , 4 ],
31333136 " DGX_B300-4_GPUs-PyTorch-Post-Merge-2" : [" b300-x4" , " l0_dgx_b300" , 2 , 2 , 4 ],
31343137 // Perf sanity post merge test
0 commit comments