@@ -1022,10 +1022,222 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10221022 }
10231023}
10241024
1025+ def runLLMMultiNodeDisaggTestlistWithSbatch (pipeline , platform , testList , config = VANILLA_CONFIG , perfMode = false , stageName = " Undefined" , splitId = 1 , splits = 1 , gpuCount = 1 , nodeCount = 1 , skipInstallWheel = false , cpver = " cp312" )
1026+ {
1027+ SlurmPartition partition = SlurmConfig . partitionConfig[platform] as SlurmPartition
1028+ SlurmCluster cluster = SlurmConfig . clusterConfig[partition. clusterName]
1029+
1030+ // Create a unique suffix for the job name
1031+ String customSuffix = " ${ env.BUILD_TAG} -${ UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)} " . toLowerCase()
1032+ def jobUID = " ${ cluster.host} -multi_node_disagg_test-${ customSuffix} "
1033+
1034+ Utils . exec(pipeline, script : " env | sort && pwd && ls -alh" )
1035+
1036+ def slurmOutputFile = null
1037+
1038+ try {
1039+ // Run ssh command to start node in desired cluster via SLURM
1040+ withCredentials([
1041+ usernamePassword(
1042+ credentialsId : ' svc_tensorrt' ,
1043+ usernameVariable : ' USERNAME' ,
1044+ passwordVariable : ' PASSWORD'
1045+ )
1046+ ]) {
1047+ def randomLoginNode = SlurmConfig . getRandomLoginNode(cluster. host)
1048+ def remote = [
1049+ ip : randomLoginNode,
1050+ host : randomLoginNode,
1051+ user : " ${ pipeline.USERNAME} " ,
1052+ passwd : " ${ pipeline.PASSWORD} " ,
1053+ allowAnyHosts : true ,
1054+ ]
1055+ Utils . exec(pipeline, script : " apt-get update && apt-get install -y sshpass openssh-client python3" )
1056+ def tarName = BUILD_CONFIGS [config][TARNAME ]
1057+ def llmTarfile = " https://urm.nvidia.com/artifactory/${ ARTIFACT_PATH} /${ tarName} "
1058+ def llmPath = sh (script : " realpath ." , returnStdout : true ). trim()
1059+ def jobWorkspace = " /home/svc_tensorrt/bloom/scripts/${ jobUID} "
1060+ def resourcePathNode = " /tmp"
1061+ def llmSrcNode = " ${ resourcePathNode} /TensorRT-LLM/src"
1062+ def llmSrcLocal = " ${ llmPath} /TensorRT-LLM/src"
1063+ def scriptRunNode = " ${ jobWorkspace} /${ jobUID} -slurm_run.sh"
1064+ slurmOutputFile = SlurmConfig . getOutputFilePath(" /home/svc_tensorrt/slurm-logs" , jobUID)
1065+ def testListPathNode = " ${ jobWorkspace} /${ testList} .txt"
1066+ def waivesListPathNode = " ${ jobWorkspace} /waives.txt"
1067+ def outputPath = " ${ jobWorkspace} /job-output.log"
1068+ def scriptLaunchPathNode = " ${ jobWorkspace} /slurm_launch.sh"
1069+ def scriptExecPathNode = " ${ jobWorkspace} /slurm_exec.py"
1070+ def scriptInstallPathNode = " ${ jobWorkspace} /install.sh"
1071+ def isAarch64 = config. contains(" aarch64" )
1072+ def coverageConfigFile = " ${ jobWorkspace} /.coveragerc"
1073+
1074+ // Determine the config yaml file path based on testList
1075+ def configYamlFile = " ${ llmSrcLocal} /tests/scripts/perf-sanity/${ testList} .yaml"
1076+
1077+ stage(" [${ stageName} ] Initializing Test" ) {
1078+ // Create Job Workspace folder in Frontend Node
1079+ Utils . exec(pipeline, script : Utils . sshUserCmd(remote, " \" mkdir -p ${ jobWorkspace} \" " ), numRetries : 3 )
1080+
1081+ // Download and Unzip Tar File
1082+ trtllm_utils. llmExecStepWithRetry(pipeline, script : " cd ${ llmPath} && wget -nv ${ llmTarfile} " )
1083+ sh " cd ${ llmPath} && tar -zxf ${ BUILD_CONFIGS[config][TARNAME]} "
1084+
1085+ // Copy script files from repository to remote host
1086+ // Copy slurm_run.sh
1087+ def scriptRunLocalPath = " ${ llmSrcLocal} /jenkins/scripts/perf/disaggregated/slurm_run.sh"
1088+ Utils . copyFileToRemoteHost(
1089+ pipeline,
1090+ remote,
1091+ scriptRunLocalPath,
1092+ scriptRunNode,
1093+ true
1094+ )
1095+
1096+ // Copy slurm_launch.sh
1097+ def scriptLaunchLocalPath = " ${ llmSrcLocal} /jenkins/scripts/perf/disaggregated/slurm_launch.sh"
1098+ Utils . copyFileToRemoteHost(
1099+ pipeline,
1100+ remote,
1101+ scriptLaunchLocalPath,
1102+ scriptLaunchPathNode,
1103+ true
1104+ )
1105+
1106+ // Copy slurm_exec.py
1107+ def scriptExecLocalPath = " ${ llmSrcLocal} /jenkins/scripts/perf/disaggregated/slurm_exec.py"
1108+ Utils . copyFileToRemoteHost(
1109+ pipeline,
1110+ remote,
1111+ scriptExecLocalPath,
1112+ scriptExecPathNode,
1113+ true
1114+ )
1115+
1116+ // Copy install.sh
1117+ def scriptInstallLocalPath = " ${ llmSrcLocal} /jenkins/scripts/perf/disaggregated/install.sh"
1118+ Utils . copyFileToRemoteHost(
1119+ pipeline,
1120+ remote,
1121+ scriptInstallLocalPath,
1122+ scriptInstallPathNode,
1123+ true
1124+ )
1125+
1126+ // Generate Test List and Upload to Frontend Node
1127+ def makoArgs = getMakoArgsFromStageName(stageName, true )
1128+ def makoOptsJson = transformMakoArgsToJson([" Mako options:" ] + makoArgs)
1129+ def testListPathLocal = renderTestDB(testList, llmSrcLocal, stageName, makoOptsJson)
1130+ Utils . copyFileToRemoteHost(
1131+ pipeline,
1132+ remote,
1133+ testListPathLocal,
1134+ testListPathNode
1135+ )
1136+
1137+ // generate .coveragerc in workspace and add file path to pytest command
1138+ sh """
1139+ touch ./.coveragerc
1140+ echo '[run]' > ./.coveragerc
1141+ echo 'branch = True' >> ./.coveragerc
1142+ echo 'data_file = ${ jobWorkspace} /.coverage.${ stageName} ' >> ./.coveragerc
1143+ echo '[paths]' >> ./.coveragerc
1144+ echo 'source =\n ${ llmSrcNode} /tensorrt_llm/\n ---wheel_path---/tensorrt_llm//tensorrt_llm/' >> ./.coveragerc
1145+ cat ./.coveragerc
1146+ """
1147+
1148+ Utils . copyFileToRemoteHost(
1149+ pipeline,
1150+ remote,
1151+ " ./.coveragerc" ,
1152+ coverageConfigFile
1153+ )
1154+
1155+ // Generate Pytest command
1156+ String pytestUtil = " "
1157+ if (nodeCount > 1 ) {
1158+ pytestUtil = " $llmSrcNode /tensorrt_llm/llmapi/trtllm-llmapi-launch"
1159+ }
1160+
1161+ def pytestCommand = getPytestBaseCommandLine(
1162+ llmSrcNode,
1163+ stageName,
1164+ perfMode,
1165+ jobWorkspace,
1166+ " __PLACEHOLDER_TRTLLM_WHL_PATH__" ,
1167+ " $jobWorkspace /.coveragerc" ,
1168+ pytestUtil,
1169+ [
1170+ " --test-list=$testListPathNode " ,
1171+ " --splitting-algorithm least_duration" ,
1172+ " --splits $splits " ,
1173+ " --group $splitId "
1174+ ]
1175+ ). join(" " )
1176+ }
1177+
1178+ stage(" [${ stageName} ] Run Pytest" ) {
1179+ // Define container and mounts for slurm_exec.py
1180+ def container = LLM_DOCKER_IMAGE . replace(" urm.nvidia.com/" , " urm.nvidia.com#" )
1181+ def mounts = " /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro,/home/svc_tensorrt/bloom/scripts:/home/svc_tensorrt/bloom/scripts"
1182+
1183+ // Generate sbatch parameter string (without --nodes, --ntasks, etc. which will come from config yaml)
1184+ def sbatchParams = """ --output=${ outputPath}
1185+ ${ partition.additionalArgs}
1186+ ${ (partition?.name && partition.name != "unspecified") ? "--partition=${partition.name}" : ""} """
1187+
1188+ def python_exec_cmd = """ python3 ${ scriptExecPathNode} \\
1189+ --sbatch-params '${ sbatchParams.replaceAll("'", "'\\\\''")} ' \\
1190+ --config-yaml '${ configYamlFile} ' \\
1191+ --slurm-launch-sh '${ scriptLaunchPathNode} ' \\
1192+ --job-workspace '${ jobWorkspace} ' \\
1193+ --llm-tarfile '${ llmTarfile} ' \\
1194+ --tar-name '${ tarName} ' \\
1195+ --llm-src-node '${ llmSrcNode} ' \\
1196+ --stage-name '${ stageName} ' \\
1197+ --perf-mode '${ perfMode} ' \\
1198+ --resource-path-node '${ resourcePathNode} ' \\
1199+ --pytest-command '${ pytestCommand.replaceAll("'", "'\\\\''")} ' \\
1200+ --coverage-config-file '${ coverageConfigFile} ' \\
1201+ --container '${ container} ' \\
1202+ --mounts '${ mounts} ' \\
1203+ --script-run-node '${ scriptRunNode} ' \\
1204+ --script-install-node '${ scriptInstallPathNode} ' \\
1205+ --test-list-path-node '${ testListPathNode} ' \\
1206+ --output-path '${ outputPath} '"""
1207+
1208+ Utils . exec(
1209+ pipeline,
1210+ timeout : false ,
1211+ script : Utils . sshUserCmd(
1212+ remote,
1213+ " \" ${ python_exec_cmd} \" "
1214+ )
1215+ )
1216+ }
1217+
1218+ echo " Finished test stage execution."
1219+ }
1220+ } finally {
1221+ uploadResults(pipeline, cluster, jobUID, stageName)
1222+ stage(" Clean up SLURM Resources" ) {
1223+ // Workaround to handle the interruption during clean up SLURM resources
1224+ retry(3 ) {
1225+ try {
1226+ cleanUpSlurmResources(pipeline, cluster, jobUID)
1227+ } catch (Exception e) {
1228+ error " Error during clean up SLURM resources: ${ e.getMessage()} and retrying."
1229+ }
1230+ }
1231+ }
1232+ }
1233+ }
1234+
10251235def runLLMTestlistOnSlurm (pipeline , platform , testList , config = VANILLA_CONFIG , perfMode = false , stageName = " Undefined" , splitId = 1 , splits = 1 , gpuCount = 1 , nodeCount = 1 , runWithSbatch = false , skipInstallWheel = false , cpver = " cp312" )
10261236{
10271237 echo " Run Slurm job with native sbatch: $runWithSbatch "
1028- if (nodeCount > 1 || runWithSbatch) {
1238+ if (nodeCount > 1 && stageName. contains(" Perf-Sanity-Disagg" )) {
1239+ runLLMMultiNodeDisaggTestlistWithSbatch(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, gpuCount, nodeCount, skipInstallWheel, cpver)
1240+ } else if (nodeCount > 1 || runWithSbatch) {
10291241 runLLMTestlistWithSbatch(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, gpuCount, nodeCount, skipInstallWheel, cpver)
10301242 } else {
10311243 runLLMTestlistWithAgent(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, gpuCount, skipInstallWheel, cpver)
@@ -2277,7 +2489,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
22772489 error " Some tests still failed after rerun attempts, please check the test report."
22782490 }
22792491
2280- if (perfMode) {
2492+ if (perfMode && ! stageName . contains( " Perf-Sanity " ) ) {
22812493 basePerfFilename = stageName. contains(" PyTorch" ) ? " base_perf_pytorch.csv" : " base_perf.csv"
22822494 basePerfPath = " ${ llmSrc} /tests/integration/defs/perf/${ basePerfFilename} "
22832495 stage(" Check perf result" ) {
@@ -2688,8 +2900,8 @@ def launchTestJobs(pipeline, testFilter)
26882900 " DGX_B200-4_GPUs-PyTorch-Post-Merge-1" : [" b200-trtllm" , " l0_dgx_b200" , 1 , 1 , 4 , 1 , true ],
26892901 " DGX_B300-4_GPUs-PyTorch-Post-Merge-1" : [" b300-x4" , " l0_dgx_b300" , 1 , 1 , 4 ],
26902902 // Perf sanity post merge test
2691- // Disable perf stages due to https://nvbugs/5643646
2692- // "DGX_B200-4_GPUs -PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4 ", "perf_sanity_l0_dgx_b200", 1, 1, 4 ],
2903+ " DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1 " : [ " b200-x4 " , " perf_sanity_l0_dgx_b200 " , 1 , 1 , 4 ],
2904+ // "DGX_B200-8_GPUs -PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x8 ", "perf_sanity_l0_dgx_b200", 1, 1, 8 ],
26932905 // "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "perf_sanity_l0_dgx_b300", 1, 1, 4],
26942906 ]
26952907 fullSet + = x86SlurmTestConfigs. keySet()
@@ -2732,7 +2944,10 @@ def launchTestJobs(pipeline, testFilter)
27322944 // "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-trtllm", "l0_gb200_multi_nodes", 4, 5, 8, 2],
27332945 // "GB200-8_GPUs-2_Nodes-PyTorch-5": ["gb200-trtllm", "l0_gb200_multi_nodes", 5, 5, 8, 2],
27342946 // ]
2735- multiNodesSBSAConfigs = [:]
2947+ multiNodesSBSAConfigs = [
2948+ " GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1" : [" gb200-trtllm" , " perf_sanity_l0_gb200_multi_nodes" , 1 , 1 , 8 , 2 ],
2949+ " GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1" : [" gb200-trtllm" , " perf_sanity_l0_gb200_multi_nodes_disagg" , 1 , 1 , 8 , 2 ],
2950+ ]
27362951 def numMultiNodeTests = 3
27372952 multiNodesSBSAConfigs + = (1 .. numMultiNodeTests). collectEntries { i ->
27382953 [" GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${ i} " . toString(), [" gb200-trtllm" , " l0_gb200_multi_nodes" , i, numMultiNodeTests, 8 , 2 ]]
0 commit comments