@@ -748,9 +748,9 @@ def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
748748}
749749// End of Methods to run Slurm job with Jenkins Agent
750750
751- def getNodeArgs (int nodeCount , int gpuCount ) {
751+ def getNodeArgs (int nodeCount , int gpuCount , boolean setSegment = false ) {
752752 int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal ). setScale(0 , BigDecimal . ROUND_CEILING ). intValue()
753- return nodeCount == 1 ? [
753+ def args = nodeCount == 1 ? [
754754 " --nodes=${ nodeCount} " ,
755755 " --gpus=${ gpuCount} "
756756 ] : [
@@ -759,6 +759,10 @@ def getNodeArgs(int nodeCount, int gpuCount) {
759759 " --ntasks-per-node=${ gpusPerNode} " ,
760760 " --gpus-per-node=${ gpusPerNode} " ,
761761 ]
762+ if (setSegment && gpuCount > 1 ) {
763+ args + = [" --segment=${ nodeCount} " ]
764+ }
765+ return args
762766}
763767
764768def getPytestBaseCommandLine (
@@ -883,6 +887,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
883887 // Create a unique suffix for the job name
884888 String customSuffix = " ${ env.BUILD_TAG} -${ UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)} " . toLowerCase()
885889 def jobUID = " ${ cluster.host} -multi_node_test-${ customSuffix} "
890+ def disaggMode = stageName. contains(" Perf-Sanity-Disagg" )
891+ def setSegment = disaggMode
886892
887893 Utils . exec(pipeline, script : " env | sort && pwd && ls -alh" )
888894
@@ -914,6 +920,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
914920 def llmSrcLocal = " ${ llmPath} /TensorRT-LLM/src"
915921 def scriptRunLocalPath = " ${ llmSrcLocal} /jenkins/scripts/slurm_run.sh"
916922 def scriptRunPathNode = " ${ jobWorkspace} /${ jobUID} -slurm_run.sh"
923+ def scriptInstallLocalPath = " ${ llmSrcLocal} /jenkins/scripts/slurm_install.sh"
924+ def scriptInstallPathNode = " ${ jobWorkspace} /${ jobUID} -slurm_install.sh"
917925 def testListPathNode = " ${ jobWorkspace} /${ testList} .txt"
918926 def waivesListPathNode = " ${ jobWorkspace} /waives.txt"
919927 def outputPath = " ${ jobWorkspace} /job-output.log"
@@ -940,6 +948,15 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
940948 true
941949 )
942950
951+ Utils . exec(pipeline, script : " echo \" Script to install environment: \" && cat ${ scriptInstallLocalPath} " )
952+ Utils . copyFileToRemoteHost(
953+ pipeline,
954+ remote,
955+ scriptInstallLocalPath,
956+ scriptInstallPathNode,
957+ true
958+ )
959+
943960 // Generate Test List and Upload to Frontend Node
944961 def makoArgs = getMakoArgsFromStageName(stageName, true )
945962 // TODO: currently the options will only be processed if the first
@@ -1013,7 +1030,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10131030 // Generate Job Launch Script
10141031 def container = LLM_DOCKER_IMAGE . replace(" urm.nvidia.com/" , " urm.nvidia.com#" )
10151032 def mounts = getMountListForSlurmTest(cluster, true ). join(" ," )
1016- String [] taskArgs = getNodeArgs(nodeCount, gpuCount)
1033+ String [] taskArgs = getNodeArgs(nodeCount, gpuCount, setSegment )
10171034 if (taskArgs == null ) {
10181035 error " Invalid Slurm test stage name is set"
10191036 }
@@ -1083,10 +1100,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10831100 envVarsToExport. each { varName , varValue ->
10841101 srunArgs. add(" --container-env=${ varName} " )
10851102 }
1086- if (nodeCount > 1 ) {
1087- srunArgs. add(" --mpi=pmi2" )
1088- }
1089-
10901103 def exemptionComment = " "
10911104 if (cluster. host. contains(" oci-nrt" ) || cluster. host. contains(" oci-hsg" ) || cluster. host. contains(" lbd-lax" )) {
10921105 exemptionComment = """ --comment='{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"90","reason":"other","description":"Long data and model loading time and disaggregated serving tests"}}'"""
@@ -1102,8 +1115,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11021115 " export ${ varName} =\" ${ escapedValue} \" "
11031116 }. join(' \n ' )
11041117
1105- def scriptContent = """ #!/bin/bash
1106- #SBATCH ${ exemptionComment} --output=${ outputPath}
1118+ def scriptLaunchPrefix = """ #!/bin/bash
1119+ #SBATCH ${ exemptionComment}
1120+ #SBATCH --output=${ outputPath}
11071121 ${ taskArgs.collect { "#SBATCH $it" }.join('\n')}
11081122 #SBATCH ${ partition.additionalArgs}
11091123 ${ (partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
@@ -1128,10 +1142,48 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11281142 echo "Env NVIDIA_VISIBLE_DEVICES: \$ NVIDIA_VISIBLE_DEVICES"
11291143
11301144 ${ srunPrologue}
1131-
1132- srun --kill-on-bad-exit=1 ${ srunArgs.join(" ")} ${ scriptRunPathNode}
11331145 """ . replaceAll(" (?m)^\\ s*" , " " )
1134- pipeline. writeFile(file : scriptLaunchPathLocal, text : scriptContent)
1146+
1147+ if (disaggMode) {
1148+ if (nodeCount > 1 ) {
1149+ srunArgs. add(" --mpi=pmix" )
1150+ }
1151+
1152+ def scriptLaunchPrefixPathLocal = Utils . createTempLocation(pipeline, " ./slurm_launch_prefix.sh" )
1153+ def scriptLaunchSrunArgsPathLocal = Utils . createTempLocation(pipeline, " ./slurm_srun_args.txt" )
1154+ def scriptLaunchDraftPathLocal = " ${ llmSrcLocal} /jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh"
1155+ def scriptSubmitLocalPath = " ${ llmSrcLocal} /jenkins/scripts/perf/disaggregated/submit.py"
1156+
1157+ pipeline. writeFile(file : scriptLaunchPrefixPathLocal, text : scriptLaunchPrefix)
1158+ pipeline. writeFile(file : scriptLaunchSrunArgsPathLocal, text : srunArgs. join(" " ))
1159+ Utils . exec(pipeline, script : " echo \" Script launch prefix: \" && cat ${ scriptLaunchPrefixPathLocal} " )
1160+ Utils . exec(pipeline, script : " echo \" Srun args content: \" && cat ${ scriptLaunchSrunArgsPathLocal} " )
1161+
1162+ // Output is the corresponding scriptLaunchPathLocal script under the disaggMode
1163+ sh """
1164+ python3 ${ scriptSubmitLocalPath} \\
1165+ --run-ci \\
1166+ --llm-src ${ llmSrcLocal} \\
1167+ --test-list ${ testListPathLocal} \\
1168+ --draft-launch-sh ${ scriptLaunchDraftPathLocal} \\
1169+ --launch-sh ${ scriptLaunchPathLocal} \\
1170+ --run-sh ${ scriptRunPathNode} \\
1171+ --install-sh ${ scriptInstallPathNode} \\
1172+ --script-prefix ${ scriptLaunchPrefixPathLocal} \\
1173+ --srun-args ${ scriptLaunchSrunArgsPathLocal}
1174+ """
1175+ } else {
1176+ if (nodeCount > 1 ) {
1177+ srunArgs. add(" --mpi=pmi2" )
1178+ }
1179+
1180+ def scriptContent = """
1181+ ${ scriptLaunchPrefix}
1182+ srun --kill-on-bad-exit=1 ${ srunArgs.join(" ")} ${ scriptRunPathNode}
1183+ """ . replaceAll(" (?m)^\\ s*" , " " )
1184+ pipeline. writeFile(file : scriptLaunchPathLocal, text : scriptContent)
1185+ }
1186+
11351187 Utils . exec(pipeline, script : " echo \" Script to trigger Slurm sbatch job: \" && cat ${ scriptLaunchPathLocal} " )
11361188 Utils . copyFileToRemoteHost(
11371189 pipeline,
@@ -2634,7 +2686,6 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
26342686 if (noRegularTests && noIsolateTests) {
26352687 error " No tests were executed for stage ${ stageName} , please check the test list and test-db rendering result."
26362688 }
2637-
26382689 }
26392690 }
26402691
@@ -2653,7 +2704,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
26532704 stage(" Check perf result" ) {
26542705 def perfCheckResult = sh(
26552706 script : """
2656- python3 ${ llmSrc} /tests/integration/defs/perf/sanity_perf_check.py \
2707+ python3 ${ llmSrc} /tests/integration/defs/perf/sanity_perf_check.py \
26572708 ${ stageName} /perf_script_test_results.csv \
26582709 ${ basePerfPath}
26592710 """ ,
@@ -2672,6 +2723,22 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
26722723 """
26732724 }
26742725 }
2726+
2727+ if (perfMode && stageName. contains(" Perf-Sanity" )) {
2728+ stage (" Check perf result" ) {
2729+ def perfCheckResult = sh(
2730+ script : """
2731+ python3 ${ llmSrc} /tests/integration/defs/perf/perf_regression_check.py \
2732+ ${ WORKSPACE} /${ stageName}
2733+ """ ,
2734+ returnStatus : true
2735+ )
2736+ // TODO: Enable this when perf regression check is stable
2737+ // if (perfCheckResult != 0) {
2738+ // error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
2739+ // }
2740+ }
2741+ }
26752742 }
26762743}
26772744
@@ -3111,8 +3178,13 @@ def launchTestJobs(pipeline, testFilter)
31113178 " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 1 , 3 , 8 , 2 ],
31123179 " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 2 , 3 , 8 , 2 ],
31133180 " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 3 , 3 , 8 , 2 ],
3114- // Perf sanity post merge test
3115- " GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_perf_sanity" , 1 , 1 , 8 , 2 ],
3181+ // Perf sanity post merge aggr tests
3182+ " GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001" , 1 , 1 , 8 , 2 ],
3183+ // Perf sanity post merge disagg tests
3184+ " GB200-12_GPUs-3_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001" , 1 , 1 , 12 , 3 ],
3185+ // "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001", 1, 1, 24, 6],
3186+ // "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002", 1, 1, 24, 6],
3187+ // "GB200-32_GPUs-8_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001", 1, 1, 32, 8],
31163188 ]
31173189 fullSet + = multiNodesSBSAConfigs. keySet()
31183190
0 commit comments