Skip to content

Commit d70aedd

Browse files
authored
[TRTLLM-8952][feat] Support Multi-Node Disagg Perf Test in CI (#9138)
Signed-off-by: Chenfei Zhang <[email protected]>
1 parent 684b37d commit d70aedd

File tree

43 files changed

+1679
-898
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1679
-898
lines changed

jenkins/L0_Test.groovy

Lines changed: 88 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -748,9 +748,9 @@ def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
748748
}
749749
// End of Methods to run Slurm job with Jenkins Agent
750750

751-
def getNodeArgs(int nodeCount, int gpuCount) {
751+
def getNodeArgs(int nodeCount, int gpuCount, boolean setSegment = false) {
752752
int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal).setScale(0, BigDecimal.ROUND_CEILING).intValue()
753-
return nodeCount == 1 ? [
753+
def args = nodeCount == 1 ? [
754754
"--nodes=${nodeCount}",
755755
"--gpus=${gpuCount}"
756756
] : [
@@ -759,6 +759,10 @@ def getNodeArgs(int nodeCount, int gpuCount) {
759759
"--ntasks-per-node=${gpusPerNode}",
760760
"--gpus-per-node=${gpusPerNode}",
761761
]
762+
if (setSegment && gpuCount > 1) {
763+
args += ["--segment=${nodeCount}"]
764+
}
765+
return args
762766
}
763767

764768
def getPytestBaseCommandLine(
@@ -883,6 +887,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
883887
// Create a unique suffix for the job name
884888
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
885889
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
890+
def disaggMode = stageName.contains("Perf-Sanity-Disagg")
891+
def setSegment = disaggMode
886892

887893
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
888894

@@ -914,6 +920,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
914920
def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
915921
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
916922
def scriptRunPathNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
923+
def scriptInstallLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_install.sh"
924+
def scriptInstallPathNode = "${jobWorkspace}/${jobUID}-slurm_install.sh"
917925
def testListPathNode = "${jobWorkspace}/${testList}.txt"
918926
def waivesListPathNode = "${jobWorkspace}/waives.txt"
919927
def outputPath = "${jobWorkspace}/job-output.log"
@@ -940,6 +948,15 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
940948
true
941949
)
942950

951+
Utils.exec(pipeline, script: "echo \"Script to install environment: \" && cat ${scriptInstallLocalPath}")
952+
Utils.copyFileToRemoteHost(
953+
pipeline,
954+
remote,
955+
scriptInstallLocalPath,
956+
scriptInstallPathNode,
957+
true
958+
)
959+
943960
// Generate Test List and Upload to Frontend Node
944961
def makoArgs = getMakoArgsFromStageName(stageName, true)
945962
// TODO: currently the options will only be processed if the first
@@ -1013,7 +1030,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10131030
// Generate Job Launch Script
10141031
def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
10151032
def mounts = getMountListForSlurmTest(cluster, true).join(",")
1016-
String[] taskArgs = getNodeArgs(nodeCount, gpuCount)
1033+
String[] taskArgs = getNodeArgs(nodeCount, gpuCount, setSegment)
10171034
if (taskArgs == null) {
10181035
error "Invalid Slurm test stage name is set"
10191036
}
@@ -1083,10 +1100,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10831100
envVarsToExport.each { varName, varValue ->
10841101
srunArgs.add("--container-env=${varName}")
10851102
}
1086-
if(nodeCount > 1) {
1087-
srunArgs.add("--mpi=pmi2")
1088-
}
1089-
10901103
def exemptionComment = ""
10911104
if (cluster.host.contains("oci-nrt") || cluster.host.contains("oci-hsg") || cluster.host.contains("lbd-lax")) {
10921105
exemptionComment = """--comment='{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"90","reason":"other","description":"Long data and model loading time and disaggregated serving tests"}}'"""
@@ -1102,8 +1115,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11021115
"export ${varName}=\"${escapedValue}\""
11031116
}.join('\n')
11041117

1105-
def scriptContent = """#!/bin/bash
1106-
#SBATCH ${exemptionComment} --output=${outputPath}
1118+
def scriptLaunchPrefix = """#!/bin/bash
1119+
#SBATCH ${exemptionComment}
1120+
#SBATCH --output=${outputPath}
11071121
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
11081122
#SBATCH ${partition.additionalArgs}
11091123
${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
@@ -1128,10 +1142,48 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11281142
echo "Env NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES"
11291143
11301144
${srunPrologue}
1131-
1132-
srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunPathNode}
11331145
""".replaceAll("(?m)^\\s*", "")
1134-
pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)
1146+
1147+
if (disaggMode) {
1148+
if(nodeCount > 1) {
1149+
srunArgs.add("--mpi=pmix")
1150+
}
1151+
1152+
def scriptLaunchPrefixPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch_prefix.sh")
1153+
def scriptLaunchSrunArgsPathLocal = Utils.createTempLocation(pipeline, "./slurm_srun_args.txt")
1154+
def scriptLaunchDraftPathLocal = "${llmSrcLocal}/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh"
1155+
def scriptSubmitLocalPath = "${llmSrcLocal}/jenkins/scripts/perf/disaggregated/submit.py"
1156+
1157+
pipeline.writeFile(file: scriptLaunchPrefixPathLocal, text: scriptLaunchPrefix)
1158+
pipeline.writeFile(file: scriptLaunchSrunArgsPathLocal, text: srunArgs.join(" "))
1159+
Utils.exec(pipeline, script: "echo \"Script launch prefix: \" && cat ${scriptLaunchPrefixPathLocal}")
1160+
Utils.exec(pipeline, script: "echo \"Srun args content: \" && cat ${scriptLaunchSrunArgsPathLocal}")
1161+
1162+
// Output is the corresponding scriptLaunchPathLocal script under the disaggMode
1163+
sh """
1164+
python3 ${scriptSubmitLocalPath} \\
1165+
--run-ci \\
1166+
--llm-src ${llmSrcLocal} \\
1167+
--test-list ${testListPathLocal} \\
1168+
--draft-launch-sh ${scriptLaunchDraftPathLocal} \\
1169+
--launch-sh ${scriptLaunchPathLocal} \\
1170+
--run-sh ${scriptRunPathNode} \\
1171+
--install-sh ${scriptInstallPathNode} \\
1172+
--script-prefix ${scriptLaunchPrefixPathLocal} \\
1173+
--srun-args ${scriptLaunchSrunArgsPathLocal}
1174+
"""
1175+
} else {
1176+
if(nodeCount > 1) {
1177+
srunArgs.add("--mpi=pmi2")
1178+
}
1179+
1180+
def scriptContent = """
1181+
${scriptLaunchPrefix}
1182+
srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunPathNode}
1183+
""".replaceAll("(?m)^\\s*", "")
1184+
pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)
1185+
}
1186+
11351187
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm sbatch job: \" && cat ${scriptLaunchPathLocal}")
11361188
Utils.copyFileToRemoteHost(
11371189
pipeline,
@@ -2634,7 +2686,6 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
26342686
if (noRegularTests && noIsolateTests) {
26352687
error "No tests were executed for stage ${stageName}, please check the test list and test-db rendering result."
26362688
}
2637-
26382689
}
26392690
}
26402691

@@ -2653,7 +2704,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
26532704
stage("Check perf result") {
26542705
def perfCheckResult = sh(
26552706
script: """
2656-
python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \
2707+
python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \
26572708
${stageName}/perf_script_test_results.csv \
26582709
${basePerfPath}
26592710
""",
@@ -2672,6 +2723,22 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
26722723
"""
26732724
}
26742725
}
2726+
2727+
if (perfMode && stageName.contains("Perf-Sanity")) {
2728+
stage ("Check perf result") {
2729+
def perfCheckResult = sh(
2730+
script: """
2731+
python3 ${llmSrc}/tests/integration/defs/perf/perf_regression_check.py \
2732+
${WORKSPACE}/${stageName}
2733+
""",
2734+
returnStatus: true
2735+
)
2736+
// TODO: Enable this when perf regression check is stable
2737+
// if (perfCheckResult != 0) {
2738+
// error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
2739+
// }
2740+
}
2741+
}
26752742
}
26762743
}
26772744

@@ -3111,8 +3178,13 @@ def launchTestJobs(pipeline, testFilter)
31113178
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
31123179
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
31133180
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
3114-
// Perf sanity post merge test
3115-
"GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_perf_sanity", 1, 1, 8, 2],
3181+
// Perf sanity post merge aggr tests
3182+
"GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001", 1, 1, 8, 2],
3183+
// Perf sanity post merge disagg tests
3184+
"GB200-12_GPUs-3_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001", 1, 1, 12, 3],
3185+
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001", 1, 1, 24, 6],
3186+
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002", 1, 1, 24, 6],
3187+
// "GB200-32_GPUs-8_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001", 1, 1, 32, 8],
31163188
]
31173189
fullSet += multiNodesSBSAConfigs.keySet()
31183190

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
2+
cleanup_on_failure() {
3+
echo "Error: $1"
4+
scancel ${SLURM_JOB_ID}
5+
exit 1
6+
}
7+
8+
mkdir -p $jobWorkspace
9+
chmod +x $runScript
10+
chmod +x $installScript
11+
12+
# Run installation on all nodes
13+
echo "Running installation on all nodes..."
14+
if ! srun "${srunArgs[@]}" $installScript &> $jobWorkspace/install.log; then
15+
cleanup_on_failure "Failed to run installation. Check $jobWorkspace/install.log"
16+
fi
17+
echo "Installation completed on all nodes"
18+
19+
# Start gen servers
20+
echo "Starting gen servers..."
21+
for i in $(seq 0 $((numGenServers - 1))); do
22+
gen_world_size=$((nodesPerGenServer * gpusPerNode))
23+
export DISAGG_SERVING_TYPE="GEN_$i"
24+
export pytestCommand="$pytestCommandWorker"
25+
srun "${srunArgs[@]}" --kill-on-bad-exit=1 \
26+
-N $nodesPerGenServer \
27+
--ntasks=$gen_world_size \
28+
--ntasks-per-node=$gpusPerNode \
29+
$runScript &> $jobWorkspace/gen_server_$i.log &
30+
echo "Started gen server $i"
31+
done
32+
33+
# Start ctx servers (skip if gen_only mode)
34+
if [ "${TRTLLM_DISAGG_BENCHMARK_GEN_ONLY:-0}" != "1" ]; then
35+
echo "Starting ctx servers..."
36+
for i in $(seq 0 $((numCtxServers - 1))); do
37+
ctx_world_size=$((nodesPerCtxServer * gpusPerNode))
38+
export DISAGG_SERVING_TYPE="CTX_$i"
39+
export pytestCommand="$pytestCommandWorker"
40+
srun "${srunArgs[@]}" --kill-on-bad-exit=1 \
41+
-N $nodesPerCtxServer \
42+
--ntasks=$ctx_world_size \
43+
--ntasks-per-node=$gpusPerNode \
44+
$runScript &> $jobWorkspace/ctx_server_$i.log &
45+
echo "Started ctx server $i"
46+
done
47+
else
48+
echo "Skipping ctx servers (gen_only mode)"
49+
fi
50+
51+
52+
# Start disagg server
53+
echo "Starting disagg server..."
54+
export DISAGG_SERVING_TYPE="DISAGG_SERVER"
55+
export pytestCommand="$pytestCommandDisaggServer"
56+
srun "${srunArgs[@]}" --kill-on-bad-exit=1 --overlap \
57+
-N 1 \
58+
--ntasks=1 \
59+
--ntasks-per-node=1 \
60+
$runScript &> $jobWorkspace/disagg_server.log &
61+
echo "Started disagg server"
62+
63+
# Start benchmark
64+
echo "Starting benchmark..."
65+
export DISAGG_SERVING_TYPE="BENCHMARK"
66+
export pytestCommand="$pytestCommandBenchmark"
67+
if ! srun "${srunArgs[@]}" --kill-on-bad-exit=1 --overlap \
68+
-N 1 \
69+
--ntasks=1 \
70+
--ntasks-per-node=1 \
71+
$runScript; then
72+
cleanup_on_failure "Benchmark failed. Check logs in ${jobWorkspace} for details"
73+
fi
74+
75+
echo "Disagg server and benchmark completed successfully"
76+
echo "Total runtime: $SECONDS seconds"

0 commit comments

Comments
 (0)