Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 88 additions & 16 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -748,9 +748,9 @@ def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
}
// End of Methods to run Slurm job with Jenkins Agent

def getNodeArgs(int nodeCount, int gpuCount) {
def getNodeArgs(int nodeCount, int gpuCount, boolean setSegment = false) {
int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal).setScale(0, BigDecimal.ROUND_CEILING).intValue()
return nodeCount == 1 ? [
def args = nodeCount == 1 ? [
"--nodes=${nodeCount}",
"--gpus=${gpuCount}"
] : [
Expand All @@ -759,6 +759,10 @@ def getNodeArgs(int nodeCount, int gpuCount) {
"--ntasks-per-node=${gpusPerNode}",
"--gpus-per-node=${gpusPerNode}",
]
if (setSegment && gpuCount > 1) {
args += ["--segment=${nodeCount}"]
}
return args
}

def getPytestBaseCommandLine(
Expand Down Expand Up @@ -883,6 +887,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
// Create a unique suffix for the job name
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
def disaggMode = stageName.contains("Perf-Sanity-Disagg")
def setSegment = disaggMode

Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")

Expand Down Expand Up @@ -914,6 +920,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
def scriptRunPathNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
def scriptInstallLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_install.sh"
def scriptInstallPathNode = "${jobWorkspace}/${jobUID}-slurm_install.sh"
def testListPathNode = "${jobWorkspace}/${testList}.txt"
def waivesListPathNode = "${jobWorkspace}/waives.txt"
def outputPath = "${jobWorkspace}/job-output.log"
Expand All @@ -940,6 +948,15 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
true
)

Utils.exec(pipeline, script: "echo \"Script to install environment: \" && cat ${scriptInstallLocalPath}")
Utils.copyFileToRemoteHost(
pipeline,
remote,
scriptInstallLocalPath,
scriptInstallPathNode,
true
)

// Generate Test List and Upload to Frontend Node
def makoArgs = getMakoArgsFromStageName(stageName, true)
// TODO: currently the options will only be processed if the first
Expand Down Expand Up @@ -1013,7 +1030,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
// Generate Job Launch Script
def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
def mounts = getMountListForSlurmTest(cluster, true).join(",")
String[] taskArgs = getNodeArgs(nodeCount, gpuCount)
String[] taskArgs = getNodeArgs(nodeCount, gpuCount, setSegment)
if (taskArgs == null) {
error "Invalid Slurm test stage name is set"
}
Expand Down Expand Up @@ -1083,10 +1100,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
envVarsToExport.each { varName, varValue ->
srunArgs.add("--container-env=${varName}")
}
if(nodeCount > 1) {
srunArgs.add("--mpi=pmi2")
}

def exemptionComment = ""
if (cluster.host.contains("oci-nrt") || cluster.host.contains("oci-hsg") || cluster.host.contains("lbd-lax")) {
exemptionComment = """--comment='{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"90","reason":"other","description":"Long data and model loading time and disaggregated serving tests"}}'"""
Expand All @@ -1102,8 +1115,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
"export ${varName}=\"${escapedValue}\""
}.join('\n')

def scriptContent = """#!/bin/bash
#SBATCH ${exemptionComment} --output=${outputPath}
def scriptLaunchPrefix = """#!/bin/bash
#SBATCH ${exemptionComment}
#SBATCH --output=${outputPath}
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
#SBATCH ${partition.additionalArgs}
${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
Expand All @@ -1128,10 +1142,48 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
echo "Env NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES"

${srunPrologue}

srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunPathNode}
""".replaceAll("(?m)^\\s*", "")
pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)

if (disaggMode) {
if(nodeCount > 1) {
srunArgs.add("--mpi=pmix")
}

def scriptLaunchPrefixPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch_prefix.sh")
def scriptLaunchSrunArgsPathLocal = Utils.createTempLocation(pipeline, "./slurm_srun_args.txt")
def scriptLaunchDraftPathLocal = "${llmSrcLocal}/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh"
def scriptSubmitLocalPath = "${llmSrcLocal}/jenkins/scripts/perf/disaggregated/submit.py"

pipeline.writeFile(file: scriptLaunchPrefixPathLocal, text: scriptLaunchPrefix)
pipeline.writeFile(file: scriptLaunchSrunArgsPathLocal, text: srunArgs.join(" "))
Utils.exec(pipeline, script: "echo \"Script launch prefix: \" && cat ${scriptLaunchPrefixPathLocal}")
Utils.exec(pipeline, script: "echo \"Srun args content: \" && cat ${scriptLaunchSrunArgsPathLocal}")

// Output is the corresponding scriptLaunchPathLocal script under the disaggMode
sh """
python3 ${scriptSubmitLocalPath} \\
--run-ci \\
--llm-src ${llmSrcLocal} \\
--test-list ${testListPathLocal} \\
--draft-launch-sh ${scriptLaunchDraftPathLocal} \\
--launch-sh ${scriptLaunchPathLocal} \\
--run-sh ${scriptRunPathNode} \\
--install-sh ${scriptInstallPathNode} \\
--script-prefix ${scriptLaunchPrefixPathLocal} \\
--srun-args ${scriptLaunchSrunArgsPathLocal}
"""
} else {
if(nodeCount > 1) {
srunArgs.add("--mpi=pmi2")
}

def scriptContent = """
${scriptLaunchPrefix}
srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunPathNode}
""".replaceAll("(?m)^\\s*", "")
pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)
}

Utils.exec(pipeline, script: "echo \"Script to trigger Slurm sbatch job: \" && cat ${scriptLaunchPathLocal}")
Utils.copyFileToRemoteHost(
pipeline,
Expand Down Expand Up @@ -2634,7 +2686,6 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
if (noRegularTests && noIsolateTests) {
error "No tests were executed for stage ${stageName}, please check the test list and test-db rendering result."
}

}
}

Expand All @@ -2653,7 +2704,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
stage("Check perf result") {
def perfCheckResult = sh(
script: """
python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \
python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \
${stageName}/perf_script_test_results.csv \
${basePerfPath}
""",
Expand All @@ -2672,6 +2723,22 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
"""
}
}

if (perfMode && stageName.contains("Perf-Sanity")) {
stage ("Check perf result") {
def perfCheckResult = sh(
script: """
python3 ${llmSrc}/tests/integration/defs/perf/perf_regression_check.py \
${WORKSPACE}/${stageName}
""",
returnStatus: true
)
// TODO: Enable this when perf regression check is stable
// if (perfCheckResult != 0) {
// error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
// }
}
}
}
}

Expand Down Expand Up @@ -3111,8 +3178,13 @@ def launchTestJobs(pipeline, testFilter)
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
// Perf sanity post merge test
"GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_perf_sanity", 1, 1, 8, 2],
// Perf sanity post merge aggr tests
"GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001", 1, 1, 8, 2],
// Perf sanity post merge disagg tests
"GB200-12_GPUs-3_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001", 1, 1, 12, 3],
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001", 1, 1, 24, 6],
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002", 1, 1, 24, 6],
// "GB200-32_GPUs-8_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001", 1, 1, 32, 8],
]
fullSet += multiNodesSBSAConfigs.keySet()

Expand Down
76 changes: 76 additions & 0 deletions jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@

cleanup_on_failure() {
echo "Error: $1"
scancel ${SLURM_JOB_ID}
exit 1
}

mkdir -p $jobWorkspace
chmod +x $runScript
chmod +x $installScript

# Run installation on all nodes
echo "Running installation on all nodes..."
if ! srun "${srunArgs[@]}" $installScript &> $jobWorkspace/install.log; then
cleanup_on_failure "Failed to run installation. Check $jobWorkspace/install.log"
fi
echo "Installation completed on all nodes"

# Start gen servers
echo "Starting gen servers..."
for i in $(seq 0 $((numGenServers - 1))); do
gen_world_size=$((nodesPerGenServer * gpusPerNode))
export DISAGG_SERVING_TYPE="GEN_$i"
export pytestCommand="$pytestCommandWorker"
srun "${srunArgs[@]}" --kill-on-bad-exit=1 \
-N $nodesPerGenServer \
--ntasks=$gen_world_size \
--ntasks-per-node=$gpusPerNode \
$runScript &> $jobWorkspace/gen_server_$i.log &
echo "Started gen server $i"
done

# Start ctx servers (skip if gen_only mode)
if [ "${TRTLLM_DISAGG_BENCHMARK_GEN_ONLY:-0}" != "1" ]; then
echo "Starting ctx servers..."
for i in $(seq 0 $((numCtxServers - 1))); do
ctx_world_size=$((nodesPerCtxServer * gpusPerNode))
export DISAGG_SERVING_TYPE="CTX_$i"
export pytestCommand="$pytestCommandWorker"
srun "${srunArgs[@]}" --kill-on-bad-exit=1 \
-N $nodesPerCtxServer \
--ntasks=$ctx_world_size \
--ntasks-per-node=$gpusPerNode \
$runScript &> $jobWorkspace/ctx_server_$i.log &
echo "Started ctx server $i"
done
else
echo "Skipping ctx servers (gen_only mode)"
fi


# Start disagg server
echo "Starting disagg server..."
export DISAGG_SERVING_TYPE="DISAGG_SERVER"
export pytestCommand="$pytestCommandDisaggServer"
srun "${srunArgs[@]}" --kill-on-bad-exit=1 --overlap \
-N 1 \
--ntasks=1 \
--ntasks-per-node=1 \
$runScript &> $jobWorkspace/disagg_server.log &
echo "Started disagg server"

# Start benchmark
echo "Starting benchmark..."
export DISAGG_SERVING_TYPE="BENCHMARK"
export pytestCommand="$pytestCommandBenchmark"
if ! srun "${srunArgs[@]}" --kill-on-bad-exit=1 --overlap \
-N 1 \
--ntasks=1 \
--ntasks-per-node=1 \
$runScript; then
cleanup_on_failure "Benchmark failed. Check logs in ${jobWorkspace} for details"
fi

echo "Disagg server and benchmark completed successfully"
echo "Total runtime: $SECONDS seconds"
Loading