Skip to content

Commit 270be80

Browse files
authored
[None][ci] Move remaining DGX-B200 tests to LBD (#9876)
Signed-off-by: Yanchao Lu <[email protected]>
1 parent c59aa8b commit 270be80

File tree

6 files changed

+69
-46
lines changed

6 files changed

+69
-46
lines changed

jenkins/BuildDockerImage.groovy

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -372,7 +372,7 @@ def buildImage(config, imageKeyToTag)
372372
IMAGE_WITH_TAG=${imageWithTag} \
373373
STAGE=${dockerfileStage} \
374374
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} ${buildWheelArgs}
375-
""", sleepInSecs: randomSleep, numRetries: 2, shortCommondRunTimeMax: 7200)
375+
""", sleepInSecs: randomSleep, numRetries: 6, shortCommondRunTimeMax: 7200)
376376
}
377377
if (target == "ngc-release") {
378378
imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag
@@ -726,7 +726,7 @@ pipeline {
726726
cmd += "--image "
727727
cmd += imageKeyToTag.values().join(" ")
728728
withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) {
729-
trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 6, shortCommondRunTimeMax: 7200)
729+
trtllm_utils.llmExecStepWithRetry(this, script: cmd, sleepInSecs: 600, numRetries: 6, shortCommondRunTimeMax: 7200)
730730
}
731731
}
732732
}

jenkins/L0_MergeRequest.groovy

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1241,7 +1241,8 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
12411241
def dockerBuildJob = [
12421242
"Build-Docker-Images": {
12431243
script {
1244-
stage("[Build-Docker-Images] Remote Run") {
1244+
def testStageName = "[Build-Docker-Images] ${env.localJobCredentials ? "Remote Run" : "Run"}"
1245+
stage(testStageName) {
12451246
def branch = env.gitlabBranch ? env.gitlabBranch : "main"
12461247
if (globalVars[GITHUB_PR_API_URL]) {
12471248
branch = "github-pr-" + globalVars[GITHUB_PR_API_URL].split('/').last()

jenkins/L0_Test.groovy

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
461461
def cleanupCommands = [
462462
"rm -rf ${cluster.scratchPath}/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true",
463463
"rm -rf ${jobWorkspace} || true",
464-
].join(" && ")
464+
].join(" ; ")
465465
Utils.exec(
466466
pipeline,
467467
script: Utils.sshUserCmd(
@@ -511,7 +511,7 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
511511
def cleanupCommands = [
512512
"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-${entrypoint} || true",
513513
"rm -rf ${cluster.scratchPath}/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true",
514-
].join(" && ")
514+
].join(" ; ")
515515
Utils.exec(
516516
pipeline,
517517
script: Utils.sshUserCmd(
@@ -939,7 +939,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
939939
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}")
940940
sh "cd ${llmPath} && tar -zxf ${BUILD_CONFIGS[config][TARNAME]}"
941941

942-
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm srun job: \" && cat ${scriptRunLocalPath}")
942+
Utils.exec(pipeline, script: "echo \"Script for Slurm srun job to submit: \" && cat ${scriptRunLocalPath}")
943943
Utils.copyFileToRemoteHost(
944944
pipeline,
945945
remote,
@@ -948,7 +948,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
948948
true
949949
)
950950

951-
Utils.exec(pipeline, script: "echo \"Script to install environment: \" && cat ${scriptInstallLocalPath}")
951+
Utils.exec(pipeline, script: "echo \"Script to install TensorRT LLM dependencies: \" && cat ${scriptInstallLocalPath}")
952952
Utils.copyFileToRemoteHost(
953953
pipeline,
954954
remote,
@@ -1093,7 +1093,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10931093
srunArgs = [
10941094
"--container-name=multi_node_test-\${SLURM_JOB_ID}",
10951095
"--container-image=$containerImageArg",
1096-
"--container-workdir=/home/svc_tensorrt/bloom/scripts",
1096+
"--container-workdir=$jobWorkspace",
10971097
"--container-mounts=$mounts",
10981098
"--container-env=NVIDIA_IMEX_CHANNELS"
10991099
]
@@ -1115,16 +1115,21 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11151115
"export ${varName}=\"${escapedValue}\""
11161116
}.join('\n')
11171117

1118+
// Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
11181119
def scriptLaunchPrefix = """#!/bin/bash
11191120
#SBATCH ${exemptionComment}
11201121
#SBATCH --output=${outputPath}
11211122
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
11221123
#SBATCH ${partition.additionalArgs}
11231124
${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
1124-
echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST"
11251125
1126-
set -Eeuo pipefail
1126+
# SBATCH directives must appear before any executable commands.
1127+
set -xEeuo pipefail
11271128
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
1129+
1130+
echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST"
1131+
echo \$SLURM_JOB_ID > "$jobWorkspace/slurm_job_id.txt"
1132+
11281133
export jobWorkspace=$jobWorkspace
11291134
export tarName=$tarName
11301135
export llmTarfile=$llmTarfile
@@ -1156,8 +1161,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11561161

11571162
pipeline.writeFile(file: scriptLaunchPrefixPathLocal, text: scriptLaunchPrefix)
11581163
pipeline.writeFile(file: scriptLaunchSrunArgsPathLocal, text: srunArgs.join(" "))
1159-
Utils.exec(pipeline, script: "echo \"Script launch prefix: \" && cat ${scriptLaunchPrefixPathLocal}")
1160-
Utils.exec(pipeline, script: "echo \"Srun args content: \" && cat ${scriptLaunchSrunArgsPathLocal}")
1164+
Utils.exec(pipeline, script: "echo \"Script for Slurm sbatch job prefix: \" && cat ${scriptLaunchPrefixPathLocal}")
1165+
Utils.exec(pipeline, script: "echo \"Script for Slurm srun job args: \" && cat ${scriptLaunchSrunArgsPathLocal}")
11611166

11621167
// Output is the corresponding scriptLaunchPathLocal script under the disaggMode
11631168
sh """
@@ -1184,7 +1189,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11841189
pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)
11851190
}
11861191

1187-
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm sbatch job: \" && cat ${scriptLaunchPathLocal}")
1192+
Utils.exec(pipeline, script: "echo \"Script for Slurm sbatch job to submit: \" && cat ${scriptLaunchPathLocal}")
11881193
Utils.copyFileToRemoteHost(
11891194
pipeline,
11901195
remote,
@@ -1194,9 +1199,24 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11941199
)
11951200

11961201
def scriptExec = """#!/bin/bash
1197-
set -Eeuo pipefail
1202+
set -xEeuo pipefail
11981203
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
1199-
touch ${outputPath}
1204+
1205+
# Clean up previous job intermediate files so that retry can work
1206+
if [ -f "${jobWorkspace}/slurm_job_id.txt" ]; then
1207+
previous_job_id=\$(cat "${jobWorkspace}/slurm_job_id.txt")
1208+
echo "Found previous Slurm job ID: \${previous_job_id}"
1209+
scancel "\${previous_job_id}" || true
1210+
rm -rf "${jobWorkspace}/slurm_job_id.txt"
1211+
# Wait for 60 seconds to ensure the previous job is canceled
1212+
sleep 60
1213+
fi
1214+
rm -rf "${jobWorkspace}/results.xml"
1215+
rm -rf "${jobWorkspace}/report.csv"
1216+
rm -rf "${jobWorkspace}/unfinished_test.txt"
1217+
rm -rf "${outputPath}"
1218+
1219+
touch "${outputPath}"
12001220
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
12011221
if [ -z "\$jobId" ]; then
12021222
echo "Error: Job submission failed, no job ID returned."
@@ -1460,7 +1480,8 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
14601480
if (stageIsInterrupted) {
14611481
echo "Stage is interrupted, skip to upload test result."
14621482
} else {
1463-
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi'
1483+
// Temporarily disable to reduce the log size
1484+
// sh 'if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi'
14641485
if (noResultIfSuccess && !stageIsFailed) {
14651486
// Clean up the workspace
14661487
sh """
@@ -2603,7 +2624,8 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
26032624
def containerPortNum = GlobalState.PORT_SECTION_SIZE
26042625

26052626
// Some clusters do not allow dmesg -C so we add || true
2606-
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
2627+
// Temporarily disable to reduce the log size
2628+
// sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
26072629
def pytestCommand = getPytestBaseCommandLine(
26082630
llmSrc,
26092631
stageName,
@@ -3124,11 +3146,11 @@ def launchTestJobs(pipeline, testFilter)
31243146
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
31253147
"DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
31263148
"B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
3127-
"DGX_B200-4_GPUs-PyTorch-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4],
3149+
"DGX_B200-4_GPUs-PyTorch-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
31283150
"DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
31293151
"DGX_B200-8_GPUs-PyTorch-1": ["b200-x8-lbd", "l0_dgx_b200", 1, 1, 8, 1, true],
3130-
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 2, 4, 1, true],
3131-
"DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-trtllm", "l0_dgx_b200", 2, 2, 4, 1, true],
3152+
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 2, 4, 1, true],
3153+
"DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true],
31323154
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
31333155
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
31343156
// Perf sanity post merge test

jenkins/scripts/slurm_install.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
22

33
# Set up error handling
4-
set -Eeuo pipefail
4+
set -xEeuo pipefail
55
trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR
66

77
slurm_install_setup() {
@@ -23,8 +23,10 @@ slurm_install_setup() {
2323
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
2424
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
2525
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
26+
echo "(Writing install lock) Current directory: $(pwd)"
2627
touch install_lock.lock
2728
else
29+
echo "(Waiting for install lock) Current directory: $(pwd)"
2830
while [ ! -f install_lock.lock ]; do
2931
sleep 5
3032
done

jenkins/scripts/slurm_run.sh

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
22

33
# Set up error handling
4-
set -Eeuo pipefail
4+
set -xEeuo pipefail
55
trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR
66

77
cd $resourcePathNode
@@ -29,10 +29,8 @@ set_value_in_command() {
2929
echo "$result"
3030
}
3131

32-
# Only the first process will save the job ID and set the git config
32+
# Only the first process will set the git config
3333
if [ $SLURM_PROCID -eq 0 ]; then
34-
# Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
35-
echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
3634
# Update HOME/.gitconfig
3735
if ! git config --global --get-all safe.directory | grep -Fxq "*"; then
3836
git config --global --add safe.directory "*"

tests/integration/test_lists/test-db/l0_dgx_b200.yml

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ l0_dgx_b200:
2424
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True]
2525
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False]
2626
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False]
27-
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] ISOLATION
27+
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True]
2828
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
2929
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
3030
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp_tp4]
@@ -66,17 +66,17 @@ l0_dgx_b200:
6666
backend: pytorch
6767
orchestrator: mpi
6868
tests:
69-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] TIMEOUT (180)
70-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (180)
71-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_bs8_mtp] TIMEOUT (180)
72-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180)
73-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] TIMEOUT (180)
74-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] TIMEOUT (180)
75-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline] TIMEOUT (180)
76-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_mtp1] TIMEOUT (180)
77-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline] TIMEOUT (180)
78-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_mtp1] TIMEOUT (180)
79-
- accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] TIMEOUT (360)
69+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] TIMEOUT (60)
70+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (60)
71+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_bs8_mtp] TIMEOUT (60)
72+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (60)
73+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] TIMEOUT (60)
74+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] TIMEOUT (60)
75+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline] TIMEOUT (60)
76+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_mtp1] TIMEOUT (60)
77+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline] TIMEOUT (60)
78+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_mtp1] TIMEOUT (60)
79+
- accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] TIMEOUT (60)
8080
- condition:
8181
ranges:
8282
system_gpu_count:
@@ -92,15 +92,15 @@ l0_dgx_b200:
9292
backend: pytorch
9393
orchestrator: mpi
9494
tests:
95-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_corner_case TIMEOUT (180)
96-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_fp8kv] TIMEOUT (180)
97-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (180)
98-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[skip_indexer] TIMEOUT (180)
99-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv] TIMEOUT (180)
100-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency] TIMEOUT (180)
101-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[skip_indexer] TIMEOUT (180)
102-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv] TIMEOUT (180)
103-
- accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_fp8[latency_moe_deepgemm] TIMEOUT (90)
95+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_corner_case TIMEOUT (60)
96+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_fp8kv] TIMEOUT (60)
97+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (60)
98+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[skip_indexer] TIMEOUT (60)
99+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv] TIMEOUT (60)
100+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency] TIMEOUT (60)
101+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[skip_indexer] TIMEOUT (60)
102+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv] TIMEOUT (60)
103+
- accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_fp8[latency_moe_deepgemm] TIMEOUT (60)
104104
- condition:
105105
ranges:
106106
system_gpu_count:

0 commit comments

Comments
 (0)