Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions jenkins/BuildDockerImage.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ def buildImage(config, imageKeyToTag)
IMAGE_WITH_TAG=${imageWithTag} \
STAGE=${dockerfileStage} \
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} ${buildWheelArgs}
""", sleepInSecs: randomSleep, numRetries: 2, shortCommondRunTimeMax: 7200)
""", sleepInSecs: randomSleep, numRetries: 6, shortCommondRunTimeMax: 7200)
}
if (target == "ngc-release") {
imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag
Expand Down Expand Up @@ -726,7 +726,7 @@ pipeline {
cmd += "--image "
cmd += imageKeyToTag.values().join(" ")
withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) {
trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 6, shortCommondRunTimeMax: 7200)
trtllm_utils.llmExecStepWithRetry(this, script: cmd, sleepInSecs: 600, numRetries: 6, shortCommondRunTimeMax: 7200)
}
}
}
Expand Down
3 changes: 2 additions & 1 deletion jenkins/L0_MergeRequest.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -1241,7 +1241,8 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
def dockerBuildJob = [
"Build-Docker-Images": {
script {
stage("[Build-Docker-Images] Remote Run") {
def testStageName = "[Build-Docker-Images] ${env.localJobCredentials ? "Remote Run" : "Run"}"
stage(testStageName) {
def branch = env.gitlabBranch ? env.gitlabBranch : "main"
if (globalVars[GITHUB_PR_API_URL]) {
branch = "github-pr-" + globalVars[GITHUB_PR_API_URL].split('/').last()
Expand Down
56 changes: 39 additions & 17 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
def cleanupCommands = [
"rm -rf ${cluster.scratchPath}/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true",
"rm -rf ${jobWorkspace} || true",
].join(" && ")
].join(" ; ")
Utils.exec(
pipeline,
script: Utils.sshUserCmd(
Expand Down Expand Up @@ -511,7 +511,7 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
def cleanupCommands = [
"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-${entrypoint} || true",
"rm -rf ${cluster.scratchPath}/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true",
].join(" && ")
].join(" ; ")
Utils.exec(
pipeline,
script: Utils.sshUserCmd(
Expand Down Expand Up @@ -939,7 +939,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}")
sh "cd ${llmPath} && tar -zxf ${BUILD_CONFIGS[config][TARNAME]}"

Utils.exec(pipeline, script: "echo \"Script to trigger Slurm srun job: \" && cat ${scriptRunLocalPath}")
Utils.exec(pipeline, script: "echo \"Script for Slurm srun job to submit: \" && cat ${scriptRunLocalPath}")
Utils.copyFileToRemoteHost(
pipeline,
remote,
Expand All @@ -948,7 +948,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
true
)

Utils.exec(pipeline, script: "echo \"Script to install environment: \" && cat ${scriptInstallLocalPath}")
Utils.exec(pipeline, script: "echo \"Script to install TensorRT LLM dependencies: \" && cat ${scriptInstallLocalPath}")
Utils.copyFileToRemoteHost(
pipeline,
remote,
Expand Down Expand Up @@ -1093,7 +1093,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
srunArgs = [
"--container-name=multi_node_test-\${SLURM_JOB_ID}",
"--container-image=$containerImageArg",
"--container-workdir=/home/svc_tensorrt/bloom/scripts",
"--container-workdir=$jobWorkspace",
"--container-mounts=$mounts",
"--container-env=NVIDIA_IMEX_CHANNELS"
]
Expand All @@ -1115,16 +1115,21 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
"export ${varName}=\"${escapedValue}\""
}.join('\n')

// Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
def scriptLaunchPrefix = """#!/bin/bash
#SBATCH ${exemptionComment}
#SBATCH --output=${outputPath}
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
#SBATCH ${partition.additionalArgs}
${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST"

set -Eeuo pipefail
# SBATCH directives must appear before any executable commands.
set -xEeuo pipefail
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR

echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST"
echo \$SLURM_JOB_ID > "$jobWorkspace/slurm_job_id.txt"

export jobWorkspace=$jobWorkspace
export tarName=$tarName
export llmTarfile=$llmTarfile
Expand Down Expand Up @@ -1156,8 +1161,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG

pipeline.writeFile(file: scriptLaunchPrefixPathLocal, text: scriptLaunchPrefix)
pipeline.writeFile(file: scriptLaunchSrunArgsPathLocal, text: srunArgs.join(" "))
Utils.exec(pipeline, script: "echo \"Script launch prefix: \" && cat ${scriptLaunchPrefixPathLocal}")
Utils.exec(pipeline, script: "echo \"Srun args content: \" && cat ${scriptLaunchSrunArgsPathLocal}")
Utils.exec(pipeline, script: "echo \"Script for Slurm sbatch job prefix: \" && cat ${scriptLaunchPrefixPathLocal}")
Utils.exec(pipeline, script: "echo \"Script for Slurm srun job args: \" && cat ${scriptLaunchSrunArgsPathLocal}")

// Output is the corresponding scriptLaunchPathLocal script under the disaggMode
sh """
Expand All @@ -1184,7 +1189,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)
}

Utils.exec(pipeline, script: "echo \"Script to trigger Slurm sbatch job: \" && cat ${scriptLaunchPathLocal}")
Utils.exec(pipeline, script: "echo \"Script for Slurm sbatch job to submit: \" && cat ${scriptLaunchPathLocal}")
Utils.copyFileToRemoteHost(
pipeline,
remote,
Expand All @@ -1194,9 +1199,24 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
)

def scriptExec = """#!/bin/bash
set -Eeuo pipefail
set -xEeuo pipefail
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
touch ${outputPath}

# Clean up previous job intermediate files so that retry can work
if [ -f "${jobWorkspace}/slurm_job_id.txt" ]; then
previous_job_id=\$(cat "${jobWorkspace}/slurm_job_id.txt")
echo "Found previous Slurm job ID: \${previous_job_id}"
scancel "\${previous_job_id}" || true
rm -rf "${jobWorkspace}/slurm_job_id.txt"
# Wait for 60 seconds to ensure the previous job is canceled
sleep 60
fi
rm -rf "${jobWorkspace}/results.xml"
rm -rf "${jobWorkspace}/report.csv"
rm -rf "${jobWorkspace}/unfinished_test.txt"
rm -rf "${outputPath}"

touch "${outputPath}"
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
if [ -z "\$jobId" ]; then
echo "Error: Job submission failed, no job ID returned."
Expand Down Expand Up @@ -1460,7 +1480,8 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
if (stageIsInterrupted) {
echo "Stage is interrupted, skip to upload test result."
} else {
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi'
// Temporarily disable to reduce the log size
// sh 'if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi'
if (noResultIfSuccess && !stageIsFailed) {
// Clean up the workspace
sh """
Expand Down Expand Up @@ -2603,7 +2624,8 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
def containerPortNum = GlobalState.PORT_SECTION_SIZE

// Some clusters do not allow dmesg -C so we add || true
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
// Temporarily disable to reduce the log size
// sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
def pytestCommand = getPytestBaseCommandLine(
llmSrc,
stageName,
Expand Down Expand Up @@ -3124,11 +3146,11 @@ def launchTestJobs(pipeline, testFilter)
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
"DGX_B200-4_GPUs-PyTorch-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4],
"DGX_B200-4_GPUs-PyTorch-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
"DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
"DGX_B200-8_GPUs-PyTorch-1": ["b200-x8-lbd", "l0_dgx_b200", 1, 1, 8, 1, true],
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 2, 4, 1, true],
"DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-trtllm", "l0_dgx_b200", 2, 2, 4, 1, true],
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 2, 4, 1, true],
"DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true],
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
// Perf sanity post merge test
Expand Down
4 changes: 3 additions & 1 deletion jenkins/scripts/slurm_install.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

# Set up error handling
set -Eeuo pipefail
set -xEeuo pipefail
trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR

slurm_install_setup() {
Expand All @@ -23,8 +23,10 @@ slurm_install_setup() {
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
echo "(Writing install lock) Current directory: $(pwd)"
touch install_lock.lock
else
echo "(Waiting for install lock) Current directory: $(pwd)"
while [ ! -f install_lock.lock ]; do
sleep 5
done
Expand Down
6 changes: 2 additions & 4 deletions jenkins/scripts/slurm_run.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

# Set up error handling
set -Eeuo pipefail
set -xEeuo pipefail
trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR

cd $resourcePathNode
Expand Down Expand Up @@ -29,10 +29,8 @@ set_value_in_command() {
echo "$result"
}

# Only the first process will save the job ID and set the git config
# Only the first process will set the git config
if [ $SLURM_PROCID -eq 0 ]; then
# Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
# Update HOME/.gitconfig
if ! git config --global --get-all safe.directory | grep -Fxq "*"; then
git config --global --add safe.directory "*"
Expand Down
42 changes: 21 additions & 21 deletions tests/integration/test_lists/test-db/l0_dgx_b200.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ l0_dgx_b200:
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] ISOLATION
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True]
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp_tp4]
Expand Down Expand Up @@ -66,17 +66,17 @@ l0_dgx_b200:
backend: pytorch
orchestrator: mpi
tests:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_bs8_mtp] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_mtp1] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_mtp1] TIMEOUT (180)
- accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] TIMEOUT (360)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_bs8_mtp] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_mtp1] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_mtp1] TIMEOUT (60)
- accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] TIMEOUT (60)
- condition:
ranges:
system_gpu_count:
Expand All @@ -92,15 +92,15 @@ l0_dgx_b200:
backend: pytorch
orchestrator: mpi
tests:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_corner_case TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_fp8kv] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[skip_indexer] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[skip_indexer] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_fp8[latency_moe_deepgemm] TIMEOUT (90)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_corner_case TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_fp8kv] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[skip_indexer] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[skip_indexer] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_fp8[latency_moe_deepgemm] TIMEOUT (60)
- condition:
ranges:
system_gpu_count:
Expand Down