From 5722ebbbfeb84befa43c941b27d872210b52ca3f Mon Sep 17 00:00:00 2001 From: Yanchao Lu Date: Sat, 27 Dec 2025 20:53:09 +0800 Subject: [PATCH 1/3] [None][ci] Move remaining DGX-B200 tests to LBD Signed-off-by: Yanchao Lu --- jenkins/BuildDockerImage.groovy | 4 +- jenkins/L0_MergeRequest.groovy | 3 +- jenkins/L0_Test.groovy | 27 ++++++------ jenkins/scripts/slurm_install.sh | 4 +- jenkins/scripts/slurm_run.sh | 6 +-- .../test_lists/test-db/l0_dgx_b200.yml | 42 +++++++++---------- 6 files changed, 45 insertions(+), 41 deletions(-) diff --git a/jenkins/BuildDockerImage.groovy b/jenkins/BuildDockerImage.groovy index 5049965f5e2..c3fb2fac3c5 100644 --- a/jenkins/BuildDockerImage.groovy +++ b/jenkins/BuildDockerImage.groovy @@ -372,7 +372,7 @@ def buildImage(config, imageKeyToTag) IMAGE_WITH_TAG=${imageWithTag} \ STAGE=${dockerfileStage} \ BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} ${buildWheelArgs} - """, sleepInSecs: randomSleep, numRetries: 2, shortCommondRunTimeMax: 7200) + """, sleepInSecs: randomSleep, numRetries: 6, shortCommondRunTimeMax: 7200) } if (target == "ngc-release") { imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag @@ -726,7 +726,7 @@ pipeline { cmd += "--image " cmd += imageKeyToTag.values().join(" ") withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) { - trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 6, shortCommondRunTimeMax: 7200) + trtllm_utils.llmExecStepWithRetry(this, script: cmd, sleepInSecs: 600, numRetries: 6, shortCommondRunTimeMax: 7200) } } } diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index a8e5789589e..3e81b22a099 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -1241,7 +1241,8 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars) def dockerBuildJob = [ "Build-Docker-Images": { script { - stage("[Build-Docker-Images] Remote Run") { + def testStageName = "[Build-Docker-Images] ${env.localJobCredentials ? "Remote Run" : "Run"}" + stage(testStageName) { def branch = env.gitlabBranch ? env.gitlabBranch : "main" if (globalVars[GITHUB_PR_API_URL]) { branch = "github-pr-" + globalVars[GITHUB_PR_API_URL].split('/').last() diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index ce5842d7c21..44fb21bae59 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -924,7 +924,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG def scriptInstallPathNode = "${jobWorkspace}/${jobUID}-slurm_install.sh" def testListPathNode = "${jobWorkspace}/${testList}.txt" def waivesListPathNode = "${jobWorkspace}/waives.txt" - def outputPath = "${jobWorkspace}/job-output.log" + def outputPath = "${jobWorkspace}/job-output-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}.log" def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh") def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh" def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh") @@ -939,7 +939,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}") sh "cd ${llmPath} && tar -zxf ${BUILD_CONFIGS[config][TARNAME]}" - Utils.exec(pipeline, script: "echo \"Script to trigger Slurm srun job: \" && cat ${scriptRunLocalPath}") + Utils.exec(pipeline, script: "echo \"Script for Slurm srun job to submit: \" && cat ${scriptRunLocalPath}") Utils.copyFileToRemoteHost( pipeline, remote, @@ -948,7 +948,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG true ) - Utils.exec(pipeline, script: "echo \"Script to install environment: \" && cat ${scriptInstallLocalPath}") + Utils.exec(pipeline, script: "echo \"Script to install TensorRT LLM dependencies: \" && cat ${scriptInstallLocalPath}") Utils.copyFileToRemoteHost( pipeline, remote, @@ -1093,7 +1093,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG srunArgs = [ "--container-name=multi_node_test-\${SLURM_JOB_ID}", "--container-image=$containerImageArg", - "--container-workdir=/home/svc_tensorrt/bloom/scripts", + "--container-workdir=$jobWorkspace", "--container-mounts=$mounts", "--container-env=NVIDIA_IMEX_CHANNELS" ] @@ -1115,6 +1115,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG "export ${varName}=\"${escapedValue}\"" }.join('\n') + // Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve def scriptLaunchPrefix = """#!/bin/bash #SBATCH ${exemptionComment} #SBATCH --output=${outputPath} @@ -1122,8 +1123,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG #SBATCH ${partition.additionalArgs} ${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""} echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST" + echo "\$SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt - set -Eeuo pipefail + set -xEeuo pipefail trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR export jobWorkspace=$jobWorkspace export tarName=$tarName @@ -1156,8 +1158,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG pipeline.writeFile(file: scriptLaunchPrefixPathLocal, text: scriptLaunchPrefix) pipeline.writeFile(file: scriptLaunchSrunArgsPathLocal, text: srunArgs.join(" ")) - Utils.exec(pipeline, script: "echo \"Script launch prefix: \" && cat ${scriptLaunchPrefixPathLocal}") - Utils.exec(pipeline, script: "echo \"Srun args content: \" && cat ${scriptLaunchSrunArgsPathLocal}") + Utils.exec(pipeline, script: "echo \"Script for Slurm sbatch job prefix: \" && cat ${scriptLaunchPrefixPathLocal}") + Utils.exec(pipeline, script: "echo \"Script for Slurm srun job args: \" && cat ${scriptLaunchSrunArgsPathLocal}") // Output is the corresponding scriptLaunchPathLocal script under the disaggMode sh """ @@ -1184,7 +1186,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent) } - Utils.exec(pipeline, script: "echo \"Script to trigger Slurm sbatch job: \" && cat ${scriptLaunchPathLocal}") + Utils.exec(pipeline, script: "echo \"Script for Slurm sbatch job to submit: \" && cat ${scriptLaunchPathLocal}") Utils.copyFileToRemoteHost( pipeline, remote, @@ -1194,8 +1196,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG ) def scriptExec = """#!/bin/bash - set -Eeuo pipefail + set -xEeuo pipefail trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR + rm -rf ${outputPath} touch ${outputPath} jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}') if [ -z "\$jobId" ]; then @@ -3124,11 +3127,11 @@ def launchTestJobs(pipeline, testFilter) "DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4], "B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1], - "DGX_B200-4_GPUs-PyTorch-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4], + "DGX_B200-4_GPUs-PyTorch-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true], "DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true], "DGX_B200-8_GPUs-PyTorch-1": ["b200-x8-lbd", "l0_dgx_b200", 1, 1, 8, 1, true], - "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 2, 4, 1, true], - "DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-trtllm", "l0_dgx_b200", 2, 2, 4, 1, true], + "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 2, 4, 1, true], + "DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true], "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4], "DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4], // Perf sanity post merge test diff --git a/jenkins/scripts/slurm_install.sh b/jenkins/scripts/slurm_install.sh index 00fcd2b0935..c77d6a9220d 100644 --- a/jenkins/scripts/slurm_install.sh +++ b/jenkins/scripts/slurm_install.sh @@ -1,7 +1,7 @@ #!/bin/bash # Set up error handling -set -Eeuo pipefail +set -xEeuo pipefail trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR slurm_install_setup() { @@ -23,8 +23,10 @@ slurm_install_setup() { gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true) hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}" echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName" + echo "(Writing install lock) Current directory: $(pwd)" touch install_lock.lock else + echo "(Waiting for install lock) Current directory: $(pwd)" while [ ! -f install_lock.lock ]; do sleep 5 done diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh index e86092b7ea2..e75826ced73 100755 --- a/jenkins/scripts/slurm_run.sh +++ b/jenkins/scripts/slurm_run.sh @@ -1,7 +1,7 @@ #!/bin/bash # Set up error handling -set -Eeuo pipefail +set -xEeuo pipefail trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR cd $resourcePathNode @@ -29,10 +29,8 @@ set_value_in_command() { echo "$result" } -# Only the first process will save the job ID and set the git config +# Only the first process will set the git config if [ $SLURM_PROCID -eq 0 ]; then - # Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve - echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt # Update HOME/.gitconfig if ! git config --global --get-all safe.directory | grep -Fxq "*"; then git config --global --add safe.directory "*" diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index fb7f7acaec5..c691acc1fef 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -24,7 +24,7 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] ISOLATION + - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp_tp4] @@ -66,17 +66,17 @@ l0_dgx_b200: backend: pytorch orchestrator: mpi tests: - - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_bs8_mtp] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_mtp1] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_mtp1] TIMEOUT (180) - - accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] TIMEOUT (360) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] TIMEOUT (60) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (60) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_bs8_mtp] TIMEOUT (60) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (60) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] TIMEOUT (60) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] TIMEOUT (60) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline] TIMEOUT (60) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_mtp1] TIMEOUT (60) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline] TIMEOUT (60) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_mtp1] TIMEOUT (60) + - accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] TIMEOUT (60) - condition: ranges: system_gpu_count: @@ -92,15 +92,15 @@ l0_dgx_b200: backend: pytorch orchestrator: mpi tests: - - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_corner_case TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_fp8kv] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[skip_indexer] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[skip_indexer] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_fp8[latency_moe_deepgemm] TIMEOUT (90) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_corner_case TIMEOUT (60) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_fp8kv] TIMEOUT (60) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (60) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[skip_indexer] TIMEOUT (60) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv] TIMEOUT (60) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency] TIMEOUT (60) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[skip_indexer] TIMEOUT (60) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv] TIMEOUT (60) + - accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_fp8[latency_moe_deepgemm] TIMEOUT (60) - condition: ranges: system_gpu_count: From 89f95d09c4779a0c30450fa49a81df2ddcf59ecf Mon Sep 17 00:00:00 2001 From: Yanchao Lu Date: Sat, 27 Dec 2025 21:34:07 +0800 Subject: [PATCH 2/3] Further fixes Signed-off-by: Yanchao Lu --- jenkins/L0_Test.groovy | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 44fb21bae59..e6579762a8b 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -461,7 +461,7 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){ def cleanupCommands = [ "rm -rf ${cluster.scratchPath}/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true", "rm -rf ${jobWorkspace} || true", - ].join(" && ") + ].join(" ; ") Utils.exec( pipeline, script: Utils.sshUserCmd( @@ -511,7 +511,7 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St def cleanupCommands = [ "rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-${entrypoint} || true", "rm -rf ${cluster.scratchPath}/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true", - ].join(" && ") + ].join(" ; ") Utils.exec( pipeline, script: Utils.sshUserCmd( @@ -924,7 +924,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG def scriptInstallPathNode = "${jobWorkspace}/${jobUID}-slurm_install.sh" def testListPathNode = "${jobWorkspace}/${testList}.txt" def waivesListPathNode = "${jobWorkspace}/waives.txt" - def outputPath = "${jobWorkspace}/job-output-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}.log" + def outputPath = "${jobWorkspace}/job-output.log" def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh") def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh" def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh") @@ -1122,11 +1122,14 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG ${taskArgs.collect { "#SBATCH $it" }.join('\n')} #SBATCH ${partition.additionalArgs} ${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""} - echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST" - echo "\$SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt + # SBATCH directives must appear before any executable commands. set -xEeuo pipefail trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR + + echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST" + echo \$SLURM_JOB_ID > "$jobWorkspace/slurm_job_id.txt" + export jobWorkspace=$jobWorkspace export tarName=$tarName export llmTarfile=$llmTarfile @@ -1198,8 +1201,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG def scriptExec = """#!/bin/bash set -xEeuo pipefail trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR - rm -rf ${outputPath} - touch ${outputPath} + + # Clean up previous job intermediate files so that retry can work + if [ -f "${jobWorkspace}/slurm_job_id.txt" ]; then + previous_job_id=\$(cat "${jobWorkspace}/slurm_job_id.txt") + echo "Found previous Slurm job ID: \${previous_job_id}" + scancel "\${previous_job_id}" || true + rm -rf "${jobWorkspace}/slurm_job_id.txt" + # Wait for 60 seconds to ensure the previous job is canceled + sleep 60 + fi + rm -rf "${jobWorkspace}/results.xml" + rm -rf "${jobWorkspace}/report.csv" + rm -rf "${jobWorkspace}/unfinished_test.txt" + rm -rf "${outputPath}" + + touch "${outputPath}" jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}') if [ -z "\$jobId" ]; then echo "Error: Job submission failed, no job ID returned." From c97a9d2c36f38323570dbdee24f2394c6d38f4dc Mon Sep 17 00:00:00 2001 From: Yanchao Lu Date: Sun, 28 Dec 2025 12:49:19 +0800 Subject: [PATCH 3/3] Temporarily disable dmesg to reduce the log size Signed-off-by: Yanchao Lu --- jenkins/L0_Test.groovy | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index e6579762a8b..74a3c92ed82 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1480,7 +1480,8 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu if (stageIsInterrupted) { echo "Stage is interrupted, skip to upload test result." } else { - sh 'if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi' + // Temporarily disable to reduce the log size + // sh 'if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi' if (noResultIfSuccess && !stageIsFailed) { // Clean up the workspace sh """ @@ -2623,7 +2624,8 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO def containerPortNum = GlobalState.PORT_SECTION_SIZE // Some clusters do not allow dmesg -C so we add || true - sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi' + // Temporarily disable to reduce the log size + // sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi' def pytestCommand = getPytestBaseCommandLine( llmSrc, stageName,