[None][infra] Some improvements for Slurm execution path in the CI (#10316)

chzblych · web-flow · commit 965578ca219d · 2025-12-29T06:49:44.000-05:00
Signed-off-by: Yanchao Lu &lt;yanchaol@nvidia.com&gt;
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
@@ -694,9 +694,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
         }
 
         slurmRunner = null
-        if (cluster.containerRuntime == ContainerRuntime.DOCKER) {
+        if (cluster.containerRuntime.toString() == "DOCKER") {
             slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
-        } else if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
+        } else if (cluster.containerRuntime.toString() == "ENROOT") {
             slurmRunner = runInEnrootOnNode(nodeName)
         } else {
             throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}")
@@ -799,7 +799,7 @@ def getPytestBaseCommandLine(
         "LLM_BACKEND_ROOT=${llmSrc}/triton_backend",
         "LLM_MODELS_ROOT=${MODEL_CACHE_DIR}",
         "MODEL_CACHE_DIR=${MODEL_CACHE_DIR}",
-        "COLUMNS=200",
+        "COLUMNS=400",
         extraInternalEnv,
         portEnvVars,
         pytestUtil,
@@ -860,11 +860,11 @@ def getMountListForSlurmTest(SlurmCluster cluster, boolean useSbatch = false)
     }
 
     // data/cache mounts
-    if (cluster.containerRuntime == ContainerRuntime.DOCKER) {
+    if (cluster.containerRuntime.toString() == "DOCKER") {
         mounts += [
             "/home/scratch.trt_llm_data:/scratch.trt_llm_data:ro",
         ]
-    } else if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
+    } else if (cluster.containerRuntime.toString() == "ENROOT") {
         if (!cluster.scratchPath) {
             throw new Exception("Scratch path is not set for cluster: ${cluster.name}")
         }
@@ -922,6 +922,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
             def scriptRunPathNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
             def scriptInstallLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_install.sh"
             def scriptInstallPathNode = "${jobWorkspace}/${jobUID}-slurm_install.sh"
+            def scriptBashUtilsLocalPath = "${llmSrcLocal}/jenkins/scripts/bash_utils.sh"
+            def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh"
             def testListPathNode = "${jobWorkspace}/${testList}.txt"
             def waivesListPathNode = "${jobWorkspace}/waives.txt"
             def outputPath = "${jobWorkspace}/job-output.log"
@@ -956,6 +958,14 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     scriptInstallPathNode,
                     true
                 )
+                Utils.exec(pipeline, script: "echo \"Script for Bash utilities: \" && cat ${scriptBashUtilsLocalPath}")
+                Utils.copyFileToRemoteHost(
+                    pipeline,
+                    remote,
+                    scriptBashUtilsLocalPath,
+                    scriptBashUtilsPathNode,
+                    true
+                )
 
                 // Generate Test List and Upload to Frontend Node
                 def makoArgs = getMakoArgsFromStageName(stageName, true)
@@ -1040,7 +1050,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
 
                 def containerImageArg = container
                 def srunPrologue = ""
-                if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
+                if (cluster.containerRuntime.toString() == "ENROOT") {
                     def enrootImagePath = "${cluster.scratchPath}/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh"
                     containerImageArg = enrootImagePath
 
@@ -1127,9 +1137,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     set -xEeuo pipefail
                     trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
 
-                    echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST"
-                    echo \$SLURM_JOB_ID > "$jobWorkspace/slurm_job_id.txt"
-
+                    echo "Starting Slurm job \$SLURM_JOB_ID on \$SLURM_NODELIST"
                     export jobWorkspace=$jobWorkspace
                     export tarName=$tarName
                     export llmTarfile=$llmTarfile
@@ -1219,10 +1227,11 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     touch "${outputPath}"
                     jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
                     if [ -z "\$jobId" ]; then
-                        echo "Error: Job submission failed, no job ID returned."
+                        echo "Error: Slurm job submission failed, no job ID returned."
                         exit 1
                     fi
-                    echo "Submitted job \$jobId"
+                    echo "Submitted Slurm job \$jobId"
+                    echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt"
                     tail -f ${outputPath} &
                     tailPid=\$!
                     # Wait until sbatch job is done.
@@ -1232,9 +1241,28 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     # Kill tail -f process
                     kill \$tailPid
                     # Check if the job failed or not
-                    sleep 5
-                    STATUS=\$(sacct -j \$jobId --format=State --noheader | head -n 1 | awk '{print \$1}')
-                    EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')
+                    sleep 10
+                    # Retry getting status and exit code as sacct might be delayed
+                    for i in {1..3}; do
+                        STATUS=\$(sacct -j \$jobId --format=State --noheader | head -n 1 | awk '{print \$1}')
+                        EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')
+
+                        if [ -n "\$STATUS" ] && [ -n "\$EXIT_CODE" ]; then
+                            break
+                        fi
+                        echo "Waiting for sacct to update... attempt \$i"
+                        sleep 10
+                    done
+
+                    if [ -z "\$EXIT_CODE" ]; then
+                        echo "Error: Failed to get exit code from sacct after retries, defaulting to 1."
+                        EXIT_CODE=1
+                    fi
+                    if [ -z "\$STATUS" ]; then
+                        echo "Error: Failed to get status from sacct after retries, defaulting to UNKNOWN."
+                        STATUS="UNKNOWN"
+                    fi
+
                     if [[ "\$STATUS" == "COMPLETED" && \$EXIT_CODE -eq 0 ]]; then
                         echo "Pytest succeed in Slurm job \$jobId"
                         echo "Status: \$STATUS | Exit_code \$EXIT_CODE"
diff --git a/jenkins/scripts/bash_utils.sh b/jenkins/scripts/bash_utils.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Retry a command with a specified number of retries and interval.
+# Arguments:
+#   max_retries (optional): The maximum number of times to retry the command. Default: 3.
+#   interval (optional): The time in seconds to wait between retries. Default: 60.
+#   command: The command to run and its arguments.
+# Usage:
+#   retry_command [max_retries] [interval] command...
+#   If only one numeric argument is provided, it is treated as max_retries.
+function retry_command() {
+    local max_retries=3
+    local interval=60
+
+    if [[ "$1" =~ ^[0-9]+$ ]]; then
+        max_retries=$1
+        shift
+    fi
+
+    if [[ "$1" =~ ^[0-9]+$ ]]; then
+        interval=$1
+        shift
+    fi
+
+    local cmd=("$@")
+
+    local count=0
+    local rc=0
+
+    while [ $count -lt $max_retries ]; do
+        if "${cmd[@]}"; then
+            return 0
+        fi
+        rc=$?
+        count=$((count + 1))
+        echo "Command failed with exit code $rc. Attempt $count/$max_retries."
+        if [ $count -lt $max_retries ]; then
+            echo "Retrying in $interval seconds..."
+            sleep $interval
+        fi
+    done
+
+    echo "Command failed after $max_retries attempts."
+    return $rc
+}
diff --git a/jenkins/scripts/slurm_install.sh b/jenkins/scripts/slurm_install.sh
@@ -4,22 +4,25 @@
 set -xEeuo pipefail
 trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR
 
+# Source utilities
+bashUtilsPath="$(dirname "${BASH_SOURCE[0]}")/$(basename "${BASH_SOURCE[0]}" | sed 's/slurm_install\.sh/bash_utils.sh/')"
+source "$bashUtilsPath"
+
 slurm_install_setup() {
     cd $resourcePathNode
     llmSrcNode=$resourcePathNode/TensorRT-LLM/src
 
     if [ $SLURM_LOCALID -eq 0 ]; then
-        wget -nv $llmTarfile
-        tar -zxf $tarName
+        retry_command bash -c "wget -nv $llmTarfile && tar -zxf $tarName"
         which python3
         python3 --version
-        apt-get install -y libffi-dev
+        retry_command apt-get install -y libffi-dev
         nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
         if [[ $pytestCommand == *--run-ray* ]]; then
-            pip3 install --retries 10 ray[default]
+            retry_command pip3 install --retries 10 ray[default]
         fi
-        cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
-        cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
+        retry_command bash -c "cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt"
+        retry_command bash -c "cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl"
         gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
         hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
         echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh
@@ -63,9 +63,10 @@ pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytest
 # Only the first process will save the coverage config file
 if [ $SLURM_PROCID -eq 0 ]; then
     sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
+else
+    # Sleep 10 seconds to wait for the coverage config file to be saved
+    sleep 10
 fi
-# Sleep 10 seconds to wait for the coverage config file to be saved
-sleep 10
 
 containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
 containerPipLLMLibPath=$(echo "$containerPipLLMLibPath" | sed 's/[[:space:]]+/_/g')
@@ -95,8 +96,17 @@ echo "Full Command: $pytestCommand"
     done
  fi
 
+# Turn off "exit on error" so the following lines always run
+set +e
+
+pytest_exit_code=0
+perf_check_exit_code=0
+perf_report_exit_code=0
+perf_sanity_check_exit_code=0
+
 eval $pytestCommand
-echo "Rank${SLURM_PROCID} Pytest finished execution"
+pytest_exit_code=$?
+echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code"
 
 if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then
     if [[ "$stageName" == *PyTorch* ]]; then
@@ -109,15 +119,38 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Pe
     python3 $llmSrcNode/tests/integration/defs/perf/sanity_perf_check.py \
         $stageName/perf_script_test_results.csv \
         $basePerfPath
-    echo "Check Perf Result"
+    perf_check_exit_code=$?
+
+    echo "Create Perf Report"
     python3 $llmSrcNode/tests/integration/defs/perf/create_perf_comparison_report.py \
         --output_path $stageName/report.pdf \
         --files $stageName/perf_script_test_results.csv \
         $basePerfPath
+    perf_report_exit_code=$?
+    echo "Rank${SLURM_PROCID} Perf report finished execution with exit code $perf_report_exit_code"
+
+    if [ "$perf_check_exit_code" -eq 0 ] && [ "$perf_report_exit_code" -ne 0 ]; then
+        perf_check_exit_code=$perf_report_exit_code
+    fi
+    echo "Rank${SLURM_PROCID} Perf check finished execution with exit code $perf_check_exit_code"
 fi
 
 if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" == *Perf-Sanity* ]]; then
     echo "Check Perf-Sanity Result"
     python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
         $jobWorkspace
+    perf_sanity_check_exit_code=$?
+    echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code"
+fi
+
+if [ "$pytest_exit_code" -ne 0 ]; then
+    final_exit_code=$pytest_exit_code
+elif [ "$perf_check_exit_code" -ne 0 ]; then
+    final_exit_code=$perf_check_exit_code
+elif [ "$perf_sanity_check_exit_code" -ne 0 ]; then
+    final_exit_code=$perf_sanity_check_exit_code
+else
+    final_exit_code=0
 fi
+echo "Rank${SLURM_PROCID} Final Slurm run finished execution with exit code $final_exit_code"
+exit $final_exit_code