NVIDIA · chzblych · Dec 31, 2025
@@ -696,13 +696,11 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
         }
 
         slurmRunner = null
+        echo "${stageName} Slurm partition timeout: ${partition.time}"
+        def partitionTimeout = partition?.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
         if (cluster.containerRuntime.toString() == "DOCKER") {
-            echo "${stageName} partitionTimeout: ${partition.time}"
-            def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
             slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, partitionTimeout, true)
         } else if (cluster.containerRuntime.toString() == "ENROOT") {
-            echo "${stageName} partitionTimeout: ${partition.time}"
-            def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
             slurmRunner = runInEnrootOnNode(nodeName, partitionTimeout)
         } else {
             throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}")
@@ -940,6 +938,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
             def coverageConfigFile = "${jobWorkspace}/.coveragerc"
 
             stage("[${stageName}] Initializing Test") {
+                println("Selected Cluster: ${cluster.name}")
                 // Create Job Workspace folder in Frontend Node
                 Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mkdir -p ${jobWorkspace}\""), numRetries: 3)
 
@@ -1213,6 +1212,18 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     true
                 )
 
+                def filesToKeepWhenRetry = [
+                    scriptRunPathNode,
+                    scriptInstallPathNode,
+                    scriptBashUtilsPathNode,
+                    scriptLaunchPathNode,
+                    scriptExecPathNode,
+                    testListPathNode,
+                    waivesListPathNode,
+                    coverageConfigFile
+                ]
+                def findKeepWhenRetryArgs = filesToKeepWhenRetry.collect { " ! -name \"\$(basename \"${it}\")\"" }.join("")
+
                 def scriptExec = """#!/bin/bash
                     set -xEeuo pipefail
                     trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
@@ -1222,14 +1233,12 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                         previous_job_id=\$(cat "${jobWorkspace}/slurm_job_id.txt")
                         echo "Found previous Slurm job ID: \${previous_job_id}"
                         scancel "\${previous_job_id}" || true
-                        rm -rf "${jobWorkspace}/slurm_job_id.txt"
-                        # Wait for 60 seconds to ensure the previous job is canceled
-                        sleep 60
+                        # Wait for 120 seconds to ensure the previous job is canceled
+                        sleep 120
                     fi
-                    rm -rf "${jobWorkspace}/results.xml"
-                    rm -rf "${jobWorkspace}/report.csv"
-                    rm -rf "${jobWorkspace}/unfinished_test.txt"
-                    rm -rf "${outputPath}"
+
+                    # Clean up workspace: remove all files/dirs not in the keep list
+                    find "${jobWorkspace}" -maxdepth 1 -mindepth 1 ${findKeepWhenRetryArgs} -exec rm -rf {} +
 
                     touch "${outputPath}"
                     jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
@@ -1665,7 +1674,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
         targetCloud = "kubernetes"
         // DGX Spark requires a special setting for accessing the device.
         // It has 128GB unified memory as per spec. Use half of the memory at the CPU side.
-        if (type == "gb10x") {
+        if (type.contains("gb10x")) {
             targetCloud = "nvks-sparks-cloud"
             memorySize = "64Gi"
             tolerations = """
@@ -1684,7 +1693,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
 
         // The following GPU types doesn't support dynamic driver flashing.
         if (REQUIRED_NO_DRIVER_TYPES.any { type.contains(it) }) {
-            if (type == "gb10x") {
+            if (type.contains("gb10x")) {
                 selectors = """
                     kubernetes.io/arch: ${arch}
                     kubernetes.io/os: linux
@@ -3238,8 +3247,7 @@ def launchTestJobs(pipeline, testFilter)
 
     parallelJobs += parallelSlurmJobs
 
-    // Try to match what are being tested on x86 H100_PCIe.
-// SBSA machines from the Blossom machine pool
+    // SBSA machines from the Blossom machine pool
     SBSATestConfigs = [
         "GH200-TensorRT-Post-Merge-1": ["gh200", "l0_gh200", 1, 1],
         // DGX Spark is also named as GB10 Grace Blackwell Superchip.

@@ -12,7 +12,14 @@ slurm_install_setup() {
     cd $resourcePathNode
     llmSrcNode=$resourcePathNode/TensorRT-LLM/src
 
+    # Use unique lock file for this job ID
+    lock_file="install_lock_job_${SLURM_JOB_ID:-local}_node_${SLURM_NODEID:-0}.lock"
+
     if [ $SLURM_LOCALID -eq 0 ]; then
+        if [ -f "$lock_file" ]; then
+            rm -f "$lock_file"
+        fi
+
         retry_command bash -c "wget -nv $llmTarfile && tar -zxf $tarName"
         which python3
         python3 --version
@@ -27,11 +34,11 @@ slurm_install_setup() {
         hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
         echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
         echo "(Writing install lock) Current directory: $(pwd)"
-        touch install_lock.lock
+        touch "$lock_file"
     else
         echo "(Waiting for install lock) Current directory: $(pwd)"
-        while [ ! -f install_lock.lock ]; do
-            sleep 5
+        while [ ! -f "$lock_file" ]; do
+            sleep 10
         done
     fi
 }

@@ -64,8 +64,8 @@ pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytest
 if [ $SLURM_PROCID -eq 0 ]; then
     sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
 else
-    # Sleep 10 seconds to wait for the coverage config file to be saved
-    sleep 10
+    # Sleep 30 seconds to wait for the coverage config file to be saved
+    sleep 30
 fi
 
 containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
@@ -108,6 +108,25 @@ eval $pytestCommand
 pytest_exit_code=$?
 echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code"
 
+# DEBUG: Diagnose intermittent "unrecognized arguments" failure (Exit Code 4)
+# Remove this after the issue is resolved
+if [ $pytest_exit_code -eq 4 ]; then
+    echo "DEBUG: Pytest failed with usage error (exit code 4)"
+    echo "DEBUG: Directory state at $(pwd):"
+    ls -l
+    echo "DEBUG: Directory state at $llmSrcNode/tests/integration/defs:"
+    ls -l $llmSrcNode/tests/integration/defs
+
+    echo "DEBUG: conftest.py content:"
+    md5sum $llmSrcNode/tests/integration/defs/conftest.py
+
+    echo "DEBUG: pytest.ini content:"
+    md5sum $llmSrcNode/tests/integration/defs/pytest.ini
+
+    echo "DEBUG: Check importability of conftest.py"
+    python3 -c "import sys; sys.path.insert(0, '.'); import conftest; print('DEBUG: conftest imported successfully')"
+fi
+
 if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then
     if [[ "$stageName" == *PyTorch* ]]; then
         basePerfFilename="base_perf_pytorch.csv"

diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py
@@ -2689,8 +2689,7 @@ def get_gpu_memory_wo_pynvml():
     import psutil
 
     logger.warning(
-        f"\nWarning: pynvml not available, using fallback commands for memory monitoring"
-    )
+        f"pynvml not available, using fallback commands for memory monitoring")
 
     gpu_memory = {}
     system_total_mb = 0