diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 42322bdb05c..09682457efa 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -696,13 +696,11 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG, } slurmRunner = null + echo "${stageName} Slurm partition timeout: ${partition.time}" + def partitionTimeout = partition?.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT if (cluster.containerRuntime.toString() == "DOCKER") { - echo "${stageName} partitionTimeout: ${partition.time}" - def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, partitionTimeout, true) } else if (cluster.containerRuntime.toString() == "ENROOT") { - echo "${stageName} partitionTimeout: ${partition.time}" - def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT slurmRunner = runInEnrootOnNode(nodeName, partitionTimeout) } else { throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}") @@ -940,6 +938,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG def coverageConfigFile = "${jobWorkspace}/.coveragerc" stage("[${stageName}] Initializing Test") { + println("Selected Cluster: ${cluster.name}") // Create Job Workspace folder in Frontend Node Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mkdir -p ${jobWorkspace}\""), numRetries: 3) @@ -1213,6 +1212,18 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG true ) + def filesToKeepWhenRetry = [ + scriptRunPathNode, + scriptInstallPathNode, + scriptBashUtilsPathNode, + scriptLaunchPathNode, + scriptExecPathNode, + testListPathNode, + waivesListPathNode, + coverageConfigFile + ] + def findKeepWhenRetryArgs = filesToKeepWhenRetry.collect { " ! -name \"\$(basename \"${it}\")\"" }.join("") + def scriptExec = """#!/bin/bash set -xEeuo pipefail trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR @@ -1222,14 +1233,12 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG previous_job_id=\$(cat "${jobWorkspace}/slurm_job_id.txt") echo "Found previous Slurm job ID: \${previous_job_id}" scancel "\${previous_job_id}" || true - rm -rf "${jobWorkspace}/slurm_job_id.txt" - # Wait for 60 seconds to ensure the previous job is canceled - sleep 60 + # Wait for 120 seconds to ensure the previous job is canceled + sleep 120 fi - rm -rf "${jobWorkspace}/results.xml" - rm -rf "${jobWorkspace}/report.csv" - rm -rf "${jobWorkspace}/unfinished_test.txt" - rm -rf "${outputPath}" + + # Clean up workspace: remove all files/dirs not in the keep list + find "${jobWorkspace}" -maxdepth 1 -mindepth 1 ${findKeepWhenRetryArgs} -exec rm -rf {} + touch "${outputPath}" jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}') @@ -1665,7 +1674,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod targetCloud = "kubernetes" // DGX Spark requires a special setting for accessing the device. // It has 128GB unified memory as per spec. Use half of the memory at the CPU side. - if (type == "gb10x") { + if (type.contains("gb10x")) { targetCloud = "nvks-sparks-cloud" memorySize = "64Gi" tolerations = """ @@ -1684,7 +1693,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod // The following GPU types doesn't support dynamic driver flashing. if (REQUIRED_NO_DRIVER_TYPES.any { type.contains(it) }) { - if (type == "gb10x") { + if (type.contains("gb10x")) { selectors = """ kubernetes.io/arch: ${arch} kubernetes.io/os: linux @@ -3238,8 +3247,7 @@ def launchTestJobs(pipeline, testFilter) parallelJobs += parallelSlurmJobs - // Try to match what are being tested on x86 H100_PCIe. -// SBSA machines from the Blossom machine pool + // SBSA machines from the Blossom machine pool SBSATestConfigs = [ "GH200-TensorRT-Post-Merge-1": ["gh200", "l0_gh200", 1, 1], // DGX Spark is also named as GB10 Grace Blackwell Superchip. diff --git a/jenkins/scripts/slurm_install.sh b/jenkins/scripts/slurm_install.sh index bd312180e76..cb1ec4bc83c 100644 --- a/jenkins/scripts/slurm_install.sh +++ b/jenkins/scripts/slurm_install.sh @@ -12,7 +12,14 @@ slurm_install_setup() { cd $resourcePathNode llmSrcNode=$resourcePathNode/TensorRT-LLM/src + # Use unique lock file for this job ID + lock_file="install_lock_job_${SLURM_JOB_ID:-local}_node_${SLURM_NODEID:-0}.lock" + if [ $SLURM_LOCALID -eq 0 ]; then + if [ -f "$lock_file" ]; then + rm -f "$lock_file" + fi + retry_command bash -c "wget -nv $llmTarfile && tar -zxf $tarName" which python3 python3 --version @@ -27,11 +34,11 @@ slurm_install_setup() { hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}" echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName" echo "(Writing install lock) Current directory: $(pwd)" - touch install_lock.lock + touch "$lock_file" else echo "(Waiting for install lock) Current directory: $(pwd)" - while [ ! -f install_lock.lock ]; do - sleep 5 + while [ ! -f "$lock_file" ]; do + sleep 10 done fi } diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh index 6c26a0347c9..0c47d38f165 100755 --- a/jenkins/scripts/slurm_run.sh +++ b/jenkins/scripts/slurm_run.sh @@ -64,8 +64,8 @@ pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytest if [ $SLURM_PROCID -eq 0 ]; then sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile" else - # Sleep 10 seconds to wait for the coverage config file to be saved - sleep 10 + # Sleep 30 seconds to wait for the coverage config file to be saved + sleep 30 fi containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}') @@ -108,6 +108,25 @@ eval $pytestCommand pytest_exit_code=$? echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code" +# DEBUG: Diagnose intermittent "unrecognized arguments" failure (Exit Code 4) +# Remove this after the issue is resolved +if [ $pytest_exit_code -eq 4 ]; then + echo "DEBUG: Pytest failed with usage error (exit code 4)" + echo "DEBUG: Directory state at $(pwd):" + ls -l + echo "DEBUG: Directory state at $llmSrcNode/tests/integration/defs:" + ls -l $llmSrcNode/tests/integration/defs + + echo "DEBUG: conftest.py content:" + md5sum $llmSrcNode/tests/integration/defs/conftest.py + + echo "DEBUG: pytest.ini content:" + md5sum $llmSrcNode/tests/integration/defs/pytest.ini + + echo "DEBUG: Check importability of conftest.py" + python3 -c "import sys; sys.path.insert(0, '.'); import conftest; print('DEBUG: conftest imported successfully')" +fi + if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then if [[ "$stageName" == *PyTorch* ]]; then basePerfFilename="base_perf_pytorch.csv" diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py index 1f187ce4e0e..c06b0d18bc2 100644 --- a/tests/integration/defs/conftest.py +++ b/tests/integration/defs/conftest.py @@ -2689,8 +2689,7 @@ def get_gpu_memory_wo_pynvml(): import psutil logger.warning( - f"\nWarning: pynvml not available, using fallback commands for memory monitoring" - ) + f"pynvml not available, using fallback commands for memory monitoring") gpu_memory = {} system_total_mb = 0