Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 23 additions & 15 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -696,13 +696,11 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
}

slurmRunner = null
echo "${stageName} Slurm partition timeout: ${partition.time}"
def partitionTimeout = partition?.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
if (cluster.containerRuntime.toString() == "DOCKER") {
echo "${stageName} partitionTimeout: ${partition.time}"
def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, partitionTimeout, true)
} else if (cluster.containerRuntime.toString() == "ENROOT") {
echo "${stageName} partitionTimeout: ${partition.time}"
def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
slurmRunner = runInEnrootOnNode(nodeName, partitionTimeout)
} else {
throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}")
Expand Down Expand Up @@ -940,6 +938,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
def coverageConfigFile = "${jobWorkspace}/.coveragerc"

stage("[${stageName}] Initializing Test") {
println("Selected Cluster: ${cluster.name}")
// Create Job Workspace folder in Frontend Node
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mkdir -p ${jobWorkspace}\""), numRetries: 3)

Expand Down Expand Up @@ -1213,6 +1212,18 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
true
)

def filesToKeepWhenRetry = [
scriptRunPathNode,
scriptInstallPathNode,
scriptBashUtilsPathNode,
scriptLaunchPathNode,
scriptExecPathNode,
testListPathNode,
waivesListPathNode,
coverageConfigFile
]
def findKeepWhenRetryArgs = filesToKeepWhenRetry.collect { " ! -name \"\$(basename \"${it}\")\"" }.join("")

def scriptExec = """#!/bin/bash
set -xEeuo pipefail
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
Expand All @@ -1222,14 +1233,12 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
previous_job_id=\$(cat "${jobWorkspace}/slurm_job_id.txt")
echo "Found previous Slurm job ID: \${previous_job_id}"
scancel "\${previous_job_id}" || true
rm -rf "${jobWorkspace}/slurm_job_id.txt"
# Wait for 60 seconds to ensure the previous job is canceled
sleep 60
# Wait for 120 seconds to ensure the previous job is canceled
sleep 120
fi
rm -rf "${jobWorkspace}/results.xml"
rm -rf "${jobWorkspace}/report.csv"
rm -rf "${jobWorkspace}/unfinished_test.txt"
rm -rf "${outputPath}"

# Clean up workspace: remove all files/dirs not in the keep list
find "${jobWorkspace}" -maxdepth 1 -mindepth 1 ${findKeepWhenRetryArgs} -exec rm -rf {} +

touch "${outputPath}"
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
Expand Down Expand Up @@ -1665,7 +1674,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
targetCloud = "kubernetes"
// DGX Spark requires a special setting for accessing the device.
// It has 128GB unified memory as per spec. Use half of the memory at the CPU side.
if (type == "gb10x") {
if (type.contains("gb10x")) {
targetCloud = "nvks-sparks-cloud"
memorySize = "64Gi"
tolerations = """
Expand All @@ -1684,7 +1693,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod

// The following GPU types doesn't support dynamic driver flashing.
if (REQUIRED_NO_DRIVER_TYPES.any { type.contains(it) }) {
if (type == "gb10x") {
if (type.contains("gb10x")) {
selectors = """
kubernetes.io/arch: ${arch}
kubernetes.io/os: linux
Expand Down Expand Up @@ -3238,8 +3247,7 @@ def launchTestJobs(pipeline, testFilter)

parallelJobs += parallelSlurmJobs

// Try to match what are being tested on x86 H100_PCIe.
// SBSA machines from the Blossom machine pool
// SBSA machines from the Blossom machine pool
SBSATestConfigs = [
"GH200-TensorRT-Post-Merge-1": ["gh200", "l0_gh200", 1, 1],
// DGX Spark is also named as GB10 Grace Blackwell Superchip.
Expand Down
13 changes: 10 additions & 3 deletions jenkins/scripts/slurm_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,14 @@ slurm_install_setup() {
cd $resourcePathNode
llmSrcNode=$resourcePathNode/TensorRT-LLM/src

# Use unique lock file for this job ID
lock_file="install_lock_job_${SLURM_JOB_ID:-local}_node_${SLURM_NODEID:-0}.lock"

if [ $SLURM_LOCALID -eq 0 ]; then
if [ -f "$lock_file" ]; then
rm -f "$lock_file"
fi

retry_command bash -c "wget -nv $llmTarfile && tar -zxf $tarName"
which python3
python3 --version
Expand All @@ -27,11 +34,11 @@ slurm_install_setup() {
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
echo "(Writing install lock) Current directory: $(pwd)"
touch install_lock.lock
touch "$lock_file"
else
echo "(Waiting for install lock) Current directory: $(pwd)"
while [ ! -f install_lock.lock ]; do
sleep 5
while [ ! -f "$lock_file" ]; do
sleep 10
done
fi
}
Expand Down
23 changes: 21 additions & 2 deletions jenkins/scripts/slurm_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytest
if [ $SLURM_PROCID -eq 0 ]; then
sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
else
# Sleep 10 seconds to wait for the coverage config file to be saved
sleep 10
# Sleep 30 seconds to wait for the coverage config file to be saved
sleep 30
fi

containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
Expand Down Expand Up @@ -108,6 +108,25 @@ eval $pytestCommand
pytest_exit_code=$?
echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code"

# DEBUG: Diagnose intermittent "unrecognized arguments" failure (Exit Code 4)
# Remove this after the issue is resolved
if [ $pytest_exit_code -eq 4 ]; then
echo "DEBUG: Pytest failed with usage error (exit code 4)"
echo "DEBUG: Directory state at $(pwd):"
ls -l
echo "DEBUG: Directory state at $llmSrcNode/tests/integration/defs:"
ls -l $llmSrcNode/tests/integration/defs

echo "DEBUG: conftest.py content:"
md5sum $llmSrcNode/tests/integration/defs/conftest.py

echo "DEBUG: pytest.ini content:"
md5sum $llmSrcNode/tests/integration/defs/pytest.ini

echo "DEBUG: Check importability of conftest.py"
python3 -c "import sys; sys.path.insert(0, '.'); import conftest; print('DEBUG: conftest imported successfully')"
fi

if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then
if [[ "$stageName" == *PyTorch* ]]; then
basePerfFilename="base_perf_pytorch.csv"
Expand Down
3 changes: 1 addition & 2 deletions tests/integration/defs/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2689,8 +2689,7 @@ def get_gpu_memory_wo_pynvml():
import psutil

logger.warning(
f"\nWarning: pynvml not available, using fallback commands for memory monitoring"
)
f"pynvml not available, using fallback commands for memory monitoring")

gpu_memory = {}
system_total_mb = 0
Expand Down
Loading