Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 42 additions & 14 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -694,9 +694,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
}

slurmRunner = null
if (cluster.containerRuntime == ContainerRuntime.DOCKER) {
if (cluster.containerRuntime.toString() == "DOCKER") {
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
} else if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
} else if (cluster.containerRuntime.toString() == "ENROOT") {
slurmRunner = runInEnrootOnNode(nodeName)
} else {
throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}")
Expand Down Expand Up @@ -799,7 +799,7 @@ def getPytestBaseCommandLine(
"LLM_BACKEND_ROOT=${llmSrc}/triton_backend",
"LLM_MODELS_ROOT=${MODEL_CACHE_DIR}",
"MODEL_CACHE_DIR=${MODEL_CACHE_DIR}",
"COLUMNS=200",
"COLUMNS=400",
extraInternalEnv,
portEnvVars,
pytestUtil,
Expand Down Expand Up @@ -860,11 +860,11 @@ def getMountListForSlurmTest(SlurmCluster cluster, boolean useSbatch = false)
}

// data/cache mounts
if (cluster.containerRuntime == ContainerRuntime.DOCKER) {
if (cluster.containerRuntime.toString() == "DOCKER") {
mounts += [
"/home/scratch.trt_llm_data:/scratch.trt_llm_data:ro",
]
} else if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
} else if (cluster.containerRuntime.toString() == "ENROOT") {
if (!cluster.scratchPath) {
throw new Exception("Scratch path is not set for cluster: ${cluster.name}")
}
Expand Down Expand Up @@ -922,6 +922,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
def scriptRunPathNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
def scriptInstallLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_install.sh"
def scriptInstallPathNode = "${jobWorkspace}/${jobUID}-slurm_install.sh"
def scriptBashUtilsLocalPath = "${llmSrcLocal}/jenkins/scripts/bash_utils.sh"
def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh"
def testListPathNode = "${jobWorkspace}/${testList}.txt"
def waivesListPathNode = "${jobWorkspace}/waives.txt"
def outputPath = "${jobWorkspace}/job-output.log"
Expand Down Expand Up @@ -956,6 +958,14 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
scriptInstallPathNode,
true
)
Utils.exec(pipeline, script: "echo \"Script for Bash utilities: \" && cat ${scriptBashUtilsLocalPath}")
Utils.copyFileToRemoteHost(
pipeline,
remote,
scriptBashUtilsLocalPath,
scriptBashUtilsPathNode,
true
)

// Generate Test List and Upload to Frontend Node
def makoArgs = getMakoArgsFromStageName(stageName, true)
Expand Down Expand Up @@ -1040,7 +1050,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG

def containerImageArg = container
def srunPrologue = ""
if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
if (cluster.containerRuntime.toString() == "ENROOT") {
def enrootImagePath = "${cluster.scratchPath}/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh"
containerImageArg = enrootImagePath

Expand Down Expand Up @@ -1127,9 +1137,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
set -xEeuo pipefail
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR

echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST"
echo \$SLURM_JOB_ID > "$jobWorkspace/slurm_job_id.txt"

echo "Starting Slurm job \$SLURM_JOB_ID on \$SLURM_NODELIST"
export jobWorkspace=$jobWorkspace
export tarName=$tarName
export llmTarfile=$llmTarfile
Expand Down Expand Up @@ -1219,10 +1227,11 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
touch "${outputPath}"
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
if [ -z "\$jobId" ]; then
echo "Error: Job submission failed, no job ID returned."
echo "Error: Slurm job submission failed, no job ID returned."
exit 1
fi
echo "Submitted job \$jobId"
echo "Submitted Slurm job \$jobId"
echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt"
tail -f ${outputPath} &
tailPid=\$!
# Wait until sbatch job is done.
Expand All @@ -1232,9 +1241,28 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
# Kill tail -f process
kill \$tailPid
# Check if the job failed or not
sleep 5
STATUS=\$(sacct -j \$jobId --format=State --noheader | head -n 1 | awk '{print \$1}')
EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')
sleep 10
# Retry getting status and exit code as sacct might be delayed
for i in {1..3}; do
STATUS=\$(sacct -j \$jobId --format=State --noheader | head -n 1 | awk '{print \$1}')
EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')

if [ -n "\$STATUS" ] && [ -n "\$EXIT_CODE" ]; then
break
fi
echo "Waiting for sacct to update... attempt \$i"
sleep 10
done

if [ -z "\$EXIT_CODE" ]; then
echo "Error: Failed to get exit code from sacct after retries, defaulting to 1."
EXIT_CODE=1
fi
if [ -z "\$STATUS" ]; then
echo "Error: Failed to get status from sacct after retries, defaulting to UNKNOWN."
STATUS="UNKNOWN"
fi

if [[ "\$STATUS" == "COMPLETED" && \$EXIT_CODE -eq 0 ]]; then
echo "Pytest succeed in Slurm job \$jobId"
echo "Status: \$STATUS | Exit_code \$EXIT_CODE"
Expand Down
45 changes: 45 additions & 0 deletions jenkins/scripts/bash_utils.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash

# Retry a command with a specified number of retries and interval.
# Arguments:
# max_retries (optional): The maximum number of times to retry the command. Default: 3.
# interval (optional): The time in seconds to wait between retries. Default: 60.
# command: The command to run and its arguments.
# Usage:
# retry_command [max_retries] [interval] command...
# If only one numeric argument is provided, it is treated as max_retries.
function retry_command() {
local max_retries=3
local interval=60

if [[ "$1" =~ ^[0-9]+$ ]]; then
max_retries=$1
shift
fi

if [[ "$1" =~ ^[0-9]+$ ]]; then
interval=$1
shift
fi

local cmd=("$@")

local count=0
local rc=0

while [ $count -lt $max_retries ]; do
if "${cmd[@]}"; then
return 0
fi
rc=$?
count=$((count + 1))
echo "Command failed with exit code $rc. Attempt $count/$max_retries."
if [ $count -lt $max_retries ]; then
echo "Retrying in $interval seconds..."
sleep $interval
fi
done

echo "Command failed after $max_retries attempts."
return $rc
}
15 changes: 9 additions & 6 deletions jenkins/scripts/slurm_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,25 @@
set -xEeuo pipefail
trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR

# Source utilities
bashUtilsPath="$(dirname "${BASH_SOURCE[0]}")/$(basename "${BASH_SOURCE[0]}" | sed 's/slurm_install\.sh/bash_utils.sh/')"
source "$bashUtilsPath"

slurm_install_setup() {
cd $resourcePathNode
llmSrcNode=$resourcePathNode/TensorRT-LLM/src

if [ $SLURM_LOCALID -eq 0 ]; then
wget -nv $llmTarfile
tar -zxf $tarName
retry_command bash -c "wget -nv $llmTarfile && tar -zxf $tarName"
which python3
python3 --version
apt-get install -y libffi-dev
retry_command apt-get install -y libffi-dev
nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
if [[ $pytestCommand == *--run-ray* ]]; then
pip3 install --retries 10 ray[default]
retry_command pip3 install --retries 10 ray[default]
fi
cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
retry_command bash -c "cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt"
retry_command bash -c "cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl"
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
Expand Down
41 changes: 37 additions & 4 deletions jenkins/scripts/slurm_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,10 @@ pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytest
# Only the first process will save the coverage config file
if [ $SLURM_PROCID -eq 0 ]; then
sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
else
# Sleep 10 seconds to wait for the coverage config file to be saved
sleep 10
fi
# Sleep 10 seconds to wait for the coverage config file to be saved
sleep 10

containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
containerPipLLMLibPath=$(echo "$containerPipLLMLibPath" | sed 's/[[:space:]]+/_/g')
Expand Down Expand Up @@ -95,8 +96,17 @@ echo "Full Command: $pytestCommand"
done
fi

# Turn off "exit on error" so the following lines always run
set +e

pytest_exit_code=0
perf_check_exit_code=0
perf_report_exit_code=0
perf_sanity_check_exit_code=0

eval $pytestCommand
echo "Rank${SLURM_PROCID} Pytest finished execution"
pytest_exit_code=$?
echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code"

if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then
if [[ "$stageName" == *PyTorch* ]]; then
Expand All @@ -109,15 +119,38 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Pe
python3 $llmSrcNode/tests/integration/defs/perf/sanity_perf_check.py \
$stageName/perf_script_test_results.csv \
$basePerfPath
echo "Check Perf Result"
perf_check_exit_code=$?

echo "Create Perf Report"
python3 $llmSrcNode/tests/integration/defs/perf/create_perf_comparison_report.py \
--output_path $stageName/report.pdf \
--files $stageName/perf_script_test_results.csv \
$basePerfPath
perf_report_exit_code=$?
echo "Rank${SLURM_PROCID} Perf report finished execution with exit code $perf_report_exit_code"

if [ "$perf_check_exit_code" -eq 0 ] && [ "$perf_report_exit_code" -ne 0 ]; then
perf_check_exit_code=$perf_report_exit_code
fi
echo "Rank${SLURM_PROCID} Perf check finished execution with exit code $perf_check_exit_code"
fi

if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" == *Perf-Sanity* ]]; then
echo "Check Perf-Sanity Result"
python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
$jobWorkspace
perf_sanity_check_exit_code=$?
echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code"
fi

if [ "$pytest_exit_code" -ne 0 ]; then
final_exit_code=$pytest_exit_code
elif [ "$perf_check_exit_code" -ne 0 ]; then
final_exit_code=$perf_check_exit_code
elif [ "$perf_sanity_check_exit_code" -ne 0 ]; then
final_exit_code=$perf_sanity_check_exit_code
else
final_exit_code=0
fi
echo "Rank${SLURM_PROCID} Final Slurm run finished execution with exit code $final_exit_code"
exit $final_exit_code