Skip to content

Commit de2ea25

Browse files
committed
[None][infra] Some improvements for Slurm execution path in the CI
Signed-off-by: Yanchao Lu <[email protected]>
1 parent 270be80 commit de2ea25

File tree

4 files changed

+117
-23
lines changed

4 files changed

+117
-23
lines changed

jenkins/L0_Test.groovy

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -694,9 +694,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
694694
}
695695

696696
slurmRunner = null
697-
if (cluster.containerRuntime == ContainerRuntime.DOCKER) {
697+
if (cluster.containerRuntime.toString() == "DOCKER") {
698698
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
699-
} else if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
699+
} else if (cluster.containerRuntime.toString() == "ENROOT") {
700700
slurmRunner = runInEnrootOnNode(nodeName)
701701
} else {
702702
throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}")
@@ -860,11 +860,11 @@ def getMountListForSlurmTest(SlurmCluster cluster, boolean useSbatch = false)
860860
}
861861

862862
// data/cache mounts
863-
if (cluster.containerRuntime == ContainerRuntime.DOCKER) {
863+
if (cluster.containerRuntime.toString() == "DOCKER") {
864864
mounts += [
865865
"/home/scratch.trt_llm_data:/scratch.trt_llm_data:ro",
866866
]
867-
} else if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
867+
} else if (cluster.containerRuntime.toString() == "ENROOT") {
868868
if (!cluster.scratchPath) {
869869
throw new Exception("Scratch path is not set for cluster: ${cluster.name}")
870870
}
@@ -922,6 +922,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
922922
def scriptRunPathNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
923923
def scriptInstallLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_install.sh"
924924
def scriptInstallPathNode = "${jobWorkspace}/${jobUID}-slurm_install.sh"
925+
def scriptBashUtilsLocalPath = "${llmSrcLocal}/jenkins/scripts/bash_utils.sh"
926+
def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh"
925927
def testListPathNode = "${jobWorkspace}/${testList}.txt"
926928
def waivesListPathNode = "${jobWorkspace}/waives.txt"
927929
def outputPath = "${jobWorkspace}/job-output.log"
@@ -956,6 +958,14 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
956958
scriptInstallPathNode,
957959
true
958960
)
961+
Utils.exec(pipeline, script: "echo \"Script for Bash utilities: \" && cat ${scriptBashUtilsLocalPath}")
962+
Utils.copyFileToRemoteHost(
963+
pipeline,
964+
remote,
965+
scriptBashUtilsLocalPath,
966+
scriptBashUtilsPathNode,
967+
true
968+
)
959969

960970
// Generate Test List and Upload to Frontend Node
961971
def makoArgs = getMakoArgsFromStageName(stageName, true)
@@ -1040,7 +1050,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10401050

10411051
def containerImageArg = container
10421052
def srunPrologue = ""
1043-
if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
1053+
if (cluster.containerRuntime.toString() == "ENROOT") {
10441054
def enrootImagePath = "${cluster.scratchPath}/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh"
10451055
containerImageArg = enrootImagePath
10461056

@@ -1127,9 +1137,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11271137
set -xEeuo pipefail
11281138
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
11291139
1130-
echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST"
1131-
echo \$SLURM_JOB_ID > "$jobWorkspace/slurm_job_id.txt"
1132-
1140+
echo "Starting Slurm job \$SLURM_JOB_ID on \$SLURM_NODELIST"
11331141
export jobWorkspace=$jobWorkspace
11341142
export tarName=$tarName
11351143
export llmTarfile=$llmTarfile
@@ -1219,10 +1227,11 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12191227
touch "${outputPath}"
12201228
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
12211229
if [ -z "\$jobId" ]; then
1222-
echo "Error: Job submission failed, no job ID returned."
1230+
echo "Error: Slurm job submission failed, no job ID returned."
12231231
exit 1
12241232
fi
1225-
echo "Submitted job \$jobId"
1233+
echo "Submitted Slurm job \$jobId"
1234+
echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt"
12261235
tail -f ${outputPath} &
12271236
tailPid=\$!
12281237
# Wait until sbatch job is done.
@@ -1232,9 +1241,28 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
12321241
# Kill tail -f process
12331242
kill \$tailPid
12341243
# Check if the job failed or not
1235-
sleep 5
1236-
STATUS=\$(sacct -j \$jobId --format=State --noheader | head -n 1 | awk '{print \$1}')
1237-
EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')
1244+
sleep 10
1245+
# Retry getting status and exit code as sacct might be delayed
1246+
for i in {1..3}; do
1247+
STATUS=\$(sacct -j \$jobId --format=State --noheader | head -n 1 | awk '{print \$1}')
1248+
EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')
1249+
1250+
if [ -n "\$STATUS" ] && [ -n "\$EXIT_CODE" ]; then
1251+
break
1252+
fi
1253+
echo "Waiting for sacct to update... attempt \$i"
1254+
sleep 10
1255+
done
1256+
1257+
if [ -z "\$EXIT_CODE" ]; then
1258+
echo "Error: Failed to get exit code from sacct after retries, defaulting to 1."
1259+
EXIT_CODE=1
1260+
fi
1261+
if [ -z "\$STATUS" ]; then
1262+
echo "Error: Failed to get status from sacct after retries, defaulting to UNKNOWN."
1263+
STATUS="UNKNOWN"
1264+
fi
1265+
12381266
if [[ "\$STATUS" == "COMPLETED" && \$EXIT_CODE -eq 0 ]]; then
12391267
echo "Pytest succeed in Slurm job \$jobId"
12401268
echo "Status: \$STATUS | Exit_code \$EXIT_CODE"

jenkins/scripts/bash_utils.sh

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/bin/bash
2+
3+
# Retry a command with a specified number of retries and interval.
4+
# Arguments:
5+
# max_retries (optional): The maximum number of times to retry the command. Default: 3.
6+
# interval (optional): The time in seconds to wait between retries. Default: 60.
7+
# command: The command to run and its arguments.
8+
# Usage:
9+
# retry_command [max_retries] [interval] command...
10+
# If only one numeric argument is provided, it is treated as max_retries.
11+
function retry_command() {
12+
local max_retries=3
13+
local interval=60
14+
15+
if [[ "$1" =~ ^[0-9]+$ ]]; then
16+
max_retries=$1
17+
shift
18+
fi
19+
20+
if [[ "$1" =~ ^[0-9]+$ ]]; then
21+
interval=$1
22+
shift
23+
fi
24+
25+
local cmd=("$@")
26+
27+
local count=0
28+
local rc=0
29+
30+
while [ $count -lt $max_retries ]; do
31+
if "${cmd[@]}"; then
32+
return 0
33+
fi
34+
rc=$?
35+
count=$((count + 1))
36+
echo "Command failed with exit code $rc. Attempt $count/$max_retries."
37+
if [ $count -lt $max_retries ]; then
38+
echo "Retrying in $interval seconds..."
39+
sleep $interval
40+
fi
41+
done
42+
43+
echo "Command failed after $max_retries attempts."
44+
return $rc
45+
}

jenkins/scripts/slurm_install.sh

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,25 @@
44
set -xEeuo pipefail
55
trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR
66

7+
# Source utilities
8+
bashUtilsPath="$(dirname "${BASH_SOURCE[0]}")/$(basename "${BASH_SOURCE[0]}" | sed 's/slurm_install\.sh/bash_utils.sh/')"
9+
source "$bashUtilsPath"
10+
711
slurm_install_setup() {
812
cd $resourcePathNode
913
llmSrcNode=$resourcePathNode/TensorRT-LLM/src
1014

1115
if [ $SLURM_LOCALID -eq 0 ]; then
12-
wget -nv $llmTarfile
13-
tar -zxf $tarName
16+
retry_command bash -c "wget -nv $llmTarfile && tar -zxf $tarName"
1417
which python3
1518
python3 --version
16-
apt-get install -y libffi-dev
19+
retry_command apt-get install -y libffi-dev
1720
nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
1821
if [[ $pytestCommand == *--run-ray* ]]; then
19-
pip3 install --retries 10 ray[default]
22+
retry_command pip3 install --retries 10 ray[default]
2023
fi
21-
cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
22-
cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
24+
retry_command bash -c "cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt"
25+
retry_command bash -c "cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl"
2326
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
2427
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
2528
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"

jenkins/scripts/slurm_run.sh

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,10 @@ pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytest
6363
# Only the first process will save the coverage config file
6464
if [ $SLURM_PROCID -eq 0 ]; then
6565
sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
66+
else
67+
# Sleep 10 seconds to wait for the coverage config file to be saved
68+
sleep 10
6669
fi
67-
# Sleep 10 seconds to wait for the coverage config file to be saved
68-
sleep 10
6970

7071
containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
7172
containerPipLLMLibPath=$(echo "$containerPipLLMLibPath" | sed 's/[[:space:]]+/_/g')
@@ -95,8 +96,16 @@ echo "Full Command: $pytestCommand"
9596
done
9697
fi
9798

99+
# Turn off "exit on error" so the following lines always run
100+
set +e
101+
102+
pytest_exit_code=0
103+
perf_check_exit_code=0
104+
perf_sanity_check_exit_code=0
105+
98106
eval $pytestCommand
99-
echo "Rank${SLURM_PROCID} Pytest finished execution"
107+
pytest_exit_code=$?
108+
echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code"
100109

101110
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then
102111
if [[ "$stageName" == *PyTorch* ]]; then
@@ -109,15 +118,24 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Pe
109118
python3 $llmSrcNode/tests/integration/defs/perf/sanity_perf_check.py \
110119
$stageName/perf_script_test_results.csv \
111120
$basePerfPath
112-
echo "Check Perf Result"
121+
perf_check_exit_code=$?
122+
echo "Create Perf Report"
113123
python3 $llmSrcNode/tests/integration/defs/perf/create_perf_comparison_report.py \
114124
--output_path $stageName/report.pdf \
115125
--files $stageName/perf_script_test_results.csv \
116126
$basePerfPath
127+
perf_check_exit_code=$((perf_check_exit_code + $?))
128+
echo "Rank${SLURM_PROCID} Perf check finished execution with exit code $perf_check_exit_code"
117129
fi
118130

119131
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" == *Perf-Sanity* ]]; then
120132
echo "Check Perf-Sanity Result"
121133
python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
122134
$jobWorkspace
135+
perf_sanity_check_exit_code=$?
136+
echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code"
123137
fi
138+
139+
final_exit_code=$((pytest_exit_code + perf_check_exit_code + perf_sanity_check_exit_code))
140+
echo "Rank${SLURM_PROCID} Final Slurm run finished execution with exit code $final_exit_code"
141+
exit $final_exit_code

0 commit comments

Comments
 (0)