Skip to content

Commit c6c3fae

Browse files
committed
update
Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
1 parent 24f5cd7 commit c6c3fae

File tree

16 files changed

+2098
-435
lines changed

16 files changed

+2098
-435
lines changed

jenkins/L0_Test.groovy

Lines changed: 220 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1022,10 +1022,222 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10221022
}
10231023
}
10241024

1025+
def runLLMMultiNodeDisaggTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, nodeCount=1, skipInstallWheel=false, cpver="cp312")
1026+
{
1027+
SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition
1028+
SlurmCluster cluster = SlurmConfig.clusterConfig[partition.clusterName]
1029+
1030+
// Create a unique suffix for the job name
1031+
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
1032+
def jobUID = "${cluster.host}-multi_node_disagg_test-${customSuffix}"
1033+
1034+
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
1035+
1036+
def slurmOutputFile = null
1037+
1038+
try {
1039+
// Run ssh command to start node in desired cluster via SLURM
1040+
withCredentials([
1041+
usernamePassword(
1042+
credentialsId: 'svc_tensorrt',
1043+
usernameVariable: 'USERNAME',
1044+
passwordVariable: 'PASSWORD'
1045+
)
1046+
]) {
1047+
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
1048+
def remote = [
1049+
ip : randomLoginNode,
1050+
host : randomLoginNode,
1051+
user : "${pipeline.USERNAME}",
1052+
passwd : "${pipeline.PASSWORD}",
1053+
allowAnyHosts: true,
1054+
]
1055+
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client python3")
1056+
def tarName = BUILD_CONFIGS[config][TARNAME]
1057+
def llmTarfile = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/${tarName}"
1058+
def llmPath = sh (script: "realpath .", returnStdout: true).trim()
1059+
def jobWorkspace = "/home/svc_tensorrt/bloom/scripts/${jobUID}"
1060+
def resourcePathNode = "/tmp"
1061+
def llmSrcNode = "${resourcePathNode}/TensorRT-LLM/src"
1062+
def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
1063+
def scriptRunNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
1064+
slurmOutputFile = SlurmConfig.getOutputFilePath("/home/svc_tensorrt/slurm-logs", jobUID)
1065+
def testListPathNode = "${jobWorkspace}/${testList}.txt"
1066+
def waivesListPathNode = "${jobWorkspace}/waives.txt"
1067+
def outputPath = "${jobWorkspace}/job-output.log"
1068+
def scriptLaunchPathNode = "${jobWorkspace}/slurm_launch.sh"
1069+
def scriptExecPathNode = "${jobWorkspace}/slurm_exec.py"
1070+
def scriptInstallPathNode = "${jobWorkspace}/install.sh"
1071+
def isAarch64 = config.contains("aarch64")
1072+
def coverageConfigFile = "${jobWorkspace}/.coveragerc"
1073+
1074+
// Determine the config yaml file path based on testList
1075+
def configYamlFile = "${llmSrcLocal}/tests/scripts/perf-sanity/${testList}.yaml"
1076+
1077+
stage("[${stageName}] Initializing Test") {
1078+
// Create Job Workspace folder in Frontend Node
1079+
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mkdir -p ${jobWorkspace}\""), numRetries: 3)
1080+
1081+
// Download and Unzip Tar File
1082+
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}")
1083+
sh "cd ${llmPath} && tar -zxf ${BUILD_CONFIGS[config][TARNAME]}"
1084+
1085+
// Copy script files from repository to remote host
1086+
// Copy slurm_run.sh
1087+
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/perf/disaggregated/slurm_run.sh"
1088+
Utils.copyFileToRemoteHost(
1089+
pipeline,
1090+
remote,
1091+
scriptRunLocalPath,
1092+
scriptRunNode,
1093+
true
1094+
)
1095+
1096+
// Copy slurm_launch.sh
1097+
def scriptLaunchLocalPath = "${llmSrcLocal}/jenkins/scripts/perf/disaggregated/slurm_launch.sh"
1098+
Utils.copyFileToRemoteHost(
1099+
pipeline,
1100+
remote,
1101+
scriptLaunchLocalPath,
1102+
scriptLaunchPathNode,
1103+
true
1104+
)
1105+
1106+
// Copy slurm_exec.py
1107+
def scriptExecLocalPath = "${llmSrcLocal}/jenkins/scripts/perf/disaggregated/slurm_exec.py"
1108+
Utils.copyFileToRemoteHost(
1109+
pipeline,
1110+
remote,
1111+
scriptExecLocalPath,
1112+
scriptExecPathNode,
1113+
true
1114+
)
1115+
1116+
// Copy install.sh
1117+
def scriptInstallLocalPath = "${llmSrcLocal}/jenkins/scripts/perf/disaggregated/install.sh"
1118+
Utils.copyFileToRemoteHost(
1119+
pipeline,
1120+
remote,
1121+
scriptInstallLocalPath,
1122+
scriptInstallPathNode,
1123+
true
1124+
)
1125+
1126+
// Generate Test List and Upload to Frontend Node
1127+
def makoArgs = getMakoArgsFromStageName(stageName, true)
1128+
def makoOptsJson = transformMakoArgsToJson(["Mako options:"] + makoArgs)
1129+
def testListPathLocal = renderTestDB(testList, llmSrcLocal, stageName, makoOptsJson)
1130+
Utils.copyFileToRemoteHost(
1131+
pipeline,
1132+
remote,
1133+
testListPathLocal,
1134+
testListPathNode
1135+
)
1136+
1137+
// generate .coveragerc in workspace and add file path to pytest command
1138+
sh """
1139+
touch ./.coveragerc
1140+
echo '[run]' > ./.coveragerc
1141+
echo 'branch = True' >> ./.coveragerc
1142+
echo 'data_file = ${jobWorkspace}/.coverage.${stageName}' >> ./.coveragerc
1143+
echo '[paths]' >> ./.coveragerc
1144+
echo 'source =\n ${llmSrcNode}/tensorrt_llm/\n ---wheel_path---/tensorrt_llm//tensorrt_llm/' >> ./.coveragerc
1145+
cat ./.coveragerc
1146+
"""
1147+
1148+
Utils.copyFileToRemoteHost(
1149+
pipeline,
1150+
remote,
1151+
"./.coveragerc",
1152+
coverageConfigFile
1153+
)
1154+
1155+
// Generate Pytest command
1156+
String pytestUtil = ""
1157+
if (nodeCount > 1) {
1158+
pytestUtil = "$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
1159+
}
1160+
1161+
def pytestCommand = getPytestBaseCommandLine(
1162+
llmSrcNode,
1163+
stageName,
1164+
perfMode,
1165+
jobWorkspace,
1166+
"__PLACEHOLDER_TRTLLM_WHL_PATH__",
1167+
"$jobWorkspace/.coveragerc",
1168+
pytestUtil,
1169+
[
1170+
"--test-list=$testListPathNode",
1171+
"--splitting-algorithm least_duration",
1172+
"--splits $splits",
1173+
"--group $splitId"
1174+
]
1175+
).join(" ")
1176+
}
1177+
1178+
stage("[${stageName}] Run Pytest") {
1179+
// Define container and mounts for slurm_exec.py
1180+
def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
1181+
def mounts = "/home/scratch.trt_llm_data:/scratch.trt_llm_data:ro,/home/svc_tensorrt/bloom/scripts:/home/svc_tensorrt/bloom/scripts"
1182+
1183+
// Generate sbatch parameter string (without --nodes, --ntasks, etc. which will come from config yaml)
1184+
def sbatchParams = """--output=${outputPath}
1185+
${partition.additionalArgs}
1186+
${(partition?.name && partition.name != "unspecified") ? "--partition=${partition.name}" : ""}"""
1187+
1188+
def python_exec_cmd = """python3 ${scriptExecPathNode} \\
1189+
--sbatch-params '${sbatchParams.replaceAll("'", "'\\\\''")}' \\
1190+
--config-yaml '${configYamlFile}' \\
1191+
--slurm-launch-sh '${scriptLaunchPathNode}' \\
1192+
--job-workspace '${jobWorkspace}' \\
1193+
--llm-tarfile '${llmTarfile}' \\
1194+
--tar-name '${tarName}' \\
1195+
--llm-src-node '${llmSrcNode}' \\
1196+
--stage-name '${stageName}' \\
1197+
--perf-mode '${perfMode}' \\
1198+
--resource-path-node '${resourcePathNode}' \\
1199+
--pytest-command '${pytestCommand.replaceAll("'", "'\\\\''")}' \\
1200+
--coverage-config-file '${coverageConfigFile}' \\
1201+
--container '${container}' \\
1202+
--mounts '${mounts}' \\
1203+
--script-run-node '${scriptRunNode}' \\
1204+
--script-install-node '${scriptInstallPathNode}' \\
1205+
--test-list-path-node '${testListPathNode}' \\
1206+
--output-path '${outputPath}'"""
1207+
1208+
Utils.exec(
1209+
pipeline,
1210+
timeout: false,
1211+
script: Utils.sshUserCmd(
1212+
remote,
1213+
"\"${python_exec_cmd}\""
1214+
)
1215+
)
1216+
}
1217+
1218+
echo "Finished test stage execution."
1219+
}
1220+
} finally {
1221+
uploadResults(pipeline, cluster, jobUID, stageName)
1222+
stage("Clean up SLURM Resources") {
1223+
// Workaround to handle the interruption during clean up SLURM resources
1224+
retry(3) {
1225+
try {
1226+
cleanUpSlurmResources(pipeline, cluster, jobUID)
1227+
} catch (Exception e) {
1228+
error "Error during clean up SLURM resources: ${e.getMessage()} and retrying."
1229+
}
1230+
}
1231+
}
1232+
}
1233+
}
1234+
10251235
def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, nodeCount=1, runWithSbatch=false, skipInstallWheel=false, cpver="cp312")
10261236
{
10271237
echo "Run Slurm job with native sbatch: $runWithSbatch"
1028-
if(nodeCount > 1 || runWithSbatch) {
1238+
if(nodeCount > 1 && stageName.contains("Perf-Sanity-Disagg")) {
1239+
runLLMMultiNodeDisaggTestlistWithSbatch(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, gpuCount, nodeCount, skipInstallWheel, cpver)
1240+
} else if(nodeCount > 1 || runWithSbatch) {
10291241
runLLMTestlistWithSbatch(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, gpuCount, nodeCount, skipInstallWheel, cpver)
10301242
} else {
10311243
runLLMTestlistWithAgent(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, gpuCount, skipInstallWheel, cpver)
@@ -2277,7 +2489,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
22772489
error "Some tests still failed after rerun attempts, please check the test report."
22782490
}
22792491

2280-
if (perfMode) {
2492+
if (perfMode && !stageName.contains("Perf-Sanity")) {
22812493
basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv"
22822494
basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}"
22832495
stage("Check perf result") {
@@ -2688,8 +2900,8 @@ def launchTestJobs(pipeline, testFilter)
26882900
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 1, 4, 1, true],
26892901
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
26902902
// Perf sanity post merge test
2691-
// Disable perf stages due to https://nvbugs/5643646
2692-
// "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4],
2903+
"DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4],
2904+
// "DGX_B200-8_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x8", "perf_sanity_l0_dgx_b200", 1, 1, 8],
26932905
// "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "perf_sanity_l0_dgx_b300", 1, 1, 4],
26942906
]
26952907
fullSet += x86SlurmTestConfigs.keySet()
@@ -2732,7 +2944,10 @@ def launchTestJobs(pipeline, testFilter)
27322944
// "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-trtllm", "l0_gb200_multi_nodes", 4, 5, 8, 2],
27332945
// "GB200-8_GPUs-2_Nodes-PyTorch-5": ["gb200-trtllm", "l0_gb200_multi_nodes", 5, 5, 8, 2],
27342946
// ]
2735-
multiNodesSBSAConfigs = [:]
2947+
multiNodesSBSAConfigs = [
2948+
"GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-trtllm", "perf_sanity_l0_gb200_multi_nodes", 1, 1, 8, 2],
2949+
"GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-trtllm", "perf_sanity_l0_gb200_multi_nodes_disagg", 1, 1, 8, 2],
2950+
]
27362951
def numMultiNodeTests = 3
27372952
multiNodesSBSAConfigs += (1..numMultiNodeTests).collectEntries { i ->
27382953
["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-trtllm", "l0_gb200_multi_nodes", i, numMultiNodeTests, 8, 2]]

jenkins/scripts/open_search_db.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,8 @@
7171
POST_TIMEOUT_SECONDS = 20
7272
QUERY_TIMEOUT_SECONDS = 10
7373

74-
OPEN_SEARCH_DB_BASE_URL = os.getenv("OPEN_SEARCH_DB_BASE_URL", "")
74+
OPEN_SEARCH_DB_BASE_URL = os.getenv("OPEN_SEARCH_DB_BASE_URL",
75+
"http://gpuwa.nvidia.com")
7576
OPEN_SEARCH_DB_USERNAME = os.getenv("OPEN_SEARCH_DB_CREDENTIALS_USR", "")
7677
OPEN_SEARCH_DB_PASSWORD = os.getenv("OPEN_SEARCH_DB_CREDENTIALS_PSW", "")
7778

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/bin/bash
2+
set -Eeuo pipefail
3+
4+
cd $resourcePathNode
5+
llmSrcNode=$resourcePathNode/TensorRT-LLM/src
6+
7+
echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
8+
9+
wget -nv $llmTarfile
10+
tar -zxf $tarName
11+
which python3
12+
python3 --version
13+
apt-get install -y libffi-dev
14+
nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
15+
if [[ $pytestCommand == *--run-ray* ]]; then
16+
pip3 install ray[default]
17+
fi
18+
cd $llmSrcNode && pip3 install --retries 1 -r requirements-dev.txt
19+
cd $resourcePathNode && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
20+
git config --global --add safe.directory "*"
21+
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
22+
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
23+
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"

0 commit comments

Comments
 (0)