Skip to content

Commit 5ff1adb

Browse files
authored
[None][fix] WAR for tensorrt depending on the archived nvidia-cuda-runtime-cu13 package (#8858)
Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
1 parent f22a87f commit 5ff1adb

File tree

5 files changed

+11
-17
lines changed

5 files changed

+11
-17
lines changed

jenkins/Build.groovy

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,7 @@ def runLLMBuild(pipeline, buildFlags, tarName, is_linux_x86_64)
454454
pipArgs = ""
455455
}
456456

457-
if (tarName.contains("_CU12")) {
457+
if (tarName.contains("CU12")) {
458458
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${LLM_ROOT} && sed -i '/^# .*<For CUDA 12\\.9>\$/ {s/^# //; n; s/^/# /}' requirements.txt && cat requirements.txt")
459459
}
460460
// install python package

jenkins/BuildDockerImage.groovy

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -586,7 +586,7 @@ pipeline {
586586
catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') {
587587
container("python3") {
588588
// Install wget
589-
trtllm_utils.llmExecStepWithRetry(this, script: "apt-get update && apt-get -y install wget")
589+
trtllm_utils.llmExecStepWithRetry(this, script: "apt-get update && apt-get install -y wget")
590590

591591
// Poll for build artifacts
592592
def artifactBaseUrl = "https://urm.nvidia.com/artifactory/${UPLOAD_PATH}/"

jenkins/L0_MergeRequest.groovy

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -381,9 +381,7 @@ def preparation(pipeline, testFilter, globalVars)
381381
def launchReleaseCheck(pipeline)
382382
{
383383
stages = {
384-
trtllm_utils.llmExecStepWithRetry(pipeline, script: """apt-get update && apt-get install \
385-
python3-pip \
386-
-y""")
384+
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y python3-pip")
387385
sh "pip3 config set global.break-system-packages true"
388386
sh "git config --global --add safe.directory \"*\""
389387
// Step 1: Clone TRT-LLM source codes

jenkins/L0_Test.groovy

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,8 +1064,7 @@ def runLLMDocBuild(pipeline, config)
10641064
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl")
10651065

10661066
// Step 3: build doc
1067-
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update")
1068-
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get install doxygen python3-pip graphviz -y")
1067+
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y doxygen python3-pip graphviz")
10691068

10701069
def containerPATH = sh(script: "echo \${PATH}", returnStdout: true).replaceAll("\\s", "")
10711070
if (!containerPATH.contains("/usr/local/bin:")) {
@@ -1104,9 +1103,7 @@ def launchTestListCheck(pipeline)
11041103
trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(LLM_DOCKER_IMAGE, "a10"), "trt-llm", {
11051104
try {
11061105
echoNodeAndGpuInfo(pipeline, stageName)
1107-
trtllm_utils.llmExecStepWithRetry(pipeline, script: """apt-get update && apt-get install \
1108-
libffi-dev \
1109-
-y""")
1106+
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y libffi-dev")
11101107
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
11111108
// download TRT-LLM tarfile
11121109
def tarName = BUILD_CONFIGS[VANILLA_CONFIG][TARNAME]
@@ -1527,8 +1524,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
15271524
// setup HF_HOME to cache model and datasets
15281525
// init the huggingface cache from nfs, since the nfs is read-only, and HF_HOME needs to be writable, otherwise it will fail at creating file lock
15291526
sh "mkdir -p ${HF_HOME} && ls -alh ${HF_HOME}"
1530-
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update")
1531-
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get install -y rsync")
1527+
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y rsync")
15321528
trtllm_utils.llmExecStepWithRetry(pipeline, script: "rsync -r ${MODEL_CACHE_DIR}/hugging-face-cache/ ${HF_HOME}/ && ls -lh ${HF_HOME}")
15331529
sh "df -h"
15341530

@@ -2379,8 +2375,7 @@ def launchTestJobs(pipeline, testFilter)
23792375
if (values[5] == DLFW_IMAGE || values[5] == DLFW_IMAGE_12_9) {
23802376
trtllm_utils.llmExecStepWithRetry(pipeline, script: "[ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true")
23812377
}
2382-
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update")
2383-
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get -y install python3-pip git rsync curl wget")
2378+
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y python3-pip git rsync curl wget")
23842379
trtllm_utils.checkoutSource(LLM_REPO, env.gitlabCommit, LLM_ROOT, true, true)
23852380
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 config set global.break-system-packages true")
23862381
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install requests")
@@ -2390,11 +2385,10 @@ def launchTestJobs(pipeline, testFilter)
23902385
def platform = cpu_arch == X86_64_TRIPLE ? "x86_64" : "sbsa"
23912386
trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget https://developer.download.nvidia.com/compute/cuda/repos/${ubuntu_version}/${platform}/cuda-keyring_1.1-1_all.deb")
23922387
trtllm_utils.llmExecStepWithRetry(pipeline, script: "dpkg -i cuda-keyring_1.1-1_all.deb")
2393-
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update")
23942388
if (key.contains("CU12")) {
2395-
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get -y install cuda-toolkit-12-9")
2389+
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-12-9")
23962390
} else {
2397-
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get -y install cuda-toolkit-13-0")
2391+
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-13-0")
23982392
}
23992393
}
24002394
if (key.contains("CU12")) {

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ pandas
2121
h5py==3.12.1
2222
StrEnum
2323
sentencepiece>=0.1.99
24+
# WAR for tensorrt depending on the archived nvidia-cuda-runtime-cu13 package
25+
nvidia-cuda-runtime-cu13==0.0.0a0
2426
# tensorrt~=10.11.0 # <For CUDA 12.9>
2527
tensorrt~=10.13.0
2628
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-08.html#rel-25-08 uses 2.8.0a0.

0 commit comments

Comments
 (0)