@@ -425,19 +425,21 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
425425
426426 Utils . exec(pipeline, script : " apt-get update && apt-get install -y sshpass openssh-client" )
427427
428+ Utils . exec(pipeline, script : " echo Sleeping to allow Slurm job completion; sleep 30" )
429+
428430 def slurmJobID = Utils . exec(
429431 pipeline,
430432 // Try to grab the job id from ${jobWorkspace}/slurm_job_id.txt.
431433 // The slurm_run.sh will add the slurm job id in that file.
432434 script : Utils . sshUserCmd(
433435 remote,
434- " ' test -f ${ jobWorkspace} /slurm_job_id.txt && cat ${ jobWorkspace} /slurm_job_id.txt' "
436+ " \" test -f ${ jobWorkspace} /slurm_job_id.txt && cat ${ jobWorkspace} /slurm_job_id.txt || true \" "
435437 ),
436438 returnStdout : true
437439 ). trim()
438440
439441 if (! slurmJobID || ! slurmJobID. isNumber()) {
440- echo " Slurm job did not submit successfully. No job ID found."
442+ echo " Slurm job may not submit successfully. No job ID found."
441443 } else {
442444 Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID} " )
443445
@@ -448,14 +450,15 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
448450 " \" scancel ${ slurmJobID} || true; sacct -j ${ slurmJobID} --format=JobID,JobName%100,Partition%15,Account%15,State,ExitCode,NodeList%30 || true; scontrol show job ${ slurmJobID} || true\" "
449451 )
450452 )
451-
452453 }
453454
455+ Utils . exec(pipeline, script : " echo Sleeping to allow Slurm job termination; sleep 30" )
456+
454457 Utils . exec(
455458 pipeline,
456459 script : Utils . sshUserCmd(
457460 remote,
458- " rm -rf ${ jobWorkspace} "
461+ " \" rm -rf ${ jobWorkspace} || true \" "
459462 )
460463 )
461464
@@ -1480,8 +1483,7 @@ def runLLMDocBuild(pipeline, config)
14801483 trtllm_utils. llmExecStepWithRetry(pipeline, script : " cd ${ llmPath} && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl" )
14811484
14821485 // Step 3: build doc
1483- trtllm_utils. llmExecStepWithRetry(pipeline, script : " apt-get update" )
1484- trtllm_utils. llmExecStepWithRetry(pipeline, script : " apt-get install doxygen python3-pip graphviz -y" )
1486+ trtllm_utils. llmExecStepWithRetry(pipeline, script : " apt-get update && apt-get install -y doxygen python3-pip graphviz" )
14851487
14861488 def containerPATH = sh(script : " echo \$ {PATH}" , returnStdout : true ). replaceAll(" \\ s" , " " )
14871489 if (! containerPATH. contains(" /usr/local/bin:" )) {
@@ -1520,9 +1522,7 @@ def launchTestListCheck(pipeline)
15201522 trtllm_utils. launchKubernetesPod(pipeline, createKubernetesPodConfig(LLM_DOCKER_IMAGE , " a10" ), " trt-llm" , {
15211523 try {
15221524 echoNodeAndGpuInfo(pipeline, stageName)
1523- trtllm_utils. llmExecStepWithRetry(pipeline, script : """ apt-get update && apt-get install \
1524- libffi-dev \
1525- -y""" )
1525+ trtllm_utils. llmExecStepWithRetry(pipeline, script : " apt-get update && apt-get install -y libffi-dev" )
15261526 sh " nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
15271527 // download TRT-LLM tarfile
15281528 def tarName = BUILD_CONFIGS [VANILLA_CONFIG ][TARNAME ]
@@ -2040,8 +2040,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
20402040 // setup HF_HOME to cache model and datasets
20412041 // init the huggingface cache from nfs, since the nfs is read-only, and HF_HOME needs to be writable, otherwise it will fail at creating file lock
20422042 sh " mkdir -p ${ HF_HOME} && ls -alh ${ HF_HOME} "
2043- trtllm_utils. llmExecStepWithRetry(pipeline, script : " apt-get update" )
2044- trtllm_utils. llmExecStepWithRetry(pipeline, script : " apt-get install -y rsync" )
2043+ trtllm_utils. llmExecStepWithRetry(pipeline, script : " apt-get update && apt-get install -y rsync" )
20452044 trtllm_utils. llmExecStepWithRetry(pipeline, script : " rsync -r ${ MODEL_CACHE_DIR} /hugging-face-cache/ ${ HF_HOME} / && ls -lh ${ HF_HOME} " )
20462045 sh " df -h"
20472046
@@ -2932,8 +2931,7 @@ def launchTestJobs(pipeline, testFilter)
29322931 if (values[5 ] == DLFW_IMAGE || values[5 ] == DLFW_IMAGE_12_9 ) {
29332932 trtllm_utils. llmExecStepWithRetry(pipeline, script : " [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true" )
29342933 }
2935- trtllm_utils. llmExecStepWithRetry(pipeline, script : " apt-get update" )
2936- trtllm_utils. llmExecStepWithRetry(pipeline, script : " apt-get -y install python3-pip git rsync curl wget" )
2934+ trtllm_utils. llmExecStepWithRetry(pipeline, script : " apt-get update && apt-get install -y python3-pip git rsync curl wget" )
29372935 trtllm_utils. checkoutSource(LLM_REPO , env. gitlabCommit, LLM_ROOT , true , true )
29382936 trtllm_utils. llmExecStepWithRetry(pipeline, script : " pip3 config set global.break-system-packages true" )
29392937 trtllm_utils. llmExecStepWithRetry(pipeline, script : " pip3 install requests" )
@@ -2943,11 +2941,10 @@ def launchTestJobs(pipeline, testFilter)
29432941 def platform = cpu_arch == X86_64_TRIPLE ? " x86_64" : " sbsa"
29442942 trtllm_utils. llmExecStepWithRetry(pipeline, script : " wget https://developer.download.nvidia.com/compute/cuda/repos/${ ubuntu_version} /${ platform} /cuda-keyring_1.1-1_all.deb" )
29452943 trtllm_utils. llmExecStepWithRetry(pipeline, script : " dpkg -i cuda-keyring_1.1-1_all.deb" )
2946- trtllm_utils. llmExecStepWithRetry(pipeline, script : " apt-get update" )
29472944 if (key. contains(" CU12" )) {
2948- trtllm_utils. llmExecStepWithRetry(pipeline, script : " apt-get -y install cuda-toolkit-12-9" )
2945+ trtllm_utils. llmExecStepWithRetry(pipeline, script : " apt-get update && apt-get install -y cuda-toolkit-12-9" )
29492946 } else {
2950- trtllm_utils. llmExecStepWithRetry(pipeline, script : " apt-get -y install cuda-toolkit-13-0" )
2947+ trtllm_utils. llmExecStepWithRetry(pipeline, script : " apt-get update && apt-get install -y cuda-toolkit-13-0" )
29512948 }
29522949 }
29532950 if (key. contains(" CU12" )) {
0 commit comments