From e9d5ad87732a1f2a93f76577bc7581ef81513539 Mon Sep 17 00:00:00 2001 From: Matt Lefebvre Date: Wed, 12 Nov 2025 19:11:56 -0800 Subject: [PATCH 1/5] Support enroot/pyxis clusters in multi-node SLURM Signed-off-by: Matt Lefebvre --- jenkins/L0_Test.groovy | 48 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index d71d510e36c..dfae55513bb 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -921,9 +921,50 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG taskArgs = [ *taskArgs, ] + + def containerImageArg = container + def srunPrologue = "" + if (cluster.containerRuntime == ContainerRuntime.ENROOT) { + mounts = [ + "/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci:/scratch.trt_llm_data:ro", + "/home/svc_tensorrt:/home/svc_tensorrt", + "/home/svc_tensorrt/.cache:/root/.cache", + // workspace needs to be explicitly mounted if container runtime is enroot to avoid chroot error + "${jobWorkspace}", + ].join(",") + + def enrootImagePath = "/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh" + containerImageArg = enrootImagePath + + srunPrologue = """ + export ENROOT_CACHE_PATH='/home/svc_tensorrt/.cache/enroot' + + retry_command() { + local cmd=\$1 + local max_attempts=\${2:-3} + local delay=\${3:-60} + local attempt=1 + + until \$cmd + do + if ((attempt >= max_attempts)) + then + echo "Command '\$cmd' failed after \$max_attempts attempts" + return 1 + fi + + echo "Command '\$cmd' failed (attempt \$attempt of \$max_attempts). Retrying in \${delay}s..." + sleep \$delay + ((attempt++)) + done + } + + retry_command "enroot import -o $enrootImagePath -- docker://$container" + """.replaceAll("(?m)^\\s*", "") + } + srunArgs = [ - "--container-image=$container", - "--container-workdir=/home/svc_tensorrt/bloom/scripts", + "--container-image=$containerImageArg", "--container-mounts=$mounts", "--container-env=NVIDIA_IMEX_CHANNELS" ] @@ -951,6 +992,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG export NVIDIA_IMEX_CHANNELS=0 export NVIDIA_IMEX_CHANNELS=0 export NVIDIA_VISIBLE_DEVICES=\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=noheader)-1))) + + ${srunPrologue} + chmod +x $scriptRunNode srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunNode} """.replaceAll("(?m)^\\s*", "") From d7f5c2d3c3642f3794dca432cccb771e1a042d1e Mon Sep 17 00:00:00 2001 From: Matt Lefebvre Date: Thu, 13 Nov 2025 11:54:01 -0800 Subject: [PATCH 2/5] Enable oci-hsg GB200s in post-merge Signed-off-by: Matt Lefebvre --- jenkins/L0_Test.groovy | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index dfae55513bb..29f01976417 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -2762,7 +2762,7 @@ def launchTestJobs(pipeline, testFilter) // Disable GB300 stages due to nodes will be offline temporarily. // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1], "GB200-4_GPUs-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4], - "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4], + "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_gpus", 1, 1, 4], // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-trtllm", "l0_gb300_multi_gpus", 1, 1, 4], ] fullSet += SBSASlurmTestConfigs.keySet() @@ -2779,7 +2779,7 @@ def launchTestJobs(pipeline, testFilter) multiNodesSBSAConfigs = [:] def numMultiNodeTests = 3 multiNodesSBSAConfigs += (1..numMultiNodeTests).collectEntries { i -> - ["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-trtllm", "l0_gb200_multi_nodes", i, numMultiNodeTests, 8, 2]] + ["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-oci-trtllm", "l0_gb200_multi_nodes", i, numMultiNodeTests, 8, 2]] } fullSet += multiNodesSBSAConfigs.keySet() From 0ac061ded9c99d35c8085e1347320d2e62a224ae Mon Sep 17 00:00:00 2001 From: Matt Lefebvre Date: Thu, 13 Nov 2025 13:03:30 -0800 Subject: [PATCH 3/5] Fix platform for single node Signed-off-by: Matt Lefebvre --- jenkins/L0_Test.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 29f01976417..75b6f3d92b9 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -2762,7 +2762,7 @@ def launchTestJobs(pipeline, testFilter) // Disable GB300 stages due to nodes will be offline temporarily. // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1], "GB200-4_GPUs-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4], - "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_gpus", 1, 1, 4], + "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4], // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-trtllm", "l0_gb300_multi_gpus", 1, 1, 4], ] fullSet += SBSASlurmTestConfigs.keySet() From e89c4f4382f61e5ee9e207f4c5a5cc01ffd49e6d Mon Sep 17 00:00:00 2001 From: Matt Lefebvre Date: Thu, 13 Nov 2025 15:32:05 -0800 Subject: [PATCH 4/5] Readd --container-workdir and remove home/svc_tensorrt mount Signed-off-by: Matt Lefebvre --- jenkins/L0_Test.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 75b6f3d92b9..9897d95444d 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -927,7 +927,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG if (cluster.containerRuntime == ContainerRuntime.ENROOT) { mounts = [ "/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci:/scratch.trt_llm_data:ro", - "/home/svc_tensorrt:/home/svc_tensorrt", "/home/svc_tensorrt/.cache:/root/.cache", // workspace needs to be explicitly mounted if container runtime is enroot to avoid chroot error "${jobWorkspace}", @@ -965,6 +964,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG srunArgs = [ "--container-image=$containerImageArg", + "--container-workdir=/home/svc_tensorrt/bloom/scripts", "--container-mounts=$mounts", "--container-env=NVIDIA_IMEX_CHANNELS" ] From b0158051962e9dce7cd3e2e637601595a7022567 Mon Sep 17 00:00:00 2001 From: Matt Lefebvre Date: Thu, 13 Nov 2025 15:40:09 -0800 Subject: [PATCH 5/5] Mount /home/svc_tensorrt/bloom/scripts Signed-off-by: Matt Lefebvre --- jenkins/L0_Test.groovy | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 9897d95444d..5bf19f58996 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -927,9 +927,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG if (cluster.containerRuntime == ContainerRuntime.ENROOT) { mounts = [ "/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci:/scratch.trt_llm_data:ro", + "/home/svc_tensorrt/bloom/scripts", "/home/svc_tensorrt/.cache:/root/.cache", - // workspace needs to be explicitly mounted if container runtime is enroot to avoid chroot error - "${jobWorkspace}", ].join(",") def enrootImagePath = "/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh"