From e9d5ad87732a1f2a93f76577bc7581ef81513539 Mon Sep 17 00:00:00 2001
From: Matt Lefebvre <mlefebvre@nvidia.com>
Date: Wed, 12 Nov 2025 19:11:56 -0800
Subject: [PATCH 1/5] Support enroot/pyxis clusters in multi-node SLURM

Signed-off-by: Matt Lefebvre <mlefebvre@nvidia.com>
---
 jenkins/L0_Test.groovy | 48 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index d71d510e36c..dfae55513bb 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -921,9 +921,50 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                 taskArgs = [
                     *taskArgs,
                 ]
+
+                def containerImageArg = container
+                def srunPrologue = ""
+                if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
+                    mounts = [
+                        "/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci:/scratch.trt_llm_data:ro",
+                        "/home/svc_tensorrt:/home/svc_tensorrt",
+                        "/home/svc_tensorrt/.cache:/root/.cache",
+                        // workspace needs to be explicitly mounted if container runtime is enroot to avoid chroot error
+                        "${jobWorkspace}",
+                    ].join(",")
+
+                    def enrootImagePath = "/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh"
+                    containerImageArg = enrootImagePath
+
+                    srunPrologue = """
+                    export ENROOT_CACHE_PATH='/home/svc_tensorrt/.cache/enroot'
+
+                    retry_command() {
+                        local cmd=\$1
+                        local max_attempts=\${2:-3}
+                        local delay=\${3:-60}
+                        local attempt=1
+
+                        until \$cmd
+                        do
+                            if ((attempt >= max_attempts))
+                            then
+                                echo "Command '\$cmd' failed after \$max_attempts attempts"
+                                return 1
+                            fi
+
+                            echo "Command '\$cmd' failed (attempt \$attempt of \$max_attempts). Retrying in \${delay}s..."
+                            sleep \$delay
+                            ((attempt++))
+                        done
+                    }
+
+                    retry_command "enroot import -o $enrootImagePath -- docker://$container"
+                    """.replaceAll("(?m)^\\s*", "")
+                }
+
                 srunArgs = [
-                    "--container-image=$container",
-                    "--container-workdir=/home/svc_tensorrt/bloom/scripts",
+                    "--container-image=$containerImageArg",
                     "--container-mounts=$mounts",
                     "--container-env=NVIDIA_IMEX_CHANNELS"
                 ]
@@ -951,6 +992,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     export NVIDIA_IMEX_CHANNELS=0
                     export NVIDIA_IMEX_CHANNELS=0
                     export NVIDIA_VISIBLE_DEVICES=\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))
+
+                    ${srunPrologue}
+
                     chmod +x $scriptRunNode
                     srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunNode}
                 """.replaceAll("(?m)^\\s*", "")

From d7f5c2d3c3642f3794dca432cccb771e1a042d1e Mon Sep 17 00:00:00 2001
From: Matt Lefebvre <mlefebvre@nvidia.com>
Date: Thu, 13 Nov 2025 11:54:01 -0800
Subject: [PATCH 2/5] Enable oci-hsg GB200s in post-merge

Signed-off-by: Matt Lefebvre <mlefebvre@nvidia.com>
---
 jenkins/L0_Test.groovy | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index dfae55513bb..29f01976417 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -2762,7 +2762,7 @@ def launchTestJobs(pipeline, testFilter)
         // Disable GB300 stages due to nodes will be offline temporarily.
         // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
         "GB200-4_GPUs-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],
-        "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],
+        "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],
         // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-trtllm", "l0_gb300_multi_gpus", 1, 1, 4],
     ]
     fullSet += SBSASlurmTestConfigs.keySet()
@@ -2779,7 +2779,7 @@ def launchTestJobs(pipeline, testFilter)
     multiNodesSBSAConfigs = [:]
     def numMultiNodeTests = 3
     multiNodesSBSAConfigs += (1..numMultiNodeTests).collectEntries { i ->
-        ["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-trtllm", "l0_gb200_multi_nodes", i, numMultiNodeTests, 8, 2]]
+        ["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-oci-trtllm", "l0_gb200_multi_nodes", i, numMultiNodeTests, 8, 2]]
     }
     fullSet += multiNodesSBSAConfigs.keySet()
 

From 0ac061ded9c99d35c8085e1347320d2e62a224ae Mon Sep 17 00:00:00 2001
From: Matt Lefebvre <mlefebvre@nvidia.com>
Date: Thu, 13 Nov 2025 13:03:30 -0800
Subject: [PATCH 3/5] Fix platform for single node

Signed-off-by: Matt Lefebvre <mlefebvre@nvidia.com>
---
 jenkins/L0_Test.groovy | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 29f01976417..75b6f3d92b9 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -2762,7 +2762,7 @@ def launchTestJobs(pipeline, testFilter)
         // Disable GB300 stages due to nodes will be offline temporarily.
         // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
         "GB200-4_GPUs-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],
-        "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],
+        "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
         // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-trtllm", "l0_gb300_multi_gpus", 1, 1, 4],
     ]
     fullSet += SBSASlurmTestConfigs.keySet()

From e89c4f4382f61e5ee9e207f4c5a5cc01ffd49e6d Mon Sep 17 00:00:00 2001
From: Matt Lefebvre <mlefebvre@nvidia.com>
Date: Thu, 13 Nov 2025 15:32:05 -0800
Subject: [PATCH 4/5] Readd --container-workdir and remove home/svc_tensorrt
 mount

Signed-off-by: Matt Lefebvre <mlefebvre@nvidia.com>
---
 jenkins/L0_Test.groovy | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 75b6f3d92b9..9897d95444d 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -927,7 +927,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                 if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
                     mounts = [
                         "/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci:/scratch.trt_llm_data:ro",
-                        "/home/svc_tensorrt:/home/svc_tensorrt",
                         "/home/svc_tensorrt/.cache:/root/.cache",
                         // workspace needs to be explicitly mounted if container runtime is enroot to avoid chroot error
                         "${jobWorkspace}",
@@ -965,6 +964,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
 
                 srunArgs = [
                     "--container-image=$containerImageArg",
+                    "--container-workdir=/home/svc_tensorrt/bloom/scripts",
                     "--container-mounts=$mounts",
                     "--container-env=NVIDIA_IMEX_CHANNELS"
                 ]

From b0158051962e9dce7cd3e2e637601595a7022567 Mon Sep 17 00:00:00 2001
From: Matt Lefebvre <mlefebvre@nvidia.com>
Date: Thu, 13 Nov 2025 15:40:09 -0800
Subject: [PATCH 5/5] Mount /home/svc_tensorrt/bloom/scripts

Signed-off-by: Matt Lefebvre <mlefebvre@nvidia.com>
---
 jenkins/L0_Test.groovy | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 9897d95444d..5bf19f58996 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -927,9 +927,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                 if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
                     mounts = [
                         "/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci:/scratch.trt_llm_data:ro",
+                        "/home/svc_tensorrt/bloom/scripts",
                         "/home/svc_tensorrt/.cache:/root/.cache",
-                        // workspace needs to be explicitly mounted if container runtime is enroot to avoid chroot error
-                        "${jobWorkspace}",
                     ].join(",")
 
                     def enrootImagePath = "/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh"