NVIDIA
diff --git a/‎cpp/tensorrt_llm/batch_manager/llmRequest.cpp‎
Lines changed: 3 additions & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/llmRequest.cpp‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/common/envUtils.cpp‎
Lines changed: 22 additions & 0 deletions b/‎cpp/tensorrt_llm/common/envUtils.cpp‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/common/envUtils.h‎
Lines changed: 2 additions & 0 deletions b/‎cpp/tensorrt_llm/common/envUtils.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/transferAgent.cpp‎
Lines changed: 16 additions & 3 deletions b/‎cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/transferAgent.cpp‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎jenkins/L0_Test.groovy‎
Lines changed: 46 additions & 3 deletions b/‎jenkins/L0_Test.groovy‎
Lines changed: 46 additions & 3 deletions
diff --git a/‎scripts/generate_lock_file.py‎
Lines changed: 1 addition & 0 deletions b/‎scripts/generate_lock_file.py‎
Lines changed: 1 addition & 0 deletions
@@ -69,7 +69,9 @@ void LlmRequest::createSerializedResult(
 /// Note that there is some dependency on the order of operations in this method. Modify with care!
 std::optional<executor::Result> LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank)
 {
-    if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)))
+    auto const streamingInProgress = mIsStreaming
+        && (mState == LlmRequestState::kGENERATION_IN_PROGRESS || mState == LlmRequestState::kGENERATION_TO_COMPLETE);
+    if (!(isFinished() || streamingInProgress))
     {
         return std::nullopt;
     }
 
@@ -318,6 +318,28 @@ std::string getEnvNixlInterface()
     return nixlInterface;
 }
 
+std::string getEnvNixlBackend()
+{
+    static std::once_flag flag;
+    static std::string nixlBackend;
+
+    std::call_once(flag,
+        [&]()
+        {
+            char const* nixl_backend = std::getenv("TRTLLM_NIXL_KVCACHE_BACKEND");
+            if (nixl_backend)
+            {
+                nixlBackend = nixl_backend;
+            }
+            else
+            {
+                // Default to UCX if not specified
+                nixlBackend = "UCX";
+            }
+        });
+    return nixlBackend;
+}
+
 bool getEnvDisaggLayerwise()
 {
     static bool const disaggLayerwise = getBoolEnv("TRTLLM_DISAGG_LAYERWISE");
 
@@ -88,6 +88,8 @@ std::string getEnvUCXInterface();
 
 std::string getEnvNixlInterface();
 
+std::string getEnvNixlBackend();
+
 bool getEnvDisaggLayerwise();
 
 bool getEnvParallelCacheSend();
 
@@ -28,6 +28,7 @@
 #include <netdb.h>
 #include <netinet/in.h>
 #include <nixl_types.h>
+#include <set>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <unistd.h>
@@ -345,15 +346,27 @@ NixlTransferAgent::NixlTransferAgent(BaseAgentConfig const& config)
         mRawAgent = std::make_unique<nixlAgent>(config.mName, std::move(nixlConfig));
     }
 
+    std::string nixlBackend = common::getEnvNixlBackend();
+    // List of supported backends - extend this list as new backends are added
+    static const std::set<std::string> kSUPPORTED_BACKENDS = {"UCX"};
+
+    if (kSUPPORTED_BACKENDS.find(nixlBackend) == kSUPPORTED_BACKENDS.end())
+    {
+        TLLM_LOG_ERROR("Unsupported NIXL backend: %s, fallback to UCX", nixlBackend.c_str());
+        nixlBackend = "UCX";
+    }
+
+    TLLM_LOG_INFO("NixlTransferAgent::NixlTransferAgent using NIXL backend: %s", nixlBackend.c_str());
+
     nixl_b_params_t init1;
     nixl_mem_list_t mems1;
-    status = mRawAgent->getPluginParams("UCX", mems1, init1);
+    status = mRawAgent->getPluginParams(nixlBackend.c_str(), mems1, init1);
     TLLM_CHECK(status == NIXL_SUCCESS);
 
-    status = mRawAgent->createBackend("UCX", init1, mRawBackend);
+    status = mRawAgent->createBackend(nixlBackend.c_str(), init1, mRawBackend);
     if (status != NIXL_SUCCESS || !mRawBackend)
     {
-        TLLM_THROW("Failed to create NIXL backend");
+        TLLM_THROW("Failed to create NIXL backend: %s", nixlBackend.c_str());
     }
     mExtraParams.backends.push_back(mRawBackend);
     TLLM_LOG_INFO("NixlTransferAgent::NixlTransferAgent mAddress: %s", mAddress.c_str());
 
@@ -64,7 +64,7 @@ void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_
                  LlmRequestState>(),
             nb::arg("ctx_chunk_config") = std::nullopt, nb::arg("max_context_length") = std::nullopt,
             nb::arg("no_schedule_until_state") = LlmRequestState::kCONTEXT_INIT,
-            nb::arg("no_schedule_after_state") = LlmRequestState::kGENERATION_COMPLETE)
+            nb::arg("no_schedule_after_state") = LlmRequestState::kGENERATION_TO_COMPLETE)
         .def("__call__", &MicroBatchScheduler::operator(), nb::arg("active_requests"), nb::arg("inflight_req_ids"),
             nb::arg("max_batch_size_runtime"), nb::arg("max_num_tokens_runtime"))
         .def("name", [](MicroBatchScheduler const&) { return MicroBatchScheduler::name; });
 
@@ -103,6 +103,7 @@ void initBindings(nb::module_& m)
         .def("get_last_tokens", nb::overload_cast<>(&GenLlmReq::getLastTokens))
         .def("get_beam_width_by_iter", &GenLlmReq::getBeamWidthByIter, nb::arg("for_next_iteration") = false)
         .def_prop_ro("max_num_generated_tokens", &GenLlmReq::getMaxNumGeneratedTokens)
+        .def("will_complete_next_iteration", &GenLlmReq::willCompleteNextIteration)
         .def("add_new_token", &GenLlmReq::addNewToken, nb::arg("token"), nb::arg("beam"))
         .def("add_new_tokens", &GenLlmReq::addNewTokens, nb::arg("beam_tokens"))
         .def_prop_ro("num_draft_tokens", &GenLlmReq::getNumDraftTokens)
 
@@ -65,8 +65,8 @@ void tensorrt_llm::pybind::batch_manager::algorithms::initBindings(pybind11::mod
                  LlmRequestState>(),
             py::arg("ctx_chunk_config") = std::nullopt, py::arg("max_context_length") = std::nullopt,
             py::arg_v("no_schedule_until_state", LlmRequestState::kCONTEXT_INIT, "LlmRequestState.CONTEXT_INIT"),
-            py::arg_v("no_schedule_after_state", LlmRequestState::kGENERATION_COMPLETE,
-                "LlmRequestState.GENERATION_COMPLETE"))
+            py::arg_v("no_schedule_after_state", LlmRequestState::kGENERATION_TO_COMPLETE,
+                "LlmRequestState.GENERATION_TO_COMPLETE"))
         .def("__call__", &MicroBatchScheduler::operator(), py::arg("active_requests"), py::arg("inflight_req_ids"),
             py::arg("max_batch_size_runtime"), py::arg("max_num_tokens_runtime"))
         .def("name", [](MicroBatchScheduler const&) { return MicroBatchScheduler::name; });
 
@@ -107,6 +107,7 @@ void initBindings(pybind11::module_& m)
         .def("get_last_tokens", py::overload_cast<>(&GenLlmReq::getLastTokens))
         .def("get_beam_width_by_iter", &GenLlmReq::getBeamWidthByIter, py::arg("for_next_iteration") = false)
         .def_property_readonly("max_num_generated_tokens", &GenLlmReq::getMaxNumGeneratedTokens)
+        .def("will_complete_next_iteration", &GenLlmReq::willCompleteNextIteration)
         .def("add_new_token", &GenLlmReq::addNewToken, py::arg("token"), py::arg("beam"))
         .def("add_new_tokens", &GenLlmReq::addNewTokens, py::arg("beam_tokens"))
         .def_property_readonly("num_draft_tokens", &GenLlmReq::getNumDraftTokens)
 
@@ -921,8 +921,48 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                 taskArgs = [
                     *taskArgs,
                 ]
+
+                def containerImageArg = container
+                def srunPrologue = ""
+                if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
+                    mounts = [
+                        "/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci:/scratch.trt_llm_data:ro",
+                        "/home/svc_tensorrt/bloom/scripts",
+                        "/home/svc_tensorrt/.cache:/root/.cache",
+                    ].join(",")
+
+                    def enrootImagePath = "/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh"
+                    containerImageArg = enrootImagePath
+
+                    srunPrologue = """
+                    export ENROOT_CACHE_PATH='/home/svc_tensorrt/.cache/enroot'
+
+                    retry_command() {
+                        local cmd=\$1
+                        local max_attempts=\${2:-3}
+                        local delay=\${3:-60}
+                        local attempt=1
+
+                        until \$cmd
+                        do
+                            if ((attempt >= max_attempts))
+                            then
+                                echo "Command '\$cmd' failed after \$max_attempts attempts"
+                                return 1
+                            fi
+
+                            echo "Command '\$cmd' failed (attempt \$attempt of \$max_attempts). Retrying in \${delay}s..."
+                            sleep \$delay
+                            ((attempt++))
+                        done
+                    }
+
+                    retry_command "enroot import -o $enrootImagePath -- docker://$container"
+                    """.replaceAll("(?m)^\\s*", "")
+                }
+
                 srunArgs = [
-                    "--container-image=$container",
+                    "--container-image=$containerImageArg",
                     "--container-workdir=/home/svc_tensorrt/bloom/scripts",
                     "--container-mounts=$mounts",
                     "--container-env=NVIDIA_IMEX_CHANNELS"
@@ -951,6 +991,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     export NVIDIA_IMEX_CHANNELS=0
                     export NVIDIA_IMEX_CHANNELS=0
                     export NVIDIA_VISIBLE_DEVICES=\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))
+
+                    ${srunPrologue}
+
                     chmod +x $scriptRunNode
                     srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunNode}
                 """.replaceAll("(?m)^\\s*", "")
@@ -2718,7 +2761,7 @@ def launchTestJobs(pipeline, testFilter)
         // Disable GB300 stages due to nodes will be offline temporarily.
         // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
         "GB200-4_GPUs-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],
-        "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],
+        "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
         // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-trtllm", "l0_gb300_multi_gpus", 1, 1, 4],
     ]
     fullSet += SBSASlurmTestConfigs.keySet()
@@ -2735,7 +2778,7 @@ def launchTestJobs(pipeline, testFilter)
     multiNodesSBSAConfigs = [:]
     def numMultiNodeTests = 3
     multiNodesSBSAConfigs += (1..numMultiNodeTests).collectEntries { i ->
-        ["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-trtllm", "l0_gb200_multi_nodes", i, numMultiNodeTests, 8, 2]]
+        ["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-oci-trtllm", "l0_gb200_multi_nodes", i, numMultiNodeTests, 8, 2]]
     }
     fullSet += multiNodesSBSAConfigs.keySet()
 
 
@@ -92,6 +92,7 @@ def generate_metadata_json():
               "w",
               encoding="utf-8") as f:
         json.dump(data, f, indent=2)
+        f.write("\n")
 
 
 if __name__ == "__main__":
Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,9 @@ void LlmRequest::createSerializedResult(`
`69`	`69`	`/// Note that there is some dependency on the order of operations in this method. Modify with care!`
`70`	`70`	`std::optional<executor::Result> LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank)`
`71`	`71`	`{`
`72`		`- if (!(isFinished() \|\| (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)))`
	`72`	`+ auto const streamingInProgress = mIsStreaming`
	`73`	`+ && (mState == LlmRequestState::kGENERATION_IN_PROGRESS \|\| mState == LlmRequestState::kGENERATION_TO_COMPLETE);`
	`74`	`+ if (!(isFinished() \|\| streamingInProgress))`
`73`	`75`	`{`
`74`	`76`	`return std::nullopt;`
`75`	`77`	`}`