Skip to content

Commit cae980d

Browse files
Merge branch 'main' into users/nzmora/robust_moe_activation
2 parents 1617395 + e3c9a97 commit cae980d

File tree

70 files changed

+791
-663
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+791
-663
lines changed

cpp/tensorrt_llm/batch_manager/llmRequest.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,9 @@ void LlmRequest::createSerializedResult(
6969
/// Note that there is some dependency on the order of operations in this method. Modify with care!
7070
std::optional<executor::Result> LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank)
7171
{
72-
if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)))
72+
auto const streamingInProgress = mIsStreaming
73+
&& (mState == LlmRequestState::kGENERATION_IN_PROGRESS || mState == LlmRequestState::kGENERATION_TO_COMPLETE);
74+
if (!(isFinished() || streamingInProgress))
7375
{
7476
return std::nullopt;
7577
}

cpp/tensorrt_llm/common/envUtils.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,28 @@ std::string getEnvNixlInterface()
318318
return nixlInterface;
319319
}
320320

321+
std::string getEnvNixlBackend()
322+
{
323+
static std::once_flag flag;
324+
static std::string nixlBackend;
325+
326+
std::call_once(flag,
327+
[&]()
328+
{
329+
char const* nixl_backend = std::getenv("TRTLLM_NIXL_KVCACHE_BACKEND");
330+
if (nixl_backend)
331+
{
332+
nixlBackend = nixl_backend;
333+
}
334+
else
335+
{
336+
// Default to UCX if not specified
337+
nixlBackend = "UCX";
338+
}
339+
});
340+
return nixlBackend;
341+
}
342+
321343
bool getEnvDisaggLayerwise()
322344
{
323345
static bool const disaggLayerwise = getBoolEnv("TRTLLM_DISAGG_LAYERWISE");

cpp/tensorrt_llm/common/envUtils.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ std::string getEnvUCXInterface();
8888

8989
std::string getEnvNixlInterface();
9090

91+
std::string getEnvNixlBackend();
92+
9193
bool getEnvDisaggLayerwise();
9294

9395
bool getEnvParallelCacheSend();

cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/transferAgent.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <netdb.h>
2929
#include <netinet/in.h>
3030
#include <nixl_types.h>
31+
#include <set>
3132
#include <sys/file.h>
3233
#include <sys/stat.h>
3334
#include <unistd.h>
@@ -345,15 +346,27 @@ NixlTransferAgent::NixlTransferAgent(BaseAgentConfig const& config)
345346
mRawAgent = std::make_unique<nixlAgent>(config.mName, std::move(nixlConfig));
346347
}
347348

349+
std::string nixlBackend = common::getEnvNixlBackend();
350+
// List of supported backends - extend this list as new backends are added
351+
static const std::set<std::string> kSUPPORTED_BACKENDS = {"UCX"};
352+
353+
if (kSUPPORTED_BACKENDS.find(nixlBackend) == kSUPPORTED_BACKENDS.end())
354+
{
355+
TLLM_LOG_ERROR("Unsupported NIXL backend: %s, fallback to UCX", nixlBackend.c_str());
356+
nixlBackend = "UCX";
357+
}
358+
359+
TLLM_LOG_INFO("NixlTransferAgent::NixlTransferAgent using NIXL backend: %s", nixlBackend.c_str());
360+
348361
nixl_b_params_t init1;
349362
nixl_mem_list_t mems1;
350-
status = mRawAgent->getPluginParams("UCX", mems1, init1);
363+
status = mRawAgent->getPluginParams(nixlBackend.c_str(), mems1, init1);
351364
TLLM_CHECK(status == NIXL_SUCCESS);
352365

353-
status = mRawAgent->createBackend("UCX", init1, mRawBackend);
366+
status = mRawAgent->createBackend(nixlBackend.c_str(), init1, mRawBackend);
354367
if (status != NIXL_SUCCESS || !mRawBackend)
355368
{
356-
TLLM_THROW("Failed to create NIXL backend");
369+
TLLM_THROW("Failed to create NIXL backend: %s", nixlBackend.c_str());
357370
}
358371
mExtraParams.backends.push_back(mRawBackend);
359372
TLLM_LOG_INFO("NixlTransferAgent::NixlTransferAgent mAddress: %s", mAddress.c_str());

cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_
6464
LlmRequestState>(),
6565
nb::arg("ctx_chunk_config") = std::nullopt, nb::arg("max_context_length") = std::nullopt,
6666
nb::arg("no_schedule_until_state") = LlmRequestState::kCONTEXT_INIT,
67-
nb::arg("no_schedule_after_state") = LlmRequestState::kGENERATION_COMPLETE)
67+
nb::arg("no_schedule_after_state") = LlmRequestState::kGENERATION_TO_COMPLETE)
6868
.def("__call__", &MicroBatchScheduler::operator(), nb::arg("active_requests"), nb::arg("inflight_req_ids"),
6969
nb::arg("max_batch_size_runtime"), nb::arg("max_num_tokens_runtime"))
7070
.def("name", [](MicroBatchScheduler const&) { return MicroBatchScheduler::name; });

cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ void initBindings(nb::module_& m)
103103
.def("get_last_tokens", nb::overload_cast<>(&GenLlmReq::getLastTokens))
104104
.def("get_beam_width_by_iter", &GenLlmReq::getBeamWidthByIter, nb::arg("for_next_iteration") = false)
105105
.def_prop_ro("max_num_generated_tokens", &GenLlmReq::getMaxNumGeneratedTokens)
106+
.def("will_complete_next_iteration", &GenLlmReq::willCompleteNextIteration)
106107
.def("add_new_token", &GenLlmReq::addNewToken, nb::arg("token"), nb::arg("beam"))
107108
.def("add_new_tokens", &GenLlmReq::addNewTokens, nb::arg("beam_tokens"))
108109
.def_prop_ro("num_draft_tokens", &GenLlmReq::getNumDraftTokens)

cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ void tensorrt_llm::pybind::batch_manager::algorithms::initBindings(pybind11::mod
6565
LlmRequestState>(),
6666
py::arg("ctx_chunk_config") = std::nullopt, py::arg("max_context_length") = std::nullopt,
6767
py::arg_v("no_schedule_until_state", LlmRequestState::kCONTEXT_INIT, "LlmRequestState.CONTEXT_INIT"),
68-
py::arg_v("no_schedule_after_state", LlmRequestState::kGENERATION_COMPLETE,
69-
"LlmRequestState.GENERATION_COMPLETE"))
68+
py::arg_v("no_schedule_after_state", LlmRequestState::kGENERATION_TO_COMPLETE,
69+
"LlmRequestState.GENERATION_TO_COMPLETE"))
7070
.def("__call__", &MicroBatchScheduler::operator(), py::arg("active_requests"), py::arg("inflight_req_ids"),
7171
py::arg("max_batch_size_runtime"), py::arg("max_num_tokens_runtime"))
7272
.def("name", [](MicroBatchScheduler const&) { return MicroBatchScheduler::name; });

cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ void initBindings(pybind11::module_& m)
107107
.def("get_last_tokens", py::overload_cast<>(&GenLlmReq::getLastTokens))
108108
.def("get_beam_width_by_iter", &GenLlmReq::getBeamWidthByIter, py::arg("for_next_iteration") = false)
109109
.def_property_readonly("max_num_generated_tokens", &GenLlmReq::getMaxNumGeneratedTokens)
110+
.def("will_complete_next_iteration", &GenLlmReq::willCompleteNextIteration)
110111
.def("add_new_token", &GenLlmReq::addNewToken, py::arg("token"), py::arg("beam"))
111112
.def("add_new_tokens", &GenLlmReq::addNewTokens, py::arg("beam_tokens"))
112113
.def_property_readonly("num_draft_tokens", &GenLlmReq::getNumDraftTokens)

jenkins/L0_Test.groovy

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -921,8 +921,48 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
921921
taskArgs = [
922922
*taskArgs,
923923
]
924+
925+
def containerImageArg = container
926+
def srunPrologue = ""
927+
if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
928+
mounts = [
929+
"/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci:/scratch.trt_llm_data:ro",
930+
"/home/svc_tensorrt/bloom/scripts",
931+
"/home/svc_tensorrt/.cache:/root/.cache",
932+
].join(",")
933+
934+
def enrootImagePath = "/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh"
935+
containerImageArg = enrootImagePath
936+
937+
srunPrologue = """
938+
export ENROOT_CACHE_PATH='/home/svc_tensorrt/.cache/enroot'
939+
940+
retry_command() {
941+
local cmd=\$1
942+
local max_attempts=\${2:-3}
943+
local delay=\${3:-60}
944+
local attempt=1
945+
946+
until \$cmd
947+
do
948+
if ((attempt >= max_attempts))
949+
then
950+
echo "Command '\$cmd' failed after \$max_attempts attempts"
951+
return 1
952+
fi
953+
954+
echo "Command '\$cmd' failed (attempt \$attempt of \$max_attempts). Retrying in \${delay}s..."
955+
sleep \$delay
956+
((attempt++))
957+
done
958+
}
959+
960+
retry_command "enroot import -o $enrootImagePath -- docker://$container"
961+
""".replaceAll("(?m)^\\s*", "")
962+
}
963+
924964
srunArgs = [
925-
"--container-image=$container",
965+
"--container-image=$containerImageArg",
926966
"--container-workdir=/home/svc_tensorrt/bloom/scripts",
927967
"--container-mounts=$mounts",
928968
"--container-env=NVIDIA_IMEX_CHANNELS"
@@ -951,6 +991,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
951991
export NVIDIA_IMEX_CHANNELS=0
952992
export NVIDIA_IMEX_CHANNELS=0
953993
export NVIDIA_VISIBLE_DEVICES=\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))
994+
995+
${srunPrologue}
996+
954997
chmod +x $scriptRunNode
955998
srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunNode}
956999
""".replaceAll("(?m)^\\s*", "")
@@ -2718,7 +2761,7 @@ def launchTestJobs(pipeline, testFilter)
27182761
// Disable GB300 stages due to nodes will be offline temporarily.
27192762
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
27202763
"GB200-4_GPUs-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],
2721-
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],
2764+
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
27222765
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-trtllm", "l0_gb300_multi_gpus", 1, 1, 4],
27232766
]
27242767
fullSet += SBSASlurmTestConfigs.keySet()
@@ -2735,7 +2778,7 @@ def launchTestJobs(pipeline, testFilter)
27352778
multiNodesSBSAConfigs = [:]
27362779
def numMultiNodeTests = 3
27372780
multiNodesSBSAConfigs += (1..numMultiNodeTests).collectEntries { i ->
2738-
["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-trtllm", "l0_gb200_multi_nodes", i, numMultiNodeTests, 8, 2]]
2781+
["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-oci-trtllm", "l0_gb200_multi_nodes", i, numMultiNodeTests, 8, 2]]
27392782
}
27402783
fullSet += multiNodesSBSAConfigs.keySet()
27412784

scripts/generate_lock_file.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ def generate_metadata_json():
9292
"w",
9393
encoding="utf-8") as f:
9494
json.dump(data, f, indent=2)
95+
f.write("\n")
9596

9697

9798
if __name__ == "__main__":

0 commit comments

Comments
 (0)