Skip to content

Commit e6fd5a8

Browse files
authored
Merge branch 'main' into placement
2 parents a607bf2 + 44d1c75 commit e6fd5a8

File tree

42 files changed

+2828
-550
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+2828
-550
lines changed

cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/thread/fused_activations.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,30 @@ __forceinline__ __device__ float tanh_opt(float x)
5959
#endif
6060
}
6161

62+
template <typename T>
63+
struct Relu2
64+
{
65+
static bool const kIsHeavy = false;
66+
67+
CUTLASS_HOST_DEVICE
68+
T operator()(T threshold, T value) const
69+
{
70+
ReLu<T> relu_op;
71+
multiplies<T> mul;
72+
T val = relu_op(threshold, value);
73+
return mul(val, val);
74+
}
75+
76+
CUTLASS_HOST_DEVICE
77+
T operator()(T value) const
78+
{
79+
ReLu<T> relu_op;
80+
multiplies<T> mul;
81+
T val = relu_op(value);
82+
return mul(val, val);
83+
}
84+
};
85+
6286
} // namespace thread
6387
} // namespace epilogue
6488
} // namespace cutlass

cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ enum class ActivationType
2929
Geglu,
3030
SwigluBias,
3131
Identity,
32+
Relu2,
3233
InvalidType
3334
};
3435

cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -954,6 +954,7 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::moeGemmBiasAct(
954954
case ActivationType::Identity: runGemm<cutlass_extensions::EpilogueOpDefault>(inputs, hopper_inputs); break;
955955
case ActivationType::Swiglu: runGemm<cutlass_extensions::EpilogueOpDefaultSilu>(inputs, hopper_inputs); break;
956956
case ActivationType::Geglu: runGemm<cutlass_extensions::EpilogueOpDefaultFtGelu>(inputs, hopper_inputs); break;
957+
case ActivationType::Relu2: TLLM_THROW("Relu2 is not supported."); break;
957958
case ActivationType::InvalidType: TLLM_THROW("Activation type for fpA_intB must be valid."); break;
958959
default: TLLM_THROW("Invalid activation type."); break;
959960
}

cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2307,6 +2307,8 @@ void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8
23072307
decltype(block_scaling_type)::value>, // Geglu
23082308
&doActivationKernel<T, GemmOutputType, ScaleBiasType, SwigluBiasAdaptor,
23092309
decltype(block_scaling_type)::value>, // SwigluBias
2310+
&doActivationKernel<T, GemmOutputType, ScaleBiasType, IdentityAdaptor<cutlass::epilogue::thread::Relu2>,
2311+
decltype(block_scaling_type)::value>, // Relu2
23102312
&doActivationKernel<T, GemmOutputType, ScaleBiasType,
23112313
IdentityAdaptor<cutlass::epilogue::thread::Identity>,
23122314
decltype(block_scaling_type)::value> // Identity

cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ enum class ActivationType
5050
Geglu,
5151
SwigluBias,
5252
Identity,
53+
Relu2,
5354
InvalidType
5455
};
5556

cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingDeepSeek.cu

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -613,22 +613,8 @@ void run(Data& data, void* stream)
613613
TLLM_CHECK_WITH_INFO(data.mNumExpertGroups >= data.mNumLimitedGroups,
614614
"Routing kernel expects top groups %d to be limited by #expert groups %d", data.mNumLimitedGroups,
615615
data.mNumExpertGroups);
616-
if (data.mNumExpertGroups > 1)
617-
{
618-
TLLM_CHECK_WITH_INFO(data.mNumExpertGroups <= MaxNumGroups,
619-
"Routing kernel expects #experts groups %d to be <= #warps %d", data.mNumExpertGroups, MaxNumGroups);
620-
TLLM_CHECK_WITH_INFO(data.mNumExperts % data.mNumExpertGroups == 0,
621-
"Routing kernel expects #experts %d to be a multiple of #expert groups %d", data.mNumExperts,
622-
data.mNumExpertGroups);
623-
TLLM_CHECK_WITH_INFO(data.mNumExperts / data.mNumExpertGroups <= WarpSize,
624-
"Routing kernel expects #experts per group <= warp size, got %d, data.mNumExpertGroups %d",
625-
data.mNumExperts / data.mNumExpertGroups, data.mNumExpertGroups);
626-
}
627-
else
628-
{
629-
TLLM_CHECK_WITH_INFO(data.mTopK <= topk::MaxNumTopK, "Routing kernel expects top K %d to be <= #warps %d",
630-
data.mTopK, topk::MaxNumTopK);
631-
}
616+
// Note: Routing-specific constraints (experts per group, topK limits) are checked later
617+
// only when routing is actually needed (data.mPtrTopKIds == nullptr)
632618
TLLM_CHECK_WITH_INFO(
633619
data.mNumExperts % 4 == 0, "Routing kernel expects #experts %d to be a multiple of 4.", data.mNumExperts);
634620
int const numBlocks = data.mNumTokens;
@@ -663,6 +649,25 @@ void run(Data& data, void* stream)
663649
int const maxTokensCoop = (numBlocksCoop * numThreadsHist * 64) / data.mTopK;
664650
if (data.mPtrTopKIds == nullptr)
665651
{
652+
// Routing needs to be executed - validate routing kernel constraints
653+
if (data.mNumExpertGroups > 1)
654+
{
655+
TLLM_CHECK_WITH_INFO(data.mNumExpertGroups <= MaxNumGroups,
656+
"Routing kernel expects #expert groups %d to be <= max groups %d", data.mNumExpertGroups, MaxNumGroups);
657+
TLLM_CHECK_WITH_INFO(data.mNumExperts % data.mNumExpertGroups == 0,
658+
"Routing kernel expects #experts %d to be a multiple of #expert groups %d", data.mNumExperts,
659+
data.mNumExpertGroups);
660+
TLLM_CHECK_WITH_INFO(data.mNumExperts / data.mNumExpertGroups <= WarpSize,
661+
"Routing kernel expects #experts per group <= warp size (%d), got %d experts / %d groups = %d experts "
662+
"per group",
663+
WarpSize, data.mNumExperts, data.mNumExpertGroups, data.mNumExperts / data.mNumExpertGroups);
664+
}
665+
else
666+
{
667+
TLLM_CHECK_WITH_INFO(data.mTopK <= topk::MaxNumTopK, "Routing kernel expects top K %d to be <= max topk %d",
668+
data.mTopK, topk::MaxNumTopK);
669+
}
670+
666671
int const numThreadsMain = data.mNumExperts < NumDeepseekExperts ? NumDeepseekExperts : NumKimiK2Experts;
667672
LAUNCH_ROUTING_DEEPSEEK(data,
668673
/*coopLaunch=*/false, routingMainKernel, numBlocks, numThreadsMain,

cpp/tensorrt_llm/thop/moeOp.cpp

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -259,8 +259,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
259259
torch::optional<torch::Tensor> const& swiglu_limit, int64_t const tp_size, int64_t const tp_rank,
260260
int64_t const ep_size, int64_t const ep_rank, int64_t const cluster_size, int64_t const cluster_rank,
261261
bool const enable_alltoall, bool min_latency_mode, torch::optional<c10::ArrayRef<int64_t>> const& profile_ids,
262-
torch::optional<int64_t> const& unpadded_hidden_size, torch::optional<int64_t> const& num_valid_tokens,
263-
torch::optional<torch::Tensor> const& out_tensor)
262+
torch::optional<int64_t> const& activation_type, torch::optional<int64_t> const& unpadded_hidden_size,
263+
torch::optional<int64_t> const& num_valid_tokens, torch::optional<torch::Tensor> const& out_tensor)
264264
{
265265
std::lock_guard<std::mutex> lock(mMutex);
266266
// Free the profile workspace to save memory
@@ -328,6 +328,9 @@ class FusedMoeRunner : public torch::CustomClassHolder
328328
TORCH_CHECK(fc1_expert_weights.sizes()[0] == fc2_expert_weights.sizes()[0],
329329
"fc1_expert_weights and fc2_expert_weights must have the same number of experts.");
330330

331+
ActivationType base_activation_type = activation_type.has_value()
332+
? static_cast<ActivationType>(activation_type.value())
333+
: ActivationType::Swiglu;
331334
if (mUseINT8WoqPerChannel)
332335
{
333336
// Note: The weight shape for INT8 weight only quantization is different, e.g., fc2_expert_weights:
@@ -337,8 +340,16 @@ class FusedMoeRunner : public torch::CustomClassHolder
337340
}
338341
else
339342
{
340-
TORCH_CHECK(fc1_expert_weights.sizes()[1] == fc2_expert_weights.sizes()[2] * mInnerDimMultiplier * 2,
341-
"fc1_expert_weights inter size must be fc2_expert_weights inter size.");
343+
if (isGatedActivation(base_activation_type))
344+
{
345+
TORCH_CHECK(fc1_expert_weights.sizes()[1] == fc2_expert_weights.sizes()[2] * mInnerDimMultiplier * 2,
346+
"fc1_expert_weights inter size must be 2 times fc2_expert_weights inter size.");
347+
}
348+
else
349+
{
350+
TORCH_CHECK(fc1_expert_weights.sizes()[1] == fc2_expert_weights.sizes()[2] * mInnerDimMultiplier,
351+
"fc1_expert_weights inter size must be equal to fc2_expert_weights inter size.");
352+
}
342353
}
343354

344355
int experts_per_token = token_selected_experts.sizes()[1];
@@ -375,7 +386,7 @@ class FusedMoeRunner : public torch::CustomClassHolder
375386
int const num_experts_on_rank = fc2_expert_weights.sizes()[0];
376387
auto const num_experts_total = static_cast<int>(num_experts_on_rank * ep_size);
377388
auto parallelism_config = kernels::MOEParallelismConfig(tp_size, tp_rank, ep_size, ep_rank);
378-
ActivationType base_activation_type = ActivationType::Swiglu;
389+
379390
if (swiglu_alpha.has_value())
380391
{
381392
CHECK_INPUT(swiglu_alpha.value(), at::ScalarType::Float);
@@ -474,8 +485,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
474485
torch::optional<torch::Tensor> const& swiglu_limit, int64_t const tp_size, int64_t const tp_rank,
475486
int64_t const ep_size, int64_t const ep_rank, int64_t const cluster_size, int64_t const cluster_rank,
476487
bool const enable_alltoall, bool min_latency_mode, torch::optional<c10::ArrayRef<int64_t>> const& profile_ids,
477-
torch::optional<int64_t> const& unpadded_hidden_size, torch::optional<int64_t> const& num_valid_tokens,
478-
torch::optional<torch::Tensor> const& out_tensor)
488+
torch::optional<int64_t> const& activation_type, torch::optional<int64_t> const& unpadded_hidden_size,
489+
torch::optional<int64_t> const& num_valid_tokens, torch::optional<torch::Tensor> const& out_tensor)
479490
{
480491
std::lock_guard<std::mutex> lock(mMutex);
481492

@@ -541,7 +552,9 @@ class FusedMoeRunner : public torch::CustomClassHolder
541552
auto const num_experts_total = static_cast<int>(num_experts_on_rank * ep_size);
542553
auto parallelism_config
543554
= kernels::MOEParallelismConfig(tp_size, tp_rank, ep_size, ep_rank, cluster_size, cluster_rank);
544-
ActivationType base_activation_type = ActivationType::Swiglu;
555+
ActivationType base_activation_type = activation_type.has_value()
556+
? static_cast<ActivationType>(activation_type.value())
557+
: ActivationType::Swiglu;
545558
if (swiglu_alpha.has_value())
546559
{
547560
CHECK_INPUT(swiglu_alpha.value(), at::ScalarType::Float);
@@ -652,7 +665,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
652665
torch::optional<torch::Tensor> const& fc2_expert_biases, int64_t const top_k, int64_t const tp_size,
653666
int64_t const tp_rank, int64_t const ep_size, int64_t const ep_rank, int64_t const cluster_size,
654667
int64_t const cluster_rank, bool const enable_alltoall, bool const min_latency_mode, int64_t const gemm_idx,
655-
int64_t const profile_id, bool const do_preparation, int64_t const unpadded_hidden_size)
668+
int64_t const profile_id, bool const do_preparation, int64_t const activation_type_int,
669+
int64_t const unpadded_hidden_size)
656670
{
657671
std::lock_guard<std::mutex> lock(mMutex);
658672

@@ -661,6 +675,7 @@ class FusedMoeRunner : public torch::CustomClassHolder
661675
{
662676
return;
663677
}
678+
ActivationType activation_type = static_cast<ActivationType>(activation_type_int);
664679

665680
int64_t const num_rows = input.sizes()[0];
666681
int64_t hidden_size = fc2_expert_weights.sizes()[1];
@@ -715,14 +730,14 @@ class FusedMoeRunner : public torch::CustomClassHolder
715730
tensorrt_llm::runtime::TorchUtils::dataType(mWeightDtype),
716731
tensorrt_llm::runtime::TorchUtils::dataType(mOutputDtype), num_experts, static_cast<int>(top_k),
717732
hidden_size, unpadded_hidden_size > 0 ? unpadded_hidden_size : hidden_size, inter_size, group_size,
718-
ActivationType::Swiglu, USE_BIAS, USE_LORA, min_latency_mode,
733+
activation_type, USE_BIAS, USE_LORA, min_latency_mode,
719734
/*need_weights*/ false, parallelism_config, enable_alltoall);
720735
#else
721736
mProfiler->init(*mKernelRunner.get(), mProfiler->mGemmToProfile,
722737
tensorrt_llm::runtime::TorchUtils::dataType(activation_dtype),
723738
tensorrt_llm::runtime::TorchUtils::dataType(mWeightDtype),
724739
tensorrt_llm::runtime::TorchUtils::dataType(mOutputDtype), num_experts, static_cast<int>(top_k),
725-
hidden_size, inter_size, group_size, ActivationType::Swiglu, USE_BIAS, USE_LORA, min_latency_mode,
740+
hidden_size, inter_size, group_size, activation_type, USE_BIAS, USE_LORA, min_latency_mode,
726741
/*need_weights*/ false, parallelism_config);
727742
#endif
728743

cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,12 @@
3434
#include <chrono>
3535
#include <cmath>
3636
#include <cstddef>
37+
#include <fcntl.h>
3738
#include <filesystem>
3839
#include <memory>
3940
#include <set>
4041
#include <thread>
42+
#include <unistd.h>
4143
#include <variant>
4244

4345
using namespace tensorrt_llm::batch_manager;
@@ -212,7 +214,10 @@ void writePatternToOffloadedBlocksGDS(
212214
{
213215
buffer[i] = i & mask;
214216
}
215-
::write(fd, buffer.data(), poolBlockSize * sizeof(T));
217+
auto const bytesToWrite = static_cast<size_t>(poolBlockSize) * sizeof(T);
218+
auto const written = ::write(fd, buffer.data(), bytesToWrite);
219+
EXPECT_EQ(written, static_cast<ssize_t>(bytesToWrite))
220+
<< "Failed to write pattern to offloaded block file " << filename;
216221
::close(fd);
217222
}
218223
}
@@ -3575,7 +3580,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest)
35753580
auto numAllocatedPrimaryBlocks = blockManager.getNumAllocatedBlocks() - blocksInSecondaryPool;
35763581
EXPECT_THAT(seq0.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2, 3, 4}));
35773582

3578-
EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
3583+
EXPECT_NO_THROW(static_cast<void>(kvCacheManager.removeSequence(requestId, llmRequest)));
35793584
numAllocatedPrimaryBlocks = blockManager.getNumAllocatedBlocks() - blocksInSecondaryPool;
35803585
EXPECT_EQ(numAllocatedPrimaryBlocks, 0);
35813586
// store blocks 0, 1, 2, 3, 4 for reuse ([1000,1001,1002,1003], [1004,1005,1006,1007], [1008,1009,1010,1011],
@@ -3601,7 +3606,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest)
36013606
kvCacheManager.addToken(requestId);
36023607
numTokens = llmRequest->getNumTokens(beamIdx);
36033608
EXPECT_THAT(seq1.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0, 5, 6}));
3604-
EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
3609+
EXPECT_NO_THROW(static_cast<void>(kvCacheManager.removeSequence(requestId, llmRequest)));
36053610

36063611
///////////////////////////////////////////////////////////////////////////
36073612
// add a medium request and then remove it
@@ -3615,7 +3620,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest)
36153620
GenerationRequest const& seq2 = kvCacheManager.getSequence(requestId);
36163621
EXPECT_EQ(llmRequest->getContextCurrentPosition(), 9);
36173622
EXPECT_THAT(seq2.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0, 1, 7}));
3618-
EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
3623+
EXPECT_NO_THROW(static_cast<void>(kvCacheManager.removeSequence(requestId, llmRequest)));
36193624

36203625
///////////////////////////////////////////////////////////////////////////
36213626
// add a longer request within attention window and try to reuse
@@ -3637,7 +3642,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest)
36373642
llmRequest->addNewToken(1016, beamIdx);
36383643
kvCacheManager.addToken(requestId);
36393644
EXPECT_THAT(seq3.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2, 8, 9}));
3640-
EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
3645+
EXPECT_NO_THROW(static_cast<void>(kvCacheManager.removeSequence(requestId, llmRequest)));
36413646
}
36423647

36433648
TEST_F(KVCacheManagerTest, KVCacheManagerSWAInvalidateReuseTest)
@@ -3715,8 +3720,8 @@ TEST_F(KVCacheManagerTest, KVCacheManagerSWAInvalidateReuseTest)
37153720
EXPECT_FALSE(blockManager.isSequenceValidForStoreForReuse(seq0.getRequestId(), onlyWindowSize));
37163721
EXPECT_TRUE(blockManager.isSequenceValidForStoreForReuse(seq1.getRequestId(), onlyWindowSize));
37173722

3718-
EXPECT_NO_THROW(kvCacheManager.removeSequence(seq0.getRequestId(), llmRequest0));
3719-
EXPECT_NO_THROW(kvCacheManager.removeSequence(seq1.getRequestId(), llmRequest1));
3723+
EXPECT_NO_THROW(static_cast<void>(kvCacheManager.removeSequence(seq0.getRequestId(), llmRequest0)));
3724+
EXPECT_NO_THROW(static_cast<void>(kvCacheManager.removeSequence(seq1.getRequestId(), llmRequest1)));
37203725
}
37213726

37223727
TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest)
@@ -3806,7 +3811,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest)
38063811
assertBlocks(seq0, {0, 1, 2}, {0, 1, 2});
38073812
auto numAllocatedPrimaryBlocks = blockManager.getNumAllocatedBlocks() - blocksInSecondaryPoolPerWindow * numWindows;
38083813

3809-
EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
3814+
EXPECT_NO_THROW(static_cast<void>(kvCacheManager.removeSequence(requestId, llmRequest)));
38103815
numAllocatedPrimaryBlocks = blockManager.getNumAllocatedBlocks() - blocksInSecondaryPoolPerWindow * numWindows;
38113816
EXPECT_EQ(numAllocatedPrimaryBlocks, 0);
38123817
// For both windows, store blocks 0, 1, 2 for reuse ([1000,1001,1002,1003], [1004,1005,1006,1007],
@@ -3832,7 +3837,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest)
38323837
llmRequest->addNewToken(1009, beamIdx);
38333838
kvCacheManager.addToken(requestId);
38343839
assertBlocks(seq1, {0, 3, 4}, {0, 3, 4});
3835-
EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
3840+
EXPECT_NO_THROW(static_cast<void>(kvCacheManager.removeSequence(requestId, llmRequest)));
38363841
}
38373842

38383843
TEST_F(KVCacheManagerTest, KVCacheManagerEventStreamOverflow)

tensorrt_llm/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,16 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16+
import os
17+
18+
# Disable UCC to WAR allgather issue before NGC PyTorch 25.12 upgrade.
19+
os.environ["OMPI_MCA_coll_ucc_enable"] = "0"
20+
1621

1722
def _add_trt_llm_dll_directory():
1823
import platform
1924
on_windows = platform.system() == "Windows"
2025
if on_windows:
21-
import os
2226
import sysconfig
2327
from pathlib import Path
2428
os.add_dll_directory(

tensorrt_llm/_torch/attention_backend/sparse/dsa.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1528,6 +1528,11 @@ def get_indexer_k_cache_buffers(self, layer_idx: int):
15281528
return self.indexer_k_cache_pool_per_layer[layer_offset].view(
15291529
self.num_blocks, block_size, 1, per_token_size)
15301530

1531+
def shutdown(self):
1532+
# Clear Python references BEFORE C++ frees the underlying CUDA buffers
1533+
self.indexer_k_cache_pool_per_layer = []
1534+
super().shutdown()
1535+
15311536
@staticmethod
15321537
def get_cache_size_per_token(model_config: ModelConfig, mapping: Mapping,
15331538
**kwargs):

0 commit comments

Comments
 (0)