NVIDIA
diff --git a/‎cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/thread/fused_activations.h‎
Lines changed: 24 additions & 0 deletions b/‎cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/thread/fused_activations.h‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu‎
Lines changed: 2 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingDeepSeek.cu‎
Lines changed: 21 additions & 16 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingDeepSeek.cu‎
Lines changed: 21 additions & 16 deletions
diff --git a/‎cpp/tensorrt_llm/thop/moeOp.cpp‎
Lines changed: 26 additions & 11 deletions b/‎cpp/tensorrt_llm/thop/moeOp.cpp‎
Lines changed: 26 additions & 11 deletions
diff --git a/‎cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp‎
Lines changed: 14 additions & 9 deletions b/‎cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp‎
Lines changed: 14 additions & 9 deletions
diff --git a/‎tensorrt_llm/__init__.py‎
Lines changed: 5 additions & 1 deletion b/‎tensorrt_llm/__init__.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/attention_backend/sparse/dsa.py‎
Lines changed: 5 additions & 0 deletions b/‎tensorrt_llm/_torch/attention_backend/sparse/dsa.py‎
Lines changed: 5 additions & 0 deletions
@@ -59,6 +59,30 @@ __forceinline__ __device__ float tanh_opt(float x)
 #endif
 }
 
+template <typename T>
+struct Relu2
+{
+    static bool const kIsHeavy = false;
+
+    CUTLASS_HOST_DEVICE
+    T operator()(T threshold, T value) const
+    {
+        ReLu<T> relu_op;
+        multiplies<T> mul;
+        T val = relu_op(threshold, value);
+        return mul(val, val);
+    }
+
+    CUTLASS_HOST_DEVICE
+    T operator()(T value) const
+    {
+        ReLu<T> relu_op;
+        multiplies<T> mul;
+        T val = relu_op(value);
+        return mul(val, val);
+    }
+};
+
 } // namespace thread
 } // namespace epilogue
 } // namespace cutlass
 
@@ -29,6 +29,7 @@ enum class ActivationType
     Geglu,
     SwigluBias,
     Identity,
+    Relu2,
     InvalidType
 };
 
 
@@ -954,6 +954,7 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::moeGemmBiasAct(
     case ActivationType::Identity: runGemm<cutlass_extensions::EpilogueOpDefault>(inputs, hopper_inputs); break;
     case ActivationType::Swiglu: runGemm<cutlass_extensions::EpilogueOpDefaultSilu>(inputs, hopper_inputs); break;
     case ActivationType::Geglu: runGemm<cutlass_extensions::EpilogueOpDefaultFtGelu>(inputs, hopper_inputs); break;
+    case ActivationType::Relu2: TLLM_THROW("Relu2 is not supported."); break;
     case ActivationType::InvalidType: TLLM_THROW("Activation type for fpA_intB must be valid."); break;
     default: TLLM_THROW("Invalid activation type."); break;
     }
 
@@ -2307,6 +2307,8 @@ void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8
                     decltype(block_scaling_type)::value>, // Geglu
                 &doActivationKernel<T, GemmOutputType, ScaleBiasType, SwigluBiasAdaptor,
                     decltype(block_scaling_type)::value>, // SwigluBias
+                &doActivationKernel<T, GemmOutputType, ScaleBiasType, IdentityAdaptor<cutlass::epilogue::thread::Relu2>,
+                    decltype(block_scaling_type)::value>, // Relu2
                 &doActivationKernel<T, GemmOutputType, ScaleBiasType,
                     IdentityAdaptor<cutlass::epilogue::thread::Identity>,
                     decltype(block_scaling_type)::value> // Identity
 
@@ -50,6 +50,7 @@ enum class ActivationType
     Geglu,
     SwigluBias,
     Identity,
+    Relu2,
     InvalidType
 };
 
 
@@ -613,22 +613,8 @@ void run(Data& data, void* stream)
     TLLM_CHECK_WITH_INFO(data.mNumExpertGroups >= data.mNumLimitedGroups,
         "Routing kernel expects top groups %d to be limited by #expert groups %d", data.mNumLimitedGroups,
         data.mNumExpertGroups);
-    if (data.mNumExpertGroups > 1)
-    {
-        TLLM_CHECK_WITH_INFO(data.mNumExpertGroups <= MaxNumGroups,
-            "Routing kernel expects #experts groups %d to be <= #warps %d", data.mNumExpertGroups, MaxNumGroups);
-        TLLM_CHECK_WITH_INFO(data.mNumExperts % data.mNumExpertGroups == 0,
-            "Routing kernel expects #experts %d to be a multiple of #expert groups %d", data.mNumExperts,
-            data.mNumExpertGroups);
-        TLLM_CHECK_WITH_INFO(data.mNumExperts / data.mNumExpertGroups <= WarpSize,
-            "Routing kernel expects #experts per group <= warp size, got %d, data.mNumExpertGroups %d",
-            data.mNumExperts / data.mNumExpertGroups, data.mNumExpertGroups);
-    }
-    else
-    {
-        TLLM_CHECK_WITH_INFO(data.mTopK <= topk::MaxNumTopK, "Routing kernel expects top K %d to be <= #warps %d",
-            data.mTopK, topk::MaxNumTopK);
-    }
+    // Note: Routing-specific constraints (experts per group, topK limits) are checked later
+    // only when routing is actually needed (data.mPtrTopKIds == nullptr)
     TLLM_CHECK_WITH_INFO(
         data.mNumExperts % 4 == 0, "Routing kernel expects #experts %d to be a multiple of 4.", data.mNumExperts);
     int const numBlocks = data.mNumTokens;
@@ -663,6 +649,25 @@ void run(Data& data, void* stream)
     int const maxTokensCoop = (numBlocksCoop * numThreadsHist * 64) / data.mTopK;
     if (data.mPtrTopKIds == nullptr)
     {
+        // Routing needs to be executed - validate routing kernel constraints
+        if (data.mNumExpertGroups > 1)
+        {
+            TLLM_CHECK_WITH_INFO(data.mNumExpertGroups <= MaxNumGroups,
+                "Routing kernel expects #expert groups %d to be <= max groups %d", data.mNumExpertGroups, MaxNumGroups);
+            TLLM_CHECK_WITH_INFO(data.mNumExperts % data.mNumExpertGroups == 0,
+                "Routing kernel expects #experts %d to be a multiple of #expert groups %d", data.mNumExperts,
+                data.mNumExpertGroups);
+            TLLM_CHECK_WITH_INFO(data.mNumExperts / data.mNumExpertGroups <= WarpSize,
+                "Routing kernel expects #experts per group <= warp size (%d), got %d experts / %d groups = %d experts "
+                "per group",
+                WarpSize, data.mNumExperts, data.mNumExpertGroups, data.mNumExperts / data.mNumExpertGroups);
+        }
+        else
+        {
+            TLLM_CHECK_WITH_INFO(data.mTopK <= topk::MaxNumTopK, "Routing kernel expects top K %d to be <= max topk %d",
+                data.mTopK, topk::MaxNumTopK);
+        }
+
         int const numThreadsMain = data.mNumExperts < NumDeepseekExperts ? NumDeepseekExperts : NumKimiK2Experts;
         LAUNCH_ROUTING_DEEPSEEK(data,
             /*coopLaunch=*/false, routingMainKernel, numBlocks, numThreadsMain,
 
@@ -259,8 +259,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
         torch::optional<torch::Tensor> const& swiglu_limit, int64_t const tp_size, int64_t const tp_rank,
         int64_t const ep_size, int64_t const ep_rank, int64_t const cluster_size, int64_t const cluster_rank,
         bool const enable_alltoall, bool min_latency_mode, torch::optional<c10::ArrayRef<int64_t>> const& profile_ids,
-        torch::optional<int64_t> const& unpadded_hidden_size, torch::optional<int64_t> const& num_valid_tokens,
-        torch::optional<torch::Tensor> const& out_tensor)
+        torch::optional<int64_t> const& activation_type, torch::optional<int64_t> const& unpadded_hidden_size,
+        torch::optional<int64_t> const& num_valid_tokens, torch::optional<torch::Tensor> const& out_tensor)
     {
         std::lock_guard<std::mutex> lock(mMutex);
         // Free the profile workspace to save memory
@@ -328,6 +328,9 @@ class FusedMoeRunner : public torch::CustomClassHolder
         TORCH_CHECK(fc1_expert_weights.sizes()[0] == fc2_expert_weights.sizes()[0],
             "fc1_expert_weights and fc2_expert_weights must have the same number of experts.");
 
+        ActivationType base_activation_type = activation_type.has_value()
+            ? static_cast<ActivationType>(activation_type.value())
+            : ActivationType::Swiglu;
         if (mUseINT8WoqPerChannel)
         {
             // Note: The weight shape for INT8 weight only quantization is different, e.g., fc2_expert_weights:
@@ -337,8 +340,16 @@ class FusedMoeRunner : public torch::CustomClassHolder
         }
         else
         {
-            TORCH_CHECK(fc1_expert_weights.sizes()[1] == fc2_expert_weights.sizes()[2] * mInnerDimMultiplier * 2,
-                "fc1_expert_weights inter size must be fc2_expert_weights inter size.");
+            if (isGatedActivation(base_activation_type))
+            {
+                TORCH_CHECK(fc1_expert_weights.sizes()[1] == fc2_expert_weights.sizes()[2] * mInnerDimMultiplier * 2,
+                    "fc1_expert_weights inter size must be 2 times fc2_expert_weights inter size.");
+            }
+            else
+            {
+                TORCH_CHECK(fc1_expert_weights.sizes()[1] == fc2_expert_weights.sizes()[2] * mInnerDimMultiplier,
+                    "fc1_expert_weights inter size must be equal to fc2_expert_weights inter size.");
+            }
         }
 
         int experts_per_token = token_selected_experts.sizes()[1];
@@ -375,7 +386,7 @@ class FusedMoeRunner : public torch::CustomClassHolder
         int const num_experts_on_rank = fc2_expert_weights.sizes()[0];
         auto const num_experts_total = static_cast<int>(num_experts_on_rank * ep_size);
         auto parallelism_config = kernels::MOEParallelismConfig(tp_size, tp_rank, ep_size, ep_rank);
-        ActivationType base_activation_type = ActivationType::Swiglu;
+
         if (swiglu_alpha.has_value())
         {
             CHECK_INPUT(swiglu_alpha.value(), at::ScalarType::Float);
@@ -474,8 +485,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
         torch::optional<torch::Tensor> const& swiglu_limit, int64_t const tp_size, int64_t const tp_rank,
         int64_t const ep_size, int64_t const ep_rank, int64_t const cluster_size, int64_t const cluster_rank,
         bool const enable_alltoall, bool min_latency_mode, torch::optional<c10::ArrayRef<int64_t>> const& profile_ids,
-        torch::optional<int64_t> const& unpadded_hidden_size, torch::optional<int64_t> const& num_valid_tokens,
-        torch::optional<torch::Tensor> const& out_tensor)
+        torch::optional<int64_t> const& activation_type, torch::optional<int64_t> const& unpadded_hidden_size,
+        torch::optional<int64_t> const& num_valid_tokens, torch::optional<torch::Tensor> const& out_tensor)
     {
         std::lock_guard<std::mutex> lock(mMutex);
 
@@ -541,7 +552,9 @@ class FusedMoeRunner : public torch::CustomClassHolder
         auto const num_experts_total = static_cast<int>(num_experts_on_rank * ep_size);
         auto parallelism_config
             = kernels::MOEParallelismConfig(tp_size, tp_rank, ep_size, ep_rank, cluster_size, cluster_rank);
-        ActivationType base_activation_type = ActivationType::Swiglu;
+        ActivationType base_activation_type = activation_type.has_value()
+            ? static_cast<ActivationType>(activation_type.value())
+            : ActivationType::Swiglu;
         if (swiglu_alpha.has_value())
         {
             CHECK_INPUT(swiglu_alpha.value(), at::ScalarType::Float);
@@ -652,7 +665,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
         torch::optional<torch::Tensor> const& fc2_expert_biases, int64_t const top_k, int64_t const tp_size,
         int64_t const tp_rank, int64_t const ep_size, int64_t const ep_rank, int64_t const cluster_size,
         int64_t const cluster_rank, bool const enable_alltoall, bool const min_latency_mode, int64_t const gemm_idx,
-        int64_t const profile_id, bool const do_preparation, int64_t const unpadded_hidden_size)
+        int64_t const profile_id, bool const do_preparation, int64_t const activation_type_int,
+        int64_t const unpadded_hidden_size)
     {
         std::lock_guard<std::mutex> lock(mMutex);
 
@@ -661,6 +675,7 @@ class FusedMoeRunner : public torch::CustomClassHolder
         {
             return;
         }
+        ActivationType activation_type = static_cast<ActivationType>(activation_type_int);
 
         int64_t const num_rows = input.sizes()[0];
         int64_t hidden_size = fc2_expert_weights.sizes()[1];
@@ -715,14 +730,14 @@ class FusedMoeRunner : public torch::CustomClassHolder
                 tensorrt_llm::runtime::TorchUtils::dataType(mWeightDtype),
                 tensorrt_llm::runtime::TorchUtils::dataType(mOutputDtype), num_experts, static_cast<int>(top_k),
                 hidden_size, unpadded_hidden_size > 0 ? unpadded_hidden_size : hidden_size, inter_size, group_size,
-                ActivationType::Swiglu, USE_BIAS, USE_LORA, min_latency_mode,
+                activation_type, USE_BIAS, USE_LORA, min_latency_mode,
                 /*need_weights*/ false, parallelism_config, enable_alltoall);
 #else
             mProfiler->init(*mKernelRunner.get(), mProfiler->mGemmToProfile,
                 tensorrt_llm::runtime::TorchUtils::dataType(activation_dtype),
                 tensorrt_llm::runtime::TorchUtils::dataType(mWeightDtype),
                 tensorrt_llm::runtime::TorchUtils::dataType(mOutputDtype), num_experts, static_cast<int>(top_k),
-                hidden_size, inter_size, group_size, ActivationType::Swiglu, USE_BIAS, USE_LORA, min_latency_mode,
+                hidden_size, inter_size, group_size, activation_type, USE_BIAS, USE_LORA, min_latency_mode,
                 /*need_weights*/ false, parallelism_config);
 #endif
 
 
@@ -34,10 +34,12 @@
 #include <chrono>
 #include <cmath>
 #include <cstddef>
+#include <fcntl.h>
 #include <filesystem>
 #include <memory>
 #include <set>
 #include <thread>
+#include <unistd.h>
 #include <variant>
 
 using namespace tensorrt_llm::batch_manager;
@@ -212,7 +214,10 @@ void writePatternToOffloadedBlocksGDS(
             {
                 buffer[i] = i & mask;
             }
-            ::write(fd, buffer.data(), poolBlockSize * sizeof(T));
+            auto const bytesToWrite = static_cast<size_t>(poolBlockSize) * sizeof(T);
+            auto const written = ::write(fd, buffer.data(), bytesToWrite);
+            EXPECT_EQ(written, static_cast<ssize_t>(bytesToWrite))
+                << "Failed to write pattern to offloaded block file " << filename;
             ::close(fd);
         }
     }
@@ -3575,7 +3580,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest)
     auto numAllocatedPrimaryBlocks = blockManager.getNumAllocatedBlocks() - blocksInSecondaryPool;
     EXPECT_THAT(seq0.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2, 3, 4}));
 
-    EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
+    EXPECT_NO_THROW(static_cast<void>(kvCacheManager.removeSequence(requestId, llmRequest)));
     numAllocatedPrimaryBlocks = blockManager.getNumAllocatedBlocks() - blocksInSecondaryPool;
     EXPECT_EQ(numAllocatedPrimaryBlocks, 0);
     // store blocks 0, 1, 2, 3, 4  for reuse ([1000,1001,1002,1003], [1004,1005,1006,1007], [1008,1009,1010,1011],
@@ -3601,7 +3606,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest)
     kvCacheManager.addToken(requestId);
     numTokens = llmRequest->getNumTokens(beamIdx);
     EXPECT_THAT(seq1.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0, 5, 6}));
-    EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
+    EXPECT_NO_THROW(static_cast<void>(kvCacheManager.removeSequence(requestId, llmRequest)));
 
     ///////////////////////////////////////////////////////////////////////////
     // add a medium request and then remove it
@@ -3615,7 +3620,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest)
     GenerationRequest const& seq2 = kvCacheManager.getSequence(requestId);
     EXPECT_EQ(llmRequest->getContextCurrentPosition(), 9);
     EXPECT_THAT(seq2.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0, 1, 7}));
-    EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
+    EXPECT_NO_THROW(static_cast<void>(kvCacheManager.removeSequence(requestId, llmRequest)));
 
     ///////////////////////////////////////////////////////////////////////////
     // add a longer request within attention window and try to reuse
@@ -3637,7 +3642,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerMaxAttentionWindowWithReuseTest)
     llmRequest->addNewToken(1016, beamIdx);
     kvCacheManager.addToken(requestId);
     EXPECT_THAT(seq3.getCacheBlockIds(onlyWindowSize).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2, 8, 9}));
-    EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
+    EXPECT_NO_THROW(static_cast<void>(kvCacheManager.removeSequence(requestId, llmRequest)));
 }
 
 TEST_F(KVCacheManagerTest, KVCacheManagerSWAInvalidateReuseTest)
@@ -3715,8 +3720,8 @@ TEST_F(KVCacheManagerTest, KVCacheManagerSWAInvalidateReuseTest)
     EXPECT_FALSE(blockManager.isSequenceValidForStoreForReuse(seq0.getRequestId(), onlyWindowSize));
     EXPECT_TRUE(blockManager.isSequenceValidForStoreForReuse(seq1.getRequestId(), onlyWindowSize));
 
-    EXPECT_NO_THROW(kvCacheManager.removeSequence(seq0.getRequestId(), llmRequest0));
-    EXPECT_NO_THROW(kvCacheManager.removeSequence(seq1.getRequestId(), llmRequest1));
+    EXPECT_NO_THROW(static_cast<void>(kvCacheManager.removeSequence(seq0.getRequestId(), llmRequest0)));
+    EXPECT_NO_THROW(static_cast<void>(kvCacheManager.removeSequence(seq1.getRequestId(), llmRequest1)));
 }
 
 TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest)
@@ -3806,7 +3811,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest)
     assertBlocks(seq0, {0, 1, 2}, {0, 1, 2});
     auto numAllocatedPrimaryBlocks = blockManager.getNumAllocatedBlocks() - blocksInSecondaryPoolPerWindow * numWindows;
 
-    EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
+    EXPECT_NO_THROW(static_cast<void>(kvCacheManager.removeSequence(requestId, llmRequest)));
     numAllocatedPrimaryBlocks = blockManager.getNumAllocatedBlocks() - blocksInSecondaryPoolPerWindow * numWindows;
     EXPECT_EQ(numAllocatedPrimaryBlocks, 0);
     // For both windows, store blocks 0, 1, 2  for reuse ([1000,1001,1002,1003], [1004,1005,1006,1007],
@@ -3832,7 +3837,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest)
     llmRequest->addNewToken(1009, beamIdx);
     kvCacheManager.addToken(requestId);
     assertBlocks(seq1, {0, 3, 4}, {0, 3, 4});
-    EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
+    EXPECT_NO_THROW(static_cast<void>(kvCacheManager.removeSequence(requestId, llmRequest)));
 }
 
 TEST_F(KVCacheManagerTest, KVCacheManagerEventStreamOverflow)
 
@@ -13,12 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+
+# Disable UCC to WAR allgather issue before NGC PyTorch 25.12 upgrade.
+os.environ["OMPI_MCA_coll_ucc_enable"] = "0"
+
 
 def _add_trt_llm_dll_directory():
     import platform
     on_windows = platform.system() == "Windows"
     if on_windows:
-        import os
         import sysconfig
         from pathlib import Path
         os.add_dll_directory(
 
@@ -1528,6 +1528,11 @@ def get_indexer_k_cache_buffers(self, layer_idx: int):
         return self.indexer_k_cache_pool_per_layer[layer_offset].view(
             self.num_blocks, block_size, 1, per_token_size)
 
+    def shutdown(self):
+        # Clear Python references BEFORE C++ frees the underlying CUDA buffers
+        self.indexer_k_cache_pool_per_layer = []
+        super().shutdown()
+
     @staticmethod
     def get_cache_size_per_token(model_config: ModelConfig, mapping: Mapping,
                                  **kwargs):
Original file line number	Diff line number	Diff line change
`@@ -954,6 +954,7 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::moeGemmBiasAct(`
`954`	`954`	`case ActivationType::Identity: runGemm<cutlass_extensions::EpilogueOpDefault>(inputs, hopper_inputs); break;`
`955`	`955`	`case ActivationType::Swiglu: runGemm<cutlass_extensions::EpilogueOpDefaultSilu>(inputs, hopper_inputs); break;`
`956`	`956`	`case ActivationType::Geglu: runGemm<cutlass_extensions::EpilogueOpDefaultFtGelu>(inputs, hopper_inputs); break;`
	`957`	`+ case ActivationType::Relu2: TLLM_THROW("Relu2 is not supported."); break;`
`957`	`958`	`case ActivationType::InvalidType: TLLM_THROW("Activation type for fpA_intB must be valid."); break;`
`958`	`959`	`default: TLLM_THROW("Invalid activation type."); break;`
`959`	`960`	`}`