NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp‎
Lines changed: 1 addition & 9 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp‎
Lines changed: 1 addition & 9 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmEnums.h‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmEnums.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h‎
Lines changed: 181 additions & 45 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h‎
Lines changed: 181 additions & 45 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h‎
Lines changed: 127 additions & 46 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h‎
Lines changed: 127 additions & 46 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h‎
Lines changed: 50 additions & 3 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h‎
Lines changed: 50 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h‎
Lines changed: 18 additions & 14 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h‎
Lines changed: 18 additions & 14 deletions
@@ -240,14 +240,6 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(TrtllmGenBatchedGemmRunne
             }
         }
 
-        if (options.mUseDeepSeekFp8)
-        {
-            if (!acceptIf(options.mUseShuffledMatrixA == false, "useShuffledMatrixA should be false for DeepSeek Fp8"))
-            {
-                continue;
-            }
-        }
-
         if (options.mFusedAct)
         {
             if (!acceptIf(options.mActType == static_cast<batchedGemm::gemmGatedAct::ActType>(mOptions.actType),
@@ -452,7 +444,7 @@ void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, int32_t va
     bmm.runInitBeforeWorldSync(config, gemmData, static_cast<void*>(stream));
 
     auto const err = bmm.run(config, workspace, gemmData, static_cast<void*>(stream), multiProcessorCount,
-        tensorrt_llm::common::getEnvEnablePDL(), globalTrtllmGenBatchedGemmModuleCache);
+        tensorrt_llm::common::getEnvEnablePDL(), /* pinnedHostBuffer */ nullptr, globalTrtllmGenBatchedGemmModuleCache);
 
     CUresult cuErr = static_cast<CUresult>(err);
     char const* cuErrStr = nullptr;
 
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION &
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2026 NVIDIA CORPORATION &
  * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
 
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION &
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2026 NVIDIA CORPORATION &
  * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -93,13 +93,38 @@ enum class BiasType : uint32_t
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+// Type of the element-wise activation to apply after the Gemm
+enum class EltwiseActType
+{
+    None = 0,
+    // Gelu is defined as the following operation:
+    // act = x0 * phi(x0)
+    // where x0 is the output of the Gemm
+    // phi is the CDF of standard normal distribution approximated by
+    // phi(x) = 0.5 * (1 + tanh(0.7978845608028654 * (x + 0.044715 * x * x * x)))
+    Gelu,
+    // Relu2 (also known as squared Relu) is defined as the following operation:
+    // act = relu(x0) ^ 2
+    // where x0 is the output of the Gemm.
+    Relu2,
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 enum class TileScheduler
 {
     // Static scheduler (Non-persistent).
     Static = 0,
-    // Dynamic persistent scheduler. This is either based on an atomically incremented global work id
-    // prior to SM100 archs, or the HW supported work id scheduler based on UGETNEXTWORKID for SM100+.
+    // Dynamic persistent scheduler for SM100+.
     Persistent,
+    // Static persistent scheduler. Launches a fixed grid size based on the number of SMs and uses
+    // the underlying PersistentTileSchedulerSm90 for static work distribution. Each CTA iterates
+    // through tiles and exits the loop by setting is_valid_tile to false when work is exhausted.
+    StaticPersistent,
+    // Dynamic persistent scheduler for SM90+ using atomicAdd on a global counter.
+    // Uses DynamicPersistentPipelinedTileSchedulerSm90 which enables work-stealing among CTAs
+    // by atomically fetching work tile indices from a global counter.
+    PersistentSm90,
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -154,6 +179,28 @@ BIAS_TYPE_FUNCTION(Mn)
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+// Helper function to check if a scheduler is persistent.
+inline bool isPersistentScheduler(TileScheduler scheduler)
+{
+    return scheduler == TileScheduler::Persistent || scheduler == TileScheduler::StaticPersistent
+        || scheduler == TileScheduler::PersistentSm90;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Helper function to check if CTA rasterization order is compatible with clean early exit of the
+// kernel. Clean early exit requires CTA indices to increase monotonically along the batch
+// dimension, so when a CTA exits the kernel early, it exits with all valid tiles already done.
+// Zigzag or batch-major patterns are NOT compatible because they may cause valid tiles to be
+// skipped when exiting early.
+inline bool supportsCleanEarlyExit(CtaSwizzleType swizzleType, bool batchM, TileScheduler /* scheduler */)
+{
+    return (
+        batchM ? (swizzleType == CtaSwizzleType::RasterizeAlongN) : (swizzleType == CtaSwizzleType::RasterizeAlongM));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace gemm
 
 } // namespace batchedGemm
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION &
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2026 NVIDIA CORPORATION &
  * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -83,6 +83,8 @@ enum class ActType
     // where x0 and x1 are the raw numbers from Gemm, while scaleC and scaleGate are input scales,
     // beta' = beta / scaleAb, scaleC' = scaleC * scaleAb.
     GeGlu,
+    // Placeholder for no activation; not implemented in codegen
+    None,
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -137,16 +139,26 @@ struct GemmGatedActOptions : public gemm::GemmOptions
 inline bool checkAndUpdateGemmGatedActOptions(
     gemmGatedAct::GemmGatedActOptions& options, tg::CudaArch cudaArch, bool updateOptions = true)
 {
+    auto isValid = gemm::checkAndUpdateGemmOptions(options, cudaArch,
+        /* tpGrpSize */ 1, updateOptions);
+    if (!isValid)
+    {
+        return false;
+    }
 
+    if (options.mActType == gemmGatedAct::ActType::None)
+    {
+        TLLM_CHECK_ERROR(false, "ActType None is not supported");
+    }
     // tmpOut is already transposed at this stage
     auto const hiddenSizeStr = options.mTransposeMmaOutput ? "M" : "N";
     auto const hiddenSize = options.mTransposeMmaOutput ? options.mM : options.mN;
     auto const hiddenEpilogueTileSize = options.mTransposeMmaOutput ? options.mEpilogueTileM : options.mEpilogueTileN;
 
     TLLM_CHECK_ERROR(hiddenSize % 2 == 0, hiddenSizeStr, " must be a multiple of 2.");
 
-    TLLM_CHECK_ERROR((options.mTransposeMmaOutput ^ options.mUseShuffledMatrixA) == 0,
-        "Transpose mma output can only be used with shuffled A matrix. And vice versa.");
+    TLLM_CHECK_ERROR((options.mTransposeMmaOutput && !options.mUseShuffledMatrix) == false,
+        "Transpose mma output can only be used with shuffled matrix.");
 
     if (options.mUseTmaStore)
     {
@@ -157,19 +169,11 @@ inline bool checkAndUpdateGemmGatedActOptions(
     if (options.mDtypeC == tg::Dtype::E2m1 || options.mDtypeC == tg::Dtype::MxE4m3)
     {
         int const outHiddenSize = (options.mTransposeMmaOutput ? options.mM : options.mN) / 2;
-        int const hiddenGranularity = 4 * tg::dtypeNumEltsPerSf(options.mDtypeC);
+        int const hiddenGranularity = 4 * options.mSfBlockSizeC;
         TLLM_CHECK_ERROR(outHiddenSize % hiddenGranularity == 0, "Output hidden size (", outHiddenSize,
             ") must be a multiple of ", hiddenGranularity, " for block-scaled outputs.");
     }
 
-    auto isValid = gemm::checkAndUpdateGemmOptions(options, cudaArch,
-        /* tpGrpSize */ 1, updateOptions);
-
-    if (!isValid)
-    {
-        return false;
-    }
-
     auto const validHiddenSize = options.mTransposeMmaOutput ? options.mValidM : options.mValidN;
     if (options.mUseDeepSeekFp8)
     {
@@ -178,12 +182,12 @@ inline bool checkAndUpdateGemmGatedActOptions(
     }
 
     //
-    if (options.mUseShuffledMatrixA)
+    if (options.mUseShuffledMatrix)
     {
         auto const shuffleBlockSize = gemm::getShuffleBlockSize(options.mEpilogueTileM);
         TLLM_CHECK_ERROR(hiddenSize % (2 * shuffleBlockSize) == 0 && validHiddenSize % (2 * shuffleBlockSize) == 0,
             "M/validM must be a multiple of 2 * shuffle block size (", 2 * shuffleBlockSize,
-            ") when useShuffledMatrixA");
+            ") when useShuffledMatrix");
     }
     if (options.mNumSlicesForSplitK > 1)
     {
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION &`
	`2`	`+ * SPDX-FileCopyrightText: Copyright (c) 1993-2026 NVIDIA CORPORATION &`
`3`	`3`	`* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0`
`4`	`4`	`*`
`5`	`5`	`* Licensed under the Apache License, Version 2.0 (the "License");`