NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h‎
Lines changed: 23 additions & 5 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h‎
Lines changed: 23 additions & 5 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h‎
Lines changed: 66 additions & 42 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h‎
Lines changed: 66 additions & 42 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h‎
Lines changed: 18 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h‎
Lines changed: 1 addition & 0 deletions
@@ -235,7 +235,7 @@ struct BatchedGemmData
         void const* mPtrBias{nullptr};
 
         // The output tensor scaling factor for Fp8 (not DeepSeek FP8) and NvFp4 quantization.
-        // TensorRT LLM API requires a scaling factor on the device.
+        // TensorRT-LLM API requires a scaling factor on the device.
         // scaleC = dequantA * dequantB * quantC,
         // where dequantA is global dequantization scaling factor of A
         //    if dtypeA is FP8, it transforms the range from [-448, 448] to [-amaxA, amaxA]
@@ -250,7 +250,7 @@ struct BatchedGemmData
         float const* mPtrScaleC{nullptr};
 
         // The output gate scale for Fp8 (not DeepSeek FP8) and NvFp4 quantization.
-        // TensorRT LLM API requires a scaling factor on the device.
+        // TensorRT-LLM API requires a scaling factor on the device.
         // scaleGate = dequantA * dequantB,
         // where dequantA is global dequantization scaling factor of A
         //    if dtypeA is FP8, it transforms the range from [-448, 448] to [-amaxA, amaxA]
@@ -507,8 +507,25 @@ class BatchedGemmInterface
             throw std::invalid_argument("Invalid combination of options");
         }
 
-        int32_t const numCtasTile
+        if (batchM)
+        {
+            numCtasBatch = gemm::divUpMul(numCtasBatch, options.mClusterDimX);
+        }
+        else
+        {
+            numCtasBatch = gemm::divUpMul(numCtasBatch, options.mClusterDimY);
+        }
+
+        int32_t numCtasTile
             = batchM ? gemm::divUp(options.mN, options.mTileN) : gemm::divUp(options.mM, options.mTileM);
+        if (batchM)
+        {
+            numCtasTile = gemm::divUpMul(numCtasTile, options.mClusterDimY);
+        }
+        else
+        {
+            numCtasTile = gemm::divUpMul(numCtasTile, options.mClusterDimX);
+        }
         int32_t const numCtasInner = options.mNumSlicesForSplitK;
         return std::make_tuple(numCtasBatch, numCtasTile, numCtasInner);
     }
@@ -531,7 +548,6 @@ class BatchedGemmInterface
     // Aligns the pointer to the alignment
     template <typename Dtype>
     inline Dtype* alignPtr(Dtype* ptr, int64_t alignment) const;
-
     // Returns the size of the workspace buffers in bytes
     std::vector<size_t> getWorkspaceSizesInBytes(BatchedGemmConfig const& config, BatchedGemmData const& data) const;
 
@@ -792,7 +808,9 @@ int32_t BatchedGemmInterface::run(BatchedGemmConfig const& config, void* workspa
         cuModuleUnload(cuModule);
     }
 #else
-    config.mCudaRunner->run((void*) &kernelParams, (void*) cudaStream, grid);
+    config.mCudaRunner->run((void*) &kernelParams, (void*) cudaStream, grid,
+        /* cluster */ {},
+        /* instanceId */ config.mInstanceIdx);
 #endif
 
     return 0;
 
@@ -86,34 +86,36 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions
 
     // FIXME We create explicit constructor with all options to WAR stubgen issue in TRT-LLM.
     BatchedGemmOptions(gemm::AllReduceAlgo allReduceAlgo, gemm::BiasType biasType, int blockK, int clusterDimX,
-        int clusterDimY, int clusterDimZ, tg::Dtype dtypeAcc, tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC,
-        tg::Dtype dtypeMmaA, tg::Dtype dtypeMmaB, bool enablesEarlyExit, bool enablesDelayedEarlyExit,
-        bool enablesGlobalPtxKnobs, int epilogueLdtmDps, int epilogueLdtmBits, int epilogueTileM, int epilogueTileN,
-        bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit,
-        bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k,
-        gemm::KernelTraits kernelTraits, gemm::MatrixLayout layoutA, gemm::MatrixLayout layoutB, int m, int mmaK,
-        tg::MmaKind mmaKind, int mmaM, int mmaN, bool mockAllReduce, int n, int numSlicesForSplitK,
-        int numSlicesForSliceK, int numStages, int numStagesMma, int numStagesMmaWithinWorkTile,
-        int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, bool patchF2fp,
-        std::optional<int32_t> sfBlockSizeA, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC,
-        int32_t sfReshapeFactor, bool sliceK, gemm::SplitK splitK, int tileK, int tileM, int tileN,
-        gemm::TileScheduler tileScheduler, bool transposeMmaOutput, bool useCustomMmaSchedule, bool useDeepSeekFp8,
-        bool useHoistTryWaitForCustomMmaSchedule, bool usePerTokenSfA, bool usePerTokenSfB, bool useShuffledMatrixA,
-        bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, bool useUnrollLoop2xForMma, int worldSize,
-        gemmGatedAct::ActType actType, bool clampBeforeAct, std::vector<int> batchedM, std::vector<int> batchedN,
-        BatchMode batchMode, int numBatches, bool isStaticBatch, int numTokens, RouteImpl routeImpl,
-        bool gridWaitForPrimaryRouting, bool fusedAct, int numRegsPerThreadNonEpilogueWarp,
-        int numRegsPerThreadEpilogueWarp, int numRegsCastAWarps, bool useTmaOobOpt)
+        int clusterDimY, int clusterDimZ, gemm::CtaSwizzleType ctaSwizzleType, tg::Dtype dtypeAcc, tg::Dtype dtypeA,
+        tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeMmaA, tg::Dtype dtypeMmaB, bool enablesEarlyExit,
+        bool enablesDelayedEarlyExit, bool enablesGlobalPtxKnobs, int epilogueLdtmDps, int epilogueLdtmBits,
+        int epilogueTileM, int epilogueTileN, bool gridTriggerSecondaryA, bool gridTriggerSecondaryB,
+        bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit,
+        bool hoistMmaTaskTryWaits, int k, gemm::KernelTraits kernelTraits, gemm::MatrixLayout layoutA,
+        gemm::MatrixLayout layoutB, int m, int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN, bool mockAllReduce, int n,
+        int numRegsCastAWarps, int numRegsCopySfLdsSttm, int numRegsPerThreadEpilogueWarp,
+        int numRegsPerThreadNonEpilogueWarp, int numSlicesForSplitK, int numSlicesForSliceK, int numStages,
+        int numStagesMma, int numStagesMmaWithinWorkTile, int numStagesMmaAcrossWorkTile, int numStagesWorkId,
+        bool outputDebugTensors, bool patchF2fp, std::optional<int32_t> sfBlockSizeA, tg::SfLayout sfLayoutA,
+        tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC, int32_t sfReshapeFactor, bool sliceK, gemm::SplitK splitK,
+        int tileK, int tileM, int tileN, gemm::TileScheduler tileScheduler, bool transposeMmaOutput,
+        bool useCustomMmaSchedule, bool useDeepSeekFp8, bool useHoistTryWaitForCustomMmaSchedule, bool usePerTokenSfA,
+        bool usePerTokenSfB, bool useShuffledMatrixA, bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps,
+        bool useUnrollLoop2xForMma, int worldSize, gemmGatedAct::ActType actType, bool clampBeforeAct,
+        std::vector<int> batchedM, std::vector<int> batchedN, BatchMode batchMode, int numBatches, bool isStaticBatch,
+        int numTokens, RouteImpl routeImpl, std::optional<RouteImpl> routeSfsImpl, bool gridWaitForPrimaryRouting,
+        bool fusedAct, bool useTmaOobOpt)
         : gemmGatedAct::GemmGatedActOptions(
-            gemm::GemmOptions(allReduceAlgo, biasType, blockK, clusterDimX, clusterDimY, clusterDimZ, dtypeAcc, dtypeA,
-                dtypeB, dtypeC, dtypeMmaA, dtypeMmaB, enablesEarlyExit, enablesDelayedEarlyExit, enablesGlobalPtxKnobs,
-                epilogueLdtmDps, epilogueLdtmBits, epilogueTileM, epilogueTileN, gridTriggerSecondaryA,
-                gridTriggerSecondaryB, gridWaitForPrimaryEarlyExit, gridWaitForPrimaryA, gridWaitForPrimaryB,
-                hoistLoadTaskInit, hoistMmaTaskTryWaits, k, kernelTraits, layoutA, layoutB, m, mmaK, mmaKind, mmaM,
-                mmaN, mockAllReduce, n, numSlicesForSplitK, numSlicesForSliceK, numStages, numStagesMma,
-                numStagesMmaWithinWorkTile, numStagesMmaAcrossWorkTile, numStagesWorkId, outputDebugTensors, patchF2fp,
-                sfBlockSizeA, sfLayoutA, sfLayoutB, sfLayoutC, sfReshapeFactor, sliceK, splitK, tileK, tileM, tileN,
-                tileScheduler, transposeMmaOutput, useCustomMmaSchedule, useDeepSeekFp8,
+            gemm::GemmOptions(allReduceAlgo, biasType, blockK, clusterDimX, clusterDimY, clusterDimZ, ctaSwizzleType,
+                dtypeAcc, dtypeA, dtypeB, dtypeC, dtypeMmaA, dtypeMmaB, enablesEarlyExit, enablesDelayedEarlyExit,
+                enablesGlobalPtxKnobs, epilogueLdtmDps, epilogueLdtmBits, epilogueTileM, epilogueTileN,
+                gridTriggerSecondaryA, gridTriggerSecondaryB, gridWaitForPrimaryEarlyExit, gridWaitForPrimaryA,
+                gridWaitForPrimaryB, hoistLoadTaskInit, hoistMmaTaskTryWaits, k, kernelTraits, layoutA, layoutB, m,
+                mmaK, mmaKind, mmaM, mmaN, mockAllReduce, n, numRegsCastAWarps, numRegsCopySfLdsSttm,
+                numRegsPerThreadEpilogueWarp, numRegsPerThreadNonEpilogueWarp, numSlicesForSplitK, numSlicesForSliceK,
+                numStages, numStagesMma, numStagesMmaWithinWorkTile, numStagesMmaAcrossWorkTile, numStagesWorkId,
+                outputDebugTensors, patchF2fp, sfBlockSizeA, sfLayoutA, sfLayoutB, sfLayoutC, sfReshapeFactor, sliceK,
+                splitK, tileK, tileM, tileN, tileScheduler, transposeMmaOutput, useCustomMmaSchedule, useDeepSeekFp8,
                 useHoistTryWaitForCustomMmaSchedule, usePerTokenSfA, usePerTokenSfB, useShuffledMatrixA, useTmaStore,
                 useTwoTmaLoadWarps, useTwoMmaWarps, useUnrollLoop2xForMma, worldSize),
             actType, clampBeforeAct)
@@ -124,11 +126,9 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions
         , mGridWaitForPrimaryRouting(gridWaitForPrimaryRouting)
         , mIsStaticBatch(isStaticBatch)
         , mNumBatches(numBatches)
-        , mNumRegsPerThreadNonEpilogueWarp(numRegsPerThreadNonEpilogueWarp)
-        , mNumRegsPerThreadEpilogueWarp(numRegsPerThreadEpilogueWarp)
-        , mNumRegsCastAWarps(numRegsCastAWarps)
         , mNumTokens(numTokens)
         , mRouteImpl(routeImpl)
+        , mRouteSfsImpl(routeSfsImpl)
         , mUseTmaOobOpt(useTmaOobOpt)
     {
     }
@@ -148,16 +148,12 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions
     bool mIsStaticBatch{true};
     // Number of Gemm batches.
     int mNumBatches;
-    // Number of registers per thread for non-epilogue warps
-    int mNumRegsPerThreadNonEpilogueWarp{0};
-    // Number of registers per thread for epilogue warps
-    int mNumRegsPerThreadEpilogueWarp{0};
-    // Number of registers for the cast A warps.
-    int mNumRegsCastAWarps{0};
     // Total number of tokens.
     int mNumTokens{32};
     // Whether load the input tokens and do routing.
     RouteImpl mRouteImpl{RouteImpl::NoRoute};
+    // Routing logic for scaling factors. If not specified, mRouteImpl is used.
+    std::optional<RouteImpl> mRouteSfsImpl{std::nullopt};
     // Whether to use TMA out-of-bounds optimization to reduce wasted traffic. See details in
     // BatchedGemm/KernelParamsDecl.h.
     bool mUseTmaOobOpt{false};
@@ -255,6 +251,24 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw
             "E2m1 is not supported with DeepSeek FP8");
     }
 
+    if (options.mRouteSfsImpl.has_value() && options.mRouteSfsImpl.value() != options.mRouteImpl)
+    {
+        TLLM_CHECK_ERROR(options.mRouteSfsImpl.value() == RouteImpl::Ldgsts && options.mRouteImpl == RouteImpl::Tma,
+            "RouteSfsImpl must be equal to RouteImpl, or Ldgsts, when RouteImpl is Tma");
+    }
+    else if (!options.mRouteSfsImpl.has_value())
+    {
+        if (updateOptions)
+        {
+            options.mRouteSfsImpl = options.mRouteImpl;
+        }
+        else
+        {
+            TLLM_LOG_ERROR("RouteSfsImpl must be specified");
+            return false;
+        }
+    }
+
     if (batchM)
     {
         if (options.mDtypeA == tg::Dtype::MxE2m1 && options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4)
@@ -299,20 +313,23 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw
             }
         }
 
-        if (doesRouteImplUseTma(options.mRouteImpl))
+        if (doesRouteImplUseTma(options.mRouteSfsImpl.value()))
         {
             TLLM_CHECK_ERROR(!batchM, "UTMALDG.GATHER4 only supported for batch N.");
 
             if (tg::mmaKindIsBlockFmt(options.mMmaKind))
             {
                 auto dtypeRoute = batchM ? options.mDtypeA : options.mDtypeB;
-                TLLM_CHECK_ERROR(options.mTileK % tg::dtypeNumEltsPerSf(dtypeRoute) == 0,
-                    "tileK needs to be a multiple of 16 * tg::dtypeNumEltsPerSf(dtypeA).");
                 TLLM_CHECK_ERROR(options.mTileK % (tg::dtypeNumEltsPerSf(dtypeRoute) * 16) == 0,
                     "tileK needs to be a multiple of 16 * tg::dtypeNumEltsPerSf(dtypeA).");
             }
         }
 
+        if (options.mClusterDimX > 1)
+        {
+            TLLM_CHECK_ERROR(!batchM, "2CTA Gemm currently only supports batch N.");
+        }
+
         if (!batchM || doesRouteImplUseNoRoute(options.mRouteImpl))
         {
             TLLM_CHECK_ERROR(options.mSfLayoutA == tg::SfLayout::R128c4,
@@ -336,6 +353,13 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw
         TLLM_CHECK_ERROR(options.mK % options.mTileK == 0, "K must be a multiple of TileK");
     }
 
+    if (options.mClusterDimX > 1 && batchM && options.mRouteImpl != RouteImpl::NoRoute)
+    {
+        TLLM_CHECK_ERROR(false,
+            "2CTA BatchedGemm does not support routing along M dimension. To support it, "
+            "change the input routing data layout to be padded to clusterDimX size.");
+    }
+
     return isValid;
 }
 
@@ -359,6 +383,7 @@ struct BatchedGemmConfig
     char const* mHash{nullptr};
 #else
     trtllm::gen::CudaRunner* mCudaRunner{nullptr};
+    int32_t mInstanceIdx{0};
 #endif
 
     BatchedGemmOptions mOptions;
@@ -379,11 +404,10 @@ inline std::string dumpOptions(BatchedGemmOptions const& options)
     ss << "mIsStaticBatch=" << options.mIsStaticBatch << "," << std::endl;
     ss << "mNumTokens=" << options.mNumTokens << "," << std::endl;
     ss << "mRouteImpl=batchedGemm::RouteImpl(" << static_cast<int32_t>(options.mRouteImpl) << ")," << std::endl;
+    ss << "mRouteSfsImpl={batchedGemm::RouteImpl(" << static_cast<int32_t>(options.mRouteSfsImpl.value()) << ")},"
+       << std::endl;
     ss << "mGridWaitForPrimaryRouting=" << options.mGridWaitForPrimaryRouting << "," << std::endl;
     ss << "mFusedAct=" << options.mFusedAct << "," << std::endl;
-    ss << "mNumRegsPerThreadNonEpilogueWarp=" << options.mNumRegsPerThreadNonEpilogueWarp << "," << std::endl;
-    ss << "mNumRegsPerThreadEpilogueWarp=" << options.mNumRegsPerThreadEpilogueWarp << "," << std::endl;
-    ss << "mNumRegsCastAWarps=" << options.mNumRegsCastAWarps << "," << std::endl;
     ss << "mUseTmaOobOpt=" << options.mUseTmaOobOpt << std::endl;
     return ss.str();
 }
 
@@ -104,6 +104,24 @@ enum class TileScheduler
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+enum class CtaSwizzleType : uint32_t
+{
+    // Rasterize CTAs along the M dimension.
+    RasterizeAlongM = 0,
+    // Rasterize CTAs along the N dimension.
+    RasterizeAlongN,
+    // Swizzle CTAs in zig-zag pattern along M dimension, Zig-zag width is 2.
+    ZigZagAlongM2,
+    // Swizzle CTAs in zig-zag pattern along N dimension, Zig-zag width is 2.
+    ZigZagAlongN2,
+    // Swizzle CTAs in zig-zag pattern along M dimension, Zig-zag width is 4.
+    ZigZagAlongM4,
+    // Swizzle CTAs in zig-zag pattern along N dimension, Zig-zag width is 4.
+    ZigZagAlongN4,
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 // Helper functions to check the SplitK type.
 
 #define SPLIT_K_FUNCTION(Mode)                                                                                         \
 
@@ -210,6 +210,7 @@ struct GemmGatedActConfig
     char const* mHash{nullptr};
 #else
     trtllm::gen::CudaRunner* mCudaRunner{nullptr};
+    int32_t mInstanceIdx{0};
 #endif
 
     GemmGatedActOptions mOptions{};