flashinfer-ai
diff --git a/‎csrc/trtllm_batched_gemm_runner.cu‎
Lines changed: 20 additions & 3 deletions b/‎csrc/trtllm_batched_gemm_runner.cu‎
Lines changed: 20 additions & 3 deletions
diff --git a/‎flashinfer/artifacts.py‎
Lines changed: 2 additions & 4 deletions b/‎flashinfer/artifacts.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎flashinfer/fused_moe/core.py‎
Lines changed: 9 additions & 7 deletions b/‎flashinfer/fused_moe/core.py‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎flashinfer/utils.py‎
Lines changed: 7 additions & 3 deletions b/‎flashinfer/utils.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h‎
Lines changed: 12 additions & 1 deletion b/‎include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmOptions.h‎
Lines changed: 63 additions & 48 deletions b/‎include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmOptions.h‎
Lines changed: 63 additions & 48 deletions
diff --git a/‎include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/Enums.h‎
Lines changed: 17 additions & 0 deletions b/‎include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/Enums.h‎
Lines changed: 17 additions & 0 deletions
@@ -100,9 +100,17 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(
         options.mTransposeMmaOutput == mOptions.transposeMmaOutput &&
         (!doesRouteImplUseNoRoute(options.mRouteImpl)) == mOptions.routeAct &&
         options.mFusedAct == mOptions.fusedAct && options.mIsStaticBatch == mOptions.staticBatch &&
-        tileSize == mOptions.tileSize &&
-        options.mUseShuffledMatrixA == mOptions.useShuffledMatrixA &&
-        options.mLayoutA == mOptions.weightLayout) {
+        tileSize == mOptions.tileSize) {
+      auto sm = configs[i].mSm;
+      if (sm != SmVersion::Sm100f) {
+        int smVersion = tensorrt_llm::common::getSMVersion();
+        if (smVersion == 100 && sm != SmVersion::Sm100a) {
+          continue;
+        } else if (smVersion == 103 && sm != SmVersion::Sm103a) {
+          continue;
+        }
+      }
+
       if (options.mFusedAct) {
         if (options.mActType != static_cast<batchedGemm::gemmGatedAct::ActType>(mOptions.actType)) {
           continue;
@@ -161,6 +169,7 @@ void TrtllmGenBatchedGemmRunner::run(
   auto const configs = bmm.getBatchedGemmConfigs();
 
   auto const& config = configs[configIndex];
+  std::cout << "config function name: " << config.mFunctionName << std::endl;
 
   FLASHINFER_CHECK(numBatches > 0, "Batched GEMM requires numBatches > 0");
   if (!mOptions.staticBatch) {
@@ -367,6 +376,7 @@ std::vector<int64_t> TrtllmGenBatchedGemmRunner::getValidConfigIndices(
 
     return false;
   };
+
   // Sort configs by options.
   std::vector<int64_t> sortedIndices = mPassingConfigIndices;
   std::sort(sortedIndices.begin(), sortedIndices.end(), cmpFunc);
@@ -381,6 +391,13 @@ std::vector<int64_t> TrtllmGenBatchedGemmRunner::getValidConfigIndices(
     auto const& config = configs[configIndex];
     auto isValidConfig = bmm.isValidConfig(config, gemmData);
     if (isValidConfig) {
+      // if (static_cast<int32_t>(config.mOptions.mLayoutA) == 0 ){
+      //   std::cout << "config.mLayoutA: " << static_cast<int32_t>(config.mOptions.mLayoutA) <<
+      //   std::endl; std::cout << "config.mLayoutB: " <<
+      //   static_cast<int32_t>(config.mOptions.mLayoutB)  << std::endl; std::cout <<
+      //   "config.mFunctionName: " << config.mFunctionName << std::endl;
+      //   validConfigIndices.push_back(configIndex);
+      // }
       validConfigIndices.push_back(configIndex);
     }
   }
 
@@ -76,7 +76,7 @@ def get_available_cubin_files(
 class ArtifactPath:
     TRTLLM_GEN_FMHA: str = "7206d64e67f4c8949286246d6e2e07706af5d223/fmha/trtllm-gen"
     TRTLLM_GEN_BMM: str = (
-        "e6f22dcc3fdeb29ff87af2f4a2cb3d30b8d273e0/batched_gemm-45beda1-ee6a802"
+        "696906bd3985f84662799054f377b4b47a1907d3/batched_gemm-074aec4-3df1e6c"
     )
     TRTLLM_GEN_GEMM: str = (
         "037e528e719ec3456a7d7d654f26b805e44c63b1/gemm-8704aa4-f91dc9e"
@@ -90,9 +90,7 @@ class MetaInfoHash:
     TRTLLM_GEN_FMHA: str = (
         "2f605255e71d673768f5bece66dde9e2e9f4c873347bfe8fefcffbf86a3c847d"
     )
-    TRTLLM_GEN_BMM: str = (
-        "c98b4ce69a39fd41556d67033c30ea814ef76b0a2fe16e798e55baf0104acc34"
-    )
+    TRTLLM_GEN_BMM: str = "696906bd3985f84662799054f377b4b47a1907d3"
     DEEPGEMM: str = "b4374f857c3066089c4ec6b5e79e785559fa2c05ce2623710b0b04bf86414a48"
     TRTLLM_GEN_GEMM: str = (
         "0345358c916d990709f9670e113e93f35c76aa22715e2d5128ec2ca8740be5ba"
 
@@ -894,7 +894,9 @@ def __init__(
             self.gated_act_type = gated_act_type
             self.tile_tokens_dim = tile_tokens_dim
 
-        def get_tile_tokens_dim(self, num_tokens: int, top_k: int):
+        def get_tile_tokens_dim(
+            self, num_tokens: int, top_k: int, max_tile_tokens_dim: int = 128
+        ):
             # Factor to account for the imbalance of the experts.
             # factor equals to the
             # max_real_num_tokens_per_expert / perfect_num_tokens_per_expert
@@ -910,10 +912,10 @@ def get_tile_tokens_dim(self, num_tokens: int, top_k: int):
             num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor)
             # And pad the number to the next power of 2.
             tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
-            # Cap to 8-64 tokens per CTA tile
-            # as it's the range supported by the kernel.
-            tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
-
+            if num_tokens_per_expert > 128 and num_tokens_per_expert < 256:
+                tile_tokens_dim = 192
+            # Cap to 8-max_tile_tokens_dim tokens per CTA tile as it's the range supported by the kernel.
+            tile_tokens_dim = min(max(tile_tokens_dim, 8), max_tile_tokens_dim)
             return tile_tokens_dim
 
         def get_valid_tactics(
@@ -931,7 +933,7 @@ def get_valid_tactics(
             ) = inputs
             num_tokens = routing_logits.shape[0]
             tile_tokens_dim = (
-                self.get_tile_tokens_dim(num_tokens, self.top_k)
+                self.get_tile_tokens_dim(num_tokens, self.top_k, 128)
                 if self.tile_tokens_dim is None
                 else self.tile_tokens_dim
             )
@@ -975,7 +977,7 @@ def forward(
             ) = inputs
             num_tokens = routing_logits.shape[0]
             tile_tokens_dim = (
-                self.get_tile_tokens_dim(num_tokens, self.top_k)
+                self.get_tile_tokens_dim(num_tokens, self.top_k, 128)
                 if self.tile_tokens_dim is None
                 else self.tile_tokens_dim
             )
 
@@ -113,14 +113,18 @@ def next_positive_power_of_2(x: int) -> int:
     return n + 1
 
 
-def calculate_tile_tokens_dim(num_tokens: int, num_experts: int, top_k: int) -> int:
+def calculate_tile_tokens_dim(
+    num_tokens: int, num_experts: int, top_k: int, max_tile_tokens_dim: int = 128
+) -> int:
     # Guess tokens per expert assuming perfect expert distribution first.
     num_tokens_per_expert = num_tokens * top_k // num_experts
 
     # And pad the number to the next power of 2.
     tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
-    # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel.
-    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+    if num_tokens_per_expert > 128 and num_tokens_per_expert < 256:
+        tile_tokens_dim = 192
+    # Cap to 8-max_tile_tokens_dim tokens per CTA tile as it's the range supported by the kernel.
+    tile_tokens_dim = min(max(tile_tokens_dim, 8), max_tile_tokens_dim)
 
     return tile_tokens_dim
 
 
@@ -506,8 +506,19 @@ class BatchedGemmInterface {
       throw std::invalid_argument("Invalid combination of options");
     }
 
-    int32_t const numCtasTile =
+    if (batchM) {
+      numCtasBatch = gemm::divUpMul(numCtasBatch, options.mClusterDimX);
+    } else {
+      numCtasBatch = gemm::divUpMul(numCtasBatch, options.mClusterDimY);
+    }
+
+    int32_t numCtasTile =
         batchM ? gemm::divUp(options.mN, options.mTileN) : gemm::divUp(options.mM, options.mTileM);
+    if (batchM) {
+      numCtasTile = gemm::divUpMul(numCtasTile, options.mClusterDimY);
+    } else {
+      numCtasTile = gemm::divUpMul(numCtasTile, options.mClusterDimX);
+    }
     int32_t const numCtasInner = options.mNumSlicesForSplitK;
     return std::make_tuple(numCtasBatch, numCtasTile, numCtasInner);
   }
 
@@ -76,39 +76,43 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions {
   // FIXME We create explicit constructor with all options to WAR stubgen issue in TRT-LLM.
   BatchedGemmOptions(
       gemm::AllReduceAlgo allReduceAlgo, gemm::BiasType biasType, int blockK, int clusterDimX,
-      int clusterDimY, int clusterDimZ, tg::Dtype dtypeAcc, tg::Dtype dtypeA, tg::Dtype dtypeB,
-      tg::Dtype dtypeC, tg::Dtype dtypeMmaA, tg::Dtype dtypeMmaB, bool enablesEarlyExit,
-      bool enablesDelayedEarlyExit, bool enablesGlobalPtxKnobs, int epilogueLdtmDps,
-      int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, bool gridTriggerSecondaryA,
-      bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA,
-      bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k,
-      gemm::KernelTraits kernelTraits, gemm::MatrixLayout layoutA, gemm::MatrixLayout layoutB,
-      int m, int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN, bool mockAllReduce, int n,
-      int numSlicesForSplitK, int numSlicesForSliceK, int numStages, int numStagesMma,
-      int numStagesMmaWithinWorkTile, int numStagesMmaAcrossWorkTile, int numStagesWorkId,
-      bool outputDebugTensors, bool patchF2fp, std::optional<int32_t> sfBlockSizeA,
-      tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC,
-      int32_t sfReshapeFactor, bool sliceK, gemm::SplitK splitK, int tileK, int tileM, int tileN,
-      gemm::TileScheduler tileScheduler, bool transposeMmaOutput, bool useCustomMmaSchedule,
-      bool useDeepSeekFp8, bool useHoistTryWaitForCustomMmaSchedule, bool usePerTokenSfA,
-      bool usePerTokenSfB, bool useShuffledMatrixA, bool useTmaStore, bool useTwoTmaLoadWarps,
-      bool useTwoMmaWarps, bool useUnrollLoop2xForMma, int worldSize, gemmGatedAct::ActType actType,
-      bool clampBeforeAct, std::vector<int> batchedM, std::vector<int> batchedN,
-      BatchMode batchMode, int numBatches, bool isStaticBatch, int numTokens, RouteImpl routeImpl,
-      bool gridWaitForPrimaryRouting, bool fusedAct, int numRegsPerThreadNonEpilogueWarp,
-      int numRegsPerThreadEpilogueWarp, int numRegsCastAWarps, bool useTmaOobOpt)
+      int clusterDimY, int clusterDimZ, gemm::CtaSwizzleType ctaSwizzleType, tg::Dtype dtypeAcc,
+      tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeMmaA,
+      tg::Dtype dtypeMmaB, bool enablesEarlyExit, bool enablesDelayedEarlyExit,
+      bool enablesGlobalPtxKnobs, int epilogueLdtmDps, int epilogueLdtmBits, int epilogueTileM,
+      int epilogueTileN, bool gridTriggerSecondaryA, bool gridTriggerSecondaryB,
+      bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA, bool gridWaitForPrimaryB,
+      bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, gemm::KernelTraits kernelTraits,
+      gemm::MatrixLayout layoutA, gemm::MatrixLayout layoutB, int m, int mmaK, tg::MmaKind mmaKind,
+      int mmaM, int mmaN, bool mockAllReduce, int n, int numRegsCastAWarps,
+      int numRegsCopySfLdsSttm, int numRegsPerThreadEpilogueWarp,
+      int numRegsPerThreadNonEpilogueWarp, int numSlicesForSplitK, int numSlicesForSliceK,
+      int numStages, int numStagesMma, int numStagesMmaWithinWorkTile,
+      int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, bool patchF2fp,
+      std::optional<int32_t> sfBlockSizeA, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB,
+      tg::SfLayout sfLayoutC, int32_t sfReshapeFactor, bool sliceK, gemm::SplitK splitK, int tileK,
+      int tileM, int tileN, gemm::TileScheduler tileScheduler, bool transposeMmaOutput,
+      bool useCustomMmaSchedule, bool useDeepSeekFp8, bool useHoistTryWaitForCustomMmaSchedule,
+      bool usePerTokenSfA, bool usePerTokenSfB, bool useShuffledMatrixA, bool useTmaStore,
+      bool useTwoTmaLoadWarps, bool useTwoMmaWarps, bool useUnrollLoop2xForMma, int worldSize,
+      gemmGatedAct::ActType actType, bool clampBeforeAct, std::vector<int> batchedM,
+      std::vector<int> batchedN, BatchMode batchMode, int numBatches, bool isStaticBatch,
+      int numTokens, RouteImpl routeImpl, std::optional<RouteImpl> routeSfsImpl,
+      bool gridWaitForPrimaryRouting, bool fusedAct, bool useTmaOobOpt)
       : gemmGatedAct::GemmGatedActOptions(
             gemm::GemmOptions(
-                allReduceAlgo, biasType, blockK, clusterDimX, clusterDimY, clusterDimZ, dtypeAcc,
-                dtypeA, dtypeB, dtypeC, dtypeMmaA, dtypeMmaB, enablesEarlyExit,
-                enablesDelayedEarlyExit, enablesGlobalPtxKnobs, epilogueLdtmDps, epilogueLdtmBits,
-                epilogueTileM, epilogueTileN, gridTriggerSecondaryA, gridTriggerSecondaryB,
-                gridWaitForPrimaryEarlyExit, gridWaitForPrimaryA, gridWaitForPrimaryB,
-                hoistLoadTaskInit, hoistMmaTaskTryWaits, k, kernelTraits, layoutA, layoutB, m, mmaK,
-                mmaKind, mmaM, mmaN, mockAllReduce, n, numSlicesForSplitK, numSlicesForSliceK,
-                numStages, numStagesMma, numStagesMmaWithinWorkTile, numStagesMmaAcrossWorkTile,
-                numStagesWorkId, outputDebugTensors, patchF2fp, sfBlockSizeA, sfLayoutA, sfLayoutB,
-                sfLayoutC, sfReshapeFactor, sliceK, splitK, tileK, tileM, tileN, tileScheduler,
+                allReduceAlgo, biasType, blockK, clusterDimX, clusterDimY, clusterDimZ,
+                ctaSwizzleType, dtypeAcc, dtypeA, dtypeB, dtypeC, dtypeMmaA, dtypeMmaB,
+                enablesEarlyExit, enablesDelayedEarlyExit, enablesGlobalPtxKnobs, epilogueLdtmDps,
+                epilogueLdtmBits, epilogueTileM, epilogueTileN, gridTriggerSecondaryA,
+                gridTriggerSecondaryB, gridWaitForPrimaryEarlyExit, gridWaitForPrimaryA,
+                gridWaitForPrimaryB, hoistLoadTaskInit, hoistMmaTaskTryWaits, k, kernelTraits,
+                layoutA, layoutB, m, mmaK, mmaKind, mmaM, mmaN, mockAllReduce, n, numRegsCastAWarps,
+                numRegsCopySfLdsSttm, numRegsPerThreadEpilogueWarp, numRegsPerThreadNonEpilogueWarp,
+                numSlicesForSplitK, numSlicesForSliceK, numStages, numStagesMma,
+                numStagesMmaWithinWorkTile, numStagesMmaAcrossWorkTile, numStagesWorkId,
+                outputDebugTensors, patchF2fp, sfBlockSizeA, sfLayoutA, sfLayoutB, sfLayoutC,
+                sfReshapeFactor, sliceK, splitK, tileK, tileM, tileN, tileScheduler,
                 transposeMmaOutput, useCustomMmaSchedule, useDeepSeekFp8,
                 useHoistTryWaitForCustomMmaSchedule, usePerTokenSfA, usePerTokenSfB,
                 useShuffledMatrixA, useTmaStore, useTwoTmaLoadWarps, useTwoMmaWarps,
@@ -121,11 +125,9 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions {
         mGridWaitForPrimaryRouting(gridWaitForPrimaryRouting),
         mIsStaticBatch(isStaticBatch),
         mNumBatches(numBatches),
-        mNumRegsPerThreadNonEpilogueWarp(numRegsPerThreadNonEpilogueWarp),
-        mNumRegsPerThreadEpilogueWarp(numRegsPerThreadEpilogueWarp),
-        mNumRegsCastAWarps(numRegsCastAWarps),
         mNumTokens(numTokens),
         mRouteImpl(routeImpl),
+        mRouteSfsImpl(routeSfsImpl),
         mUseTmaOobOpt(useTmaOobOpt) {}
 
   // Batched M-dimensions of GEMM.
@@ -143,16 +145,12 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions {
   bool mIsStaticBatch{true};
   // Number of Gemm batches.
   int mNumBatches;
-  // Number of registers per thread for non-epilogue warps
-  int mNumRegsPerThreadNonEpilogueWarp{0};
-  // Number of registers per thread for epilogue warps
-  int mNumRegsPerThreadEpilogueWarp{0};
-  // Number of registers for the cast A warps.
-  int mNumRegsCastAWarps{0};
   // Total number of tokens.
   int mNumTokens{32};
   // Whether load the input tokens and do routing.
   RouteImpl mRouteImpl{RouteImpl::NoRoute};
+  // Routing logic for scaling factors. If not specified, mRouteImpl is used.
+  std::optional<RouteImpl> mRouteSfsImpl{std::nullopt};
   // Whether to use TMA out-of-bounds optimization to reduce wasted traffic. See details in
   // BatchedGemm/KernelParamsDecl.h.
   bool mUseTmaOobOpt{false};
@@ -235,6 +233,18 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw
                      "E2m1 is not supported with DeepSeek FP8");
   }
 
+  if (options.mRouteSfsImpl.has_value() && options.mRouteSfsImpl.value() != options.mRouteImpl) {
+    TLLM_CHECK_ERROR(
+        options.mRouteSfsImpl.value() == RouteImpl::Ldgsts && options.mRouteImpl == RouteImpl::Tma,
+        "RouteSfsImpl must be equal to RouteImpl, or Ldgsts, when RouteImpl is Tma");
+  } else if (!options.mRouteSfsImpl.has_value()) {
+    if (updateOptions) {
+      options.mRouteSfsImpl = options.mRouteImpl;
+    } else {
+      TLLM_LOG_ERROR("RouteSfsImpl must be specified");
+      return false;
+    }
+  }
   if (batchM) {
     if (options.mDtypeA == tg::Dtype::MxE2m1 && options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) {
       TLLM_CHECK_ERROR(doesRouteImplUseNoRoute(options.mRouteImpl),
@@ -269,18 +279,20 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw
       }
     }
 
-    if (doesRouteImplUseTma(options.mRouteImpl)) {
+    if (doesRouteImplUseTma(options.mRouteSfsImpl.value())) {
       TLLM_CHECK_ERROR(!batchM, "UTMALDG.GATHER4 only supported for batch N.");
 
       if (tg::mmaKindIsBlockFmt(options.mMmaKind)) {
         auto dtypeRoute = batchM ? options.mDtypeA : options.mDtypeB;
-        TLLM_CHECK_ERROR(options.mTileK % tg::dtypeNumEltsPerSf(dtypeRoute) == 0,
-                         "tileK needs to be a multiple of 16 * tg::dtypeNumEltsPerSf(dtypeA).");
         TLLM_CHECK_ERROR(options.mTileK % (tg::dtypeNumEltsPerSf(dtypeRoute) * 16) == 0,
                          "tileK needs to be a multiple of 16 * tg::dtypeNumEltsPerSf(dtypeA).");
       }
     }
 
+    if (options.mClusterDimX > 1) {
+      TLLM_CHECK_ERROR(!batchM, "2CTA Gemm currently only supports batch N.");
+    }
+
     if (!batchM || doesRouteImplUseNoRoute(options.mRouteImpl)) {
       TLLM_CHECK_ERROR(options.mSfLayoutA == tg::SfLayout::R128c4,
                        "options.mSfLayoutA has to be tg::SfLayout::R128c4 when not being routed");
@@ -301,6 +313,11 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw
     TLLM_CHECK_ERROR(options.mK % options.mTileK == 0, "K must be a multiple of TileK");
   }
 
+  if (options.mClusterDimX > 1 && batchM && options.mRouteImpl != RouteImpl::NoRoute) {
+    TLLM_CHECK_ERROR(false,
+                     "2CTA BatchedGemm does not support routing along M dimension. To support it, "
+                     "change the input routing data layout to be padded to clusterDimX size.");
+  }
   return isValid;
 }
 
@@ -323,6 +340,7 @@ struct BatchedGemmConfig {
   char const* mHash{nullptr};
 #else
   trtllm::gen::CudaRunner* mCudaRunner{nullptr};
+  int32_t mInstanceIdx{0};
 #endif
 
   BatchedGemmOptions mOptions;
@@ -343,13 +361,10 @@ inline std::string dumpOptions(BatchedGemmOptions const& options) {
   ss << "mNumTokens=" << options.mNumTokens << "," << std::endl;
   ss << "mRouteImpl=batchedGemm::RouteImpl(" << static_cast<int32_t>(options.mRouteImpl) << "),"
      << std::endl;
+  ss << "mRouteSfsImpl={batchedGemm::RouteImpl("
+     << static_cast<int32_t>(options.mRouteSfsImpl.value()) << ")}," << std::endl;
   ss << "mGridWaitForPrimaryRouting=" << options.mGridWaitForPrimaryRouting << "," << std::endl;
   ss << "mFusedAct=" << options.mFusedAct << "," << std::endl;
-  ss << "mNumRegsPerThreadNonEpilogueWarp=" << options.mNumRegsPerThreadNonEpilogueWarp << ","
-     << std::endl;
-  ss << "mNumRegsPerThreadEpilogueWarp=" << options.mNumRegsPerThreadEpilogueWarp << ","
-     << std::endl;
-  ss << "mNumRegsCastAWarps=" << options.mNumRegsCastAWarps << "," << std::endl;
   ss << "mUseTmaOobOpt=" << options.mUseTmaOobOpt << std::endl;
   return ss.str();
 }
 
@@ -97,6 +97,23 @@ enum class TileScheduler {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+enum class CtaSwizzleType : uint32_t {
+  // Rasterize CTAs along the M dimension.
+  RasterizeAlongM = 0,
+  // Rasterize CTAs along the N dimension.
+  RasterizeAlongN,
+  // Swizzle CTAs in zig-zag pattern along M dimension, Zig-zag width is 2.
+  ZigZagAlongM2,
+  // Swizzle CTAs in zig-zag pattern along N dimension, Zig-zag width is 2.
+  ZigZagAlongN2,
+  // Swizzle CTAs in zig-zag pattern along M dimension, Zig-zag width is 4.
+  ZigZagAlongM4,
+  // Swizzle CTAs in zig-zag pattern along N dimension, Zig-zag width is 4.
+  ZigZagAlongN4,
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 // Helper functions to check the SplitK type.
 
 #define SPLIT_K_FUNCTION(Mode) \