fix wip

cyx-6 · cyx-6 · commit d98f0904f245 · 2025-07-31T00:12:29.000Z
diff --git a/flashinfer/fused_moe.py b/flashinfer/fused_moe.py
@@ -774,15 +774,15 @@ def cutlass_fused_moe(
 
 
 def trtllm_gen_fused_moe_sm100_module() -> JitSpec:
-    hash = "6b93c394210c89dccef13833c89797f1b8f8aefb"
-    tllm_gen_commit = "ce8ce46"
+    hash = "5e0cff4583554d182ae3fee461ff87b481ff3464"
+    tllm_gen_commit = "573cd5a"
     tllm_gen_config_hash = "2dc78d9"
     include_path = (
         f"{hash}/batched_gemm-{tllm_gen_commit}-{tllm_gen_config_hash}/include"
     )
     metainfo = get_cubin(
         f"{include_path}/flashinferMetaInfo",
-        "b24fd5e7ae6b20e903c866ecb1d4a68f238301ba9b76df6a536056f2059a0d56",
+        "a13e1ca232f60ca9eefb3298153aba03ccab6916748cf7e68b731d8dc4e9ccbc",
         ".h",
     )
     assert metainfo, "KernelMetaInfo.h not found"
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h
@@ -243,11 +243,48 @@ struct BatchedGemmData {
     // Shape is [B].
     float const* mPtrScaleGate{nullptr};
 
+    // The clamp limit for the accumulator before applying the activation.
+    // Shape is [B].
+    // Clamp is INF if nullptr.
+    // When the input is FP8 or NVFP4, the clamp has to be scaled by limit' = limit / dequantAb.
+    // If applied on SwiGlu, it will be:
+    //
+    //   x_glu    = x_glu.clamp(min=None, max=limit)
+    //   x_linear = x_linear.clamp(min=-limit, max=limit)
+    //
+    // The given clamp limit applies to the dequantized values, so the order of operations would
+    // look something like this:
+    //
+    // x0 = x0 * dqAb
+    // x0 = clamp(x0, none, limit)
+    // x0 = x0 * sigmoid(alpha * x0)
+    // x1 = dqAb * x1
+    // x1 = clamp(x1, -limit, limit)
+    // out = qC * (x1 + beta) * x0
+    //
+    // Given that the dqAb and qC are combined into scaleC, we can bring the dqAb into the clamp
+    // limit and apply the clamping prior to dequantization:
+    //
+    // x0 = clamp(x0, none, limit / dqAb)
+    // x0 = x0 * dqAb
+    // x0 = x0 * sigmoid(alpha * x0)
+    // x1 = clamp(x1, -limit / dqAb, limit / dqAb)
+    // scaleC = dqAb * qC
+    // beta' = beta / dqAb
+    // out = scaleC * (x1 + beta') * x0
+    //
+    // Note this assumes that scaleAb == scaleGate which is true in TRT-LLM MoE use-case
+    //
+    float const* mPtrClampLimit{nullptr};
+
     // The alpha and beta for SwiGlu.
     // gatedActivation <- (x0 + beta) * activation(x1, alpha)
     // Shape is [B].
     // Alpha is 1.f if nullptr.
     // Beta is 0.f if nullptr.
+    // The formula:
+    //
+    //   out_glu  = x_glu * torch.sigmoid(alpha * x_glu) + (x_linear + beta)
     float const* mPtrSwiGluAlpha{nullptr};
     float const* mPtrSwiGluBeta{nullptr};
 
@@ -630,9 +667,10 @@ int32_t BatchedGemmInterface::run(BatchedGemmConfig const& config, void* workspa
       batchedGemmData.mInputBuffers.mPtrSfB, batchedGemmData.mInputBuffers.mPtrPerTokenSfA,
       batchedGemmData.mInputBuffers.mPtrPerTokenSfB, batchedGemmData.mInputBuffers.mPtrBias,
       batchedGemmData.mOutputBuffers.mPtrSfC, batchedGemmData.mInputBuffers.mPtrScaleC,
-      batchedGemmData.mInputBuffers.mPtrScaleGate, batchedGemmData.mInputBuffers.mPtrSwiGluAlpha,
-      batchedGemmData.mInputBuffers.mPtrSwiGluBeta, batchedGemmData.mInputBuffers.mPtrRouteMap,
-      dPtrRowMax, dPtrRowMaxBars, batchedGemmData.mInputBuffers.mPtrNumNonExitingCtas,
+      batchedGemmData.mInputBuffers.mPtrScaleGate, batchedGemmData.mInputBuffers.mPtrClampLimit,
+      batchedGemmData.mInputBuffers.mPtrSwiGluAlpha, batchedGemmData.mInputBuffers.mPtrSwiGluBeta,
+      batchedGemmData.mInputBuffers.mPtrRouteMap, dPtrRowMax, dPtrRowMaxBars,
+      batchedGemmData.mInputBuffers.mPtrNumNonExitingCtas,
       batchedGemmData.mInputBuffers.mPtrTotalNumPaddedTokens,
       batchedGemmData.mInputBuffers.mPtrCtaIdxXyToBatchIdx,
       batchedGemmData.mInputBuffers.mPtrCtaIdxXyToMnLimit, maxNumCtasInBatchDim);
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmOptions.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmOptions.h
@@ -90,10 +90,10 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions {
       bool usePerTokenSfB, bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps,
       tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC,
       int32_t sfReshapeFactor, gemm::TileScheduler tileScheduler, gemmGatedAct::ActType actType,
-      std::vector<int> batchedM, std::vector<int> batchedN, BatchMode batchMode, int numBatches,
-      bool isStaticBatch, int numTokens, RouteImpl routeImpl, bool gridWaitForPrimaryRouting,
-      bool fusedAct, int numRegsPerThreadNonEpilogueWarp, int numRegsPerThreadEpilogueWarp,
-      int numRegsCastAWarps)
+      bool clampBeforeAct, std::vector<int> batchedM, std::vector<int> batchedN,
+      BatchMode batchMode, int numBatches, bool isStaticBatch, int numTokens, RouteImpl routeImpl,
+      bool gridWaitForPrimaryRouting, bool fusedAct, int numRegsPerThreadNonEpilogueWarp,
+      int numRegsPerThreadEpilogueWarp, int numRegsCastAWarps, bool useTmaOobOpt)
       : gemmGatedAct::GemmGatedActOptions(
             gemm::GemmOptions(
                 allReduceAlgo, biasType, blockK, clusterDimX, clusterDimY, clusterDimZ, dtypeAcc,
@@ -109,48 +109,49 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions {
                 useCustomMmaSchedule, useHoistTryWaitForCustomMmaSchedule, useDeepSeekFp8,
                 usePerTokenSfA, usePerTokenSfB, useTmaStore, useTwoTmaLoadWarps, useTwoMmaWarps,
                 sfLayoutA, sfLayoutB, sfLayoutC, sfReshapeFactor, tileScheduler),
-            actType),
+            actType, clampBeforeAct),
         mBatchedM(batchedM),
         mBatchedN(batchedN),
         mBatchMode(BatchMode(batchMode)),
-        mNumBatches(numBatches),
-        mIsStaticBatch(isStaticBatch),
-        mNumTokens(numTokens),
-        mRouteImpl(routeImpl),
-        mGridWaitForPrimaryRouting(gridWaitForPrimaryRouting),
         mFusedAct(fusedAct),
+        mGridWaitForPrimaryRouting(gridWaitForPrimaryRouting),
+        mIsStaticBatch(isStaticBatch),
+        mNumBatches(numBatches),
         mNumRegsPerThreadNonEpilogueWarp(numRegsPerThreadNonEpilogueWarp),
         mNumRegsPerThreadEpilogueWarp(numRegsPerThreadEpilogueWarp),
-        mNumRegsCastAWarps(numRegsCastAWarps) {}
+        mNumRegsCastAWarps(numRegsCastAWarps),
+        mNumTokens(numTokens),
+        mRouteImpl(routeImpl),
+        mUseTmaOobOpt(useTmaOobOpt) {}
 
   // Batched M-dimensions of GEMM.
   std::vector<int> mBatchedM;
   // Batched N-dimensions of GEMM.
   std::vector<int> mBatchedN;
   // Whether batching M or N.
   BatchMode mBatchMode{BatchMode::BatchM};
-  // Number of Gemm batches.
-  int mNumBatches;
-
-  // Whether the batch size is static (i.e. known at kernel launch time).
-  bool mIsStaticBatch{true};
-  // Total number of tokens.
-  int mNumTokens{32};
-  // Whether load the input tokens and do routing.
-  RouteImpl mRouteImpl{RouteImpl::NoRoute};
+  // Whether to perform a fused gated activation.
+  bool mFusedAct{false};
   // Whether the loads that load from ptrRouteMap, ptrTotalNumPaddedTokens,
   // ptrCtaIdxXyToBatchIdx, etc.. should wait on a grid dependency.
   bool mGridWaitForPrimaryRouting{true};
-
-  // Whether to perform a fused gated activation.
-  bool mFusedAct{false};
-
+  // Whether the batch size is static (i.e. known at kernel launch time).
+  bool mIsStaticBatch{true};
+  // Number of Gemm batches.
+  int mNumBatches;
   // Number of registers per thread for non-epilogue warps
   int mNumRegsPerThreadNonEpilogueWarp{0};
   // Number of registers per thread for epilogue warps
   int mNumRegsPerThreadEpilogueWarp{0};
   // Number of registers for the cast A warps.
   int mNumRegsCastAWarps{0};
+  // Total number of tokens.
+  int mNumTokens{32};
+  // Whether load the input tokens and do routing.
+  RouteImpl mRouteImpl{RouteImpl::NoRoute};
+  // Whether to use TMA out-of-bounds optimization to reduce wasted traffic. See details in
+  // BatchedGemm/KernelParamsDecl.h.
+  bool mUseTmaOobOpt{false};
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -159,6 +160,16 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions {
 bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackwell,
                                       bool updateOptions = true) {
   bool isValid = true;
+  if (options.mUseTmaOobOpt && !options.mUseTwoTmaLoadWarps) {
+    if (updateOptions) {
+      // Since any routing (mRouteAct != NoRoute) requires mUseTwoTmaLoadWarps == true.
+      // Single TMA load warp is not the target use case for OOB optimization.
+      options.mUseTmaOobOpt = false;
+    } else {
+      TLLM_CHECK_ERROR(false, "TMA OOB optimization requires two TMA load warps.");
+      return false;
+    }
+  }
   if (options.mFusedAct) {
     // ensure that we check the fused options as well
     isValid = gemmGatedAct::checkAndUpdateGemmGatedActOptions(options, isBlackwell, updateOptions);
@@ -302,6 +313,8 @@ struct BatchedGemmConfig {
   // defined. In this case, the cubins will be loaded from the provided data and function name.
   // Otherwise, the kernel will be loaded from the CudaRunner.
 #ifdef TLLM_GEN_EXPORT_INTERFACE
+  uint8_t const* mData{nullptr};
+  uint32_t const mSize{0};
   uint32_t const mSharedMemSize{0};
   char const* mFunctionName{nullptr};
   uint32_t const mNumThreadsPerCTA{0};
@@ -334,7 +347,8 @@ inline std::string dumpOptions(BatchedGemmOptions const& options) {
      << std::endl;
   ss << "mNumRegsPerThreadEpilogueWarp=" << options.mNumRegsPerThreadEpilogueWarp << ","
      << std::endl;
-  ss << "mNumRegsCastAWarps=" << options.mNumRegsCastAWarps << std::endl;
+  ss << "mNumRegsCastAWarps=" << options.mNumRegsCastAWarps << "," << std::endl;
+  ss << "mUseTmaOobOpt=" << options.mUseTmaOobOpt << std::endl;
   return ss.str();
 }
 
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/GemmGatedActOptions.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/GemmGatedActOptions.h
@@ -91,11 +91,13 @@ inline std::string getActTypeName(ActType type) {
 
 struct GemmGatedActOptions : public gemm::GemmOptions {
   GemmGatedActOptions() = default;
-  GemmGatedActOptions(gemm::GemmOptions options, ActType actType)
-      : gemm::GemmOptions(options), mActType(actType) {}
+  GemmGatedActOptions(gemm::GemmOptions options, ActType actType, bool clampBeforeAct)
+      : gemm::GemmOptions(options), mActType(actType), mClampBeforeAct(clampBeforeAct) {}
 
   // Type of the gated activation.
   ActType mActType{ActType::SwiGlu};
+  // Clamp the dequantized values to the range [-limit, limit].
+  bool mClampBeforeAct{false};
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -156,8 +158,9 @@ inline bool checkAndUpdateGemmGatedActOptions(gemmGatedAct::GemmGatedActOptions&
 inline std::string dumpOptions(GemmGatedActOptions const& options) {
   std::stringstream ss;
   ss << gemm::dumpOptions(options) << ", ";
-  ss << "mActType=" << "gemmGatedAct::ActType(" << static_cast<int32_t>(options.mActType) << ")"
+  ss << "mActType=" << "gemmGatedAct::ActType(" << static_cast<int32_t>(options.mActType) << "),"
      << std::endl;
+  ss << "mClampBeforeAct=" << options.mClampBeforeAct << "" << std::endl;
   return ss.str();
 }
 
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelParams.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelParams.h
@@ -212,6 +212,15 @@ struct KernelParams {
   // Shape is [B]. One scaling factor per tensor in batch.
   float const* ptrScaleGate{nullptr};
 
+  // The clamp limit before the activation.
+  // Shape is [1].
+  // Clamp is INF if nullptr.
+  // If applied on SwiGlu, it will be:
+  //
+  //   x_glu    = x_glu.clamp(min=None, max=limit)
+  //   x_linear = x_linear.clamp(min=-limit, max=limit)
+  float const* ptrClampLimit{nullptr};
+
   // The alpha and beta for SwiGlu.
   // Shape is [B]. One alpha and one beta per tensor in batch.
   // Alpha is 1.f if nullptr.
@@ -695,8 +704,8 @@ struct KernelParams {
       GemmOptions_ const& options, bool const batchM, void const* ptrA, void const* ptrB,
       void* ptrC, void const* dSfA, void const* dSfB, void const* ptrPerTokenSfA,
       void const* ptrPerTokenSfB, void const* ptrBias, void* dSfC, float const* ptrScaleC,
-      float const* ptrScaleGate, float const* ptrSwiGluAlpha, float const* ptrSwiGluBeta,
-      int32_t const* routeMap, float* rowMax, uint32_t* rowMaxBars,
+      float const* ptrScaleGate, float const* ptrClampLimit, float const* ptrSwiGluAlpha,
+      float const* ptrSwiGluBeta, int32_t const* routeMap, float* rowMax, uint32_t* rowMaxBars,
       int32_t const* ptrNumNonExitingCtas = nullptr,
       int32_t const* ptrTotalNumPaddedTokens = nullptr,
       int32_t const* ptrCtaIdxXyToBatchIdx = nullptr, int32_t const* ptrCtaIdxXyToMnLimit = nullptr,
@@ -713,6 +722,8 @@ struct KernelParams {
     params.ptrScaleC = ptrScaleC;
     params.ptrScaleGate = ptrScaleGate;
 
+    params.ptrClampLimit = ptrClampLimit;
+
     params.ptrSwiGluAlpha = ptrSwiGluAlpha;
     params.ptrSwiGluBeta = ptrSwiGluBeta;