NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp‎
Lines changed: 18 additions & 8 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp‎
Lines changed: 18 additions & 8 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h‎
Lines changed: 2 additions & 1 deletion b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h‎
Lines changed: 3 additions & 3 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h‎
Lines changed: 3 additions & 3 deletions
@@ -144,12 +144,6 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(TrtllmGenBatchedGemmRunne
                 }
             }
 
-            // FIXME: Disable split-k for now.
-            if (options.mClusterDimZ != 1)
-            {
-                continue;
-            }
-
             if (options.mFusedAct)
             {
                 if (options.mActType != static_cast<batchedGemm::gemmGatedAct::ActType>(mOptions.actType))
@@ -158,14 +152,29 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(TrtllmGenBatchedGemmRunne
                 }
             }
 
+            // FIXME: Disables a few static scheduler kernels (schedS) that appears to have issues;
+            // found after commit e257cb3533; still under investigation. Offending kernels:
+            // bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a
+            // bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f
+            if (options.mTileScheduler == TileScheduler::Static && options.mUseTmaOobOpt == true
+                && options.mTileN == 64)
+            {
+                continue;
+            }
+
             if (mOptions.transposeMmaOutput && options.mEpilogueTileM == mOptions.epilogueTileM)
             {
                 mPassingConfigIndices.push_back(i);
             }
         }
     }
 
-    TLLM_CHECK_WITH_INFO(!mPassingConfigIndices.empty(), "No kernel found for the given options");
+    TLLM_CHECK_WITH_INFO(!mPassingConfigIndices.empty(),
+        "No kernel found for the given options: mDtypeA: %s, mDtypeB: %s, mDtypeC: %s, mUseDeepSeekFp8: %d, "
+        "mTransposeMmaOutput: %d, mRouteAct: %d, mFusedAct: %d, mIsStaticBatch: %d, mTileSize: %d",
+        tg::dtypeToString(mOptions.dtypeA).c_str(), tg::dtypeToString(mOptions.dtypeB).c_str(),
+        tg::dtypeToString(mOptions.dtypeC).c_str(), mOptions.deepSeekFp8, mOptions.transposeMmaOutput,
+        mOptions.routeAct, mOptions.fusedAct, mOptions.staticBatch, mOptions.tileSize);
 }
 
 size_t TrtllmGenBatchedGemmRunner::getWorkspaceSizeInBytes(int32_t m, int32_t n, int32_t k,
@@ -277,7 +286,8 @@ void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vecto
     auto envVarVal = std::getenv("TLLM_BATCHED_GEMM_PRINT_NAME");
     if (envVarVal && std::atoi(envVarVal) == 1)
     {
-        TLLM_LOG_INFO("numBatches %d Gemm %d %d %d Kernel %s\n", numBatches, m, n, k, config.mFunctionName);
+        TLLM_LOG_INFO("NumBatches %d, MaxNumCtasInBatchDim %d, ShapeMNK %d %d %d, Kernel %s", numBatches,
+            maxNumCtasInBatchDim, m, n, k, config.mFunctionName);
     }
     // FIXME once we start using all-reduce in the epilogue of the bmm this can be moved elsewhere
     bmm.runInitBeforeWorldSync(config, gemmData, static_cast<void*>(stream));
 
@@ -76,11 +76,12 @@ class TrtllmGenBatchedGemmRunner
         int32_t const* ctaIdxXyToBatchIdx, int32_t const* ctaIdxXyToMnLimit, int32_t const* numNonExitingCtas,
         void* workspace, CUstream stream, int device, int32_t configIndex);
 
-    // NVFP4 per-block scaling GEMM
+    // Block-scaling GEMM
     void run(int32_t m, int32_t n, int32_t k, std::vector<int32_t> const& batchedTokens, void const* a, void const* sfA,
         void const* b, void const* sfB, void* c, void* outSfC, void* workspace, CUstream stream, int device,
         int32_t configIndex);
 
+    // Block-scaling GEMM with SwiGLU activation
     void run(int32_t m, int32_t n, int32_t k, std::vector<int32_t> const& batchedTokens, void const* a, void const* sfA,
         void const* b, void const* sfB, float const* bias, float const* swiGluAlpha, float const* swiGluBeta,
         float const* clampLimit, void* c, void* outSfC, void* workspace, CUstream stream, int device,
 
@@ -530,6 +530,9 @@ class BatchedGemmInterface
         return std::make_tuple(numCtasBatch, numCtasTile, numCtasInner);
     }
 
+    // Creates GemmOptions from kernel and data.
+    BatchedGemmOptions getOptionsFromConfigAndData(BatchedGemmConfig const& config, BatchedGemmData const& data) const;
+
     // Returns the number of CTAs of the current kernel.
     int32_t getNumCtas(
         BatchedGemmOptions const& options, std::optional<int32_t> maxNumCtasInBatchDim = std::nullopt) const
@@ -541,9 +544,6 @@ class BatchedGemmInterface
     // Returns true if the configuration of the cubin can be executed for the given params.
     bool isValidConfig(BatchedGemmConfig const& config, BatchedGemmData const& data) const;
 
-    // Creates GemmOptions from kernel and data.
-    BatchedGemmOptions getOptionsFromConfigAndData(BatchedGemmConfig const& config, BatchedGemmData const& data) const;
-
 private:
     // Aligns the pointer to the alignment
     template <typename Dtype>
Original file line number	Diff line number	Diff line change
`@@ -144,12 +144,6 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(TrtllmGenBatchedGemmRunne`
`144`	`144`	`}`
`145`	`145`	`}`
`146`	`146`
`147`		`- // FIXME: Disable split-k for now.`
`148`		`- if (options.mClusterDimZ != 1)`
`149`		`- {`
`150`		`- continue;`
`151`		`- }`
`152`		`-`
`153`	`147`	`if (options.mFusedAct)`
`154`	`148`	`{`
`155`	`149`	`if (options.mActType != static_cast<batchedGemm::gemmGatedAct::ActType>(mOptions.actType))`
`@@ -158,14 +152,29 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(TrtllmGenBatchedGemmRunne`
`158`	`152`	`}`
`159`	`153`	`}`
`160`	`154`
	`155`	`+ // FIXME: Disables a few static scheduler kernels (schedS) that appears to have issues;`
	`156`	`+ // found after commit e257cb3533; still under investigation. Offending kernels:`
	`157`	`+ // bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a`
	`158`	`+ // bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f`
	`159`	`+ if (options.mTileScheduler == TileScheduler::Static && options.mUseTmaOobOpt == true`
	`160`	`+ && options.mTileN == 64)`
	`161`	`+ {`
	`162`	`+ continue;`
	`163`	`+ }`
	`164`	`+`
`161`	`165`	`if (mOptions.transposeMmaOutput && options.mEpilogueTileM == mOptions.epilogueTileM)`
`162`	`166`	`{`
`163`	`167`	`mPassingConfigIndices.push_back(i);`
`164`	`168`	`}`
`165`	`169`	`}`
`166`	`170`	`}`
`167`	`171`
`168`		`- TLLM_CHECK_WITH_INFO(!mPassingConfigIndices.empty(), "No kernel found for the given options");`
	`172`	`+ TLLM_CHECK_WITH_INFO(!mPassingConfigIndices.empty(),`
	`173`	`+ "No kernel found for the given options: mDtypeA: %s, mDtypeB: %s, mDtypeC: %s, mUseDeepSeekFp8: %d, "`
	`174`	`+ "mTransposeMmaOutput: %d, mRouteAct: %d, mFusedAct: %d, mIsStaticBatch: %d, mTileSize: %d",`
	`175`	`+ tg::dtypeToString(mOptions.dtypeA).c_str(), tg::dtypeToString(mOptions.dtypeB).c_str(),`
	`176`	`+ tg::dtypeToString(mOptions.dtypeC).c_str(), mOptions.deepSeekFp8, mOptions.transposeMmaOutput,`
	`177`	`+ mOptions.routeAct, mOptions.fusedAct, mOptions.staticBatch, mOptions.tileSize);`
`169`	`178`	`}`
`170`	`179`
`171`	`180`	`size_t TrtllmGenBatchedGemmRunner::getWorkspaceSizeInBytes(int32_t m, int32_t n, int32_t k,`
`@@ -277,7 +286,8 @@ void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vecto`
`277`	`286`	`auto envVarVal = std::getenv("TLLM_BATCHED_GEMM_PRINT_NAME");`
`278`	`287`	`if (envVarVal && std::atoi(envVarVal) == 1)`
`279`	`288`	`{`
`280`		`- TLLM_LOG_INFO("numBatches %d Gemm %d %d %d Kernel %s\n", numBatches, m, n, k, config.mFunctionName);`
	`289`	`+ TLLM_LOG_INFO("NumBatches %d, MaxNumCtasInBatchDim %d, ShapeMNK %d %d %d, Kernel %s", numBatches,`
	`290`	`+ maxNumCtasInBatchDim, m, n, k, config.mFunctionName);`
`281`	`291`	`}`
`282`	`292`	`// FIXME once we start using all-reduce in the epilogue of the bmm this can be moved elsewhere`
`283`	`293`	`bmm.runInitBeforeWorldSync(config, gemmData, static_cast<void*>(stream));`