@@ -144,12 +144,6 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(TrtllmGenBatchedGemmRunne
144144 }
145145 }
146146
147- // FIXME: Disable split-k for now.
148- if (options.mClusterDimZ != 1 )
149- {
150- continue ;
151- }
152-
153147 if (options.mFusedAct )
154148 {
155149 if (options.mActType != static_cast <batchedGemm::gemmGatedAct::ActType>(mOptions .actType ))
@@ -158,14 +152,29 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(TrtllmGenBatchedGemmRunne
158152 }
159153 }
160154
155+ // FIXME: Disables a few static scheduler kernels (schedS) that appears to have issues;
156+ // found after commit e257cb3533; still under investigation. Offending kernels:
157+ // bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a
158+ // bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f
159+ if (options.mTileScheduler == TileScheduler::Static && options.mUseTmaOobOpt == true
160+ && options.mTileN == 64 )
161+ {
162+ continue ;
163+ }
164+
161165 if (mOptions .transposeMmaOutput && options.mEpilogueTileM == mOptions .epilogueTileM )
162166 {
163167 mPassingConfigIndices .push_back (i);
164168 }
165169 }
166170 }
167171
168- TLLM_CHECK_WITH_INFO (!mPassingConfigIndices .empty (), " No kernel found for the given options" );
172+ TLLM_CHECK_WITH_INFO (!mPassingConfigIndices .empty (),
173+ " No kernel found for the given options: mDtypeA: %s, mDtypeB: %s, mDtypeC: %s, mUseDeepSeekFp8: %d, "
174+ " mTransposeMmaOutput: %d, mRouteAct: %d, mFusedAct: %d, mIsStaticBatch: %d, mTileSize: %d" ,
175+ tg::dtypeToString (mOptions .dtypeA ).c_str (), tg::dtypeToString (mOptions .dtypeB ).c_str (),
176+ tg::dtypeToString (mOptions .dtypeC ).c_str (), mOptions .deepSeekFp8 , mOptions .transposeMmaOutput ,
177+ mOptions .routeAct , mOptions .fusedAct , mOptions .staticBatch , mOptions .tileSize );
169178}
170179
171180size_t TrtllmGenBatchedGemmRunner::getWorkspaceSizeInBytes (int32_t m, int32_t n, int32_t k,
@@ -277,7 +286,8 @@ void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vecto
277286 auto envVarVal = std::getenv (" TLLM_BATCHED_GEMM_PRINT_NAME" );
278287 if (envVarVal && std::atoi (envVarVal) == 1 )
279288 {
280- TLLM_LOG_INFO (" numBatches %d Gemm %d %d %d Kernel %s\n " , numBatches, m, n, k, config.mFunctionName );
289+ TLLM_LOG_INFO (" NumBatches %d, MaxNumCtasInBatchDim %d, ShapeMNK %d %d %d, Kernel %s" , numBatches,
290+ maxNumCtasInBatchDim, m, n, k, config.mFunctionName );
281291 }
282292 // FIXME once we start using all-reduce in the epilogue of the bmm this can be moved elsewhere
283293 bmm.runInitBeforeWorldSync (config, gemmData, static_cast <void *>(stream));
0 commit comments