clean up

dongfengy · dongfengy · commit d2d086acb142 · 2025-11-13T19:00:20.000Z
Signed-off-by: Dongfeng Yu &lt;dongfengy@nvidia.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp
@@ -211,8 +211,6 @@ void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vecto
     int32_t const* ctaIdxXyToMnLimit, int32_t const* numNonExitingCtas, void* workspace, CUstream stream, int device,
     int32_t configIndex)
 {
-    std::cout << "run 1 fixed" << std::endl;
-    std::cout << ptrBias << std::endl;
     auto bmm = BatchedGemmInterface();
 
     BatchedGemmData gemmData;
@@ -253,8 +251,8 @@ void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vecto
     gemmData.mProblemDimensions.mK = k;
     gemmData.mProblemDimensions.mRank = 0;
     gemmData.mProblemDimensions.mWorldSize = 1;
-    gemmData.mProblemDimensions.mValidM = n;
-    gemmData.mProblemDimensions.mValidN = m;
+    gemmData.mProblemDimensions.mValidM = mOptions.transposeMmaOutput ? n : m;
+    gemmData.mProblemDimensions.mValidN = mOptions.transposeMmaOutput ? m : n;
     gemmData.mProblemDimensions.mValidK = k;
 
     // Inputs
@@ -310,8 +308,6 @@ void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vecto
     void const* a, void const* sfA, void const* b, void const* sfB, void* c, void* outSfC, void* workspace,
     CUstream stream, int device, int32_t configIndex)
 {
-    std::cout << "run 2" << std::endl;
-    std::cout << "no bias" << std::endl;
     // Dispatch with block scaling factors and with static batching.
     run(m, n, k, batchedTokens, /* numTokens */ 0, batchedTokens.size(), /* maxNumCtasInBatchDim */ 0, a, sfA, b, sfB,
         /* perTokensSfA */ nullptr, /* perTokensSfB */ nullptr,
@@ -327,8 +323,6 @@ void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vecto
     float const* ptrBeta, float const* ptrClampLimit, void* c, void* outSfC, void* workspace, CUstream stream,
     int device, int32_t configIndex)
 {
-    std::cout << "run 3" << std::endl;
-    std::cout << ptrBias << std::endl;
     // Dispatch with block scaling factors and with static batching.
     run(m, n, k, batchedTokens, /* numTokens */ 0, batchedTokens.size(), /* maxNumCtasInBatchDim */ 0, a, sfA, b, sfB,
         /* perTokensSfA */ nullptr, /* perTokensSfB */ nullptr,
@@ -342,8 +336,6 @@ void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vecto
     void const* a, void const* b, float const* scaleC, float const* scaleGateC, void* c, void* workspace,
     CUstream stream, int device, int32_t configIndex)
 {
-    std::cout << "run 4" << std::endl;
-    std::cout << "no bias" << std::endl;
     // Dispatch with block scaling factors and with static batching.
     run(m, n, k, batchedTokens, /* numTokens */ 0, batchedTokens.size(), /* maxNumCtasInBatchDim */ 0, a,
         /* sfA */ nullptr, b, /* sfB */ nullptr, /* perTokensSfA */ nullptr, /* perTokensSfB */ nullptr, scaleC,
@@ -377,8 +369,8 @@ std::vector<int64_t> TrtllmGenBatchedGemmRunner::getValidConfigIndices(int32_t m
     gemmData.mProblemDimensions.mRank = 0;
     gemmData.mProblemDimensions.mWorldSize = 1;
     gemmData.mProblemDimensions.mMaxNumCtasInTokenDim = maxNumCtasInBatchDim;
-    gemmData.mProblemDimensions.mValidM = n;
-    gemmData.mProblemDimensions.mValidN = m;
+    gemmData.mProblemDimensions.mValidM = mOptions.transposeMmaOutput ? n : m;
+    gemmData.mProblemDimensions.mValidN = mOptions.transposeMmaOutput ? m : n;
     gemmData.mProblemDimensions.mValidK = k;
     auto cmpFunc = [&configs, &gemmData, &bmm, &multiProcessorCount](int64_t idx0, int64_t idx1)
     {
@@ -450,7 +442,6 @@ std::vector<int64_t> TrtllmGenBatchedGemmRunner::getValidConfigIndices(int32_t m
     std::vector<int64_t> validConfigIndices;
     for (auto const& configIndex : prioritizedIndices)
     {
-        std::cout << "checking config index " << configIndex << std::endl;
         auto const& config = configs[configIndex];
         auto isValidConfig = bmm.isValidConfig(config, gemmData);
         if (isValidConfig)
@@ -494,8 +485,8 @@ bool TrtllmGenBatchedGemmRunner::isValidConfigIndex(int32_t configIndex, int32_t
     gemmData.mProblemDimensions.mRank = 0;
     gemmData.mProblemDimensions.mWorldSize = 1;
     gemmData.mProblemDimensions.mMaxNumCtasInTokenDim = maxNumCtasInBatchDim;
-    gemmData.mProblemDimensions.mValidM = n;
-    gemmData.mProblemDimensions.mValidN = m;
+    gemmData.mProblemDimensions.mValidM = mOptions.transposeMmaOutput ? n : m;
+    gemmData.mProblemDimensions.mValidN = mOptions.transposeMmaOutput ? m : n;
     gemmData.mProblemDimensions.mValidK = k;
 
     auto const& config = configs[configIndex];
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h
@@ -209,13 +209,11 @@ inline bool checkAndUpdateBatchedGemmOptions(
     }
     if (options.mFusedAct)
     {
-        std::cout << "checking fused act options" << std::endl;
         // ensure that we check the fused options as well
         isValid = gemmGatedAct::checkAndUpdateGemmGatedActOptions(options, cudaArch, updateOptions);
     }
     else
     {
-        std::cout << "checking gemm options" << std::endl;
         isValid = gemm::checkAndUpdateGemmOptions(options, cudaArch, 1 /* tpGrpSize */, updateOptions);
     }
 
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h
@@ -161,11 +161,8 @@ inline bool checkAndUpdateGemmGatedActOptions(
             ") must be a multiple of ", hiddenGranularity, " for block-scaled outputs.");
     }
 
-    std::cout << "checking gemm options instead" << std::endl;
     auto isValid = gemm::checkAndUpdateGemmOptions(options, cudaArch,
         /* tpGrpSize */ 1, updateOptions);
-    std::cout << "finished checking gemm options" << std::endl;
-    std::cout << "the result is " << isValid << std::endl;
 
     if (!isValid)
     {
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h
@@ -629,7 +629,6 @@ inline int32_t getShuffleBlockSize(int epilogueTileM)
 inline bool checkAndUpdateGemmOptions(
     GemmOptions& options, tg::CudaArch cudaArch, int tpGrpSize, bool updateOptions = true)
 {
-    std::cout << "Checking GemmOptions..." << std::endl;
     options.mWorldSize = tpGrpSize;
 
     bool isBlackwell = tg::isArchBlackwell(cudaArch);
@@ -642,11 +641,9 @@ inline bool checkAndUpdateGemmOptions(
         }
         else
         {
-            std::cout << "failed at dtypeB" << std::endl;
             return false;
         }
     }
-    std::cout << "ckpt 0" << std::endl;
 
     // If not specified, used the input dtypes as MMA dtypes (no cast required).
     if (options.mDtypeMmaA == tg::Dtype::Void)
@@ -657,7 +654,6 @@ inline bool checkAndUpdateGemmOptions(
         }
         else
         {
-            std::cout << "failed at dtypeMmaA" << std::endl;
             return false;
         }
     }
@@ -669,7 +665,6 @@ inline bool checkAndUpdateGemmOptions(
         }
         else
         {
-            std::cout << "failed at dtypeMmaB" << std::endl;
             return false;
         }
     }
@@ -691,13 +686,8 @@ inline bool checkAndUpdateGemmOptions(
     // It must not exceed the padded dimensions.
     if (options.mValidM > options.mM || options.mValidN > options.mN || options.mValidK > options.mK)
     {
-        std::cout << "test validM/N/K start" << std::endl;
-        std::cout << "options.mValidM=" << options.mValidM << ", options.mM=" << options.mM << std::endl;
-        std::cout << "options.mValidN=" << options.mValidN << ", options.mN=" << options.mN << std::endl;
-        std::cout << "options.mValidK=" << options.mValidK << ", options.mK=" << options.mK << std::endl;
         TLLM_LOG_WARNING(options.mValidK <= options.mK,
             "ValidM, ValidN, and ValidK must be less than or equal to M, N, and K respectively.");
-        std::cout << "test validM/N/K start2" << std::endl;
         if (updateOptions)
         {
             options.mValidM = std::min(options.mValidM, options.mM);
@@ -706,7 +696,6 @@ inline bool checkAndUpdateGemmOptions(
         }
         else
         {
-            std::cout << "failed at validM/N/K" << std::endl;
             return false;
         }
     }
@@ -717,12 +706,10 @@ inline bool checkAndUpdateGemmOptions(
         bool hasValidParams = (options.mValidM != -1 && options.mValidM != options.mM)
             || (options.mValidN != -1 && options.mValidN != options.mN)
             || (options.mValidK != -1 && options.mValidK != options.mK);
-        std::cout << "test BlockMajorK start" << std::endl;
         TLLM_CHECK_ERROR(!hasValidParams,
             "BlockMajorK layout does not support validM/validN/validK parameters due to swizzled layout. "
             "Found validM=",
             options.mValidM, " validN=", options.mValidN, " validK=", options.mValidK);
-        std::cout << "test BlockMajorK start2" << std::endl;
     }
 
 #ifdef TLLM_PUBLIC_RELEASE
@@ -731,7 +718,6 @@ inline bool checkAndUpdateGemmOptions(
         TLLM_CHECK_ERROR(false, "E2m1 x E4m3 is not supported for JIT compile. Use cubins instead.");
     }
 #endif // TLLM_PUBLIC_RELEASE
-    std::cout << "ckpt 1" << std::endl;
     // Check that the A cast is supported.
     // Currently, we only support {MxFp4, NvFp4} -> Bf16.
     TLLM_CHECK_ERROR((options.mDtypeA == options.mDtypeMmaA)
@@ -775,7 +761,6 @@ inline bool checkAndUpdateGemmOptions(
         TLLM_CHECK_ERROR(options.mDtypeMmaB == tg::Dtype::E4m3 || options.mDtypeMmaB == tg::Dtype::E2m1,
             "For dtypeMmaA = E4m3/E2m1 A, dtypeMmaB must also be E4m3/E2m1.");
     }
-    std::cout << "ckpt 2" << std::endl;
     // kind::mxf8f6f4
     if (options.mDtypeMmaA == tg::Dtype::MxE4m3 || options.mDtypeMmaA == tg::Dtype::MxE2m1)
     {
@@ -787,7 +772,6 @@ inline bool checkAndUpdateGemmOptions(
         TLLM_CHECK_ERROR(options.mDtypeMmaA == tg::Dtype::MxE4m3 || options.mDtypeMmaA == tg::Dtype::MxE2m1,
             "For dtypeMmaB = MxE4m3 or MxE2m1, dtypeMmaA must also be MxE4m3 or MxE2m1.");
     }
-    std::cout << "ckpt 3" << std::endl;
     // kind::f16
     if (options.mDtypeMmaA == tg::Dtype::Fp16 || options.mDtypeMmaA == tg::Dtype::Bfloat16)
     {
@@ -819,7 +803,6 @@ inline bool checkAndUpdateGemmOptions(
         }
         else
         {
-            std::cout << "failed at mmaKind" << std::endl;
             return false;
         }
     }
@@ -836,7 +819,6 @@ inline bool checkAndUpdateGemmOptions(
         }
         else
         {
-            std::cout << "failed at mmaK" << std::endl;
             return false;
         }
     }
@@ -867,7 +849,6 @@ inline bool checkAndUpdateGemmOptions(
             "Hopper does not use TMEM. The register layout corresponds to 16dp256bit. Got ", options.mEpilogueLdtmDps,
             "dp", options.mEpilogueLdtmBits, "bit.");
     }
-    std::cout << "ckpt 4" << std::endl;
     // Constraints for NvFp4 and MxFp8.
     if ((options.mMmaKind == tg::MmaKind::MxFp4NvFp4 || options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4
             || options.mDtypeC == tg::Dtype::MxE4m3)
@@ -887,7 +868,6 @@ inline bool checkAndUpdateGemmOptions(
             }
             else
             {
-                std::cout << "failed at mmaM" << std::endl;
                 return false;
             }
         }
@@ -932,7 +912,6 @@ inline bool checkAndUpdateGemmOptions(
             }
             else
             {
-                std::cout << "failed at mmaK" << std::endl;
                 return false;
             }
         }
@@ -1039,7 +1018,6 @@ inline bool checkAndUpdateGemmOptions(
         }
         else
         {
-            std::cout << "failed at dtypeC" << std::endl;
             return false;
         }
     }
@@ -1055,7 +1033,6 @@ inline bool checkAndUpdateGemmOptions(
         }
         else
         {
-            std::cout << "failed at epilogueTileM" << std::endl;
             return false;
         }
     }
@@ -1070,7 +1047,6 @@ inline bool checkAndUpdateGemmOptions(
         }
         else
         {
-            std::cout << "failed at epilogueTileN" << std::endl;
             return false;
         }
     }
@@ -1086,7 +1062,6 @@ inline bool checkAndUpdateGemmOptions(
         }
         else
         {
-            std::cout << "failed at epilogueTileM/N" << std::endl;
             return false;
         }
     }
@@ -1101,7 +1076,6 @@ inline bool checkAndUpdateGemmOptions(
         }
         else
         {
-            std::cout << "failed at epilogueTileM" << std::endl;
             return false;
         }
     }
@@ -1222,7 +1196,6 @@ inline bool checkAndUpdateGemmOptions(
             }
             else
             {
-                std::cout << "failed at epilogueTileM/N" << std::endl;
                 return false;
             }
         }
@@ -1246,7 +1219,6 @@ inline bool checkAndUpdateGemmOptions(
         }
         else
         {
-            std::cout << "failed at mmaStages" << std::endl;
             return false;
         }
     }
@@ -1258,7 +1230,6 @@ inline bool checkAndUpdateGemmOptions(
         }
         else
         {
-            std::cout << "failed at mmaStages" << std::endl;
             return false;
         }
     }
@@ -1270,7 +1241,6 @@ inline bool checkAndUpdateGemmOptions(
         }
         else
         {
-            std::cout << "failed at mmaStages" << std::endl;
             return false;
         }
     }
@@ -1367,7 +1337,6 @@ inline bool checkAndUpdateGemmOptions(
             }
             else
             {
-                std::cout << "failed at tileM" << std::endl;
                 return false;
             }
         }
@@ -1382,7 +1351,6 @@ inline bool checkAndUpdateGemmOptions(
             }
             else
             {
-                std::cout << "failed at numSlicesForSliceK" << std::endl;
                 return false;
             }
         }
@@ -1427,7 +1395,6 @@ inline bool checkAndUpdateGemmOptions(
             }
             else
             {
-                std::cout << "failed at unrollLoop2xForMma" << std::endl;
                 return false;
             }
         }
@@ -1448,7 +1415,6 @@ inline bool checkAndUpdateGemmOptions(
         }
         else
         {
-            std::cout << "failed at tileScheduler" << std::endl;
             return false;
         }
     }
@@ -1464,7 +1430,6 @@ inline bool checkAndUpdateGemmOptions(
         }
         else
         {
-            std::cout << "failed at earlyExit" << std::endl;
             return false;
         }
     }
@@ -1552,7 +1517,6 @@ inline bool checkAndUpdateGemmOptions(
             }
             else
             {
-                std::cout << "failed at blockK" << std::endl;
                 return false;
             }
         }
diff --git a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp
diff --git a/tests/unittest/_torch/thop/parallel/test_moe.py b/tests/unittest/_torch/thop/parallel/test_moe.py

Original file line number	Diff line number	Diff line change
`@@ -209,13 +209,11 @@ inline bool checkAndUpdateBatchedGemmOptions(`
`209`	`209`	`}`
`210`	`210`	`if (options.mFusedAct)`
`211`	`211`	`{`
`212`		`- std::cout << "checking fused act options" << std::endl;`
`213`	`212`	`// ensure that we check the fused options as well`
`214`	`213`	`isValid = gemmGatedAct::checkAndUpdateGemmGatedActOptions(options, cudaArch, updateOptions);`
`215`	`214`	`}`
`216`	`215`	`else`
`217`	`216`	`{`
`218`		`- std::cout << "checking gemm options" << std::endl;`
`219`	`217`	`isValid = gemm::checkAndUpdateGemmOptions(options, cudaArch, 1 /* tpGrpSize */, updateOptions);`
`220`	`218`	`}`
`221`	`219`
Original file line number	Diff line number	Diff line change
`@@ -161,11 +161,8 @@ inline bool checkAndUpdateGemmGatedActOptions(`
`161`	`161`	`") must be a multiple of ", hiddenGranularity, " for block-scaled outputs.");`
`162`	`162`	`}`
`163`	`163`
`164`		`- std::cout << "checking gemm options instead" << std::endl;`
`165`	`164`	`auto isValid = gemm::checkAndUpdateGemmOptions(options, cudaArch,`
`166`	`165`	`/* tpGrpSize */ 1, updateOptions);`
`167`		`- std::cout << "finished checking gemm options" << std::endl;`
`168`		`- std::cout << "the result is " << isValid << std::endl;`
`169`	`166`
`170`	`167`	`if (!isValid)`
`171`	`168`	`{`
Original file line number	Diff line number	Diff line change
`@@ -629,7 +629,6 @@ inline int32_t getShuffleBlockSize(int epilogueTileM)`
`629`	`629`	`inline bool checkAndUpdateGemmOptions(`
`630`	`630`	`GemmOptions& options, tg::CudaArch cudaArch, int tpGrpSize, bool updateOptions = true)`
`631`	`631`	`{`
`632`		`- std::cout << "Checking GemmOptions..." << std::endl;`
`633`	`632`	`options.mWorldSize = tpGrpSize;`
`634`	`633`
`635`	`634`	`bool isBlackwell = tg::isArchBlackwell(cudaArch);`
`@@ -642,11 +641,9 @@ inline bool checkAndUpdateGemmOptions(`
`642`	`641`	`}`
`643`	`642`	`else`
`644`	`643`	`{`
`645`		`- std::cout << "failed at dtypeB" << std::endl;`
`646`	`644`	`return false;`
`647`	`645`	`}`
`648`	`646`	`}`
`649`		`- std::cout << "ckpt 0" << std::endl;`
`650`	`647`
`651`	`648`	`// If not specified, used the input dtypes as MMA dtypes (no cast required).`
`652`	`649`	`if (options.mDtypeMmaA == tg::Dtype::Void)`
`@@ -657,7 +654,6 @@ inline bool checkAndUpdateGemmOptions(`
`657`	`654`	`}`
`658`	`655`	`else`
`659`	`656`	`{`
`660`		`- std::cout << "failed at dtypeMmaA" << std::endl;`
`661`	`657`	`return false;`
`662`	`658`	`}`
`663`	`659`	`}`
`@@ -669,7 +665,6 @@ inline bool checkAndUpdateGemmOptions(`
`669`	`665`	`}`
`670`	`666`	`else`
`671`	`667`	`{`
`672`		`- std::cout << "failed at dtypeMmaB" << std::endl;`
`673`	`668`	`return false;`
`674`	`669`	`}`
`675`	`670`	`}`
`@@ -691,13 +686,8 @@ inline bool checkAndUpdateGemmOptions(`
`691`	`686`	`// It must not exceed the padded dimensions.`
`692`	`687`	`if (options.mValidM > options.mM \|\| options.mValidN > options.mN \|\| options.mValidK > options.mK)`
`693`	`688`	`{`
`694`		`- std::cout << "test validM/N/K start" << std::endl;`
`695`		`- std::cout << "options.mValidM=" << options.mValidM << ", options.mM=" << options.mM << std::endl;`
`696`		`- std::cout << "options.mValidN=" << options.mValidN << ", options.mN=" << options.mN << std::endl;`
`697`		`- std::cout << "options.mValidK=" << options.mValidK << ", options.mK=" << options.mK << std::endl;`
`698`	`689`	`TLLM_LOG_WARNING(options.mValidK <= options.mK,`
`699`	`690`	`"ValidM, ValidN, and ValidK must be less than or equal to M, N, and K respectively.");`
`700`		`- std::cout << "test validM/N/K start2" << std::endl;`
`701`	`691`	`if (updateOptions)`
`702`	`692`	`{`
`703`	`693`	`options.mValidM = std::min(options.mValidM, options.mM);`
`@@ -706,7 +696,6 @@ inline bool checkAndUpdateGemmOptions(`
`706`	`696`	`}`
`707`	`697`	`else`
`708`	`698`	`{`
`709`		`- std::cout << "failed at validM/N/K" << std::endl;`
`710`	`699`	`return false;`
`711`	`700`	`}`
`712`	`701`	`}`
`@@ -717,12 +706,10 @@ inline bool checkAndUpdateGemmOptions(`
`717`	`706`	`bool hasValidParams = (options.mValidM != -1 && options.mValidM != options.mM)`
`718`	`707`	`\|\| (options.mValidN != -1 && options.mValidN != options.mN)`
`719`	`708`	`\|\| (options.mValidK != -1 && options.mValidK != options.mK);`
`720`		`- std::cout << "test BlockMajorK start" << std::endl;`
`721`	`709`	`TLLM_CHECK_ERROR(!hasValidParams,`
`722`	`710`	`"BlockMajorK layout does not support validM/validN/validK parameters due to swizzled layout. "`
`723`	`711`	`"Found validM=",`
`724`	`712`	`options.mValidM, " validN=", options.mValidN, " validK=", options.mValidK);`
`725`		`- std::cout << "test BlockMajorK start2" << std::endl;`
`726`	`713`	`}`
`727`	`714`
`728`	`715`	`#ifdef TLLM_PUBLIC_RELEASE`
`@@ -731,7 +718,6 @@ inline bool checkAndUpdateGemmOptions(`
`731`	`718`	`TLLM_CHECK_ERROR(false, "E2m1 x E4m3 is not supported for JIT compile. Use cubins instead.");`
`732`	`719`	`}`
`733`	`720`	`#endif // TLLM_PUBLIC_RELEASE`
`734`		`- std::cout << "ckpt 1" << std::endl;`
`735`	`721`	`// Check that the A cast is supported.`
`736`	`722`	`// Currently, we only support {MxFp4, NvFp4} -> Bf16.`
`737`	`723`	`TLLM_CHECK_ERROR((options.mDtypeA == options.mDtypeMmaA)`
`@@ -775,7 +761,6 @@ inline bool checkAndUpdateGemmOptions(`
`775`	`761`	`TLLM_CHECK_ERROR(options.mDtypeMmaB == tg::Dtype::E4m3 \|\| options.mDtypeMmaB == tg::Dtype::E2m1,`
`776`	`762`	`"For dtypeMmaA = E4m3/E2m1 A, dtypeMmaB must also be E4m3/E2m1.");`
`777`	`763`	`}`
`778`		`- std::cout << "ckpt 2" << std::endl;`
`779`	`764`	`// kind::mxf8f6f4`
`780`	`765`	`if (options.mDtypeMmaA == tg::Dtype::MxE4m3 \|\| options.mDtypeMmaA == tg::Dtype::MxE2m1)`
`781`	`766`	`{`
`@@ -787,7 +772,6 @@ inline bool checkAndUpdateGemmOptions(`
`787`	`772`	`TLLM_CHECK_ERROR(options.mDtypeMmaA == tg::Dtype::MxE4m3 \|\| options.mDtypeMmaA == tg::Dtype::MxE2m1,`
`788`	`773`	`"For dtypeMmaB = MxE4m3 or MxE2m1, dtypeMmaA must also be MxE4m3 or MxE2m1.");`
`789`	`774`	`}`
`790`		`- std::cout << "ckpt 3" << std::endl;`
`791`	`775`	`// kind::f16`
`792`	`776`	`if (options.mDtypeMmaA == tg::Dtype::Fp16 \|\| options.mDtypeMmaA == tg::Dtype::Bfloat16)`
`793`	`777`	`{`
`@@ -819,7 +803,6 @@ inline bool checkAndUpdateGemmOptions(`
`819`	`803`	`}`
`820`	`804`	`else`
`821`	`805`	`{`
`822`		`- std::cout << "failed at mmaKind" << std::endl;`
`823`	`806`	`return false;`
`824`	`807`	`}`
`825`	`808`	`}`
`@@ -836,7 +819,6 @@ inline bool checkAndUpdateGemmOptions(`
`836`	`819`	`}`
`837`	`820`	`else`
`838`	`821`	`{`
`839`		`- std::cout << "failed at mmaK" << std::endl;`
`840`	`822`	`return false;`
`841`	`823`	`}`
`842`	`824`	`}`
`@@ -867,7 +849,6 @@ inline bool checkAndUpdateGemmOptions(`
`867`	`849`	`"Hopper does not use TMEM. The register layout corresponds to 16dp256bit. Got ", options.mEpilogueLdtmDps,`
`868`	`850`	`"dp", options.mEpilogueLdtmBits, "bit.");`
`869`	`851`	`}`
`870`		`- std::cout << "ckpt 4" << std::endl;`
`871`	`852`	`// Constraints for NvFp4 and MxFp8.`
`872`	`853`	`if ((options.mMmaKind == tg::MmaKind::MxFp4NvFp4 \|\| options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4`
`873`	`854`	`\|\| options.mDtypeC == tg::Dtype::MxE4m3)`
`@@ -887,7 +868,6 @@ inline bool checkAndUpdateGemmOptions(`
`887`	`868`	`}`
`888`	`869`	`else`
`889`	`870`	`{`
`890`		`- std::cout << "failed at mmaM" << std::endl;`
`891`	`871`	`return false;`
`892`	`872`	`}`
`893`	`873`	`}`
`@@ -932,7 +912,6 @@ inline bool checkAndUpdateGemmOptions(`
`932`	`912`	`}`
`933`	`913`	`else`
`934`	`914`	`{`
`935`		`- std::cout << "failed at mmaK" << std::endl;`
`936`	`915`	`return false;`
`937`	`916`	`}`
`938`	`917`	`}`
`@@ -1039,7 +1018,6 @@ inline bool checkAndUpdateGemmOptions(`
`1039`	`1018`	`}`
`1040`	`1019`	`else`
`1041`	`1020`	`{`
`1042`		`- std::cout << "failed at dtypeC" << std::endl;`
`1043`	`1021`	`return false;`
`1044`	`1022`	`}`
`1045`	`1023`	`}`
`@@ -1055,7 +1033,6 @@ inline bool checkAndUpdateGemmOptions(`
`1055`	`1033`	`}`
`1056`	`1034`	`else`
`1057`	`1035`	`{`
`1058`		`- std::cout << "failed at epilogueTileM" << std::endl;`
`1059`	`1036`	`return false;`
`1060`	`1037`	`}`
`1061`	`1038`	`}`
`@@ -1070,7 +1047,6 @@ inline bool checkAndUpdateGemmOptions(`
`1070`	`1047`	`}`
`1071`	`1048`	`else`
`1072`	`1049`	`{`
`1073`		`- std::cout << "failed at epilogueTileN" << std::endl;`
`1074`	`1050`	`return false;`
`1075`	`1051`	`}`
`1076`	`1052`	`}`
`@@ -1086,7 +1062,6 @@ inline bool checkAndUpdateGemmOptions(`
`1086`	`1062`	`}`
`1087`	`1063`	`else`
`1088`	`1064`	`{`
`1089`		`- std::cout << "failed at epilogueTileM/N" << std::endl;`
`1090`	`1065`	`return false;`
`1091`	`1066`	`}`
`1092`	`1067`	`}`
`@@ -1101,7 +1076,6 @@ inline bool checkAndUpdateGemmOptions(`
`1101`	`1076`	`}`
`1102`	`1077`	`else`
`1103`	`1078`	`{`
`1104`		`- std::cout << "failed at epilogueTileM" << std::endl;`
`1105`	`1079`	`return false;`
`1106`	`1080`	`}`
`1107`	`1081`	`}`
`@@ -1222,7 +1196,6 @@ inline bool checkAndUpdateGemmOptions(`
`1222`	`1196`	`}`
`1223`	`1197`	`else`
`1224`	`1198`	`{`
`1225`		`- std::cout << "failed at epilogueTileM/N" << std::endl;`
`1226`	`1199`	`return false;`
`1227`	`1200`	`}`
`1228`	`1201`	`}`
`@@ -1246,7 +1219,6 @@ inline bool checkAndUpdateGemmOptions(`
`1246`	`1219`	`}`
`1247`	`1220`	`else`
`1248`	`1221`	`{`
`1249`		`- std::cout << "failed at mmaStages" << std::endl;`
`1250`	`1222`	`return false;`
`1251`	`1223`	`}`
`1252`	`1224`	`}`
`@@ -1258,7 +1230,6 @@ inline bool checkAndUpdateGemmOptions(`
`1258`	`1230`	`}`
`1259`	`1231`	`else`
`1260`	`1232`	`{`
`1261`		`- std::cout << "failed at mmaStages" << std::endl;`
`1262`	`1233`	`return false;`
`1263`	`1234`	`}`
`1264`	`1235`	`}`
`@@ -1270,7 +1241,6 @@ inline bool checkAndUpdateGemmOptions(`
`1270`	`1241`	`}`
`1271`	`1242`	`else`
`1272`	`1243`	`{`
`1273`		`- std::cout << "failed at mmaStages" << std::endl;`
`1274`	`1244`	`return false;`
`1275`	`1245`	`}`
`1276`	`1246`	`}`
`@@ -1367,7 +1337,6 @@ inline bool checkAndUpdateGemmOptions(`
`1367`	`1337`	`}`
`1368`	`1338`	`else`
`1369`	`1339`	`{`
`1370`		`- std::cout << "failed at tileM" << std::endl;`
`1371`	`1340`	`return false;`
`1372`	`1341`	`}`
`1373`	`1342`	`}`
`@@ -1382,7 +1351,6 @@ inline bool checkAndUpdateGemmOptions(`
`1382`	`1351`	`}`
`1383`	`1352`	`else`
`1384`	`1353`	`{`
`1385`		`- std::cout << "failed at numSlicesForSliceK" << std::endl;`
`1386`	`1354`	`return false;`
`1387`	`1355`	`}`
`1388`	`1356`	`}`
`@@ -1427,7 +1395,6 @@ inline bool checkAndUpdateGemmOptions(`
`1427`	`1395`	`}`
`1428`	`1396`	`else`
`1429`	`1397`	`{`
`1430`		`- std::cout << "failed at unrollLoop2xForMma" << std::endl;`
`1431`	`1398`	`return false;`
`1432`	`1399`	`}`
`1433`	`1400`	`}`
`@@ -1448,7 +1415,6 @@ inline bool checkAndUpdateGemmOptions(`
`1448`	`1415`	`}`
`1449`	`1416`	`else`
`1450`	`1417`	`{`
`1451`		`- std::cout << "failed at tileScheduler" << std::endl;`
`1452`	`1418`	`return false;`
`1453`	`1419`	`}`
`1454`	`1420`	`}`
`@@ -1464,7 +1430,6 @@ inline bool checkAndUpdateGemmOptions(`
`1464`	`1430`	`}`
`1465`	`1431`	`else`
`1466`	`1432`	`{`
`1467`		`- std::cout << "failed at earlyExit" << std::endl;`
`1468`	`1433`	`return false;`
`1469`	`1434`	`}`
`1470`	`1435`	`}`
`@@ -1552,7 +1517,6 @@ inline bool checkAndUpdateGemmOptions(`
`1552`	`1517`	`}`
`1553`	`1518`	`else`
`1554`	`1519`	`{`
`1555`		`- std::cout << "failed at blockK" << std::endl;`
`1556`	`1520`	`return false;`
`1557`	`1521`	`}`
`1558`	`1522`	`}`