diff --git a/flashinfer/artifacts.py b/flashinfer/artifacts.py index 28d441b9f..24c4c4e8c 100644 --- a/flashinfer/artifacts.py +++ b/flashinfer/artifacts.py @@ -50,10 +50,10 @@ def get_available_cubin_files(source, retries=3, delay=5, timeout=10): class ArtifactPath: TRTLLM_GEN_FMHA: str = "c8e0abb4b0438880a2b0a9b68449e3cf1513aadf/fmha/trtllm-gen/" TRTLLM_GEN_BMM: str = ( - "c8e0abb4b0438880a2b0a9b68449e3cf1513aadf/batched_gemm-32110eb-a15c257/" + "5d347c6234c9f0e7f1ab6519ea933183b48216ed/batched_gemm-32110eb-5262bae/" ) TRTLLM_GEN_GEMM: str = ( - "07a5f242a649533ff6885f87c42b2476a9e46233/gemm-c603ed2-434a6e1/" + "5d347c6234c9f0e7f1ab6519ea933183b48216ed/gemm-32110eb-434a6e1/" ) CUDNN_SDPA: str = "4c623163877c8fef5751c9c7a59940cd2baae02e/fmha/cudnn/" DEEPGEMM: str = "d25901733420c7cddc1adf799b0d4639ed1e162f/deep-gemm/" @@ -63,9 +63,12 @@ class MetaInfoHash: TRTLLM_GEN_FMHA: str = ( "0d124e546c8a2e9fa59499625e8a6d140a2465573d4a3944f9d29f29f73292fb" ) + TRTLLM_GEN_BMM: str = ( + "aae02e5703ee0ce696c4b3a1f2a32936fcc960dcb69fdef52b6d0f8a7b673000" + ) DEEPGEMM: str = "69aa277b7f3663ed929e73f9c57301792b8c594dac15a465b44a5d151b6a1d50" TRTLLM_GEN_GEMM: str = ( - "50c5627324003c822efbdd1d368b1e569f4f67f4bb0a2fbb7397cd56c6d14c2a" + "a00ef9d834cb66c724ec7c72337bc955dc53070a65a6f68b34f852d144fa6ea3" ) @@ -74,7 +77,8 @@ def download_artifacts() -> bool: os.environ["FLASHINFER_CUBIN_CHECKSUM_DISABLED"] = "1" cubin_files = [ (ArtifactPath.TRTLLM_GEN_FMHA + "flashInferMetaInfo", ".h"), - (ArtifactPath.TRTLLM_GEN_GEMM + "KernelMetaInfo", ".h"), + (ArtifactPath.TRTLLM_GEN_GEMM + "include/flashinferMetaInfo", ".h"), + (ArtifactPath.TRTLLM_GEN_BMM + "include/flashinferMetaInfo", ".h"), ] for kernel in [ ArtifactPath.TRTLLM_GEN_FMHA, diff --git a/flashinfer/deep_gemm.py b/flashinfer/deep_gemm.py index a6b5eac96..177eafac0 100644 --- a/flashinfer/deep_gemm.py +++ b/flashinfer/deep_gemm.py @@ -37,7 +37,7 @@ from .artifacts import ArtifactPath, MetaInfoHash from .cuda_utils import checkCudaErrors from .jit.cubin_loader import get_cubin -from .jit.env import FLASHINFER_CACHE_DIR +from .jit.env import FLASHINFER_CUBIN_DIR from .utils import ceil_div, round_up @@ -907,11 +907,7 @@ def load_all(): continue symbol, sha256 = KERNEL_MAP[cubin_name] get_cubin(ArtifactPath.DEEPGEMM + cubin_name, sha256) - path = ( - FLASHINFER_CACHE_DIR - / "cubins" - / f"{ArtifactPath.DEEPGEMM + cubin_name}.cubin" - ) + path = FLASHINFER_CUBIN_DIR / f"{ArtifactPath.DEEPGEMM + cubin_name}.cubin" assert path.exists() RUNTIME_CACHE[cubin_name] = SM100FP8GemmRuntime(str(path), symbol) @@ -925,9 +921,7 @@ def load(name: str, code: str) -> SM100FP8GemmRuntime: return RUNTIME_CACHE[cubin_name] symbol, sha256 = KERNEL_MAP[cubin_name] get_cubin(ArtifactPath.DEEPGEMM + cubin_name, sha256) - path = ( - FLASHINFER_CACHE_DIR / "cubins" / f"{ArtifactPath.DEEPGEMM + cubin_name}.cubin" - ) + path = FLASHINFER_CUBIN_DIR / f"{ArtifactPath.DEEPGEMM + cubin_name}.cubin" assert path.exists() RUNTIME_CACHE[cubin_name] = SM100FP8GemmRuntime(str(path), symbol) return RUNTIME_CACHE[cubin_name] @@ -1460,7 +1454,7 @@ def init_indices(self): assert get_cubin(indice_path, self.sha256, file_extension=".json"), ( "cubin kernel map file not found, nor downloaded with matched sha256" ) - path = FLASHINFER_CACHE_DIR / "cubins" / f"{indice_path}.json" + path = FLASHINFER_CUBIN_DIR / f"{indice_path}.json" assert path.exists() with open(path, "r") as f: self.indice = json.load(f) diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py index 06efa2439..b9ece319e 100644 --- a/flashinfer/fused_moe/core.py +++ b/flashinfer/fused_moe/core.py @@ -16,13 +16,12 @@ import functools from enum import IntEnum -from pathlib import Path from types import SimpleNamespace from typing import Any, Dict, List, Optional, Tuple, Union import torch -from ..artifacts import ArtifactPath +from ..artifacts import ArtifactPath, MetaInfoHash from ..autotuner import ( AutoTuner, DynamicTensorSpec, @@ -33,6 +32,7 @@ from ..jit import JitSpec from ..jit import env as jit_env from ..jit import gen_jit_spec, setup_cubin_loader, sm100a_nvcc_flags +from ..jit.cubin_loader import get_cubin from ..jit.cutlass_gemm.generate_kernels import generate_gemm_operations from ..utils import ( _check_shape_dtype_device, @@ -819,15 +819,18 @@ def cutlass_fused_moe( def trtllm_gen_fused_moe_sm100_module() -> JitSpec: - debug_cubin_path = ( - jit_env.FLASHINFER_INCLUDE_DIR - / "flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/cubins" + # Fetch "flashinferMetaInfo.h" from the online kernel cache. This file + # contains the `tllmGenBatchedGemmList` as the list of available kernels + # online. It is included when compiling `trtllm_fused_moe_runner.cu`, etc. + include_path = f"{ArtifactPath.TRTLLM_GEN_BMM}/include" + header_name = "flashinferMetaInfo" + + # use `get_cubin` to get "flashinferMetaInfo.h" + metainfo = get_cubin( + f"{include_path}/{header_name}", MetaInfoHash.TRTLLM_GEN_BMM, ".h" ) - import glob - - debug_cubin_files = [ - Path(p) for p in glob.glob(str(debug_cubin_path / "Bmm_*.cpp")) - ] + # make sure "flashinferMetaInfo.h" is downloaded or cached + assert metainfo, f"{header_name}.h not found" return gen_jit_spec( "fused_moe_trtllm_sm100", @@ -844,8 +847,7 @@ def trtllm_gen_fused_moe_sm100_module() -> JitSpec: jit_env.FLASHINFER_CSRC_DIR / "trtllm_fused_moe_routing_renormalize.cu", jit_env.FLASHINFER_CSRC_DIR / "trtllm_fused_moe_dev_kernel.cu", jit_env.FLASHINFER_CSRC_DIR / "trtllm_batched_gemm_runner.cu", - ] - + debug_cubin_files, + ], extra_cuda_cflags=[ "-DTLLM_GEN_EXPORT_INTERFACE", "-DTLLM_ENABLE_CUDA", @@ -857,6 +859,8 @@ def trtllm_gen_fused_moe_sm100_module() -> JitSpec: + sm100a_nvcc_flags, extra_ldflags=["-lcuda"], extra_include_paths=[ + # link "include" sub-directory in cache + jit_env.FLASHINFER_CUBIN_DIR / include_path, jit_env.FLASHINFER_CSRC_DIR / "nv_internal", jit_env.FLASHINFER_CSRC_DIR / "nv_internal/include", ], diff --git a/flashinfer/gemm.py b/flashinfer/gemm.py index ae5c0042c..7f799fa7f 100755 --- a/flashinfer/gemm.py +++ b/flashinfer/gemm.py @@ -349,12 +349,19 @@ def get_gemm_sm100_module(): def trtllm_gemm_gen_module() -> JitSpec: - header_name = "KernelMetaInfo" + # Fetch "flashinferMetaInfo.h" from the online kernel cache. This file + # contains the `tllmGenGemmList` as the list of available kernels online. + # It is included when compiling `trtllm_gemm_runner.cu`. + include_path = f"{ArtifactPath.TRTLLM_GEN_GEMM}/include" + header_name = "flashinferMetaInfo" + + # use `get_cubin` to get "flashinferMetaInfo.h" metainfo = get_cubin( - f"{ArtifactPath.TRTLLM_GEN_GEMM}/{header_name}", + f"{include_path}/{header_name}", MetaInfoHash.TRTLLM_GEN_GEMM, ".h", ) + # make sure "flashinferMetaInfo.h" is downloaded or cached assert metainfo, f"{header_name}.h not found" return gen_jit_spec( "trtllm_gemm", @@ -367,11 +374,8 @@ def trtllm_gemm_gen_module() -> JitSpec: f'-DTLLM_GEN_GEMM_CUBIN_PATH=\\"{ArtifactPath.TRTLLM_GEN_GEMM}\\"', ] + sm100a_nvcc_flags, - extra_include_paths=[ - jit_env.FLASHINFER_CACHE_DIR / "cubins" / ArtifactPath.TRTLLM_GEN_GEMM, - jit_env.FLASHINFER_INCLUDE_DIR - / "flashinfer/trtllm/gemm/trtllmGen_gemm_export", - ], + # link "include" sub-directory in cache + extra_include_paths=[jit_env.FLASHINFER_CUBIN_DIR / include_path], extra_ldflags=["-lcuda"], ) diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h index fb81ff077..65c956104 100644 --- a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h +++ b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmInterface.h @@ -24,7 +24,7 @@ #include "trtllm/gen/CudaKernelLauncher.h" #ifdef TLLM_GEN_EXPORT_INTERFACE -#include "KernelMetaInfo.h" +#include "flashinferMetaInfo.h" #endif // TLLM_GEN_EXPORT_INTERFACE #ifdef TLLM_GEN_BMM_CUBIN_PATH @@ -509,7 +509,8 @@ BatchedGemmConfig const* BatchedGemmInterface::getBatchedGemmConfigs() const { size_t BatchedGemmInterface::getNumBatchedGemmConfigs() const { #ifdef TLLM_GEN_EXPORT_INTERFACE - return tensorrt_llm::kernels::tllmGenBatchedGemmListLen; + return sizeof(tensorrt_llm::kernels::tllmGenBatchedGemmList) / + sizeof(tensorrt_llm::kernels::tllmGenBatchedGemmList[0]); #else return 0; #endif diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelMetaInfo.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelMetaInfo.h deleted file mode 100644 index f5c064af7..000000000 --- a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/KernelMetaInfo.h +++ /dev/null @@ -1,34727 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & - * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "BatchedGemmOptions.h" - -namespace batchedGemm { - -namespace tensorrt_llm { -namespace kernels { -// clang-format off - -#define TLLM_GEN_COMMIT "32110eb" -#define TLLM_GEN_EXPORT_VERSION "7.0.3.0.3.0" - -static constexpr size_t tllmGenBatchedGemmListLen = 408; - -#ifndef EXCLUDE_SM_100 -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr; -#endif // EXCLUDE_SM_100 - -#ifndef EXCLUDE_SM_100 -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0; -#endif // EXCLUDE_SM_100 - - -static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { -#ifndef EXCLUDE_SM_100 -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "3f50040873bad87a6494c9cfe1fdb74cf105a262da64d6eeffb7cde948e4c8b0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "421c9180d5ef84c582c13ca6f758e3eb88db2c0221d6d5358481994c793fff9e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "9a5af1b5f22f8cfad3585bc94c7ab9e3889608184af7a90fa72355ec3580e37b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "fbb8bd03a8e4ccf046a6d26ba70538b6c9295f38d9558fe74c472dc6f9500407", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "1211763de22658770e6fe0eeeaf075b6c3b9fc404184e27919e920c9cd89ae6d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "80f36089bf16902e0cfe5530fdf73181181e42bbe1148a06f528c9fcfe951c1f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "0382740f8904e76ac8ba65bd139c0703098be937ca6a48f8d805828d2e9fb4d9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "e9ca5eae59b91497e401da818bda7a66c07c5c03b9b12870b8b46b1ead340290", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "a18ea12a8da554dd67a5cade11d3e2162bb2cca0349a3858d2b18de571423898", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "c8691567af794c5de14d5a5cc4f2a5d28536ca6073422b4df47c9afbb8f2be06", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "28d8f178f4347d48c162b4a35786bdab9378c9dc471a02c80b3aed6650a4e6a5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "b8269703cdd5409fa0bad9f8ed695dca41ff739af1ef94b346c3f0be0c404760", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "f30f8d237cd1cf14e5ec6e46384e15fb0f29b49ac06d115760f7cd5de96139d3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "9bf99980b7a6c15a6ae98733aeec0db642207ff052c875884147723f39b6413b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "78318af7a0405b458217987427d2fe31e668b1e5316c3d14d72ec23b71117aaa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "c9df13716958e4441c1d29ac5c1fa6cb2d293389fd8f779bc03fd2abfe62423a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "0a9146fb80f1248e9b89895999cb31834c3994ea3b490ad9a8a444386742d16f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "bd37ba431b1765a9ddf90d5f46549ef5bd453a20037cdd497baa79be476b781b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "f6435ef5c7d9d09aed443595c849477def79f95a9995dc0a938aed6ff6591a16", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "47277e74746b699eb163ed30cb063fec743743f969b881247a8723296927a452", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "4226a60504e7275e4b4a42f764e98294258e66f8cc4a10e14e171df7b2f5066b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "bf4fafd9e3cb30a9b0446dd8838849db06a959f16689df21dfad17513682bc63", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "86f8fffb1cea50303e40100d3bc51e0be57b3a70adc2a1859a8694f79049349d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "1e121aa979c8bd4768d793bdb0003889e7be3f4a31ff2624b285fe4c92e7cb87", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "7a0b234601ff19b9a74f9a5e049cfa9bbb2412a588027cabb73ac41726388a05", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "273bd4ae9a7fcd4dc3b4beacae4660a8aeafe93ec01cf13c99ddda7cde90c6f6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "235c7cecc53e4b5b0fca2a72f8e9a467e9fee921739087acf3a27cdebc688cc2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "f14ef8c6fb913f5fe433f4f7cef57d22453d9827e152b7998154f6701193d4c7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "37779c1a8bc4082b5c10f813cec67884a076d704450abb72006730c9d643da3c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "82c9d523368e936cecb66b5964259c128095eefddc099fa5679c963d70ae1a86", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a", 448, "132269771417a940aa39f43c0dbf496c585ad385b38dc182f28274c97307aca4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "f564fad6aed8f446292c1019b7ec6fd8700afefe5ae9db03f9338e153fc8685f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "298d1a1794a17c5d0585684e0b33711daaa2f6e0b00c8f47dc3f9e08f68c79f7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a", 448, "7bb43ecdf9e503d1f0b510b6161ca4c7c98390fb36255fa2c4c094ed474cf381", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "033f3c0b194580bd4a8e0d32296ec950d3336a90f22c1d2c19601b3f04706025", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "a4853e328a5ecc75c50ffa766da7f3dfc2474bab0d31e5b17fca6323b6dd06b9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "10f1d796023187047bba9a36b6b4be2cf3625804b39e9119ba60c8f9ebafea56", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "e90f00f9cc3a99474284d1d40300be041cc4616412dedfbf3af6fd8991fee6aa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "b9ff246be6e1494a34a8fd737cad55ce0d7cd492f0ad0981c11e7d8e56a1b6fe", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "4c7c2cfaf1a4ff72cbcc6da48792ebf84c4a01cb17e076bf5bc71a4ca3863853", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "23af50bafd4ff3219b3b2e77185babd14dc564196e59d4a7ce0c71f9f63f6e1d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "63ec7f175f16f2148f9112466e7759c6d080de0f21c0657ad4fe32903ed220e8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "0227acf26857469604694c037b2ec247e95f1d363b502579227bddd36474d22f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "7649909f8952a983e3573f19be8d0eeef90ae853f3730cae014dd97bfe3ca225", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "0b0535fe5e1dc64350dc3b525ae1cfa4edf7c8e5ce140431c474786041e74f53", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "f40181a72c1a3a9de321ea65d10398214ee7f7de27a8e74e67b6d32e58194807", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "87fe035315d329d83ab5dcd4201a173a43190ee7ee2949a0836309afad140734", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "c92032ee7452c8f79ac9526fb9db84b0d50ab18da19011dfdcba6e6b3d852397", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "033db21cba9333e6789784c5d851b60e431e5d421753bd2713cf3da0063fc283", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "3be81e7035ee5a5d3488f9501197fea2caf7159de69217eab8d041dd013f5f7c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "97266c0137ed61bca912f69fb299aa6369fbb4bc53ce3883031308e93c8923df", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "bf85855e9e7a943c409e87a393fab52c09d8fea4e3d49e750f8b463576555d23", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "e749eb37196ae08cf93ea4f2a7e65b085c2cad49d109102ee211748953e57b01", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "4cd14db05dde5b92cfff810910d8fca58b1607850cba9a6fcaac41dc204398e5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "834628f9c031e0e74b5634a732bb1af96039f7736fb2c39c29efe97a60435d80", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "c6f35190ec003d65f903d0d017577377ffe50a4c9ba8ef7398232e514e775bcd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "7400ee66aa0eaa3218c7e43e06c3b449bd9d263fee808a62e3019cd67cf71c78", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "401fd2bfcd51e41ed9abd426842f6d9a8e59ef3b85fff0b4e43a15a59dce992a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "dc8fa6335b2667f779f474cbb5cfe8931fdc8b6ea6b54bce03d15f34a555b71c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "e2e6c0e37a9bc30aedd1219d2e907f25d70ed2d0c507e82b4eac4ad9fcbe354d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "c7fd0a259771c1c208fb20b3f7a0bb9859a47b6a655e47a4b03ad8a4567f32ee", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "5c3392364b2d53496a1505a6bcb3b179ee718a91aba166d1352a89e6f29447e1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "8cc703d5b28e8bb936a4ca647f749aba2b49a41cbc8c33dcc874b54b074a0f8b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "bc9c55c5b547cac8ecceb6a48b3efa2f53c58fd7b1ffb8f8cb0a232c40af29af", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "14d6b39d1ca45a50381c359b6c5cbc432e5b61f4a3e5aa90ed1769eb5d8c394c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "fa0384182891b60447683a86f986a9d1193388760b3cd27c8440781bccb513ce", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "d687e41aad68fcc7fcb695bf62dcac712024fa879e7fd370cbc55370e542e940", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "eb7cf8dfe1dc909900275dc5ff5693591e5546eaa195941336ee4207dc54aa67", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "a14678fe5f84b7b8541398c5bc5fdb732cadc8d974dd985e6a907cf195c64813", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "12946559dc5d971ad4ded3d1faa415eec7b395ff8b3e55b5f0da8b372de0dd18", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "648ef4107c15ad51db68a834255b4fc43e099b684f2ec5df19d7ba7473e26eae", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "9a90dbaa0accef76e6b3cce8c8a5ad4339bed854a69bfd186519652af3cce24b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "8da6a223f8df73432b0601dc953b4ecd3542a2e499ba1ccd36a9dabb8d5774f2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "5dbb3a228c68dfc4cef3a5c987306fa6e387b801f133d57b897bce0626fb07cc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "ec38064002c223d2f08fa823db4f902e671f5c1b7029e70ff786a0a08c129341", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "dceea69d8a13feddcb5a63e3300afd3760bbd4186de64294f0452243fae2a8b7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "7cc2536574de984495b44de0ef8f6c45f6381f9abdaab01f0cc367c0a8661a69", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "0a74ae88978f2aad22d47816aeb36e1229b398cddb63d2a1c00266e33f98ebb3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "dd18f671d972032a78dd12641870f989cb3b8df3176fd867331224f492d74227", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "597374f4d8c6227a98a134bb20ff6e45491f6f994d44633b4d8a339e8c50cdb9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "cb8335a75a566e201ce08b7bd12a270adaa71f0c0a7ced206684fb8c544f891b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "c2ad2912f7315a2abfddc2e8a2f007306ff689da1475a60b7b8e219a97dd2fbe", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a", 384, "027475aef4ed9229b874e3b01dfd136f909f1c1d0bb3fc382e283f944eb7fe06", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a", 384, "98860c3803b347f64ce0a07d13e1e713630b45a01c705a70eee3611ba0e650b7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a", 384, "4119c179a085bacf3f07d57cd888269cb12b8cb0e8ea12d50d1add78ffdfbb4c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a", 384, "6475afd431e333182fd56f19060a99c16dd676c15115a99b877ca582737e269e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "d27479f40eb777039050cf3452590e2a0c1fd35437b1535fba4861b89874e677", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "28c787a8fde1474cb1c432c3299d42896b1660cc987b1696671819ed2ce96b2b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "47dde169efefa34d2faba0364d087d1fdf986134d9e1de0764aa733354336d2d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "e53508cca19d5b67ad3cc789f344377bb8896494c05bfcd8bb90c01198b71899", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "5fb79245cf1df99a0cf864e665c2098da795adbd377e2e4bdd24756e30661744", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "69abbc9a29f37e190aed3fcc1c7a2cb4f2c0ced73b42a54e7f3b8d84d0ec508a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "3ce175c6dfbe2ed1296a15a5573169ef8f3630e81191fa8bd1183784a6907ada", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "43bfe6367d451c1641749b52244ae40ed9bc084d9ce825e78705e42cee76f7c9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a", 384, "d7c94aa9865646a2d3d7b36b7daa9b06b6145688924532f6f1fcd651ea1e448e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a", 384, "27f5bed11c1be2270461d7a16bd04c2d5b0c3d1da365926a02b694749f047fbf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a", 384, "ef5d746adc2a09e3294b52ce6684d33f8cef0fc6734fe70e38efdfc105642e5c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a", 384, "10a957da353c29e2cd57852d7dcd90d3fea0980ce130d3ef3d5ac9f6d27f742c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "f8c169eddeb2274d509a10fcfed8754a2b9657267d56e804da0dcc5f72ba54a2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "6fe46321db31e54185cbc03e20e154fb7895c0ad1888365b9d51cedd63d5764c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "3ff9f922cf74e03cb5b3f03172d4be34ceadf911ff3b9ce6d6b646ace5eb9a77", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "b4c54f4f30f019d312346106faa8384065f7e99e9803e46343e7838a1a818664", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "790b2b58ace3c7a9365e8f37f34de8e2b6dabb5c7742cdb33c06d16484383ae9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "a916d59461e1c44f8897a40d017f42b68f1fa86e93679517fef80c1bc49f043d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "e40531b337de3dff738fd228d732675485ba0e32a3a0f4cfba34f106edce258a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "53d992332e02a59947bd40929f9c8efd3da79234a77d6fddabd868d58bdff5d5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 256, "b9f33b3ab29d19f2db6159a55ac761b1e4b128b06f6d7108881aeff8ae59dfff", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 224, "1db2412318ecc631ab85bd07feaf9564167ccfa09f89d4aa61110f221fb6e279", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a", 224, "9c7d64b40f4417659ebc3447b30a198f2eeff2bad4e30453f0dc3a00656d33db", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 256, "6f2017653064416fc97c0dda5e5070d3d72e07e2a7c423f7ff085162898f08ed", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 224, "7313e5403903ff8374ebe03bc3069a248caeab61f28ae356afcde042a4b13996", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a", 224, "16180b696decf4306a25573ae19dad86217014aecd96aaa44de1b513f3acb422", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 416, "52b258085b7e0970dc5268ec7ca7ca3a5c4980b94e2b63991fae61dac9f2a763", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 448, "fe0e4a968b4472f8856c2198f59dfb3fe778e733b166fc0fcc042d822964ceac", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 384, "3b112df01a3b928b1b1bb19f278011a18e9b866715cd19887ccbe200dac218d9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 416, "d4aa941092162be422042c26e6868843f229c8d3f374a6760d0627922d0105c0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 416, "f3edac4357dc88e43c54167c4314f321943c0e63eea05c8234ee121074b57e3f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 448, "fa31b14ddda97d48a48d09d14d3c9ce86fdf9285344875bb3a8ef7a717c8bc9a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 384, "dfeda97aa906fed9d8564bf3c1ddfbde59633837fb730f60006fb593ac60d51b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 416, "cc440cac9aacdd9008b8e8b1ab1040459eda7702fdbd2f895bb4bc926bd2fc4f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 416, "06048dcfaaa81ff67c1f72634fdfbbafb81092dfef6bbb28c7b75e1d57b4ff2a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 448, "b85032d1fc6b97f44cd9d3778cba54b25bdcf1cedd90efe34d15f8ecf6eb691e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 384, "9a52732e9b7fc056e74775739c71b9659f9561386dcfb7ff18a13f634ca59ced", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 416, "448185b1fd23f3315d5b1fd2ef833066d53184700117b86fa3b6bfa33a820c5c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 416, "84ea4dd3bb574e7668780351d3c33c7846921eb7fc2a00794a7cbc91c96f7ba4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 448, "4cde38e9d6178a2da1028c81bb7897fd0ed5be177cd949ce7f8887ef1f0bd811", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 384, "79bbbc653e8be33b7e8b2ca2400a7e2a9eb94752912f18fc291a0d3b6407e979", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 416, "5c1d3679cb47511def1d4db96988ba8104914592a46b4ad3df554e2ffa482577", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 416, "5c579bd1d00f0f0ab698aa5cfd70596283f367c962081c707b7b8f101ab84793", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 448, "d2800340cfeda9ccccbcc4951b64233263b58bdc12e79896f61ea26cb41edbab", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 384, "469c956f7e2e095908ddd733b1db48d1effbff506bca5efc43f7395d72c1c48f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 416, "13efa9ae5dd6d8e88d98c2f5af777eef65a538a3530cccfa08054c6e26714c93", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 416, "a6e68338e9fe01626be6c254deda824bf588a69be035e195254de082ca68c723", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 448, "28fae92ddb56b8569597955cef7b840c995d6d8c375bd0b6404086dfe722b025", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 384, "c4a5d32e52bc38ebc69fd0102bd1c4f4444f114345b24674832acae0e6b34a11", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 416, "9763f6a7f20c8fa74157036a62a755dc6569ebee9cf054d1c088182633bb234e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 416, "3a28fe8644a7bc7a1cc014a8897ab0fe2166436e2cd5cfaed18d8fb57ad69413", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 448, "fae33ca8316876b52ad4442738631714bb966dce0092b74026b4b1186ec074b3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 384, "84acc24ca690945c29621f601d016a8dd6a0d1469b8d735806fc726b41bd1793", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 416, "b00a33a971a83513b638849fb432d0fbe11eb4b38ae4a696227295964b939aa5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 416, "9e7147c1111d5312284b3711b5cb5e5615212529b87d67cf319d923276dc7d31", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 448, "15abd2a84edab7a22acf98b74499a0789fd767a68a813e147f6ecd3372f0f851", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 384, "b386b8c7e1c9339659e6837240fa2f21f8fe927dd5374fce061d57517c9fcf1e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 416, "ef2e3792a633ceba03139e91beb01517ee09d2b7539f67e146be12104e284280", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1052672) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 16 -, /* mMmaKind */ trtllm::gen::MmaKind(1) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 1 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "38b5f0e760783280d227c998b300f0e8dd71a3f26373714a22e2f31348a3e5ed", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "a3a12a8fad98acfb1dee2a7c4d869a39890ffce9d5db5966da9a82a46da2ed62", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "6b3293be7d956ba7b71371bea3abaaedd83b8f579db9a059c6fc99693223c4cd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "22b5ab9b379205e9fc715a447023dbd202303b0f8d491ed3bffc77c33be2587c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "ef6fd532b1f40ca268e0ae2f2dd2c862ebf43e67bf57058d889346a0e25f1725", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "33a419aa9f7d3efe0f05b76483b1f745471a65cf59cfb76946d3abca189152af", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "169635b45a94a5ac3d7abc68ac2d3de3cd5438afab71b672b4f2eec22f16cdd0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "839da645ae7ddebfa72c9a73ee25bc0af44c233f92744a74ae2b3a0853caceb6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "0994bfa66ea8a078b03947f88850366fca52db246742509924bf7a6f213c7bfb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "ec379b195e72575bf96f9ff16ea0c35eb91e9b291a5a3719751407d6daed6337", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "5b5d1151de0658df84e2810b3f1916d297be4ce3651d95d9a9de889e4afd1c86", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "3b0858a36167ad32d9905500e2aa59743b6d91bb2c2373a70f9c3814aa074795", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "91874a966f6eb543323f8467c2e3f66600f20f0ab8754026abde6b510c699f98", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "955ae7dd9ea62cf762ca07c8ee875e24303b0d95b46b03768c07ad3631e5b143", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "e11a6189ded24497219f2bf335f60a83fb17e163dc32459c7d166a015e2720be", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "be1acac35d67130541d30ead271c206f7ac6f70ac4a0e46db4c1845053a8c3a1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "0033ff8a409809884ee01fd3837975710d5cf654422b8055892c126cfe11419d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "7ad06f231f869ea9e835d3d1083e027d7ecca6f7a5fb03752f9d351f7ae2fc45", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "620b6d07cfbcf2cfbbc16d4e4dedaba7eccdac2f878dda447431a34594cb95d4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "4e8fea6373057cb4cee2f2a32a4d0edee7817c060b3449471b0001bc9dd2d482", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 125952, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "0b7ed55068401563c1d4f1b8d0decb49821a0aaa2151c02ce4feca7ca038329b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 125952, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "198533be3cee3890a08a67e379a1223f4daec31058c520726cdd29bdaf13560a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 163840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "f1fd3a527c984a2775575ada2a9f8c3dc5b1a67a16e2f8c33b7a5536cdc3a00e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 163840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "d54f4f208b17054371d86be133399e8eca22070d2698208cb8706b537baa67a4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 125952, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "be7f908008125834306e4050d77214cb231f8f405e7eee09d01defa33a3cf2f8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 125952, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "9a71d9dabf06f8c485511ffbf2596ba8c56d05acc22f82fc16b3f53dc35d74a7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 163840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "ab7b410bd93cefea6d39fb831410c13fa2b9f72d772b7055f9a504f4e0b9cef9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 163840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "bb56c004e9600811b1287fd79ee6b2d5ba1a6111bbebb3a01590b4a52042dcb7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 142336, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "3dd7d632ae2bd741bd66105cca2ff630e928956a57cb269180ace28ed0a2bb9a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 142336, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "675e511eb3d5efc57b05609cfa044a3228077d83033845d272790997b613346f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 185344, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "13f1e99d4a76143862bac3e5f420a10587bd6be5286615de4aad9b2ead3948c7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 185344, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "9bbfd28c75e78ae54f4d2b58aff19bea505e96dc397d3dfc5031c815601826ea", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 142336, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "712986701528a94354455e39c9759c5573c463a8a47df822c2286a65fe0daefb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 142336, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "ea38d7b3c60b20595e79c692903917e5cee01bfc2ac9e35af00ca03286e62956", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 185344, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "a1c9af9aa0f57eac713088b8b31f5b216a145c503cdf7944880bebdde359dbe8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 185344, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "aef9a91ee0738e331f8d7c3bb3faa862113a247b9754ba6e12ac9766d452eb27", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 177152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "64f070388a1a75610038a835e814097a80e468aa6386be16006bc24289a245e9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 177152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "1b6974f8108e1b0cdc6e3b066b84e0ec4cd580cfd3267ac1ec941bda41d954c8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "553f0aa838c31e24f41ac380d797de027c41358352544642cf585919e69b22dc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "5336f9f353d3b5d2a204a95b22ea52c774bd6362ea2659a53a401062512cea56", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 177152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "587c54db90f77353dc7eaaf0130bc816e1acdc27ff84207b7ae45af5f203e63f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 177152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "79c219a8c1326757393006cdaf72de6389a5aa9a3dda955141b54e4f61e832b4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "e2e741c1d17440c4463d8f941baa4dbd9d63ac2ef5f5183df5ad1bf25cf06491", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "3cc64b02c875b4660ee70fa2db4029bee0d7c60eb95446852a6732150133b6e1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 153600, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "1fa7ae2040138a8e919c787e62ff4d5e235ac397ff75c78c56ca1eed1ea2c113", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 153600, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "b98f704b2003d04bb36e2a9307a7319e6471e7d4ad7afa2080b657c34ee6d2c8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 159744, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "037db60f5ab7b61a4e80ffaaa61f36fc32dba72bacf09334b15e1da592a53332", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 151552, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "bdb9e4f703807f6ce8abc9ae7bc8e4b8bc6f1dc4d026a35d482d0ba0ea2e85aa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "e89cf7342fc0013e957c2f58ddb53f84d05d679462c152378ad047134817a689", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "27c64ecb23ece3132d8fa547e9d577d20042063f526bcc73d16a10e9e53481e9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 195584, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "11d7f6c4c4a8d7f382fed2b2dc7f608743064587b19ae1b96d412d9db106a290", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 187392, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "3df6963d871d8fc2915f6e4be2d899ead0f06d4fb3156487ea39e2febda7bff7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "b419076e6f8be20e164ea1ed8d4770b9aba41a20baeb35ba8e0ff72f9c88f5f7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "2bef689b3d1f86873e618973d7aef905630574fb54b043dfd3bd1dd6d14d6f23", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 231424, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "e425bf76a44f9c244a8b04add431d3dec324e745c3ab58df1890d42c335f8e80", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 223232, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "ceec79d213aca4da2dd3a230b3f81fa5d8a39890fa81befae3421de09d5316f8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 153600, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "58962324203c255d56409d2fd7bbbe91872efd46168e157a22ac1ac70b7b42b7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 153600, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "ab07787ab1420a630f9cd816a0151aed9b5b3f6270429b42e9255c227676e6b3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 159744, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "c25077e7fc632546ab3389dace009de63e2972d38b9e77e7b980d010b15822c6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 151552, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "534eb114f2c8c1b77de1f33bab90959259d163280e912429b945302d0e06cf28", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "1862ac649a69e4b532651012a2dff768d45cbca4cf467ccc4041910eb1de1176", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "233baae772c1b23100c09b46a79cedb27f13605556f0bc3e60a8839dde6a39e0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 195584, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "c21ac81f0799a8365f557430e12ce95f448cdb21eab8fe146533078b644c2d3f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 187392, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "eb55eb91b197919dcff2a184c1037c07363cf1b5a4c68d33ff53d738266af975", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "df7d18206ee418bfc9a636f5c326ea35a6c687fb519c3ed4e8ad1fd4b7e224f4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "6377876adb967003b602d3ad5e786c703b1ff72331929d490c28eb0f7c863bc6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 231424, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "615bd354244e6679359592eb69ee4bb1fac02720ea6de3d254f5ef4234a5a208", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 223232, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "a41e0feccf2584546ec060c487d0bc2f0a5b98ae348cf9c593b51c98ec6fd6d5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "e464cee5683c77a9b0a8710a79df18c81c30bd1e1ad70cc1110f508283d74a70", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "1f1b53b8475b8915cb927f17cb914c8f39d3bea4b19cc2dc08a9cb18b13376c4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 231424, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "b6fedcccdb8c1f4a716e93165cd6dbf826b06889abcd2b5548192dccdc5c2464", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 223232, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "6f3ad785800894aaca7d8ff8aa211b31800d2e21975bd72751a81f0687e3f2a7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "7399ce8bb98825c9f4dc063932e7bde7d4d32aebff3023d53fca90155fb42fef", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "cd7dd69304a1f7208ba22a4dc2630741bf84a4ec67a664abdb7528a30bce4511", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 231424, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "3dc4e382e28111d76580bd38dc1af4da201b2dea9f0fd5bb1cc0adf35b9d8d81", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 223232, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "9048f79c720a2ce32103f677813c48ff741b475431290107f693960b4144637b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 0 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "da31f6d9c67d08e85b7072d5a43266d0d6ee30735817b37f982c0477de645a06", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "42e9065be76998f1a64683085d9ee72071e85c3f6051fc9aec97b8cb4a8bc7cb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "46c62f5dc36d3a412416fb040ad22f3bbd4d65f5706fd2f32672bb629dc6dd18", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "8e5a000b9df97eed3710350f1126dc698bb7f2cac3ca94e063b469769a176366", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "5d621271aba16b84dfe6283b5d93ac500800b567bff705d8a73d2d2a963e67bd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "17abea2a2e8181c85657a1c3654da4d5783bb186fc0547237b6cb88dff148d31", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "ef8e4838f68f4020afda48c29d032ea4f56239df6b101d1b8f6c3a15f85d744c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "f306d4c0b809848b99d22554426480770767cef45047dec3e80f8b161f2c140d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "39dc8af745963ed8342183716ed83d62266c28ddbe2290a00b1401de33537e42", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "932c455add6e5927757c2a56c4adfc81ad2dc114bd0c55924950dfc424432fd8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "7492c306444a92b35ff1666b496b0b518b1e5e1a3bf571334d4e29ab6129666f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "c068236aac9e7192e3c904b839834bdda81a1f8b07d2f69dbe032d0d05374822", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "e470d3aabe53bc2484d130ce968c0ebb8776d1a2c563377c0e6a128ec42efe2b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a", 448, "0e1aa67fd306295f4c21d072e436bcdc1d51fef87aa2fa0b0493b91aefc481c4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "142afcee25cb84bf6611830b8a71e127deb69232f580a05df56e6ee505a09f05", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "238320111ff22f9aedabaa9d79954e7db0a395df3c3ff50af7306616307219b1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a", 448, "ba95baba14a5cb89863871c716d247bef2b461df8c88a1fd6668e782254bb8ee", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "9ae64768181f24c6ca90e899261eabbf76d93319ca54c212cac0764e5f9f89b7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "8157f44703c59388fb09afac934d18fb37a94aec57207801f07c8603687ecdbe", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "7477fff17f203078d1a94b4ec28bc0b061c1a1df194da9dfa3e313820f3c010e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "2f8ad526b885e8baa349e382a4ae0ac982d886242594141ae02519979b71b13b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "3e8760064685db77751a0651e21f944c694263647635671dd3958ed3f7fada28", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "508b4b08f46d803db6ece157302800429d3a315b448f82c594f09dc224054f50", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "fb124fa8b6e12b989155fab1c39e1e9c02f8aaa7b10a48a990e1e69f87baa9d0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "7cec49ee4eb48238caac11db363f407c6af4b5155fc1f422a89ca35ff92d2d7a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "b63a1328798a2b363926f1e908e874ef1a94f5dfa4bdff87f24731d7c5a68636", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "bfd97bbe6574a722b831054f05d8f9a60f921836e77fe307bcae58e37198be70", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "ab4005174cdc2b54fb322c32c184952ecbcbdfbd99d173d7e242d382da88fdf6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "3c46c6b249e5a0cd7340f3dbc7b8072077652ec4c00f825b388f8b8bbea69cbf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "d38baf93035e39c7f6d3928ac5f5e95037202d5f5a668c47559e47725049baa7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "3d88d7cdf8878e120633daa4940007ff6fd4ea0c78dca63819117adeea74afde", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "5fc17978448b4e1bbac488501ae56fbbcdade025754f46e2cb05e94a63b67a82", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "2eaeea22b7f06a714c66e5e7cb07a926c9813c6442c7f81b0d21624d9c8a5920", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "028d9e988fc094d7ffe0d9b9416236f89fba7f1bcb8a5fbcfb5dc499b2174955", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 229376, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x512_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 256, "f29f44b4a5e9666850a7ae0503f150a76fa4af45d18242fa53652c7830a492d6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 229376, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x512u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 256, "e3c30edacd1a1361c6e983c5724253f6f0963c2af08a722f855e0a476fd02d41", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "3156acfad50fdc17cad8a8361003459321a3464ced26e0028ac7b0e323b134b6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "96fcdb017b1d06c94709523848b757776a2c00ef601e2b7583f0f6d84e762741", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "020a6868ec1616be5d6672af219d2f37babd931ed22ea7f6b351c51be22f2b2c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "d1f59e9d0d394769ff8709dccaf513fdf564efd007962aac9ab9f6cc4ab5a0e7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "7612966658dff79d86fd595ddec5ffe687dad3df5381593a88f837720c236964", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "1c6e04f534cadf6f08823442154ac262c67ffcf39f9ccac08a21a4d9d2a901dd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "1133ac386e288984301755cb9ea35960ce9356eff891c4f0e125a3216ea36c3e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "04f3fe5b18e41354a51d94d394f030f5196619ac176e2332c5b58f16fc4ef8c0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "e9a6951414db31e6fe9792c25afee459948dd0d2866a5fc78f6faf0441a52dd1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "be24ef39cdd4e0c681b6e30e5558549487f2db3006a396cbf1a4a71446f99b31", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "62c5de8afbf9ae1890e508e6ff4ebbef8db86ad0f25233bfb24e1e894da6860d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "543cd01fcc2f8a4bd4250bf645085f3d3663c50dcc17e1529c13794677b5accf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "6032dad65ba48e4b92f9a2fcaaf48f375f9bdd235b8948be74df9e1befb92ecb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "2a620944a3085c1054f57d0f07c6d19ac9d7958858607658a2961fd0c0eef409", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "f7bdd240292559badd385d400b123e051f2487e3e4dc16baf4b74b71cb4b1cef", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "72ece78605b6d5fafc6bfbbaaef349293f31d58fe16f1ae1bd548f0754b57c4e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 174080, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 256, "15cdfa203783589d46b84b273c10c7e0848a5fa3cf158ff0d5c39c9ac0c26bd0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 174080, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 256, "870c199a8a501a4b629f5d1bf97d1c8d76d06c295d76f33cf010e35cc253e99d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "fe3731381664f21d8643a66720b297fdf4dd2b2e22475bb19149c23c4cb49856", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "0a5c5f1765b677f4b12a5a77bd42255d2b44f00d1f79a33656a248d7f1673b5a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "ae778ccd730dc872964386dd1f58bf3aa7c396bfa63f5879f6f1e6723bbfb745", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "92968153ee4f2bbc6fee0614b1424ebd9639a9f476ba287ccea6d66931429eab", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "b2efeb8d3eb03af3ddbfc5b6a02fcaebc2d6f7d966a53f11ce5b6246c05f1eac", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "9f9f0047824e413b32a53d05b28fc137716be2d3fa77718a31fc8762932530b8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "3c0b2b33d7a82d5eeb15bbfca04aa57e0a3d1e124623c87ab7be6923619e0f2d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "c813ada1612f3d3529058eca1a77529917a0cdd30adb9e4f6a023134be4342e1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "0f3a5c9b19f422182c88f22ef643708e3a7608cbc075eb0a64fe286805056570", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "c684c20e117ed0c344362db6def8db796da95338da784c470a0a469aceffc1b2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "9ea1b44d1ae9b5625b2c8c7bee3ea564a299441610c6e8c5305691f7972487f8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "1d3d411a31710c47d5820a6ac8b14efd137694b3dcf085e9cc11f79b72dcec3b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "c1352b94785c2f2aaf5b073b4738128a81b7a119d25da73f72745aaa0113bcbe", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "9c3ecac762384d250afc9ea993dded8ed1122a529b7ba6f7699683ad9b322e0c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "085bc5bf30ea47141214a0d9c5b166fd6df569cfc14c38ad7d9e44174fc1af0f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "4870220f05c78df32faddc8de55a4edf07991b400bb1b206300106ef72a8cb55", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 211968, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 256, "8fc4f3a93233412c4dad29628b229a2a32ef9149b992cad845c3bde762d427e3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 211968, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 256, "530590ee26d518825cbf6584939b657dcbc1ea07f6c2d89c736f7630ec7aa9bd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a", 384, "d42f2fd34e8cf59c328baa0435afc72534d22185f8808487075df41e56fc09ca", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a", 384, "ec7648b300c2908dfa94d1ee2140649ede8eaf25c283bbbb0374718c18c50c79", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a", 384, "dc6001cf0e3e3c41a23eb6ae9b929d0bed1f79528225b0dcb3bec818334c9049", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a", 384, "b624dad0cbb1c4b9fe803dfee5bbd9d2d05c3269d47dd7014031dbddbf836699", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "b7b0cc6d9c6ae0ebe46a369e14273a1ce80dcbf3a52f9b25e54a7d7a9820598a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "aec05f561a21befa59a224f356b959b406fd2c2ea01c6c0cef9e9f884f7edefa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "ff0f609512a66261d9cacdd4f6d7c6f0a0df1f9ec6e8b87d45fe7807962cbcf8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "47ec62d07f57e6931f53a84a34fc80ba662dd932a740cde606196a18832e5234", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "e1c21a7380cbe830099603605416a0ce4a3ae04287ce0db38d45202610486425", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "e9d722f5c6c64c294fc44cb3265d930a5aa1e2db57b6785ca02a2c20b7a53a52", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "97c72e81f143cb0f5316f3890b2f2efdb4e8e6f96f673392ecc95bc964e3cfb7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "013a501582343cf23c12e814739f06d0fcc6c46723c016cf862c0dc8446c58f8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a", 384, "f69a3334f3dd7dbba1b8055012f9f6a7c4716360c4081daccad177d6bdbe1de5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a", 384, "26f9cdd0e653cb1d23bfcc62971c095cd9ec1e7b903680e0bcb52295bcf7bf4d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a", 384, "e7a181f3677b6ab072fe2af99e1a5b7929f78a9562c9d39e9de3c1fe714211a0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a", 384, "dddcf3923d544d600a0722153bce5eae2c6ab823a97cbdf291277bf5e01c4ab0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "46c66d9f32de8f08c6db20bcdd8744ba8fdeadc37898b2c5352fa6db9ebbd2db", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "eac88b97c198140559ef278cafa4cf63575292e7b35346416b6e78cb1c9bb5c5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "9c96ccc014badff11a1f50c3c40f5b299ad03fd5ba8a32c8e3bc42f490dba266", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "60d81ae7da341580c6ba652f6c5b9c25c9c0cd3c55bb510e6641f923b7b3633e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "9daf7ce62048baf89dcc9b28d09fb98b4033ce57f25654a95e79ec819e1d84fb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "bfbe62be4152bd934ee6553669dd7f59e211f139770ab4b64c21882e3f52c270", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "52fe80baf1b55bfac15ca876b5c2b4111d1f475fa176219ba491543cbe908360", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "bc2ffee4831f12acf4e54ac3ef3f6279c5517100ee04d16167587114ecabf409", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a", 224, "6329f7ec972e432372452c9988851e27dffa49e10b922723e5bef3f9540b2987", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 256, "54364b012c0467825b7a02615275ac53a6131027fbe10d78589c2211af6bd4b0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a", 224, "13992f122109be6fdfc576480470f0a0a8a56c6bfa2210b317debac5045e1fcf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 256, "c3b553e55fb4e898d86ee14ff1bb37ec1a6a3468d5f337e0fc369667a7da4218", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 1 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 352, "f1a7ea01b8739ed28f5a7e5a3c68715b0feeed07b8a372ee76b024c32e454481", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 320, "8f87812e830b0ef68528fb1185fcd396369825795664fedbb01903d14aa1b45a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 352, "a2e4f42148f636846ccf23df49547a8624cd6390973c265165222271a15a7839", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 320, "66b38fcde671df041618bb144ad0ac9dca3af15338960c3d72f32763b0f46f9b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 222208, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 352, "14c49c0ba19d98a5daeac0f875923a81f1280a2aa997ee2b17532588118fd747", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 222208, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 320, "206d9ef92877403581df57ff9dafbd8846022f42d05460a6999bfbd3c13346f5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 222208, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 352, "cf4a9445827af25ad5509c977520e71d240c1cf481218c5a4ce8f42e4365497b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 222208, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 320, "6638a120534f8e5057d418a804d85ed31d792805a765dcf31007d2d544e59a58", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 217088, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 352, "ba3623bd75c6d2886dcb8d909ea4d22aebfcb818322f82d31112b1f0b4c788d6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 217088, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 320, "fbbc609f53caa9b124fb5a57fb7e0a0eaebe8bb331ca20f17e8906627ac31c1d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 217088, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 352, "5ff1d8d57151079800893c8a7349faa3aa489e6c1a01c197ded53a390d3af1fc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 217088, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 320, "c04da5f63259b901c8aaed6749707268f14f784e923908e5555a7d6042f8793e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 352, "ebc4029975da6fc6d2ac7c907a89c1a1e06aae8136ee6ccd9ae1ed86e7fc26a2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 320, "ea5395a0463c0a39e6dae7be45d4080c0695fc3edc6678f813a6769c2508eb16", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 352, "9bdac69520fd1a4c1915b0612ceb733cc58653e21c749af4cfdb42a5fe08e8e4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 320, "71c50c6ccb8be1c7cde7c18375e561f4463133a041691f8592e1fecc67d2d7e4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 352, "6c8b4c915bb3b8131301c4ca922e96c5d5af6efcca0beba70a43b4ea08a887db", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 320, "5581245c8c20d8357f33d296c4d3448143d62406575bc5ccfa175e133ec32dfa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 352, "9609ea9566913c76e84a837b7511ec63b835af3653a3e41ca4c0f591fc661b44", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 320, "2a2184a71fcc77e60a4a7686a152bf8eb660c5da27df105eb786915ea670d2e0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len, 166912, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a", 448, "6d28d0eec821539c8790a3f998ff68987199a3c34d8d3775ac653289a8bab2da", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len, 166912, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a", 448, "ee70583d209fdd0267dbcadfed212c7a11cf263c63ed566fe27d479d1b7f5d21", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(1) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a", 384, "0c5f5ffa8eb28913c87e78e5bf259dc4dce206544479aec4235750e74ac11835", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a", 384, "98c29f7ebdc1e95425a63da5002644a84dd4ff5b0c9d9c3222b405a3768c6806", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a", 384, "d140f74147be90c9ccb65d49f231dde181e34e3742158a7a25700617974bfa89", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a", 384, "ddf6ebc5fd61ee1618e7a846e2010d90b8f97e940b9d15fb1a4013c2f77ee92f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a", 384, "b942baa865cf92a34a6cb05de9f87fdd099a64d5240634619c164207bb0f6ce4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a", 384, "2c03409fe9e5a05e92ba863cb541d81c150593d8da22c72987e1d8ab97b7f037", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ 128 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(2) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a", 384, "df6216b63156119ec1904594120b1b27a8e80b24930691c4cfaa55fe85fcffde", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a", 384, "7919353c891ab408893abc7298e876421fd8c59eed469ed98e31124752b1c931", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len, 217088, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a", 224, "42552b1145a7637fe53aeb1b91fad1cfb07668f59439e9996507f3079eb475c8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len, 217088, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a", 224, "194aa97105e1648571be4a622ec13fe9b5fed84902680eb7450be52b3a1c5fe6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 123904, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "78505ef3c73f04a0d53445d639ff707704ee4c6d886831736c14ecec82600efe", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 123904, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "599659954bd80cfda041f664d83c13ae89c605f0cbda94cada70f00a310bb2ea", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 161792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "6db5f9d10464bade0417c0fc45dccc499cd8cb1283aae6a36d2b902531a484c3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 161792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "daa5e40fe2a0fd55c17306459dfd56952c665863ffdb544b2e7160040ca210f5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 123904, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "de84c185f683a95f68b3f0628f43d6ace368189f7534677f34df4bdb23ca65fa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 123904, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "a16e7157ebc69edc07507cb56255930a104b86096e923e31252c28179540f337", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 161792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "186dce0ee0341e4f90ce0e673478e7041fbb57f0338d8f9d90dd255d4622820f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 161792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "91f75f1bbb0773becd4cc8f2fe187ba67af9aa9c640d6adc7036803c4c882fe7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 16 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 16 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 16 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 138240, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "4bf94cf88b3e79a1232408d75aa723d1c8a4d5e6cee60bbd6dce06fdf8b67bdc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 138240, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "a7646e3bd92185c13588d0238802f633e04a171c95c14a134b1786b3c9bca4fb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 181248, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "76fd23564cb7223eacd20c08df3aace2866882fc4a0edda811c6db79532b653b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 181248, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "613d320526ab9f209870c363945d2815213cc0238e1ba5c448e3459b7e7bd531", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 138240, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "be616bf5cd5656a26d2304d2359c1cc82a5eb750a30298b8420f50089f2bcf23", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 138240, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "46d660330199d6395de999cfa7734fc16791d535777b54d565e68424d62ca10b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 181248, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "f8a6d222dc9affb478e6dbb2ac57b61e607af8200225f9d7e433f74cfe2fd289", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 181248, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "fcbc96fd294b7fdd364c235b03cb2e04c151573a0564f3dbb28a407c707c4377", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 32 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 32 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 32 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 168960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "786f358bf1789989723720d0d932ec12853e49d019db3b09be5426cdeca358f0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 168960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "5804ce859c989fa2266dfa1d550f5a6c0df19b28d7ca6f9814980424cdb81d09", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 220160, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "ab64018790ccb1f457648cb75296ec7d4d091d54350cd17bc7e5e468a0cb3316", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 220160, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "e7f67ed99ebf367b53811ecb55579173cf697806f420ed85a80151d056c23415", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 168960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "c7ed8f6d742ea142fda745c5715ba9dcbf8941a343af271d7844d55f02feeb57", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 168960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "2cee9f40f0ed9d515faabb3a8d0352b74103bfb56bcbff14674f620527fe50fb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 220160, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "17c89d1bb74c646281024f4348eaff386b34dcda2ec28b4df28d52c53e825cfe", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 220160, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "fc0af5af884f070d4f82fd35292f2433fd4904594af766f3db1bba3961f9cf5f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 64 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 64 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 64 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 152576, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "ffba61927b3ea1e1b7b29084f89e33dd5a6e83fe32d2b3e79eddb3446da2a761", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 152576, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "49824cbed86455cc7836b05c0b5261ee51e94ecee498b37f64d1cc923e9a43c9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 159744, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "fda32440ec4d9a6c98afd95404e63e22274f1b3ab4ff7744cfb6ba354379aa93", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 151552, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "a60d72364a4180f4599e443f62fea83f37830df9e328a34d3fae7e1f825a0a94", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "3bcb4529126bf44527c333e75a1b45122e2e141abba802146be75c524c4a8e57", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "afd00d9e20e5f50fa2d83bc2d4179582b100c6faf73fb627e1334f049fde0ce0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 195584, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "10b2e0624341b0f3b28d01764245248f5a942553d82018cdbf2d5d0655f09aad", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 187392, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "87b861e015c9409b2aff171d508a02308772948e4c36afdc9c9053dd699bd940", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "26abed2f58502b931d728d01683291c0f7a5415b275e7b476db2397654ba5a02", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "ead5254e30dfe83e246606071c686200dec7f40e4f3e72119b4cf42d975e5e40", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 231424, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "c2467d2a97ac86edb03b061d148702df24715febb02edb7aa371c694eef8a37a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 223232, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "3a197c4d4b40880ce4a6b9c309c38fa36f2df1c5c02a0b1199aba2d560fc8a50", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 152576, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "7584dabe78ad0dc6b1b627e7eec0d21c91c4b9c1cd53eff3240b990d10e552ca", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 152576, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "71dda7f91b6925618277b436bdb95c1c609c89d25f6c9227b0c421dfdd8f0382", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 159744, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "2542a115ddfab3bbcb8816a2b4af26cc2d881f2c6d6552bd7b4200b40ff4b60c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 151552, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "6283f013fe3b5444f423f42a61ad151f7beae9145748ced3caa0674b0675c3d4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "acc297829bcfd0e45ced4300cf69edeed6e727a32eda1d067da6403747784e3e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "92139dfe591eaca228b2f79721e981679803376aa692a3366e351f94ac51ede7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 195584, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "5a86ddfe2f7b573941b49fe1e99497fa10b302fd96d3dc4dde0304311912ad7e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 187392, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "d869e97fc378587d8f55934e4118be427c5c31a2156dbafdce2e9a4931e20848", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 5 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "f8cba023c6150b5e80771d19703d377e1ef61ba0ddaa80bfbcd7a4b52d762bbe", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "1de7b15c33a7b5dfb87cde99a2693f6bf2ece88f3706c94b3507413ada41aa60", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 231424, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "8d3c3428f4acb8c1d45109f2d26df94f8112142689c7ea37e369628ae6215463", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 223232, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "6d2b7221be879c4e19b857ef5099521a111254cfdc1211d6415f0bc2c0ae152f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 6 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "66059be69e378daef40ad4f19a17ac1d9dd5305a589f963a408a88080ad8bcb6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "77f8dab69fc0c346e977b67408f16749de76ed4df80c438f8af37d2ad610befa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 231424, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "267bfa7448c730809d49cc3707168e1b1f9d4a7824b8449af51ef0e1bcd6ef68", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 223232, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "91b2af33c93bf7e89afca5ea4b8c58b99db79ed713f550d2e31243cce072d5e0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "713657671934397700abf3f77cdd20da3df2cf9cc9eaebff78bc2cd5763bf866", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "6cc8de91f83ec44751953e155beee55a2bcbfaf6c0fc4a1256e527466484c61a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 231424, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "0321ef95643ff3da86a79a505034de18b1fc0d9f7f6e287b17e82d925322569a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 223232, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "4380454862dfce06463093bc1feead0c827a31e2080b33cf6bf9988210e6c6eb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(1) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 2 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826828) -, /* mDtypeB */ trtllm::gen::Dtype(17827853) -, /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(5) -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 2 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(0) -, /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 1 -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 1 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -#endif // EXCLUDE_SM_100 -}; -// clang-format on -} // namespace kernels -} // namespace tensorrt_llm -} // namespace batchedGemm diff --git a/include/flashinfer/trtllm/gemm/trtllmGen_gemm_export/GemmInterface.h b/include/flashinfer/trtllm/gemm/trtllmGen_gemm_export/GemmInterface.h index ba5d7c06d..16a0baed2 100644 --- a/include/flashinfer/trtllm/gemm/trtllmGen_gemm_export/GemmInterface.h +++ b/include/flashinfer/trtllm/gemm/trtllmGen_gemm_export/GemmInterface.h @@ -24,7 +24,7 @@ #include "trtllm/gen/CudaKernelLauncher.h" #ifdef TLLM_GEN_EXPORT_INTERFACE -#include "KernelMetaInfo.h" +#include "flashinferMetaInfo.h" #endif // TLLM_GEN_EXPORT_INTERFACE #ifdef TLLM_GEN_GEMM_CUBIN_PATH @@ -309,7 +309,8 @@ GemmConfig const* GemmInterface::getGemmConfigs() const { size_t GemmInterface::getNumGemmConfigs() const { #ifdef TLLM_GEN_EXPORT_INTERFACE - return tensorrt_llm::kernels::tllmGenGemmListLen; + return sizeof(tensorrt_llm::kernels::tllmGenGemmList) / + sizeof(tensorrt_llm::kernels::tllmGenGemmList[0]); #else return 0; #endif