niukuo
diff --git a/‎cpp/kernels/fmha_v2/fmha_test.py‎
Lines changed: 1 addition & 1 deletion b/‎cpp/kernels/fmha_v2/fmha_test.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/kernels/fmha_v2/setup.py‎
Lines changed: 70 additions & 7 deletions b/‎cpp/kernels/fmha_v2/setup.py‎
Lines changed: 70 additions & 7 deletions
diff --git a/‎cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h‎
Lines changed: 1 addition & 1 deletion b/‎cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h‎
Lines changed: 1 addition & 1 deletion
@@ -157,7 +157,7 @@ def test_trtllm_context_mla_attention_fmha(dtype, s):
         epsilon += ' -epsilon 0.03'
 
     sm_version = getSMVersion()
-    if dtype in ["-e4m3", "-e4m3 -bf16-output"] and sm_version != 89:
+    if sm_version != 89:
         pytest.skip("FP8 MLAs only supported on sm89 currently.")
 
     # Context phase kernels.
 
@@ -189,8 +189,7 @@ class InputLayout(IntEnum):
 ns_close = r"""
 // clang-format on
 } // namespace kernels
-} // namespace tensorrt_llm
-""" if generate_cu_trtllm else ""
+} // namespace tensorrt_llm""" if generate_cu_trtllm else ""
 
 copyright = '''\
 /***************************************************************************************************
@@ -1344,7 +1343,7 @@ def get_makefile_code(specs_names):
 
 #endif // sliding_or_chunked_causal_mask
 
-void {launcher_name}_nl({params_type} &params,
+void {launcher_name}_nl({fused_multihead_attention_params_v2_str} &params,
     const Launch_params& launch_params, cudaStream_t stream){{
   constexpr int loop_iters = {seq_len} / {noloop_step};
   static_assert(loop_iters * {noloop_step} == {seq_len}, "");
@@ -1431,6 +1430,7 @@ def get_makefile_code(specs_names):
                 {loop_step},
                 {kv_loop_step},
                 {head_size},
+                {head_size_v},
                 {q_tile_buffers},
                 {kv_tile_buffers},
                 NUM_COMPUTE_GROUPS,
@@ -1453,6 +1453,7 @@ def get_makefile_code(specs_names):
                        {loop_step},
                        {kv_loop_step},
                        {head_size},
+                       {head_size_v},
                        {q_tile_buffers},
                        {kv_tile_buffers},
                        NUM_COMPUTE_GROUPS,
@@ -1472,6 +1473,7 @@ def get_makefile_code(specs_names):
                                       {loop_step},
                                       {kv_loop_step},
                                       {head_size},
+                                      {head_size_v},
                                       {q_tile_buffers},
                                       {kv_tile_buffers},
                                       NUM_COMPUTE_GROUPS,
@@ -1491,6 +1493,7 @@ def get_makefile_code(specs_names):
                             {loop_step},
                             {kv_loop_step},
                             {head_size},
+                            {head_size_v},
                             {q_tile_buffers},
                             {kv_tile_buffers},
                             NUM_COMPUTE_GROUPS,
@@ -2881,6 +2884,7 @@ def get_kernel_traits_code(specs_names):
                                   {loop_step},
                                   {kv_loop_step},
                                   {head_size},
+                                  {head_size_v},
                                   {q_tile_buffers},
                                   {kv_tile_buffers},
                                   NUM_COMPUTE_GROUPS,
@@ -3213,7 +3217,7 @@ def get_lname_from_kname(kname: str) -> str:
                         return 'nullptr'
                     lname = kname.replace('_kernel', '')
                     mask_types = [
-                        '_sliding_window_causal', '_custom_mask', '_causal'
+                        '_sliding_or_chunked_causal', '_custom_mask', '_causal'
                     ]
                     for mask_type in mask_types:
                         lname = lname.replace(mask_type, '')
@@ -3228,6 +3232,12 @@ def get_lname_from_kname(kname: str) -> str:
 {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \
 {attention_input_layout_value}, {is_il}, {is_flash_atten}, {is_warp_specialization}, {is_fp32_accu}, \
 {is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {lname}}}\
+'''.format(**locals()) if 'sage' in kname and 'sm90' in kname else '''\
+{{ DATA_TYPE_{prec}, DATA_TYPE_{output_prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, {head_size_v}, \
+{sage_block_sizes[0]}, {sage_block_sizes[1]}, {sage_block_sizes[2]}, kSM_{sm}, nullptr, \
+0, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \
+{attention_input_layout_value}, {is_il}, {is_flash_atten}, {is_warp_specialization}, {is_fp32_accu}, \
+{is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {lname}}}\
 '''.format(**locals())
             else:
                 code = '''\
@@ -3332,7 +3342,6 @@ def get_lname_from_kname(kname: str) -> str:
 {metadata_v2}
 }};
 {local_ns_close}
-
 '''.format(**locals(), copyright=copyright)
 
     else:
@@ -3540,7 +3549,10 @@ def enumerate_hgmma_ldgsts_kernels(specs, sm=90, dtype='fp16'):
 
 
 # Note this will be used in TRT-LLM.
-def enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='fp16'):
+def enumerate_hgmma_flash_warpspec_kernels(specs,
+                                           sm=90,
+                                           dtype='fp16',
+                                           head_size_v=0):
 
     scheduling_mode = int(os.getenv('SCHEDULING_MODE', '1'))
 
@@ -3563,6 +3575,7 @@ def enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='fp16'):
                 dtype=dtype,
                 seq_len=0,  # support any sequence length
                 head_size=[32, 40, 48, 64],
+                head_size_v=head_size_v,
                 warps_m=4,  #4x1 warpgroups
                 warps_n=1,
                 version=2,
@@ -3595,6 +3608,7 @@ def enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='fp16'):
                 dtype=dtype,
                 seq_len=0,  # support any sequence length
                 head_size=[72, 80, 96, 104, 128],
+                head_size_v=head_size_v,
                 warps_m=4,  #4x1 warpgroups
                 warps_n=1,
                 version=2,
@@ -3627,6 +3641,7 @@ def enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='fp16'):
                 dtype=dtype,
                 seq_len=0,  # support any sequence length
                 head_size=[160, 192, 256],
+                head_size_v=head_size_v,
                 warps_m=4,  #4x1 warpgroups
                 warps_n=1,
                 version=2,
@@ -3652,6 +3667,40 @@ def enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='fp16'):
                 scheduling_mode=scheduling_mode,
                 input_layout=input_layout))
 
+        # for deepseek context 192/128, kv_step=128
+        specs.append(
+            kernel_spec(
+                sm=sm,
+                sm_mma=90,
+                dtype=dtype,
+                seq_len=0,  # support any sequence length
+                head_size=192,
+                head_size_v=128,
+                warps_m=4,  #4x1 warpgroups
+                warps_n=1,
+                version=2,
+                interleaved=False,
+                ldgsts_q=
+                False,  # for Hopper kernels, ldgsts = False signals TMA usage.
+                ldgsts_k=False,
+                ldgsts_v=False,
+                share_smem_k_v=False,
+                loop_step=64,
+                q_tile_buffers=1,  # only used by warp specialized kernels
+                has_noloop=0,
+                noloop_step=64,
+                kv_loop_step=128,
+                kv_tile_buffers=2,  # only used by warp specialized kernels
+                unroll_threshold=1,
+                has_scale_max=False,
+                flash_attention=True,
+                warp_specialization=True,
+                alibi=alibi,
+                enable_attn_logit_softcapping=enable_attn_logit_softcapping,
+                return_softmax_stats=return_softmax,
+                scheduling_mode=scheduling_mode,
+                input_layout=input_layout))
+
 
 # Note this will be used in TRT-LLM.
 def enumerate_qgmma_flash_warpspec_kernels(specs,
@@ -6215,7 +6264,21 @@ def enumerate_kernels():
                   and kspec.cross_mha     == False
                   and kspec.flash_attention == True
                   and kspec.warp_specialization == False
-                  and kspec.tiled == True)
+                  and kspec.tiled == True
+                  and not (kspec.sm == 90 and (kspec.head_size, kspec.head_size_v) == (192, 128)))
+                  # Deepseek MLA (hopper-style context 192/128 packed + paged)
+                  or (kspec.sm            == 90
+                  and kspec.dtype         == 'bf16'
+                  and kspec.head_size     == 192
+                  and kspec.head_size_v   == 128
+                  and kspec.sage_block_sizes is None
+                  and kspec.version       == 2
+                  and kspec.cross_mha     == False
+                  and kspec.flash_attention == True
+                  and kspec.warp_specialization == True
+                  and kspec.input_layout in [InputLayout.PACKED_QKV, InputLayout.Q_PAGED_KV]
+                  and kspec.alibi == False
+                  and kspec.enable_attn_logit_softcapping == False)
                   # SageAttention (warp_spec, head_size in (80, 128), packed QKV, padding mask)
                   or (kspec.sm            == 90
                   and kspec.head_size     in [80, 128]
 
@@ -173,7 +173,7 @@ struct Compute
 
     enum
     {
-        TILE_SIZE_V = STEP_KV * Kernel_traits::D
+        TILE_SIZE_V = STEP_KV * Kernel_traits::DV
     };
 
     enum
Original file line number	Diff line number	Diff line change
`@@ -173,7 +173,7 @@ struct Compute`
`173`	`173`
`174`	`174`	`enum`
`175`	`175`	`{`
`176`		`- TILE_SIZE_V = STEP_KV * Kernel_traits::D`
	`176`	`+ TILE_SIZE_V = STEP_KV * Kernel_traits::DV`
`177`	`177`	`};`
`178`	`178`
`179`	`179`	`enum`