[https://nvbugs/5863806][fix] Fix Python string truthiness bug in FMHA cubin selection (NVIDIA#11909)

luyiyun1021 · jieli-matrix · web-flow · commit 34a915377f6b · 2026-03-09T20:31:17.000+08:00
Signed-off-by: Yiyun Lu &lt;55233584+luyiyun1021@users.noreply.github.com&gt;
Co-authored-by: Jie Li &lt;76780849+jieli-matrix@users.noreply.github.com&gt;
diff --git a/cpp/kernels/fmha_v2/setup.py b/cpp/kernels/fmha_v2/setup.py
@@ -3333,11 +3333,14 @@ def use_cubin_header(sm,
                      head_size,
                      dtype,
                      output_dtype=None,
-                     enable_skip_softmax=False):
+                     enable_skip_softmax=False,
+                     attention_mask_type=None):
     if enable_skip_softmax:
         return False
     if 'e4m3' in dtype and output_dtype in ['bf16', 'fp16']:
         return False
+    if attention_mask_type == AttentionMaskType.BIDIRECTIONAL_SLIDING_WINDOW:
+        return False
     return (sm == 90 and head_size == 128) or (sm == 89 and 'e4m3' in dtype)
 
 
@@ -3349,9 +3352,11 @@ def get_cubin_header(kernel_traits, specs_names):
     cubin_lens_dict = {}
     launchers_dict = {}
     for kspec, fname, lname, kname in specs_names:
+        mask_type = AttentionMaskType.BIDIRECTIONAL_SLIDING_WINDOW \
+            if '_bidirectional_sliding_window' in kname else None
         if generate_cu_trtllm and not use_cubin_header(
                 kspec.sm, kspec.head_size, kspec.dtype, kspec.output_dtype,
-                kspec.enable_skip_softmax):
+                kspec.enable_skip_softmax, mask_type):
             continue
         name = fname.replace('.', '_')
         data = 'extern unsigned char cubin_{name}_cubin[];'.format(name=name)
@@ -3487,7 +3492,8 @@ def get_cubin_header(kernel_traits, specs_names):
         return_softmax_stats_flag = pythonBoolean2cpp[sm != '90' or (
             sm == '90' and '_softmax' in kname)]
 
-        enable_skip_softmax_flag = pythonBoolean2cpp['_skipSoftmax' in kname]
+        enable_skip_softmax = '_skipSoftmax' in kname
+        enable_skip_softmax_flag = pythonBoolean2cpp[enable_skip_softmax]
 
         # meta_unroll_step
         meta_unroll_step = unroll_step if ('_nl' in kname
@@ -3516,7 +3522,8 @@ def get_cubin_header(kernel_traits, specs_names):
                 def get_lname_from_kname(kname: str) -> str:
                     if use_cubin_header(int(sm), int(head_size), prec.lower(),
                                         output_prec.lower(),
-                                        enable_skip_softmax_flag):
+                                        enable_skip_softmax,
+                                        attention_mask_type):
                         return 'nullptr'
                     lname = kname.replace('_kernel', '')
                     mask_types = [
@@ -3537,9 +3544,9 @@ def get_lname_from_kname(kname: str) -> str:
 {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \
 {attention_input_layout_value}, {is_il}, {is_flash_atten}, {is_warp_specialization}, {is_fp32_accu}, \
 {is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {enable_skip_softmax_flag}, {lname}}}\
-'''.format(**locals()) if use_cubin_header(int(sm), int(head_size),
-                                           prec.lower(), output_prec.lower(),
-                                           enable_skip_softmax_flag) else '''\
+'''.format(**locals()) if use_cubin_header(
+                    int(sm), int(head_size), prec.lower(), output_prec.lower(),
+                    enable_skip_softmax, attention_mask_type) else '''\
 {{ DATA_TYPE_{prec}, DATA_TYPE_{output_prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, {head_size_v}, \
 {sage_block_sizes[0]}, {sage_block_sizes[1]}, {sage_block_sizes[2]}, kSM_{sm}, nullptr, \
 0, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -218,8 +218,6 @@ cpp/test_e2e.py::test_model[-mamba-86] SKIP (https://nvbugs/5781665)
 unittest/llmapi/test_llm_multi_gpu_pytorch.py::test_tinyllama_logits_processor_tp2pp2 SKIP (https://nvbugs/5781731)
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] SKIP (https://nvbugs/5807902)
 unittest/_torch/ray_orchestrator/multi_gpu/test_multi_instance.py::test_multi_instance[tp2_2instances] SKIP (https://nvbugs/5784566)
-accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized SKIP (https://nvbugs/5785465)
-accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8 SKIP (https://nvbugs/5785485)
 accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False] SKIP (https://nvbugs/5795918)
 full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5800672)
 examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (https://nvbugs/5802248)
@@ -276,7 +274,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[disable_s
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-fp8] SKIP (https://nvbugs/5651865)
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRITON] SKIP (https://nvbugs/5864263)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-triton-auto] SKIP (https://nvbugs/5864187)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False] SKIP (https://nvbugs/5863806)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-auto] SKIP (https://nvbugs/5596343)
 test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/5864769)
 unittest/auto_deploy/multigpu/transformations/library/test_bmm_sharding.py::test_sharding[1-1] SKIP (https://nvbugs/5875203)