[https://nvbugs/5521253][fix] Enable Gemma3 12B & 27B on SM100 (#8666)

brb-nv · web-flow · commit 095b7a3ad53b · 2025-11-03T14:49:36.000-08:00
Signed-off-by: Balaram Buddharaju &lt;169953907+brb-nv@users.noreply.github.com&gt;
diff --git a/cpp/kernels/fmha_v2/setup.py b/cpp/kernels/fmha_v2/setup.py
@@ -6379,6 +6379,16 @@ def enumerate_kernels():
                   and kspec.version       == 2
                   and kspec.cross_mha     == False
                   and kspec.flash_attention == False)
+                  # Gemma3 VL support.
+                  or  (kspec.sm           == 100
+                  and kspec.dtype         in ['fp16', 'bf16', 'fp16_fp32', 'e4m3', 'e4m3_fp32']
+                  and kspec.head_size     == 72
+                  and kspec.head_size_v   == 0
+                  and kspec.sage_block_sizes is None
+                  and kspec.version       == 2
+                  and kspec.cross_mha     == False
+                  and kspec.flash_attention == True
+                  and kspec.input_layout != InputLayout.SEPARATE_Q_K_V)
                   # Deepseek MLA (generation 576/512 paged)
                   or (kspec.sm            in [90, 100, 120]
                   and kspec.dtype         in ['bf16', 'e4m3_fp32']
diff --git a/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp b/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp
@@ -46,7 +46,10 @@ QkvLayout AttentionInputLayoutToQkvLayout(AttentionInputLayout layout)
 
 FmhaDispatcher::FmhaDispatcher(MHARunnerFixedParams fixedParams)
     : mFixedParams(fixedParams)
-    , mUseTllmGen(tensorrt_llm::common::isSM100Family())
+    // TRTLLM-GEN only supports power of 2 head sizes.
+    // The exception will fall back to fmha v2.
+    // Please update fmha_v2/setup.py if you want to add more supported head sizes.
+    , mUseTllmGen(tensorrt_llm::common::isSM100Family() && fixedParams.headSize != 72)
 {
     if (mUseTllmGen)
     {
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -2447,8 +2447,7 @@ def test_ptp_quickstart_advanced_mixed_precision(llm_root, llm_venv):
                  marks=pytest.mark.skip_less_device_memory(80000)),
     pytest.param("gemma-3-27b-it",
                  "gemma/gemma-3-27b-it",
-                 marks=(pytest.mark.skip_less_device_memory(80000),
-                        skip_post_blackwell)),
+                 marks=pytest.mark.skip_less_device_memory(80000)),
     pytest.param(
         "Nano-v2-VLM",
         "Nano-v2-VLM",

Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,10 @@ QkvLayout AttentionInputLayoutToQkvLayout(AttentionInputLayout layout)`
`46`	`46`
`47`	`47`	`FmhaDispatcher::FmhaDispatcher(MHARunnerFixedParams fixedParams)`
`48`	`48`	`: mFixedParams(fixedParams)`
`49`		`- , mUseTllmGen(tensorrt_llm::common::isSM100Family())`
	`49`	`+ // TRTLLM-GEN only supports power of 2 head sizes.`
	`50`	`+ // The exception will fall back to fmha v2.`
	`51`	`+ // Please update fmha_v2/setup.py if you want to add more supported head sizes.`
	`52`	`+ , mUseTllmGen(tensorrt_llm::common::isSM100Family() && fixedParams.headSize != 72)`
`50`	`53`	`{`
`51`	`54`	`if (mUseTllmGen)`
`52`	`55`	`{`