use block 128

qnie-oai · qnie-oai · commit a37606b2532e · 2025-11-24T21:19:28.000-08:00
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py
@@ -201,11 +201,14 @@ def make_default_opt_flags_nvidia(
         block_m = 128
     else:
         if tokens_per_expt <= 64 and routing_data is not None and routing_data.expt_hist is not None:
-            # Ragged and likely memory bound; set the block size higher to minimize loading weights more than once.
             if lhs_dtype == torch.bfloat16 and rhs_dtype == FP4 and tokens_per_expt >= 16 and torch.cuda.get_device_capability()[0] >= 10:
+                # Ragged and likely memory bound; set the block size higher to minimize loading weights more than once.
                 block_m = max(16, min(triton.next_power_of_2(8 * tokens_per_expt), 128))
             else:
                 block_m = max(16, min(triton.next_power_of_2(2 * tokens_per_expt), 64))
+            if block_m == 64 and precision_config.out_scale is not None and rhs_dtype == FP4 and torch.cuda.get_device_capability()[0] >= 10:
+                # when having both fused_activation and mxfp8 downcast in epilogue, block_m=64 causing shared memory overflow
+                block_m = 128
         else:
             block_m = max(16, min(triton.next_power_of_2(tokens_per_expt), 128))
     # block n