refactor logical check for gemm+allreduce fusion

benzh-2025 · benzh-2025 · commit cfc6143d3d1a · 2025-12-25T02:33:21.000Z
Signed-off-by: benzh-2025 &lt;benzh@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -672,13 +672,17 @@ def __init__(
         # Disable fusion for small models due to accuracy issues
         self.enable_fusion &= config.hidden_size > 4096
 
-        use_fused_gemm_allreduce = True
-        use_fused_gemm_allreduce &= (not mpi_disabled())
-        use_fused_gemm_allreduce &= (self.mapping.tp_size > 1)
-        use_fused_gemm_allreduce &= (config.torch_dtype
-                                     in (torch.float16, torch.bfloat16))
-        use_fused_gemm_allreduce &= (self.is_nvfp4 is not None
-                                     and self.is_nvfp4)
+        mpi_enabled = not mpi_disabled()
+        dtype_supported = config.torch_dtype in (torch.float16, torch.bfloat16)
+        tp_valid = self.mapping.tp_size > 1
+        quant_valid = self.is_nvfp4 is not None and self.is_nvfp4
+        use_fused_gemm_allreduce = all(
+            [mpi_enabled, dtype_supported, tp_valid, quant_valid])
+
+        def check_in_out_features(in_features, out_features):
+            in_feature_valid = in_features % 128 == 0 and in_features >= 1024
+            out_feature_valid = out_features % 64 == 0 and out_features >= 1024
+            return all([in_feature_valid, out_feature_valid])
 
         num_heads = config.num_attention_heads
         head_dim = getattr(config, 'head_dim', None)
@@ -687,21 +691,22 @@ def __init__(
 
         in_features = num_heads * head_dim
         out_features = config.hidden_size
-        in_features_div_by = 128
-        attn_fused_gemm_allreduce = use_fused_gemm_allreduce and in_features % in_features_div_by == 0 and in_features >= 1024
-        attn_fused_gemm_allreduce &= (out_features % 64 == 0
-                                      and out_features >= 1024)
+        in_out_features_valid = check_in_out_features(in_features, out_features)
 
+        attn_fused_gemm_allreduce = all(
+            [use_fused_gemm_allreduce, in_out_features_valid])
         self.PRE_MLP_FUSION = not attn_fused_gemm_allreduce and self.mapping.has_tp(
         ) and not self.enable_attention_dp and self.enable_fusion
 
         in_features = config.intermediate_size
         out_features = config.hidden_size
-        in_features_div_by = 128 * self.mapping.tp_size
-        mlp_fused_gemm_allreduce = use_fused_gemm_allreduce and in_features % in_features_div_by == 0 and in_features >= 1024
-        mlp_fused_gemm_allreduce &= (out_features % 64 == 0
-                                     and out_features >= 1024)
-
+        in_features_aligned_with_tp = in_features % self.mapping.tp_size == 0
+        in_out_features_valid = check_in_out_features(
+            in_features // self.mapping.tp_size, out_features)
+        mlp_fused_gemm_allreduce = all([
+            use_fused_gemm_allreduce, in_features_aligned_with_tp,
+            in_out_features_valid
+        ])
         self.POST_MLP_FUSION = not mlp_fused_gemm_allreduce and self.mapping.has_tp(
         ) and self.enable_fusion
 
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -2156,19 +2156,17 @@ def __init__(
         self.use_custom_cublas_mm = use_custom_cublas_mm
         self.lora = lora
 
-        use_fused_gemm_allreduce = True
-        use_fused_gemm_allreduce &= (not mpi_disabled())
-        use_fused_gemm_allreduce &= self.dtype in (torch.float16,
-                                                   torch.bfloat16)
-        use_fused_gemm_allreduce &= (self.in_features % 128 == 0)
-        use_fused_gemm_allreduce &= (self.tp_mode is not None
-                                     and self.tp_mode == TensorParallelMode.ROW)
-        use_fused_gemm_allreduce &= (self.tp_size > 1 and self.reduce_output)
-        use_fused_gemm_allreduce &= (self.out_features % 64 == 0)
-        use_fused_gemm_allreduce &= (
-            self.quant_config is not None
-            and self.quant_config.layer_quant_mode.has_nvfp4())
-        self.use_fused_gemm_allreduce = use_fused_gemm_allreduce
+        mpi_enabled = not mpi_disabled()
+        dtype_supported = self.dtype in (torch.float16, torch.bfloat16)
+        in_features_aligned = self.in_features % 128 == 0
+        out_features_aligned = self.out_features % 64 == 0
+        tp_valid = self.tp_mode is not None and self.tp_mode == TensorParallelMode.ROW and self.tp_size > 1
+        quant_valid = self.quant_config is not None and self.quant_config.layer_quant_mode.has_nvfp4(
+        )
+        self.use_fused_gemm_allreduce = all([
+            self.reduce_output, mpi_enabled, dtype_supported,
+            in_features_aligned, out_features_aligned, tp_valid, quant_valid
+        ])
 
         self.enable_cuda_core = False
         if torch.cuda.is_available():
diff --git a/tests/unittest/_torch/multi_gpu/test_linear.py b/tests/unittest/_torch/multi_gpu/test_linear.py
@@ -420,16 +420,16 @@ def fp4_row_linear_allreduce_run_single_rank(func, tp_size, seq_len,
         func(tp_size, local_rank, seq_len, output_size, hidden_size, dtype,
              output_ref, x_sf_global, w_sf_global, x_fp4s, w_fp4, x_sf_blocks,
              w_sf_block_unswizzled)
-    except Exception:
-        traceback.print_exc()
+    except Exception as e:
+        print(f"Error: {e}")
         raise
     return True
 
 
 @skip_pre_blackwell
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason='needs 2 GPUs to run this test')
-@pytest.mark.parametrize("seq_len", [256], ids=lambda x: f"seqlen:{x}")
+@pytest.mark.parametrize("seq_len", [256, 400], ids=lambda x: f"seqlen:{x}")
 @pytest.mark.parametrize("output_size", [32, 64], ids=lambda x: f"output:{x}")
 @pytest.mark.parametrize("hidden_size", [128, 256], ids=lambda x: f"hidden:{x}")
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16],