add fp4 gemm + allreduce

benzh-2025 · benzh-2025 · commit 80485c7ba69b · 2025-12-05T08:05:34.000Z
Signed-off-by: benzh &lt;benzh@nvidia.com&gt;
diff --git a/cpp/tensorrt_llm/thop/CMakeLists.txt b/cpp/tensorrt_llm/thop/CMakeLists.txt
@@ -104,7 +104,8 @@ add_library(
   loraOp.cpp
   finegrained_mixed_dtype_gemm_thop.cpp
   tinygemm2.cpp
-  dsv3RopeOp.cpp)
+  dsv3RopeOp.cpp
+  fusedGemmAllreduceOp.cpp)
 set_property(TARGET th_common PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_link_libraries(
   th_common PRIVATE ${TORCH_LIBRARIES} th_utils ${Python3_LIBRARIES}
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -1247,3 +1247,96 @@ def record_stream(tensor: torch.Tensor, stream_id: int) -> None:
     stream = get_stream(stream_id)
     assert stream is not None
     tensor.record_stream(stream)
+
+
+class Fp4GemmAllreduceRunner(TunableRunner):
+    runner_dict = dict()
+    tuning_config = TuningConfig(dynamic_tensor_specs=(DynamicTensorSpec(
+        0, 0, get_last_power_of_2_num_tokens_buckets,
+        last_positive_power_of_2), ),
+                                 constraint_specs=(ConstraintSpec(
+                                     2, 0, fp4_scale_infer_shape), ))
+
+    def __init__(
+        self,
+        output_dtype: torch.dtype,
+        tp_rank: int,
+        tp_group: List[int],
+    ):
+        self.output_dtype = output_dtype
+        self.tp_rank = tp_rank
+        self.tp_group_str = '-'.join(str(g) for g in tp_group)
+        instance_key = (output_dtype, self.tp_group_str)
+        if instance_key not in Fp4GemmAllreduceRunner.runner_dict:
+            Fp4GemmAllreduceRunner.runner_dict[
+                instance_key] = torch.classes.trtllm.Fp4GemmAllreduceRunner(
+                    output_dtype, tp_rank, tp_group)
+        self.fp4_gemm_all_reduce_runner = Fp4GemmAllreduceRunner.runner_dict[
+            instance_key]
+
+    def unique_id(self):
+        return (self.output_dtype, self.tp_group_str)
+
+    def get_valid_tactics(self, inputs: List[torch.Tensor],
+                          profile: OptimizationProfile, **kwargs) -> List[int]:
+        return list(range(self.fp4_gemm_all_reduce_runner.get_num_configs()))
+
+    def forward(
+        self,
+        inputs: List[torch.Tensor],
+        tactic: int = 0,
+    ) -> torch.Tensor:
+        mat1, mat2, mat1_scale, mat2_scale, global_scale = inputs
+        return self.fp4_gemm_all_reduce_runner.run_gemm(
+            mat1,
+            mat2,
+            mat1_scale,
+            mat2_scale,
+            global_scale,
+            tactic,
+        )
+
+
+@torch.library.custom_op("trtllm::nvfp4_gemm_allreduce", mutates_args=())
+def nvfp4_gemm_allreduce(
+    act_fp4: torch.Tensor,
+    weight: torch.Tensor,
+    act_sf: torch.Tensor,
+    weight_scale: torch.Tensor,
+    alpha: torch.Tensor,
+    output_dtype: torch.dtype,
+    tp_rank: int,
+    tp_group: List[int],
+) -> torch.Tensor:
+    tuner = AutoTuner.get()
+
+    # Use Cutlass runner with predefined configs
+    nvfp4_gemm_allreduce_runner = Fp4GemmAllreduceRunner(
+        output_dtype, tp_rank, tp_group)
+
+    runner_type = type(nvfp4_gemm_allreduce_runner).__name__
+    _, best_tactic = tuner.choose_one(
+        f"trtllm::nvfp4_gemm_allreduce::{runner_type}",
+        [nvfp4_gemm_allreduce_runner],
+        nvfp4_gemm_allreduce_runner.tuning_config,
+        [act_fp4, weight, act_sf, weight_scale, alpha],
+    )
+
+    return nvfp4_gemm_allreduce_runner(
+        inputs=[act_fp4, weight, act_sf, weight_scale, alpha],
+        tactic=best_tactic)
+
+
+@nvfp4_gemm_allreduce.register_fake
+def _(
+    act_fp4: torch.Tensor,
+    weight: torch.Tensor,
+    act_sf: torch.Tensor,
+    weight_scale: torch.Tensor,
+    alpha: torch.Tensor,
+    output_dtype: torch.dtype,
+    tp_rank: int,
+    tp_group: List[int],
+) -> torch.Tensor:
+    return act_fp4.new_empty((act_fp4.size(0), weight.size(0)),
+                             dtype=output_dtype)
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -309,6 +309,12 @@ def apply(self, module: Linear, input: torch.Tensor,
               bias: Optional[torch.Tensor], *args, **kwargs):
         raise NotImplementedError
 
+    @abstractmethod
+    def apply_linear_allreduce(self, module: Linear, input: torch.Tensor,
+                               bias: Optional[torch.Tensor], tp_rank: int,
+                               tp_group: List[int], *args, **kwargs):
+        raise NotImplementedError
+
     def load_weights(self,
                      module: Linear,
                      weights: List[Dict],
@@ -395,6 +401,11 @@ def apply(self, module: Linear, input: torch.Tensor,
             output = F.linear(input, module.weight, bias)
         return output
 
+    def apply_linear_allreduce(self, module: Linear, input: torch.Tensor,
+                               bias: Optional[torch.Tensor], tp_rank: int,
+                               tp_group: List[int], *args, **kwargs):
+        raise NotImplementedError
+
     def load_weights_vanilla(self,
                              module: Linear,
                              weights: List[Dict],
@@ -511,6 +522,11 @@ def apply(self, module: Linear, input: torch.Tensor,
             output = output + bias
         return output
 
+    def apply_linear_allreduce(self, module: Linear, input: torch.Tensor,
+                               bias: Optional[torch.Tensor], tp_rank: int,
+                               tp_group: List[int], *args, **kwargs):
+        raise NotImplementedError
+
     def load_kv_scales(self, weights: List[Dict]):
         k_scale, v_scale = [], []
         for w in weights:
@@ -655,6 +671,11 @@ def apply(self, module: Linear, input: torch.Tensor,
             output = output + bias
         return output
 
+    def apply_linear_allreduce(self, module: Linear, input: torch.Tensor,
+                               bias: Optional[torch.Tensor], tp_rank: int,
+                               tp_group: List[int], *args, **kwargs):
+        raise NotImplementedError
+
     def _get_scale_name(self, weights: List[Dict]):
         # `weight_scale_inv` for DS recipe and  `weight_scale` for ModelOpt recipe.
         # Actually they hold identical values of data_amax / 448.
@@ -769,6 +790,11 @@ def apply(self, module: Linear, input: torch.Tensor,
             output = output + bias
         return output
 
+    def apply_linear_allreduce(self, module: Linear, input: torch.Tensor,
+                               bias: Optional[torch.Tensor], tp_rank: int,
+                               tp_group: List[int], *args, **kwargs):
+        raise NotImplementedError
+
     def _get_scale_name(self, weights: List[Dict]):
         # `weight_scale_inv` for DS recipe and  `weight_scale` for ModelOpt recipe.
         # Actually they hold identical values of data_amax / 448.
@@ -950,6 +976,28 @@ def apply(self, module: Linear, input: torch.Tensor,
             output = output + bias
         return output
 
+    def apply_linear_allreduce(self, module: Linear, input: torch.Tensor,
+                               bias: Optional[torch.Tensor], tp_rank: int,
+                               tp_group: List[int], *args, **kwargs):
+        if isinstance(input, Fp4QuantizedTensor):
+            act_fp4, act_sf = input.fp4_tensor, input.scaling_factor
+        elif isinstance(input, tuple):
+            act_fp4, act_sf = input
+        else:
+            act_fp4, act_sf = torch.ops.trtllm.fp4_quantize(
+                input, module.input_scale, module.scaling_vector_size, False)
+
+        output = torch.ops.trtllm.nvfp4_gemm_allreduce(
+            act_fp4, module.weight, act_sf, module.weight_scale, module.alpha,
+            module.dtype, tp_rank, tp_group)
+        # Take the dim of out_features if padded. Make sure the output is contiguous
+        if output.shape[-1] > module.out_features:
+            output = output[..., :module.out_features].contiguous()
+
+        if bias is not None:
+            output = output + bias
+        return output
+
     def load_kv_scales(self, weights: List[Dict]):
         k_scale, v_scale = [], []
         for w in weights:
@@ -1189,6 +1237,11 @@ def apply(self, module: Linear, input: torch.Tensor,
             output = output + bias
         return output
 
+    def apply_linear_allreduce(self, module: Linear, input: torch.Tensor,
+                               bias: Optional[torch.Tensor], tp_rank: int,
+                               tp_group: List[int], *args, **kwargs):
+        raise NotImplementedError
+
     def load_weight_scales(
         self,
         weights: List[Dict],
@@ -1357,6 +1410,16 @@ def apply(self, module: Linear, input: torch.Tensor,
             output = output + bias
         return output
 
+    def apply_linear_allreduce(self, module: Linear, input: torch.Tensor,
+                               bias: Optional[torch.Tensor], tp_rank: int,
+                               tp_group: List[int], *args, **kwargs):
+        raise NotImplementedError
+
+    def apply_linear_allreduce(self, module: Linear, input: torch.Tensor,
+                               bias: Optional[torch.Tensor], tp_rank: int,
+                               tp_group: List[int], *args, **kwargs):
+        raise NotImplementedError
+
     def load_weight_scales(self,
                            weights: List[Dict],
                            tp_size: int = 1,
@@ -2016,6 +2079,7 @@ def __init__(
         use_cublaslt_nvfp4_blockscaling_mm: bool = False,
         disable_deep_gemm: bool = False,
         fused_weight_shard_indices_mapping: Optional[dict] = None,
+        use_fused_gemm_allreduce: bool = False,
     ):
         from ..distributed import AllReduce
 
@@ -2065,6 +2129,8 @@ def __init__(
         self.reduce_output = reduce_output
         self.use_custom_cublas_mm = use_custom_cublas_mm
         self.lora = lora
+        self.use_fused_gemm_allreduce = use_fused_gemm_allreduce and self.quant_config.layer_quant_mode.has_nvfp4(
+        )
 
         self.enable_cuda_core = False
         if torch.cuda.is_available():
@@ -2164,6 +2230,20 @@ def apply_linear(self,
                 output = output + lora_result
         return output
 
+    def apply_linear_allreduce(self,
+                               input,
+                               bias,
+                               lora_params: Optional[dict] | None = None,
+                               layer_idx: Optional[int] | None = None):
+        output = self.quant_method.apply_linear_allreduce(
+            self, input, bias, self.tp_rank, self.mapping.tp_group)
+
+        if self.lora is not None and bool(lora_params):
+            lora_result = self.lora(input, lora_params, layer_idx)
+            if lora_result is not None:
+                output = output + lora_result
+        return output
+
     def _maybe_fuse_bias_into_allreduce(
         self,
         bias: Optional[torch.Tensor],
@@ -2190,16 +2270,23 @@ def forward(
         layer_idx: Optional[int] = None,
     ) -> torch.Tensor:
         if self.tp_mode == TensorParallelMode.ROW:
+            use_fused_gemm_allreduce = self.use_fused_gemm_allreduce and (
+                all_reduce_params is None or
+                (all_reduce_params.enable_allreduce == True
+                 and all_reduce_params.fusion_op == AllReduceFusionOp.NONE))
             bias = None if (self.tp_rank > 0) else self.bias
             if self.reduce_output:
-                fuse_bias = self._maybe_fuse_bias_into_allreduce(
-                    bias, all_reduce_params)
-                bias = None if fuse_bias else bias
-                output = self.apply_linear(input, bias, lora_params, layer_idx)
-                output = self.all_reduce(
-                    output,
-                    all_reduce_params=all_reduce_params,
-                )
+                if use_fused_gemm_allreduce:
+                    output = self.apply_linear_allreduce(
+                        input, bias, lora_params, layer_idx)
+                else:
+                    fuse_bias = self._maybe_fuse_bias_into_allreduce(
+                        bias, all_reduce_params)
+                    bias = None if fuse_bias else bias
+                    output = self.apply_linear(input, bias, lora_params,
+                                               layer_idx)
+                    output = self.all_reduce(
+                        output, all_reduce_params=all_reduce_params)
             else:
                 output = self.apply_linear(input, bias, lora_params, layer_idx)
         elif self.tp_mode == TensorParallelMode.COLUMN:
diff --git a/tests/unittest/_torch/multi_gpu/test_linear.py b/tests/unittest/_torch/multi_gpu/test_linear.py