Refactor ops-register and column_parallel_op for flashcomm2

Levi-JQ · Levi-JQ · commit 8b9a5a270e33 · 2025-09-29T16:39:56.000+08:00
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -101,15 +101,15 @@ def __init__(self, vllm_config):
             )
             if self.oproj_tensor_parallel_size is not None:
                 raise AssertionError(
-                    "flashcomm2_oproj_tensor_parallel_size cannot be enabled simultaneously with oproj_tensor_parallel_size"
+                    f"flashcomm2_oproj_tensor_parallel_size cannot be enabled simultaneously with oproj_tensor_parallel_size"
                 )
             if global_tp_size <= self.flashcomm2_oproj_tensor_parallel_size:
                 raise AssertionError(
-                    "flashcomm2_oproj_tensor_parallel_size ({self.flashcomm2_oproj_tensor_parallel_size}) cannot exceed global tensor parallel size ({global_tp_size})"
+                    f"flashcomm2_oproj_tensor_parallel_size ({self.flashcomm2_oproj_tensor_parallel_size}) cannot exceed global tensor parallel size ({global_tp_size})"
                 )
             if global_tp_size % self.flashcomm2_oproj_tensor_parallel_size != 0:
                 raise AssertionError(
-                    "Global tensor parallel size ({global_tp_size}) must be divisible by flashcomm2_oproj_tensor_parallel_size ({self.flashcomm2_oproj_tensor_parallel_size})"
+                    f"Global tensor parallel size ({global_tp_size}) must be divisible by flashcomm2_oproj_tensor_parallel_size ({self.flashcomm2_oproj_tensor_parallel_size})"
                 )
 
 
diff --git a/vllm_ascend/models/layers/mla.py b/vllm_ascend/models/layers/mla.py
@@ -26,6 +26,7 @@
 from torch import nn
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, get_current_vllm_config
+from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.layers.mla import MultiHeadLatentAttention
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -133,6 +134,18 @@ def forward(
             if num_tokens % self.tp_size:
                 rows += 1
             output_shape = (rows, hidden_states.shape[1])
+
+        forward_context = get_forward_context()
+        is_prefill = forward_context.with_prefill
+        if forward_context.flashcomm_v2_enabled and forward_context.flashcomm1_ds_prefill:
+            num_padding_tokens = forward_context.pad_size
+            if is_prefill and self.debug_layer_idx > 0 and self.debug_layer_idx < self.layers:
+                output_shape = hidden_states.shape
+            else:
+                B = (hidden_states.shape[0] + num_padding_tokens) // get_tensor_model_parallel_world_size()
+                H = hidden_states.shape[1]
+                output_shape = (B, H)
+
         # FIXME: This does not seem right, should make sure the buffer is fixed
         output = torch.empty(output_shape,
                              dtype=hidden_states.dtype,
diff --git a/vllm_ascend/ops/linear_op.py b/vllm_ascend/ops/linear_op.py
@@ -38,6 +38,7 @@
 
 from typing import Optional, Tuple, Union
 
+from torch import nn
 import torch
 import torch.distributed as dist
 import torch_npu
@@ -182,6 +183,69 @@ def apply_impl(
         return output, output_bias
 
 
+class Flashcomm2MergedColumnParallelOp(CustomColumnParallelOp):
+
+    def apply_impl(
+        self, input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        """Linear layer with column parallelism.
+
+        Implemented multiple optimization projects for dense models, such as FlashComm and
+        communication-computation fusion.
+        """
+
+        bias = self.bias if not self.skip_bias_add else None
+
+        # Matrix multiply.
+        assert self.quant_method is not None
+
+        input_ = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(input_, True)
+        output_parallel = self.quant_method.apply(self.layer, input_, bias)
+
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = self.comm_group.all_gather(output_parallel)
+        else:
+            output = output_parallel
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+
+class Flashcomm2QKVParallelOp(CustomColumnParallelOp):
+
+    def __init__(self, layer, prefix):
+        super().__init__(layer)
+        self.prefix = prefix
+
+    def apply_impl(
+        self, input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        """Linear layer with column parallelism.
+
+        Implemented multiple optimization projects for dense models, such as FlashComm and
+        communication-computation fusion.
+        """
+
+        bias = self.bias if not self.skip_bias_add else None
+
+        # Matrix multiply.
+        assert self.quant_method is not None
+
+        layer_num = self.prefix.split('.')[2]
+
+        input_ = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(
+            input_, layer_num != '0')
+        output_parallel = self.quant_method.apply(self.layer, input_, bias)
+
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = self.comm_group.all_gather(output_parallel)
+        else:
+            output = output_parallel
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+
 class SequenceQKVParallelOp(CustomColumnParallelOp):
 
     def __init__(self, layer, prefix):
@@ -330,6 +394,10 @@ def apply_impl(
         self,
         input_: torch.Tensor,
     ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+        """Linear layer for Flashcomm2.
+            Input.ahspe = [batchsize*seqlength, headnum*headdim/TP]
+            Output.shape = [(batchsize*seqlength+padsize)/TP, hiddensize]
+        """
         # Handle input parallelism - split or use as-is
         if self.input_is_parallel:
             input_parallel = input_
@@ -392,11 +460,6 @@ def apply_impl(
             output = self.comm_group.reduce_scatter(output_parallel, dim=0)
         else:
             output = output_parallel
-        if not forward_context.flashcomm1_ds_prefill:
-            # flashcomm1 not enabled
-            output = get_tp_group().all_gather(output, 0)
-            if num_padding_tokens > 0:
-                output = output[:-num_padding_tokens]
 
         output_bias = self.bias if self.skip_bias_add else None
 
@@ -510,22 +573,27 @@ def update_attrs(self):
 def get_column_parallel_op(
     disable_tp, prefix, layer
 ) -> Tuple[Optional[Union[MLPColumnParallelOp, SequenceMergedColumnParallelOp,
-                          SequenceQKVParallelOp]], int, int]:
+                          SequenceQKVParallelOp, Flashcomm2MergedColumnParallelOp, Flashcomm2QKVParallelOp]], int, int]:
     if disable_tp:
         return None, 0, 1
 
     custom_op: Optional[Union[
         MLPColumnParallelOp,
         SequenceMergedColumnParallelOp,
         SequenceQKVParallelOp,
+        Flashcomm2MergedColumnParallelOp,
+        Flashcomm2QKVParallelOp
     ]] = None
     if "gate_up_proj" in prefix and mlp_tp_enable():
         custom_op = MLPColumnParallelOp(layer)
     elif "gate_up_proj" in prefix and enable_sp():
         custom_op = SequenceMergedColumnParallelOp(layer)
+    elif "gate_up_proj" in prefix and flashcomm2_enable():
+        custom_op = Flashcomm2MergedColumnParallelOp(layer)
     elif enable_sp():
         custom_op = SequenceQKVParallelOp(layer, prefix)
-
+    elif flashcomm2_enable():
+        custom_op = Flashcomm2QKVParallelOp(layer, prefix)
     if custom_op is not None:
         return custom_op, custom_op.tp_rank, custom_op.tp_size
 
diff --git a/vllm_ascend/ops/register_custom_ops.py b/vllm_ascend/ops/register_custom_ops.py
@@ -21,8 +21,9 @@ def _maybe_chunk_residual_impl(x: torch.Tensor,
 
     if x.size(0) != residual.size(0):
         sp_enabled = forward_context.sp_enabled
-        assert sp_enabled is True, ("Currently, this situation only occurs "
-                                    "when sp is enabled")
+        flashcomm_v2_enabled = forward_context.flashcomm_v2_enabled
+        assert sp_enabled or flashcomm_v2_enabled is True, ("Currently, this situation only occurs "
+                                    "when sp or flashcomm_v2 is enabled")
         pad_size = forward_context.pad_size
         if pad_size > 0:
             residual = F.pad(residual, (0, 0, 0, pad_size))
@@ -41,7 +42,8 @@ def _maybe_all_gather_and_maybe_unpad_impl(x: torch.Tensor,
         return x
 
     sp_enabled = forward_context.sp_enabled
-    if sp_enabled and label:
+    flashcomm_v2_enabled = forward_context.flashcomm_v2_enabled
+    if (sp_enabled or flashcomm_v2_enabled) and label:
         x = tensor_model_parallel_all_gather(x, 0)
         pad_size = forward_context.pad_size
         if pad_size > 0:
@@ -56,7 +58,8 @@ def _maybe_pad_and_reduce_impl(x: torch.Tensor) -> torch.Tensor:
         return tensor_model_parallel_all_reduce(x)
 
     sp_enabled = forward_context.sp_enabled
-    if sp_enabled:
+    flashcomm_v2_enabled = forward_context.flashcomm_v2_enabled
+    if sp_enabled or flashcomm_v2_enabled:
         pad_size = forward_context.pad_size
         if pad_size > 0:
             x = F.pad(x, (0, 0, 0, pad_size))

Original file line number	Diff line number	Diff line change
`@@ -101,15 +101,15 @@ def __init__(self, vllm_config):`
`101`	`101`	`)`
`102`	`102`	`if self.oproj_tensor_parallel_size is not None:`
`103`	`103`	`raise AssertionError(`
`104`		`- "flashcomm2_oproj_tensor_parallel_size cannot be enabled simultaneously with oproj_tensor_parallel_size"`
	`104`	`+ f"flashcomm2_oproj_tensor_parallel_size cannot be enabled simultaneously with oproj_tensor_parallel_size"`
`105`	`105`	`)`
`106`	`106`	`if global_tp_size <= self.flashcomm2_oproj_tensor_parallel_size:`
`107`	`107`	`raise AssertionError(`
`108`		`- "flashcomm2_oproj_tensor_parallel_size ({self.flashcomm2_oproj_tensor_parallel_size}) cannot exceed global tensor parallel size ({global_tp_size})"`
	`108`	`+ f"flashcomm2_oproj_tensor_parallel_size ({self.flashcomm2_oproj_tensor_parallel_size}) cannot exceed global tensor parallel size ({global_tp_size})"`
`109`	`109`	`)`
`110`	`110`	`if global_tp_size % self.flashcomm2_oproj_tensor_parallel_size != 0:`
`111`	`111`	`raise AssertionError(`
`112`		`- "Global tensor parallel size ({global_tp_size}) must be divisible by flashcomm2_oproj_tensor_parallel_size ({self.flashcomm2_oproj_tensor_parallel_size})"`
	`112`	`+ f"Global tensor parallel size ({global_tp_size}) must be divisible by flashcomm2_oproj_tensor_parallel_size ({self.flashcomm2_oproj_tensor_parallel_size})"`
`113`	`113`	`)`
`114`	`114`
`115`	`115`