[Diffusion][Perf] Remove Redundant Communication Cost by Refining SP Hook Design (#1275)

mxuax · web-flow · commit ac397ae2a0b0 · 2026-02-09T19:10:10.000+08:00
Signed-off-by: mxuax &lt;mxuax@connect.ust.hk&gt;
Signed-off-by: XU Mingshi &lt;91017482+mxuax@users.noreply.github.com&gt;
diff --git a/vllm_omni/diffusion/attention/layer.py b/vllm_omni/diffusion/attention/layer.py
@@ -14,10 +14,11 @@
 from vllm_omni.diffusion.attention.backends.abstract import AttentionMetadata
 from vllm_omni.diffusion.attention.backends.sdpa import SDPABackend
 from vllm_omni.diffusion.attention.parallel import build_parallel_attention_strategy
+from vllm_omni.diffusion.attention.parallel.base import NoParallelAttention
 from vllm_omni.diffusion.attention.parallel.ring import RingParallelAttention
 from vllm_omni.diffusion.attention.selector import get_attn_backend
 from vllm_omni.diffusion.distributed.parallel_state import get_sp_group
-from vllm_omni.diffusion.forward_context import get_forward_context
+from vllm_omni.diffusion.forward_context import get_forward_context, is_forward_context_available
 
 logger = init_logger(__name__)
 
@@ -87,6 +88,21 @@ def __init__(
             gather_idx=gather_idx,
             use_sync=use_sync,
         )
+        # Fallback strategy when SP is not active (outside sharded regions)
+        self._no_parallel_strategy = NoParallelAttention()
+
+    def _get_active_parallel_strategy(self):
+        """Get the parallel strategy based on current SP active state.
+
+        Returns NoParallelAttention if we're outside an SP sharded region
+        (e.g., in noise_refiner/context_refiner before unified_prepare in Z-Image).
+        This avoids unnecessary SP communication for layers not covered by _sp_plan.
+        """
+        if is_forward_context_available():
+            ctx = get_forward_context()
+            if not ctx.sp_active:
+                return self._no_parallel_strategy
+        return self.parallel_strategy
 
     def forward(
         self,
@@ -95,20 +111,23 @@ def forward(
         value: torch.Tensor,
         attn_metadata: AttentionMetadata = None,
     ) -> torch.Tensor:
+        # Get the appropriate parallel strategy based on SP active state
+        strategy = self._get_active_parallel_strategy()
+
         # 1. Prepare inputs (Communication / Resharding)
         # For Ulysses: AllToAll Q/K/V; Slicing joint_q/k/v
         # For Ring: Concat joint_q
-        query, key, value, attn_metadata, ctx = self.parallel_strategy.pre_attention(query, key, value, attn_metadata)
+        query, key, value, attn_metadata, ctx = strategy.pre_attention(query, key, value, attn_metadata)
 
         # 2. Kernel Execution (Computation)
-        if self.use_ring:
+        if self.use_ring and strategy is not self._no_parallel_strategy:
             out = self._run_ring_attention(query, key, value, attn_metadata)
         else:
             out = self._run_local_attention(query, key, value, attn_metadata)
 
         # 3. Post-processing (Reverse Communication)
         # For Ulysses: AllToAll Output, and AllGather Joint Output
-        out = self.parallel_strategy.post_attention(out, ctx)
+        out = strategy.post_attention(out, ctx)
 
         return out
 
diff --git a/vllm_omni/diffusion/forward_context.py b/vllm_omni/diffusion/forward_context.py
@@ -31,6 +31,16 @@ class ForwardContext:
     # Original sequence length before padding (for removing padding in gather)
     sp_original_seq_len: int | None = None
 
+    # SP active scope tracking
+    # Tracks the depth of SP sharding - incremented on shard, decremented on gather
+    # Used by attention layers to determine if SP communication should be enabled
+    _sp_shard_depth: int = 0
+
+    @property
+    def sp_active(self) -> bool:
+        """Returns True when inside an SP sharded region (between shard and gather)."""
+        return self._sp_shard_depth > 0
+
     def __post_init__(self):
         pass
 
diff --git a/vllm_omni/diffusion/hooks/sequence_parallel.py b/vllm_omni/diffusion/hooks/sequence_parallel.py
@@ -235,6 +235,8 @@ def pre_forward(self, module: nn.Module, *args: Any, **kwargs: Any) -> tuple[tup
 
     def post_forward(self, module: nn.Module, output: Any) -> Any:
         """Shard outputs for split_output=True entries."""
+        from vllm_omni.diffusion.forward_context import get_forward_context, is_forward_context_available
+
         is_tensor = isinstance(output, torch.Tensor)
         is_tensor_list = isinstance(output, (list, tuple)) and all(isinstance(x, torch.Tensor) for x in output)
 
@@ -243,6 +245,7 @@ def post_forward(self, module: nn.Module, output: Any) -> Any:
             return output
 
         output_list = [output] if is_tensor else list(output)
+        actually_sharded = False
 
         for index, spm in self.metadata.items():
             if not isinstance(index, int):
@@ -252,7 +255,14 @@ def post_forward(self, module: nn.Module, output: Any) -> Any:
             if index >= len(output_list):
                 raise ValueError(f"Index {index} out of bounds for output of length {len(output_list)}.")
 
-            output_list[index] = self._prepare_sp_input(output_list[index], spm, self._last_args, self._last_kwargs)
+            original = output_list[index]
+            output_list[index] = self._prepare_sp_input(original, spm, self._last_args, self._last_kwargs)
+            if output_list[index] is not original:
+                actually_sharded = True
+
+        # Mark SP as active only if at least one tensor was actually sharded
+        if actually_sharded and is_forward_context_available():
+            get_forward_context()._sp_shard_depth += 1
 
         return output_list[0] if is_tensor else type(output)(output_list)
 
@@ -445,6 +455,8 @@ def post_forward(self, module: nn.Module, output: Any) -> Any:
             ctx = get_forward_context()
             original_seq_len = ctx.sp_original_seq_len
 
+        actually_gathered = False
+
         for i, spm in enumerate(self.metadata):
             if spm is None:
                 continue
@@ -465,6 +477,12 @@ def post_forward(self, module: nn.Module, output: Any) -> Any:
                 logger.debug(f"Removed padding: gathered shape {gathered.shape} (original_seq_len={original_seq_len})")
 
             output[i] = gathered
+            actually_gathered = True
+
+        # Mark SP as inactive only if at least one tensor was actually gathered
+        if actually_gathered and is_forward_context_available():
+            ctx = get_forward_context()
+            ctx._sp_shard_depth = max(0, ctx._sp_shard_depth - 1)
 
         return output[0] if is_tensor else type(output)(output)