[Training] [2/n] add bwd for all2all and all_gather (#439)

SolitaryThinker · web-flow · commit 85b871754585 · 2025-05-27T14:27:54.000-07:00
diff --git a/fastvideo/v1/distributed/__init__.py b/fastvideo/v1/distributed/__init__.py
@@ -5,7 +5,8 @@
     cleanup_dist_env_and_memory, get_sequence_model_parallel_rank,
     get_sequence_model_parallel_world_size, get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size, get_world_group,
-    init_distributed_environment, initialize_model_parallel)
+    init_distributed_environment, initialize_model_parallel,
+    model_parallel_is_initialized)
 from fastvideo.v1.distributed.utils import *
 
 __all__ = [
@@ -17,4 +18,5 @@
     "get_tensor_model_parallel_world_size",
     "cleanup_dist_env_and_memory",
     "get_world_group",
+    "model_parallel_is_initialized",
 ]
diff --git a/fastvideo/v1/distributed/device_communicators/base_device_communicator.py b/fastvideo/v1/distributed/device_communicators/base_device_communicator.py
@@ -1,16 +1,182 @@
 # SPDX-License-Identifier: Apache-2.0
 # Adapted from https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/distributed/device_communicators/base_device_communicator.py
 
-from typing import Optional
+from typing import Any, Optional, Tuple
 
 import torch
 import torch.distributed as dist
-from torch.distributed import ProcessGroup
+from torch import Tensor
+from torch.distributed import ProcessGroup, ReduceOp
+
+
+class DistributedAutograd:
+    """Collection of autograd functions for distributed operations.
+    
+    This class provides custom autograd functions for distributed operations like all_reduce,
+    all_gather, and all_to_all. Each operation is implemented as a static inner class with
+    proper forward and backward implementations.
+    """
+
+    class AllReduce(torch.autograd.Function):
+        """Differentiable all_reduce operation.
+        
+        The gradient of all_reduce is another all_reduce operation since the operation
+        combines values from all ranks equally.
+        """
+
+        @staticmethod
+        def forward(ctx: Any,
+                    group: ProcessGroup,
+                    input_: Tensor,
+                    op: Optional[dist.ReduceOp] = None) -> Tensor:
+            ctx.group = group
+            ctx.op = op
+            output = input_.clone()
+            dist.all_reduce(output, group=group, op=op)
+            return output
+
+        @staticmethod
+        def backward(ctx: Any,
+                     grad_output: Tensor) -> Tuple[None, Tensor, None]:
+            grad_output = grad_output.clone()
+            dist.all_reduce(grad_output, group=ctx.group, op=ctx.op)
+            return None, grad_output, None
+
+    class AllGather(torch.autograd.Function):
+        """Differentiable all_gather operation.
+        
+        The operation gathers tensors from all ranks and concatenates them along a specified dimension.
+        The backward pass uses reduce_scatter to efficiently distribute gradients back to source ranks.
+        """
+
+        @staticmethod
+        def forward(ctx: Any, group: ProcessGroup, input_: Tensor,
+                    world_size: int, dim: int) -> Tensor:
+            ctx.group = group
+            ctx.world_size = world_size
+            ctx.dim = dim
+            ctx.input_shape = input_.shape
+
+            input_size = input_.size()
+            output_size = (input_size[0] * world_size, ) + input_size[1:]
+            output_tensor = torch.empty(output_size,
+                                        dtype=input_.dtype,
+                                        device=input_.device)
+
+            dist.all_gather_into_tensor(output_tensor, input_, group=group)
+
+            output_tensor = output_tensor.reshape((world_size, ) + input_size)
+            output_tensor = output_tensor.movedim(0, dim)
+            output_tensor = output_tensor.reshape(input_size[:dim] +
+                                                  (world_size *
+                                                   input_size[dim], ) +
+                                                  input_size[dim + 1:])
+            return output_tensor
+
+        @staticmethod
+        def backward(ctx: Any,
+                     grad_output: Tensor) -> Tuple[None, Tensor, None, None]:
+            # Split the gradient tensor along the gathered dimension
+            dim_size = grad_output.size(ctx.dim) // ctx.world_size
+            grad_chunks = grad_output.reshape(grad_output.shape[:ctx.dim] +
+                                              (ctx.world_size, dim_size) +
+                                              grad_output.shape[ctx.dim + 1:])
+            grad_chunks = grad_chunks.movedim(ctx.dim, 0)
+
+            # Each rank only needs its corresponding gradient
+            grad_input = torch.empty(ctx.input_shape,
+                                     dtype=grad_output.dtype,
+                                     device=grad_output.device)
+            dist.reduce_scatter_tensor(grad_input,
+                                       grad_chunks.contiguous(),
+                                       group=ctx.group)
+
+            return None, grad_input, None, None
+
+    class AllToAll4D(torch.autograd.Function):
+        """Differentiable all_to_all operation specialized for 4D tensors.
+        
+        This operation is particularly useful for attention operations where we need to
+        redistribute data across ranks for efficient parallel processing.
+        
+        The operation supports two modes:
+        1. scatter_dim=2, gather_dim=1: Used for redistributing attention heads
+        2. scatter_dim=1, gather_dim=2: Used for redistributing sequence dimensions
+        """
+
+        @staticmethod
+        def forward(ctx: Any, group: ProcessGroup, input_: Tensor,
+                    world_size: int, scatter_dim: int,
+                    gather_dim: int) -> Tensor:
+            ctx.group = group
+            ctx.world_size = world_size
+            ctx.scatter_dim = scatter_dim
+            ctx.gather_dim = gather_dim
+
+            if world_size == 1:
+                return input_
+
+            assert input_.dim(
+            ) == 4, f"input must be 4D tensor, got {input_.dim()} and shape {input_.shape}"
+
+            if scatter_dim == 2 and gather_dim == 1:
+                bs, shard_seqlen, hc, hs = input_.shape
+                seqlen = shard_seqlen * world_size
+                shard_hc = hc // world_size
+
+                input_t = input_.reshape(bs, shard_seqlen, world_size, shard_hc,
+                                         hs).transpose(0, 2).contiguous()
+                output = torch.empty_like(input_t)
+
+                dist.all_to_all_single(output, input_t, group=group)
+
+                output = output.reshape(seqlen, bs, shard_hc,
+                                        hs).transpose(0, 1).contiguous()
+                output = output.reshape(bs, seqlen, shard_hc, hs)
+
+                return output
+            elif scatter_dim == 1 and gather_dim == 2:
+                bs, seqlen, shard_hc, hs = input_.shape
+                hc = shard_hc * world_size
+                shard_seqlen = seqlen // world_size
+
+                input_t = input_.reshape(bs, world_size, shard_seqlen, shard_hc,
+                                         hs)
+                input_t = input_t.transpose(0, 3).transpose(0, 1).contiguous()
+                input_t = input_t.reshape(world_size, shard_hc, shard_seqlen,
+                                          bs, hs)
+
+                output = torch.empty_like(input_t)
+                dist.all_to_all_single(output, input_t, group=group)
+
+                output = output.reshape(hc, shard_seqlen, bs, hs)
+                output = output.transpose(0, 2).contiguous()
+                output = output.reshape(bs, shard_seqlen, hc, hs)
+
+                return output
+            else:
+                raise RuntimeError(
+                    f"Invalid scatter_dim={scatter_dim}, gather_dim={gather_dim}. "
+                    f"Only (scatter_dim=2, gather_dim=1) and (scatter_dim=1, gather_dim=2) are supported."
+                )
+
+        @staticmethod
+        def backward(
+                ctx: Any,
+                grad_output: Tensor) -> Tuple[None, Tensor, None, None, None]:
+            if ctx.world_size == 1:
+                return None, grad_output, None, None, None
+
+            # For backward pass, we swap scatter_dim and gather_dim
+            output = DistributedAutograd.AllToAll4D.apply(
+                ctx.group, grad_output, ctx.world_size, ctx.gather_dim,
+                ctx.scatter_dim)
+            return None, output, None, None, None
 
 
 class DeviceCommunicatorBase:
     """
-    Base class for device-specific communicator.
+    Base class for device-specific communicator with autograd support.
     It can use the `cpu_group` to initialize the communicator.
     If the device has PyTorch integration (PyTorch can recognize its
     communication backend), the `device_group` will also be given.
@@ -33,35 +199,28 @@ def __init__(self,
         self.rank_in_group = dist.get_group_rank(self.cpu_group,
                                                  self.global_rank)
 
-    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
-        dist.all_reduce(input_, group=self.device_group)
-        return input_
+    def all_reduce(self,
+                   input_: torch.Tensor,
+                   op: Optional[dist.ReduceOp] = ReduceOp.SUM) -> torch.Tensor:
+        """Performs an all_reduce operation with gradient support."""
+        return DistributedAutograd.AllReduce.apply(self.device_group, input_,
+                                                   op)
 
     def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        """Performs an all_gather operation with gradient support."""
         if dim < 0:
-            # Convert negative dim to positive.
             dim += input_.dim()
-        input_size = input_.size()
-        # NOTE: we have to use concat-style all-gather here,
-        # stack-style all-gather has compatibility issues with
-        # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
-        output_size = (input_size[0] * self.world_size, ) + input_size[1:]
-        # Allocate output tensor.
-        output_tensor = torch.empty(output_size,
-                                    dtype=input_.dtype,
-                                    device=input_.device)
-        # All-gather.
-        dist.all_gather_into_tensor(output_tensor,
-                                    input_,
-                                    group=self.device_group)
-        # Reshape
-        output_tensor = output_tensor.reshape((self.world_size, ) + input_size)
-        output_tensor = output_tensor.movedim(0, dim)
-        output_tensor = output_tensor.reshape(input_size[:dim] +
-                                              (self.world_size *
-                                               input_size[dim], ) +
-                                              input_size[dim + 1:])
-        return output_tensor
+        return DistributedAutograd.AllGather.apply(self.device_group, input_,
+                                                   self.world_size, dim)
+
+    def all_to_all_4D(self,
+                      input_: torch.Tensor,
+                      scatter_dim: int = 2,
+                      gather_dim: int = 1) -> torch.Tensor:
+        """Performs a 4D all-to-all operation with gradient support."""
+        return DistributedAutograd.AllToAll4D.apply(self.device_group, input_,
+                                                    self.world_size,
+                                                    scatter_dim, gather_dim)
 
     def gather(self,
                input_: torch.Tensor,
@@ -95,81 +254,6 @@ def gather(self,
             output_tensor = None
         return output_tensor
 
-    def all_to_all_4D(self,
-                      input_: torch.Tensor,
-                      scatter_dim: int = 2,
-                      gather_dim: int = 1) -> torch.Tensor:
-        """Specialized all-to-all operation for 4D tensors (e.g., for QKV matrices).
-        
-        Args:
-            input_ (torch.Tensor): 4D input tensor to be scattered and gathered.
-            scatter_dim (int, optional): Dimension along which to scatter. Defaults to 2.
-            gather_dim (int, optional): Dimension along which to gather. Defaults to 1.
-            
-        Returns:
-            torch.Tensor: Output tensor after all-to-all operation.
-        """
-        # Bypass the function if we are using only 1 GPU.
-        if self.world_size == 1:
-            return input_
-
-        assert input_.dim(
-        ) == 4, f"input must be 4D tensor, got {input_.dim()} and shape {input_.shape}"
-
-        if scatter_dim == 2 and gather_dim == 1:
-            # input: (bs, seqlen/P, hc, hs) output: (bs, seqlen, hc/P, hs)
-            bs, shard_seqlen, hc, hs = input_.shape
-            seqlen = shard_seqlen * self.world_size
-            shard_hc = hc // self.world_size
-
-            # Reshape and transpose for scattering
-            input_t = (input_.reshape(bs, shard_seqlen, self.world_size,
-                                      shard_hc, hs).transpose(0,
-                                                              2).contiguous())
-
-            output = torch.empty_like(input_t)
-
-            torch.distributed.all_to_all_single(output,
-                                                input_t,
-                                                group=self.device_group)
-            torch.cuda.synchronize()
-
-            # Reshape and transpose back
-            output = output.reshape(seqlen, bs, shard_hc,
-                                    hs).transpose(0, 1).contiguous().reshape(
-                                        bs, seqlen, shard_hc, hs)
-
-            return output
-
-        elif scatter_dim == 1 and gather_dim == 2:
-            # input: (bs, seqlen, hc/P, hs) output: (bs, seqlen/P, hc, hs)
-            bs, seqlen, shard_hc, hs = input_.shape
-            hc = shard_hc * self.world_size
-            shard_seqlen = seqlen // self.world_size
-
-            # Reshape and transpose for scattering
-            input_t = (input_.reshape(bs, self.world_size, shard_seqlen,
-                                      shard_hc, hs).transpose(0, 3).transpose(
-                                          0, 1).contiguous().reshape(
-                                              self.world_size, shard_hc,
-                                              shard_seqlen, bs, hs))
-            output = torch.empty_like(input_t)
-
-            torch.distributed.all_to_all_single(output,
-                                                input_t,
-                                                group=self.device_group)
-            torch.cuda.synchronize()
-
-            # Reshape and transpose back
-            output = output.reshape(hc, shard_seqlen, bs,
-                                    hs).transpose(0, 2).contiguous().reshape(
-                                        bs, shard_seqlen, hc, hs)
-
-            return output
-        else:
-            raise RuntimeError(
-                "scatter_dim must be 1 or 2 and gather_dim must be 1 or 2")
-
     def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
         """Sends a tensor to the destination rank in a non-blocking way"""
         """NOTE: `dst` is the local rank of the destination rank."""
diff --git a/fastvideo/v1/distributed/device_communicators/cuda_communicator.py b/fastvideo/v1/distributed/device_communicators/cuda_communicator.py
@@ -29,17 +29,19 @@ def __init__(self,
                 device=self.device,
             )
 
-    def all_reduce(self, input_):
+    def all_reduce(self,
+                   input_,
+                   op: Optional[torch.distributed.ReduceOp] = None):
         pynccl_comm = self.pynccl_comm
         assert pynccl_comm is not None
-        out = pynccl_comm.all_reduce(input_)
+        out = pynccl_comm.all_reduce(input_, op=op)
         if out is None:
             # fall back to the default all-reduce using PyTorch.
             # this usually happens during testing.
             # when we run the model, allreduce only happens for the TP
             # group, where we always have either custom allreduce or pynccl.
             out = input_.clone()
-            torch.distributed.all_reduce(out, group=self.device_group)
+            torch.distributed.all_reduce(out, group=self.device_group, op=op)
         return out
 
     def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
diff --git a/fastvideo/v1/distributed/parallel_state.py b/fastvideo/v1/distributed/parallel_state.py