update

a-r-r-o-w · a-r-r-o-w · commit 215104f49b01 · 2025-08-08T11:01:10.000+02:00
diff --git a/src/diffusers/hooks/context_parallel.py b/src/diffusers/hooks/context_parallel.py
@@ -233,6 +233,21 @@ def post_forward(self, module, output):
         return output[0] if is_tensor else tuple(output)
 
 
+class AllGatherFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, tensor, dim, group):
+        ctx.dim = dim
+        ctx.group = group
+        ctx.world_size = torch.distributed.get_world_size(group)
+        ctx.rank = torch.distributed.get_rank(group)
+        return funcol.all_gather_tensor(tensor, dim, group=group)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_chunks = torch.chunk(grad_output, ctx.world_size, dim=ctx.dim)
+        return grad_chunks[ctx.rank], None, None
+
+
 class EquipartitionSharder:
     @classmethod
     def shard(cls, tensor: torch.Tensor, dim: int, mesh: torch.distributed.device_mesh.DeviceMesh) -> torch.Tensor:
@@ -246,7 +261,7 @@ def shard(cls, tensor: torch.Tensor, dim: int, mesh: torch.distributed.device_me
     @classmethod
     def unshard(cls, tensor: torch.Tensor, dim: int, mesh: torch.distributed.device_mesh.DeviceMesh) -> torch.Tensor:
         tensor = tensor.contiguous()
-        tensor = funcol.all_gather_tensor(tensor, dim, group=mesh.get_group())
+        tensor = AllGatherFunction.apply(tensor, dim, mesh.get_group())
         return tensor
 
 
diff --git a/src/diffusers/models/attention_dispatch.py b/src/diffusers/models/attention_dispatch.py
@@ -281,7 +281,8 @@ def dispatch_attention_fn(
         and not _AttentionBackendRegistry._is_context_parallel_enabled(backend_name)
     ):
         raise ValueError(
-            f"Backend {backend_name} does not support context parallelism, but a parallel configuration is provided."
+            f"Backend {backend_name} either does not support context parallelism or context parallelism "
+            f"was enabled with a world size of 1."
         )
 
     kwargs = {

Original file line number	Diff line number	Diff line change
`@@ -281,7 +281,8 @@ def dispatch_attention_fn(`
`281`	`281`	`and not _AttentionBackendRegistry._is_context_parallel_enabled(backend_name)`
`282`	`282`	`):`
`283`	`283`	`raise ValueError(`
`284`		`- f"Backend {backend_name} does not support context parallelism, but a parallel configuration is provided."`
	`284`	`+ f"Backend {backend_name} either does not support context parallelism or context parallelism "`
	`285`	`+ f"was enabled with a world size of 1."`
`285`	`286`	`)`
`286`	`287`
`287`	`288`	`kwargs = {`