refactor

a-r-r-o-w · a-r-r-o-w · commit 79736265c5f5 · 2025-07-16T21:24:24.000+02:00
diff --git a/src/diffusers/hooks/context_parallel.py b/src/diffusers/hooks/context_parallel.py
@@ -34,8 +34,8 @@
 logger = get_logger(__name__)  # pylint: disable=invalid-name
 
 _CONTEXT_PARALLEL_MODEL_HOOK = "context_parallel_model_hook"
-_CONTEXT_PARALLEL_SUBMODULE_INPUT_HOOK_TEMPLATE = "cp_input---{}"
-_CONTEXT_PARALLEL_SUBMODULE_OUTPUT_HOOK_TEMPLATE = "cp_output---{}"
+_CONTEXT_PARALLEL_INPUT_HOOK_TEMPLATE = "cp_input---{}"
+_CONTEXT_PARALLEL_OUTPUT_HOOK_TEMPLATE = "cp_output---{}"
 
 
 # TODO(aryan): consolidate with ._helpers.TransformerBlockMetadata
@@ -92,14 +92,14 @@ def apply_context_parallel(
         for m in submodule:
             if isinstance(cp_model_plan, dict):
                 hook = ContextParallelSplitHook(cp_model_plan, parallel_config)
-                hook_name = _CONTEXT_PARALLEL_SUBMODULE_INPUT_HOOK_TEMPLATE.format(module_id)
+                hook_name = _CONTEXT_PARALLEL_INPUT_HOOK_TEMPLATE.format(module_id)
             elif isinstance(cp_model_plan, (ContextParallelOutput, list, tuple)):
                 if isinstance(cp_model_plan, ContextParallelOutput):
                     cp_model_plan = [cp_model_plan]
                 if not all(isinstance(x, ContextParallelOutput) for x in cp_model_plan):
                     raise ValueError(f"Expected all elements of cp_model_plan to be CPOutput, but got {cp_model_plan}")
                 hook = ContextParallelGatherHook(cp_model_plan, parallel_config)
-                hook_name = _CONTEXT_PARALLEL_SUBMODULE_OUTPUT_HOOK_TEMPLATE.format(module_id)
+                hook_name = _CONTEXT_PARALLEL_OUTPUT_HOOK_TEMPLATE.format(module_id)
             else:
                 raise ValueError(f"Unsupported context parallel model plan type: {type(cp_model_plan)}")
             registry = HookRegistry.check_if_exists_or_initialize(m)
diff --git a/src/diffusers/models/attention_dispatch.py b/src/diffusers/models/attention_dispatch.py
@@ -751,6 +751,11 @@ def forward(
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
+        attn_mask: Optional[torch.Tensor],
+        dropout_p: float,
+        scale: Optional[float],
+        is_causal: bool,
+        enable_gqa: bool,
         return_lse: bool,
         op: torch.autograd.Function,
     ):
@@ -773,7 +778,7 @@ def forward(
                 value = kv[key.numel() :].reshape_as(value)
                 next_rank = (next_rank + 1) % world_size
 
-            out, lse = op.apply(query, key, value, None, 0.0, None, False, False, True)
+            out, lse = op.apply(query, key, value, attn_mask, dropout_p, scale, is_causal, enable_gqa, True)
 
             if parallel_config.convert_to_fp32:
                 out = out.to(torch.float32)
@@ -806,6 +811,11 @@ def forward(
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
+        attn_mask: Optional[torch.Tensor],
+        dropout_p: float,
+        scale: Optional[float],
+        is_causal: bool,
+        enable_gqa: bool,
         return_lse: bool,
         op: torch.autograd.Function,
     ):
@@ -823,7 +833,7 @@ def forward(
         query, key, value = (funcol.all_to_all_single(x, None, None, group=group).wait() for x in (query, key, value))
         query, key, value = (x.flatten(0, 1).permute(1, 0, 2, 3).contiguous() for x in (query, key, value))
 
-        out = op.apply(query, key, value, None, 0.0, None, False, False, return_lse)
+        out = op.apply(query, key, value, attn_mask, dropout_p, scale, is_causal, enable_gqa, return_lse)
         if return_lse:
             out, lse, *_ = out
 
@@ -872,9 +882,13 @@ def _templated_context_parallel_attention(
     parallel_config = _AttentionBackendRegistry._parallel_config
     # TODO: add support for unified attention with ring/ulysses degree both being > 1
     if parallel_config.ring_degree > 1:
-        return TemplatedRingAttention.apply(query, key, value, return_lse, op)
+        return TemplatedRingAttention.apply(
+            query, key, value, attn_mask, dropout_p, scale, is_causal, enable_gqa, return_lse, op
+        )
     elif parallel_config.ulysses_degree > 1:
-        return TemplatedUlyssesAttention.apply(query, key, value, return_lse, op)
+        return TemplatedUlyssesAttention.apply(
+            query, key, value, attn_mask, dropout_p, scale, is_causal, enable_gqa, return_lse, op
+        )
     else:
         return op.apply(query, key, value, attn_mask, dropout_p, scale, is_causal, enable_gqa, return_lse)
 
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -1535,6 +1535,10 @@ def parallelize(self, *, ring_degree: int = 1, ulysses_degree: int = 1, cp_plan=
             device=device,
             cp_mesh=cp_mesh,
         )
+        if cp_plan is None and self._cp_plan is None:
+            raise ValueError(
+                "`cp_plan` must be provided either as an argument or set in the model's `_cp_plan` attribute."
+            )
         cp_plan = cp_plan if cp_plan is not None else self._cp_plan
 
         apply_context_parallel(self, parallel_config, cp_plan)