apply review suggestions

a-r-r-o-w · a-r-r-o-w · commit 088d909b326f · 2025-09-04T14:06:13.000+02:00
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -189,6 +189,7 @@
             "CogView4Transformer2DModel",
             "ConsisIDTransformer3DModel",
             "ConsistencyDecoderVAE",
+            "ContextParallelConfig",
             "ControlNetModel",
             "ControlNetUnionModel",
             "ControlNetXSAdapter",
@@ -862,6 +863,7 @@
             CogView4Transformer2DModel,
             ConsisIDTransformer3DModel,
             ConsistencyDecoderVAE,
+            ContextParallelConfig,
             ControlNetModel,
             ControlNetUnionModel,
             ControlNetXSAdapter,
diff --git a/src/diffusers/hooks/context_parallel.py b/src/diffusers/hooks/context_parallel.py
@@ -20,10 +20,10 @@
 import torch.distributed._functional_collectives as funcol
 
 from ..models._modeling_parallel import (
+    ContextParallelConfig,
     ContextParallelInput,
     ContextParallelModelPlan,
     ContextParallelOutput,
-    _InternalParallelConfig,
 )
 from ..utils import get_logger
 from ..utils.torch_utils import unwrap_module
@@ -74,11 +74,11 @@ def _get_parameter_from_args_kwargs(self, identifier: str, args=(), kwargs=None)
 
 def apply_context_parallel(
     module: torch.nn.Module,
-    parallel_config: _InternalParallelConfig,
+    parallel_config: ContextParallelConfig,
     plan: Dict[str, ContextParallelModelPlan],
 ) -> None:
     """Apply context parallel on a model."""
-    logger.debug(f"Applying context parallel with CP mesh: {parallel_config.cp_mesh} and plan: {plan}")
+    logger.debug(f"Applying context parallel with CP mesh: {parallel_config._mesh} and plan: {plan}")
 
     for module_id, cp_model_plan in plan.items():
         submodule = _get_submodule_by_name(module, module_id)
@@ -122,7 +122,7 @@ def remove_context_parallel(module: torch.nn.Module, plan: Dict[str, ContextPara
 
 
 class ContextParallelSplitHook(ModelHook):
-    def __init__(self, metadata: ContextParallelModelPlan, parallel_config: _InternalParallelConfig) -> None:
+    def __init__(self, metadata: ContextParallelModelPlan, parallel_config: ContextParallelConfig) -> None:
         super().__init__()
         self.metadata = metadata
         self.parallel_config = parallel_config
@@ -207,7 +207,7 @@ def _prepare_cp_input(self, x: torch.Tensor, cp_input: ContextParallelInput) ->
 
 
 class ContextParallelGatherHook(ModelHook):
-    def __init__(self, metadata: ContextParallelModelPlan, parallel_config: _InternalParallelConfig) -> None:
+    def __init__(self, metadata: ContextParallelModelPlan, parallel_config: ContextParallelConfig) -> None:
         super().__init__()
         self.metadata = metadata
         self.parallel_config = parallel_config
@@ -251,7 +251,11 @@ def backward(ctx, grad_output):
 class EquipartitionSharder:
     @classmethod
     def shard(cls, tensor: torch.Tensor, dim: int, mesh: torch.distributed.device_mesh.DeviceMesh) -> torch.Tensor:
-        assert tensor.size()[dim] % mesh.size() == 0
+        # NOTE: the following assertion does not have to be true in general. We simply enforce it for now
+        # because the alternate case has not yet been tested/required for any model.
+        assert tensor.size()[dim] % mesh.size() == 0, (
+            "Tensor size along dimension to be sharded must be divisible by mesh size"
+        )
 
         # The following is not fullgraph compatible with Dynamo (fails in DeviceMesh.get_rank)
         # return tensor.chunk(mesh.size(), dim=dim)[mesh.get_rank()]
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
@@ -25,7 +25,7 @@
 _import_structure = {}
 
 if is_torch_available():
-    _import_structure["_modeling_parallel"] = ["ParallelConfig", "enable_parallelism"]
+    _import_structure["_modeling_parallel"] = ["ContextParallelConfig", "ParallelConfig", "enable_parallelism"]
     _import_structure["adapter"] = ["MultiAdapter", "T2IAdapter"]
     _import_structure["attention_dispatch"] = ["AttentionBackendName", "attention_backend"]
     _import_structure["auto_model"] = ["AutoModel"]
@@ -120,7 +120,7 @@
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     if is_torch_available():
-        from ._modeling_parallel import ParallelConfig, enable_parallelism
+        from ._modeling_parallel import ContextParallelConfig, ParallelConfig, enable_parallelism
         from .adapter import MultiAdapter, T2IAdapter
         from .attention_dispatch import AttentionBackendName, attention_backend
         from .auto_model import AutoModel
diff --git a/src/diffusers/models/_modeling_parallel.py b/src/diffusers/models/_modeling_parallel.py
@@ -40,52 +40,82 @@
 
 
 @dataclass
-class ParallelConfig:
+class ContextParallelConfig:
+    # Number of GPUs to use for ring attention within a context parallel region
     ring_degree: Optional[int] = None
+    # Number of context parallel regions to use for ulysses attention within a context parallel region
     ulysses_degree: Optional[int] = None
-
-    def __post_init__(self):
-        if self.ring_degree is None:
-            self.ring_degree = 1
-        if self.ulysses_degree is None:
-            self.ulysses_degree = 1
-
-
-@dataclass
-class _InternalParallelConfig:
-    rank: int
-    world_size: int
-    ring_degree: int
-    ulysses_degree: int
-    device: torch.device
-    cp_mesh: torch.distributed.device_mesh.DeviceMesh
-
     # Whether to convert output and LSE to float32 for ring attention numerical stability
     convert_to_fp32: bool = True
     # TODO: support alltoall
     rotate_method: Literal["allgather", "alltoall"] = "allgather"
 
+    _rank: int = None
+    _world_size: int = None
+    _device: torch.device = None
+    _mesh: torch.distributed.device_mesh.DeviceMesh = None
     _flattened_mesh: torch.distributed.device_mesh.DeviceMesh = None
     _ring_mesh: torch.distributed.device_mesh.DeviceMesh = None
     _ulysses_mesh: torch.distributed.device_mesh.DeviceMesh = None
     _ring_local_rank: int = None
     _ulysses_local_rank: int = None
 
     def __post_init__(self):
+        if self.ring_degree is None:
+            self.ring_degree = 1
+        if self.ulysses_degree is None:
+            self.ulysses_degree = 1
+
+    def setup(self, rank: int, world_size: int, device: torch.device, mesh: torch.distributed.device_mesh.DeviceMesh):
+        self._rank = rank
+        self._world_size = world_size
+        self._device = device
+        self._mesh = mesh
+        if self.ring_degree is None:
+            self.ring_degree = 1
+        if self.ulysses_degree is None:
+            self.ulysses_degree = 1
         if self.rotate_method != "allgather":
-            raise ValueError(f"Only rotate_method='allgather' is supported for now, but got {self.rotate_method}.")
+            raise NotImplementedError(
+                f"Only rotate_method='allgather' is supported for now, but got {self.rotate_method}."
+            )
         if self._flattened_mesh is None:
-            self._flattened_mesh = self.cp_mesh._flatten()
+            self._flattened_mesh = self._mesh._flatten()
         if self._ring_mesh is None:
-            self._ring_mesh = self.cp_mesh["ring"]
+            self._ring_mesh = self._mesh["ring"]
         if self._ulysses_mesh is None:
-            self._ulysses_mesh = self.cp_mesh["ulysses"]
+            self._ulysses_mesh = self._mesh["ulysses"]
         if self._ring_local_rank is None:
             self._ring_local_rank = self._ring_mesh.get_local_rank()
         if self._ulysses_local_rank is None:
             self._ulysses_local_rank = self._ulysses_mesh.get_local_rank()
 
 
+@dataclass
+class ParallelConfig:
+    context_parallel_config: Optional[ContextParallelConfig] = None
+
+    _rank: int = None
+    _world_size: int = None
+    _device: torch.device = None
+    _cp_mesh: torch.distributed.device_mesh.DeviceMesh = None
+
+    def setup(
+        self,
+        rank: int,
+        world_size: int,
+        device: torch.device,
+        *,
+        cp_mesh: Optional[torch.distributed.device_mesh.DeviceMesh] = None,
+    ):
+        self._rank = rank
+        self._world_size = world_size
+        self._device = device
+        self._cp_mesh = cp_mesh
+        if self.context_parallel_config is not None:
+            self.context_parallel_config.setup(rank, world_size, device, cp_mesh)
+
+
 @dataclass(frozen=True)
 class ContextParallelInput:
     split_dim: int
@@ -145,7 +175,7 @@ def enable_parallelism(model_or_pipeline: Union["DiffusionPipeline", "ModelMixin
         parallelized_components = [
             (name, component)
             for name, component in model_or_pipeline.components.items()
-            if getattr(component, "_internal_parallel_config", None) is not None
+            if getattr(component, "_parallel_config", None) is not None
         ]
         if len(parallelized_components) > 1:
             raise ValueError(
@@ -158,7 +188,7 @@ def enable_parallelism(model_or_pipeline: Union["DiffusionPipeline", "ModelMixin
             )
         _, model_or_pipeline = parallelized_components[0]
     elif isinstance(model_or_pipeline, ModelMixin):
-        if getattr(model_or_pipeline, "_internal_parallel_config", None) is None:
+        if getattr(model_or_pipeline, "_parallel_config", None) is None:
             raise ValueError(
                 "The model is not parallelized. Please ensure the model is parallelized with `.parallelize()` before using this context manager."
             )
@@ -167,8 +197,9 @@ def enable_parallelism(model_or_pipeline: Union["DiffusionPipeline", "ModelMixin
             f"Expected a `DiffusionPipeline` or `ModelMixin` instance, but got {type(model_or_pipeline)}. Please provide a valid model or pipeline."
         )
 
+    # TODO: needs to be updated when more parallelism strategies are supported
     old_parallel_config = _AttentionBackendRegistry._parallel_config
-    _AttentionBackendRegistry._parallel_config = model_or_pipeline._internal_parallel_config
+    _AttentionBackendRegistry._parallel_config = model_or_pipeline._parallel_config.context_parallel_config
 
     yield
 
diff --git a/src/diffusers/models/attention_dispatch.py b/src/diffusers/models/attention_dispatch.py
@@ -40,7 +40,7 @@
 
 
 if TYPE_CHECKING:
-    from ._modeling_parallel import _InternalParallelConfig
+    from ._modeling_parallel import ContextParallelConfig
 
 _REQUIRED_FLASH_VERSION = "2.6.3"
 _REQUIRED_SAGE_VERSION = "2.1.1"
@@ -193,7 +193,7 @@ class _AttentionBackendRegistry:
     _supports_context_parallel = {}
     _active_backend = AttentionBackendName(DIFFUSERS_ATTN_BACKEND)
     _checks_enabled = DIFFUSERS_ATTN_CHECKS
-    _parallel_config: Optional["_InternalParallelConfig"] = None
+    _parallel_config: Optional["ContextParallelConfig"] = None
 
     @classmethod
     def register(
@@ -729,7 +729,7 @@ def _flash_attention_forward_op(
 
     # flash-attn only returns LSE if dropout_p > 0. So, we need to workaround.
     parallel_config = _AttentionBackendRegistry._parallel_config
-    if grad_enabled or (parallel_config is not None and parallel_config.world_size > 1):
+    if grad_enabled or (parallel_config is not None and parallel_config._world_size > 1):
         dropout_p = dropout_p if dropout_p > 0 else 1e-30
 
     with torch.set_grad_enabled(grad_enabled):
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -65,7 +65,7 @@
     populate_model_card,
 )
 from ..utils.torch_utils import empty_device_cache
-from ._modeling_parallel import ContextParallelModelPlan, ParallelConfig, _InternalParallelConfig
+from ._modeling_parallel import ContextParallelConfig, ContextParallelModelPlan, ParallelConfig
 from .model_loading_utils import (
     _caching_allocator_warmup,
     _determine_device_map,
@@ -249,7 +249,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
     _skip_layerwise_casting_patterns = None
     _supports_group_offloading = True
     _repeated_blocks = []
-    _internal_parallel_config = None
+    _parallel_config = None
     _cp_plan = None
 
     def __init__(self):
@@ -1481,55 +1481,61 @@ def compile_repeated_blocks(self, *args, **kwargs):
                 f"Regional compilation failed because {repeated_blocks} classes are not found in the model. "
             )
 
-    def enable_parallelism(self, *, config: ParallelConfig, cp_plan: Optional[Dict[str, ContextParallelModelPlan]] = None):
+    def enable_parallelism(
+        self,
+        *,
+        config: Union[ParallelConfig, ContextParallelConfig],
+        cp_plan: Optional[Dict[str, ContextParallelModelPlan]] = None,
+    ):
         from ..hooks.context_parallel import apply_context_parallel
 
         logger.warning(
-            "`parallelize` is an experimental feature. The API may change in the future and breaking changes may be introduced at any time without warning."
+            "`enable_parallelism` is an experimental feature. The API may change in the future and breaking changes may be introduced at any time without warning."
         )
 
+        if isinstance(config, ContextParallelConfig):
+            config = ParallelConfig(context_parallel_config=config)
+
         if not torch.distributed.is_initialized():
-            raise RuntimeError("torch.distributed must be initialized before calling `parallelize`.")
-        if config.ring_degree < 1 or config.ulysses_degree < 1:
-            raise ValueError("`ring_degree` and `ulysses_degree` must be greater than or equal to 1.")
-        if config.ring_degree > 1 and config.ulysses_degree > 1:
-            raise ValueError(
-                "Unified Ulysses-Ring attention is not yet supported. Please set either `ring_degree` or `ulysses_degree` to 1."
-            )
+            raise RuntimeError("torch.distributed must be initialized before calling `enable_parallelism`.")
 
         rank = torch.distributed.get_rank()
         world_size = torch.distributed.get_world_size()
-
-        if config.ring_degree * config.ulysses_degree > world_size:
-            raise ValueError(
-                f"The product of `ring_degree` ({config.ring_degree}) and `ulysses_degree` ({config.ulysses_degree}) must not exceed the world size ({world_size})."
-            )
-
         device_type = torch._C._get_accelerator().type
         device_module = torch.get_device_module(device_type)
         device = torch.device(device_type, rank % device_module.device_count())
 
-        cp_mesh = torch.distributed.device_mesh.init_device_mesh(
-            device_type=device_type,
-            mesh_shape=(config.ring_degree, config.ulysses_degree),
-            mesh_dim_names=("ring", "ulysses"),
-        )
-        parallel_config = _InternalParallelConfig(
-            rank=rank,
-            world_size=world_size,
-            ring_degree=config.ring_degree,
-            ulysses_degree=config.ulysses_degree,
-            device=device,
-            cp_mesh=cp_mesh,
-        )
+        cp_mesh = None
+        if config.context_parallel_config is not None:
+            cp_config = config.context_parallel_config
+            if cp_config.ring_degree < 1 or cp_config.ulysses_degree < 1:
+                raise ValueError("`ring_degree` and `ulysses_degree` must be greater than or equal to 1.")
+            if cp_config.ring_degree > 1 and cp_config.ulysses_degree > 1:
+                raise ValueError(
+                    "Unified Ulysses-Ring attention is not yet supported. Please set either `ring_degree` or `ulysses_degree` to 1."
+                )
+            if cp_config.ring_degree * cp_config.ulysses_degree > world_size:
+                raise ValueError(
+                    f"The product of `ring_degree` ({cp_config.ring_degree}) and `ulysses_degree` ({cp_config.ulysses_degree}) must not exceed the world size ({world_size})."
+                )
+            cp_mesh = torch.distributed.device_mesh.init_device_mesh(
+                device_type=device_type,
+                mesh_shape=(cp_config.ring_degree, cp_config.ulysses_degree),
+                mesh_dim_names=("ring", "ulysses"),
+            )
+
+        config.setup(rank, world_size, device, cp_mesh=cp_mesh)
+
         if cp_plan is None and self._cp_plan is None:
             raise ValueError(
                 "`cp_plan` must be provided either as an argument or set in the model's `_cp_plan` attribute."
             )
         cp_plan = cp_plan if cp_plan is not None else self._cp_plan
 
-        apply_context_parallel(self, parallel_config, cp_plan)
-        self._internal_parallel_config = parallel_config
+        if config.context_parallel_config is not None:
+            apply_context_parallel(self, config.context_parallel_config, cp_plan)
+
+        self._parallel_config = config
 
     @classmethod
     def _load_pretrained_model(
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
@@ -1053,7 +1053,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
-class ParallelConfig(metaclass=DummyObject):
+class ContextParallelConfig(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):