huggingface
diff --git a/‎docs/source/en/training/distributed_inference.md‎
Lines changed: 67 additions & 24 deletions b/‎docs/source/en/training/distributed_inference.md‎
Lines changed: 67 additions & 24 deletions
diff --git a/‎src/diffusers/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎src/diffusers/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/diffusers/hooks/group_offloading.py‎
Lines changed: 71 additions & 27 deletions b/‎src/diffusers/hooks/group_offloading.py‎
Lines changed: 71 additions & 27 deletions
@@ -237,6 +237,8 @@ By selectively loading and unloading the models you need at a given stage and sh
 
 Use [`~ModelMixin.set_attention_backend`] to switch to a more optimized attention backend. Refer to this [table](../optimization/attention_backends#available-backends) for a complete list of available backends.
 
+Most attention backends are compatible with context parallelism. Open an [issue](https://github.com/huggingface/diffusers/issues/new) if a backend is not compatible.
+
 ### Ring Attention
 
 Key (K) and value (V) representations communicate between devices using [Ring Attention](https://huggingface.co/papers/2310.01889). This ensures each split sees every other token's K/V. Each GPU computes attention for its local K/V and passes it to the next GPU in the ring. No single GPU holds the full sequence, which reduces communication latency.
@@ -245,40 +247,60 @@ Pass a [`ContextParallelConfig`] to the `parallel_config` argument of the transf
 
 ```py
 import torch
-from diffusers import AutoModel, QwenImagePipeline, ContextParallelConfig
-
-try:
-    torch.distributed.init_process_group("nccl")
-    rank = torch.distributed.get_rank()
-    device = torch.device("cuda", rank % torch.cuda.device_count())
+from torch import distributed as dist
+from diffusers import DiffusionPipeline, ContextParallelConfig
+
+def setup_distributed():
+    if not dist.is_initialized():
+        dist.init_process_group(backend="nccl")
+    rank = dist.get_rank()
+    device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    
-    transformer = AutoModel.from_pretrained("Qwen/Qwen-Image", subfolder="transformer", torch_dtype=torch.bfloat16, parallel_config=ContextParallelConfig(ring_degree=2))
-    pipeline = QwenImagePipeline.from_pretrained("Qwen/Qwen-Image", transformer=transformer, torch_dtype=torch.bfloat16, device_map="cuda")
-    pipeline.transformer.set_attention_backend("flash")
+    return device
+
+def main():
+    device = setup_distributed()
+    world_size = dist.get_world_size()
+
+    pipeline = DiffusionPipeline.from_pretrained(
+        "black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, device_map=device
+    )
+    pipeline.transformer.set_attention_backend("_native_cudnn")
+
+    cp_config = ContextParallelConfig(ring_degree=world_size)
+    pipeline.transformer.enable_parallelism(config=cp_config)
 
     prompt = """
     cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
     highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
     """
-    
+
     # Must specify generator so all ranks start with same latents (or pass your own)
     generator = torch.Generator().manual_seed(42)
-    image = pipeline(prompt, num_inference_steps=50, generator=generator).images[0]
-    
-    if rank == 0:
-        image.save("output.png")
-
-except Exception as e:
-    print(f"An error occurred: {e}")
-    torch.distributed.breakpoint()
-    raise
-
-finally:
-    if torch.distributed.is_initialized():
-        torch.distributed.destroy_process_group()
+    image = pipeline(
+        prompt,
+        guidance_scale=3.5,
+        num_inference_steps=50,
+        generator=generator,
+    ).images[0]
+
+    if dist.get_rank() == 0:
+        image.save(f"output.png")
+
+    if dist.is_initialized():
+        dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()
 ```
 
+The script above needs to be run with a distributed launcher, such as [torchrun](https://docs.pytorch.org/docs/stable/elastic/run.html), that is compatible with PyTorch. `--nproc-per-node` is set to the number of GPUs available.
+
+/```shell
+`torchrun --nproc-per-node 2 above_script.py`. 
+/```
+
 ### Ulysses Attention
 
 [Ulysses Attention](https://huggingface.co/papers/2309.14509) splits a sequence across GPUs and performs an *all-to-all* communication (every device sends/receives data to every other device). Each GPU ends up with all tokens for only a subset of attention heads. Each GPU computes attention locally on all tokens for its head, then performs another all-to-all to regroup results by tokens for the next layer.
@@ -288,5 +310,26 @@ finally:
 Pass the [`ContextParallelConfig`] to [`~ModelMixin.enable_parallelism`].
 
 ```py
+# Depending on the number of GPUs available.
 pipeline.transformer.enable_parallelism(config=ContextParallelConfig(ulysses_degree=2))
+```
+
+### parallel_config
+
+Pass `parallel_config` during model initialization to enable context parallelism.
+
+```py
+CKPT_ID = "black-forest-labs/FLUX.1-dev"
+
+cp_config = ContextParallelConfig(ring_degree=2)
+transformer = AutoModel.from_pretrained(
+    CKPT_ID, 
+    subfolder="transformer", 
+    torch_dtype=torch.bfloat16, 
+    parallel_config=cp_config
+)
+
+pipeline = DiffusionPipeline.from_pretrained(
+    CKPT_ID, transformer=transformer, torch_dtype=torch.bfloat16,
+).to(device)
 ```
@@ -404,6 +404,8 @@
 else:
     _import_structure["modular_pipelines"].extend(
         [
+            "Flux2AutoBlocks",
+            "Flux2ModularPipeline",
             "FluxAutoBlocks",
             "FluxKontextAutoBlocks",
             "FluxKontextModularPipeline",
@@ -1111,6 +1113,8 @@
         from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
         from .modular_pipelines import (
+            Flux2AutoBlocks,
+            Flux2ModularPipeline,
             FluxAutoBlocks,
             FluxKontextAutoBlocks,
             FluxKontextModularPipeline,
 
@@ -60,6 +60,8 @@ class GroupOffloadingConfig:
     offload_to_disk_path: Optional[str] = None
     stream: Optional[Union[torch.cuda.Stream, torch.Stream]] = None
     block_modules: Optional[List[str]] = None
+    exclude_kwargs: Optional[List[str]] = None
+    module_prefix: Optional[str] = ""
     pin_groups: Optional[Union[str, Callable]] = None
 
 
@@ -156,7 +158,7 @@ def _pinned_memory_tensors(self):
         finally:
             pinned_dict = None
 
-    def _transfer_tensor_to_device(self, tensor, source_tensor):
+    def _transfer_tensor_to_device(self, tensor, source_tensor, default_stream=None):
         tensor.data = source_tensor.to(self.onload_device, non_blocking=self.non_blocking)
         if self.record_stream:
             tensor.data.record_stream(self._torch_accelerator_module.current_stream())
@@ -211,6 +213,7 @@ def _onload_from_memory(self):
             self.stream.synchronize()
 
         context = nullcontext() if self.stream is None else self._torch_accelerator_module.stream(self.stream)
+        default_stream = self._torch_accelerator_module.current_stream() if self.stream is not None else None
         with context:
             if self.stream is not None:
                 with self._pinned_memory_tensors() as pinned_memory:
@@ -308,13 +311,16 @@ def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
                 self.next_group.onload_()
 
             should_synchronize = (
-                not self.group.onload_self and self.group.stream is not None and not should_onload_next_group
+                not self.group.onload_self
+                and self.group.stream is not None
+                and not should_onload_next_group
+                and not self.group.record_stream
             )
             if should_synchronize:
                 self.group.stream.synchronize()
 
             args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
-            kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
+            kwargs = self._send_kwargs_to_device(kwargs)
             return args, kwargs
 
         # If the current module is the onload_leader of the group, we onload the group if it is supposed
@@ -329,7 +335,10 @@ def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
                 self.next_group.onload_()
 
             should_synchronize = (
-                not self.group.onload_self and self.group.stream is not None and not should_onload_next_group
+                not self.group.onload_self
+                and self.group.stream is not None
+                and not should_onload_next_group
+                and not self.group.record_stream
             )
             if should_synchronize:
                 # If this group didn't onload itself, it means it was asynchronously onloaded by the
@@ -341,7 +350,7 @@ def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
                 self.group.stream.synchronize()
 
         args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
-        kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
+        kwargs = self._send_kwargs_to_device(kwargs)
         return args, kwargs
 
     def post_forward(self, module: torch.nn.Module, output):
@@ -352,6 +361,28 @@ def post_forward(self, module: torch.nn.Module, output):
             self.group.offload_()
         return output
 
+    def _is_group_on_device(self) -> bool:
+        tensors = []
+        for group_module in self.group.modules:
+            tensors.extend(list(group_module.parameters()))
+            tensors.extend(list(group_module.buffers()))
+        tensors.extend(self.group.parameters)
+        tensors.extend(self.group.buffers)
+
+        return len(tensors) > 0 and all(t.device == self.group.onload_device for t in tensors)
+
+    def _send_kwargs_to_device(self, kwargs):
+        exclude_kwargs = self.config.exclude_kwargs or []
+        if exclude_kwargs:
+            moved_kwargs = send_to_device(
+                {k: v for k, v in kwargs.items() if k not in exclude_kwargs},
+                self.group.onload_device,
+                non_blocking=self.group.non_blocking,
+            )
+            kwargs.update(moved_kwargs)
+            return kwargs
+        return send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
+
     def _is_group_on_device(self) -> bool:
         tensors = []
         for group_module in self.group.modules:
@@ -524,6 +555,17 @@ def pre_forward(self, module, *args, **kwargs):
         return args, kwargs
 
 
+def _normalize_pin_groups(pin_groups: Optional[Union[str, Callable]]) -> Optional[Union[str, Callable]]:
+    if isinstance(pin_groups, str):
+        normalized_pin_groups = pin_groups.lower()
+        if normalized_pin_groups not in {"first_last", "all"}:
+            raise ValueError("`pin_groups` must be one of `None`, 'first_last', 'all', or a callable.")
+        return normalized_pin_groups
+    if pin_groups is not None and not callable(pin_groups):
+        raise ValueError("`pin_groups` must be one of `None`, 'first_last', 'all', or a callable.")
+    return pin_groups
+
+
 def apply_group_offloading(
     module: torch.nn.Module,
     onload_device: Union[str, torch.device],
@@ -536,6 +578,7 @@ def apply_group_offloading(
     low_cpu_mem_usage: bool = False,
     offload_to_disk_path: Optional[str] = None,
     block_modules: Optional[List[str]] = None,
+    exclude_kwargs: Optional[List[str]] = None,
     pin_groups: Optional[Union[str, Callable]] = None,
 ) -> None:
     r"""
@@ -595,11 +638,15 @@ def apply_group_offloading(
             option only matters when using streamed CPU offloading (i.e. `use_stream=True`). This can be useful when
             the CPU memory is a bottleneck but may counteract the benefits of using streams.
         block_modules (`List[str]`, *optional*):
-            List of module names that should be treated as blocks for offloading. If provided, only these modules
-            will be considered for block-level offloading. If not provided, the default block detection logic will be used.
+            List of module names that should be treated as blocks for offloading. If provided, only these modules will
+            be considered for block-level offloading. If not provided, the default block detection logic will be used.
+        exclude_kwargs (`List[str]`, *optional*):
+            List of kwarg keys that should not be processed by `send_to_device`. This is useful for mutable state like
+            caching lists that need to maintain their object identity across forward passes. If not provided, will be
+            inferred from the module's `_skip_keys` attribute if it exists.
         pin_groups (`"first_last"` or `"all"` or `Callable`, *optional*, defaults to `None`):
-            Optionally keeps selected groups on the onload device permanently. Use `"first_last"` to pin the first
-            and last parameter-bearing groups, `"all"` to pin every parameter-bearing group, or pass a callable that
+            Optionally keeps selected groups on the onload device permanently. Use `"first_last"` to pin the first and
+            last parameter-bearing groups, `"all"` to pin every parameter-bearing group, or pass a callable that
             receives a module (and optionally the module name and index) and returns `True` to pin that group.
 
     Example:
@@ -640,19 +687,14 @@ def apply_group_offloading(
     if offload_type == GroupOffloadingType.BLOCK_LEVEL and num_blocks_per_group is None:
         raise ValueError("`num_blocks_per_group` must be provided when using `offload_type='block_level'.")
 
-    normalized_pin_groups = pin_groups
-    if isinstance(pin_groups, str):
-        normalized_pin_groups = pin_groups.lower()
-        if normalized_pin_groups not in {"first_last", "all"}:
-            raise ValueError("`pin_groups` must be one of `None`, 'first_last', 'all', or a callable.")
-    elif pin_groups is not None and not callable(pin_groups):
-        raise ValueError("`pin_groups` must be one of `None`, 'first_last', 'all', or a callable.")
+    pin_groups = _normalize_pin_groups(pin_groups)
+    _raise_error_if_accelerate_model_or_sequential_hook_present(module)
 
-    pin_groups = normalized_pin_groups
+    if block_modules is None:
+        block_modules = getattr(module, "_group_offload_block_modules", None)
 
-    _raise_error_if_accelerate_model_or_sequential_hook_present(module)
-    registry = HookRegistry.check_if_exists_or_initialize(module)
-    registry._group_offload_pin_groups = pin_groups
+    if exclude_kwargs is None:
+        exclude_kwargs = getattr(module, "_skip_keys", None)
 
     config = GroupOffloadingConfig(
         onload_device=onload_device,
@@ -665,6 +707,8 @@ def apply_group_offloading(
         low_cpu_mem_usage=low_cpu_mem_usage,
         offload_to_disk_path=offload_to_disk_path,
         block_modules=block_modules,
+        exclude_kwargs=exclude_kwargs,
+        module_prefix="",
         pin_groups=pin_groups,
     )
     _apply_group_offloading(module, config)
@@ -706,7 +750,7 @@ def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOf
 
     for name, submodule in module.named_children():
         # Check if this is an explicitly defined block module
-        if name in block_modules:
+        if block_modules and name in block_modules:
             # Apply block offloading to the specified submodule
             _apply_block_offloading_to_submodule(
                 submodule, name, config, modules_with_group_offloading, matched_module_groups
@@ -802,7 +846,7 @@ def _apply_block_offloading_to_submodule(
             if len(current_modules) == 0:
                 continue
 
-            group_id = f"{name}_{i}_{i + len(current_modules) - 1}"
+            group_id = f"{config.module_prefix}{name}_{i}_{i + len(current_modules) - 1}"
             group = ModuleGroup(
                 modules=current_modules,
                 offload_device=config.offload_device,
@@ -834,7 +878,7 @@ def _apply_block_offloading_to_submodule(
             record_stream=config.record_stream,
             low_cpu_mem_usage=config.low_cpu_mem_usage,
             onload_self=True,
-            group_id=name,
+            group_id=f"{config.module_prefix}{name}",
         )
         matched_module_groups.append(group)
         modules_with_group_offloading.add(name)
@@ -864,7 +908,7 @@ def _apply_group_offloading_leaf_level(module: torch.nn.Module, config: GroupOff
             record_stream=config.record_stream,
             low_cpu_mem_usage=config.low_cpu_mem_usage,
             onload_self=True,
-            group_id=name,
+            group_id=f"{config.module_prefix}{name}",
         )
         _apply_group_offloading_hook(submodule, group, config=config)
         modules_with_group_offloading.add(name)
@@ -911,7 +955,7 @@ def _apply_group_offloading_leaf_level(module: torch.nn.Module, config: GroupOff
             record_stream=config.record_stream,
             low_cpu_mem_usage=config.low_cpu_mem_usage,
             onload_self=True,
-            group_id=name,
+            group_id=f"{config.module_prefix}{name}",
         )
         _apply_group_offloading_hook(parent_module, group, config=config)
 
@@ -966,8 +1010,8 @@ def _apply_lazy_group_offloading_hook(
     if registry.get_hook(_GROUP_OFFLOADING) is None:
         hook = GroupOffloadingHook(group, config=config)
         registry.register_hook(hook, _GROUP_OFFLOADING)
-        
-    lazy_prefetch_hook = LazyPrefetchGroupOffloadingHook(pin_groups = config.pin_groups)
+
+    lazy_prefetch_hook = LazyPrefetchGroupOffloadingHook(pin_groups=config.pin_groups)
     registry.register_hook(lazy_prefetch_hook, _LAZY_PREFETCH_GROUP_OFFLOADING)