huggingface
diff --git a/‎docs/source/en/training/distributed_inference.md‎
Lines changed: 67 additions & 24 deletions b/‎docs/source/en/training/distributed_inference.md‎
Lines changed: 67 additions & 24 deletions
diff --git a/‎src/diffusers/__init__.py‎
Lines changed: 8 additions & 0 deletions b/‎src/diffusers/__init__.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/diffusers/models/transformers/transformer_prx.py‎
Lines changed: 30 additions & 5 deletions b/‎src/diffusers/models/transformers/transformer_prx.py‎
Lines changed: 30 additions & 5 deletions
diff --git a/‎src/diffusers/modular_pipelines/__init__.py‎
Lines changed: 10 additions & 0 deletions b/‎src/diffusers/modular_pipelines/__init__.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/diffusers/modular_pipelines/flux2/__init__.py‎
Lines changed: 111 additions & 0 deletions b/‎src/diffusers/modular_pipelines/flux2/__init__.py‎
Lines changed: 111 additions & 0 deletions
@@ -237,6 +237,8 @@ By selectively loading and unloading the models you need at a given stage and sh
 
 Use [`~ModelMixin.set_attention_backend`] to switch to a more optimized attention backend. Refer to this [table](../optimization/attention_backends#available-backends) for a complete list of available backends.
 
+Most attention backends are compatible with context parallelism. Open an [issue](https://github.com/huggingface/diffusers/issues/new) if a backend is not compatible.
+
 ### Ring Attention
 
 Key (K) and value (V) representations communicate between devices using [Ring Attention](https://huggingface.co/papers/2310.01889). This ensures each split sees every other token's K/V. Each GPU computes attention for its local K/V and passes it to the next GPU in the ring. No single GPU holds the full sequence, which reduces communication latency.
@@ -245,40 +247,60 @@ Pass a [`ContextParallelConfig`] to the `parallel_config` argument of the transf
 
 ```py
 import torch
-from diffusers import AutoModel, QwenImagePipeline, ContextParallelConfig
-
-try:
-    torch.distributed.init_process_group("nccl")
-    rank = torch.distributed.get_rank()
-    device = torch.device("cuda", rank % torch.cuda.device_count())
+from torch import distributed as dist
+from diffusers import DiffusionPipeline, ContextParallelConfig
+
+def setup_distributed():
+    if not dist.is_initialized():
+        dist.init_process_group(backend="nccl")
+    rank = dist.get_rank()
+    device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    
-    transformer = AutoModel.from_pretrained("Qwen/Qwen-Image", subfolder="transformer", torch_dtype=torch.bfloat16, parallel_config=ContextParallelConfig(ring_degree=2))
-    pipeline = QwenImagePipeline.from_pretrained("Qwen/Qwen-Image", transformer=transformer, torch_dtype=torch.bfloat16, device_map="cuda")
-    pipeline.transformer.set_attention_backend("flash")
+    return device
+
+def main():
+    device = setup_distributed()
+    world_size = dist.get_world_size()
+
+    pipeline = DiffusionPipeline.from_pretrained(
+        "black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, device_map=device
+    )
+    pipeline.transformer.set_attention_backend("_native_cudnn")
+
+    cp_config = ContextParallelConfig(ring_degree=world_size)
+    pipeline.transformer.enable_parallelism(config=cp_config)
 
     prompt = """
     cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
     highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
     """
-    
+
     # Must specify generator so all ranks start with same latents (or pass your own)
     generator = torch.Generator().manual_seed(42)
-    image = pipeline(prompt, num_inference_steps=50, generator=generator).images[0]
-    
-    if rank == 0:
-        image.save("output.png")
-
-except Exception as e:
-    print(f"An error occurred: {e}")
-    torch.distributed.breakpoint()
-    raise
-
-finally:
-    if torch.distributed.is_initialized():
-        torch.distributed.destroy_process_group()
+    image = pipeline(
+        prompt,
+        guidance_scale=3.5,
+        num_inference_steps=50,
+        generator=generator,
+    ).images[0]
+
+    if dist.get_rank() == 0:
+        image.save(f"output.png")
+
+    if dist.is_initialized():
+        dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()
 ```
 
+The script above needs to be run with a distributed launcher, such as [torchrun](https://docs.pytorch.org/docs/stable/elastic/run.html), that is compatible with PyTorch. `--nproc-per-node` is set to the number of GPUs available.
+
+/```shell
+`torchrun --nproc-per-node 2 above_script.py`. 
+/```
+
 ### Ulysses Attention
 
 [Ulysses Attention](https://huggingface.co/papers/2309.14509) splits a sequence across GPUs and performs an *all-to-all* communication (every device sends/receives data to every other device). Each GPU ends up with all tokens for only a subset of attention heads. Each GPU computes attention locally on all tokens for its head, then performs another all-to-all to regroup results by tokens for the next layer.
@@ -288,5 +310,26 @@ finally:
 Pass the [`ContextParallelConfig`] to [`~ModelMixin.enable_parallelism`].
 
 ```py
+# Depending on the number of GPUs available.
 pipeline.transformer.enable_parallelism(config=ContextParallelConfig(ulysses_degree=2))
+```
+
+### parallel_config
+
+Pass `parallel_config` during model initialization to enable context parallelism.
+
+```py
+CKPT_ID = "black-forest-labs/FLUX.1-dev"
+
+cp_config = ContextParallelConfig(ring_degree=2)
+transformer = AutoModel.from_pretrained(
+    CKPT_ID, 
+    subfolder="transformer", 
+    torch_dtype=torch.bfloat16, 
+    parallel_config=cp_config
+)
+
+pipeline = DiffusionPipeline.from_pretrained(
+    CKPT_ID, transformer=transformer, torch_dtype=torch.bfloat16,
+).to(device)
 ```
@@ -404,6 +404,8 @@
 else:
     _import_structure["modular_pipelines"].extend(
         [
+            "Flux2AutoBlocks",
+            "Flux2ModularPipeline",
             "FluxAutoBlocks",
             "FluxKontextAutoBlocks",
             "FluxKontextModularPipeline",
@@ -419,6 +421,8 @@
             "Wan22AutoBlocks",
             "WanAutoBlocks",
             "WanModularPipeline",
+            "ZImageAutoBlocks",
+            "ZImageModularPipeline",
         ]
     )
     _import_structure["pipelines"].extend(
@@ -1109,6 +1113,8 @@
         from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
         from .modular_pipelines import (
+            Flux2AutoBlocks,
+            Flux2ModularPipeline,
             FluxAutoBlocks,
             FluxKontextAutoBlocks,
             FluxKontextModularPipeline,
@@ -1124,6 +1130,8 @@
             Wan22AutoBlocks,
             WanAutoBlocks,
             WanModularPipeline,
+            ZImageAutoBlocks,
+            ZImageModularPipeline,
         )
         from .pipelines import (
             AllegroPipeline,
 
@@ -16,7 +16,6 @@
 
 import torch
 from torch import nn
-from torch.nn.functional import fold, unfold
 
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import logging
@@ -532,7 +531,19 @@ def img2seq(img: torch.Tensor, patch_size: int) -> torch.Tensor:
             Flattened patch sequence of shape `(B, L, C * patch_size * patch_size)`, where `L = (H // patch_size) * (W
             // patch_size)` is the number of patches.
     """
-    return unfold(img, kernel_size=patch_size, stride=patch_size).transpose(1, 2)
+    b, c, h, w = img.shape
+    p = patch_size
+
+    # Reshape to (B, C, H//p, p, W//p, p) separating grid and patch dimensions
+    img = img.reshape(b, c, h // p, p, w // p, p)
+
+    # Permute to (B, H//p, W//p, C, p, p) using einsum
+    # n=batch, c=channels, h=grid_height, p=patch_height, w=grid_width, q=patch_width
+    img = torch.einsum("nchpwq->nhwcpq", img)
+
+    # Flatten to (B, L, C * p * p)
+    img = img.reshape(b, -1, c * p * p)
+    return img
 
 
 def seq2img(seq: torch.Tensor, patch_size: int, shape: torch.Tensor) -> torch.Tensor:
@@ -554,12 +565,26 @@ def seq2img(seq: torch.Tensor, patch_size: int, shape: torch.Tensor) -> torch.Te
             Reconstructed image tensor of shape `(B, C, H, W)`.
     """
     if isinstance(shape, tuple):
-        shape = shape[-2:]
+        h, w = shape[-2:]
     elif isinstance(shape, torch.Tensor):
-        shape = (int(shape[0]), int(shape[1]))
+        h, w = (int(shape[0]), int(shape[1]))
     else:
         raise NotImplementedError(f"shape type {type(shape)} not supported")
-    return fold(seq.transpose(1, 2), shape, kernel_size=patch_size, stride=patch_size)
+
+    b, l, d = seq.shape
+    p = patch_size
+    c = d // (p * p)
+
+    # Reshape back to grid structure: (B, H//p, W//p, C, p, p)
+    seq = seq.reshape(b, h // p, w // p, c, p, p)
+
+    # Permute back to image layout: (B, C, H//p, p, W//p, p)
+    # n=batch, h=grid_height, w=grid_width, c=channels, p=patch_height, q=patch_width
+    seq = torch.einsum("nhwcpq->nchpwq", seq)
+
+    # Final reshape to (B, C, H, W)
+    seq = seq.reshape(b, c, h, w)
+    return seq
 
 
 class PRXTransformer2DModel(ModelMixin, ConfigMixin, AttentionMixin):
 
@@ -52,6 +52,10 @@
         "FluxKontextAutoBlocks",
         "FluxKontextModularPipeline",
     ]
+    _import_structure["flux2"] = [
+        "Flux2AutoBlocks",
+        "Flux2ModularPipeline",
+    ]
     _import_structure["qwenimage"] = [
         "QwenImageAutoBlocks",
         "QwenImageModularPipeline",
@@ -60,6 +64,10 @@
         "QwenImageEditPlusModularPipeline",
         "QwenImageEditPlusAutoBlocks",
     ]
+    _import_structure["z_image"] = [
+        "ZImageAutoBlocks",
+        "ZImageModularPipeline",
+    ]
     _import_structure["components_manager"] = ["ComponentsManager"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -71,6 +79,7 @@
     else:
         from .components_manager import ComponentsManager
         from .flux import FluxAutoBlocks, FluxKontextAutoBlocks, FluxKontextModularPipeline, FluxModularPipeline
+        from .flux2 import Flux2AutoBlocks, Flux2ModularPipeline
         from .modular_pipeline import (
             AutoPipelineBlocks,
             BlockState,
@@ -91,6 +100,7 @@
         )
         from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline
         from .wan import Wan22AutoBlocks, WanAutoBlocks, WanModularPipeline
+        from .z_image import ZImageAutoBlocks, ZImageModularPipeline
 else:
     import sys
 
 
@@ -0,0 +1,111 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["encoders"] = [
+        "Flux2TextEncoderStep",
+        "Flux2RemoteTextEncoderStep",
+        "Flux2VaeEncoderStep",
+    ]
+    _import_structure["before_denoise"] = [
+        "Flux2SetTimestepsStep",
+        "Flux2PrepareLatentsStep",
+        "Flux2RoPEInputsStep",
+        "Flux2PrepareImageLatentsStep",
+    ]
+    _import_structure["denoise"] = [
+        "Flux2LoopDenoiser",
+        "Flux2LoopAfterDenoiser",
+        "Flux2DenoiseLoopWrapper",
+        "Flux2DenoiseStep",
+    ]
+    _import_structure["decoders"] = ["Flux2DecodeStep"]
+    _import_structure["inputs"] = [
+        "Flux2ProcessImagesInputStep",
+        "Flux2TextInputStep",
+    ]
+    _import_structure["modular_blocks"] = [
+        "ALL_BLOCKS",
+        "AUTO_BLOCKS",
+        "REMOTE_AUTO_BLOCKS",
+        "TEXT2IMAGE_BLOCKS",
+        "IMAGE_CONDITIONED_BLOCKS",
+        "Flux2AutoBlocks",
+        "Flux2AutoVaeEncoderStep",
+        "Flux2BeforeDenoiseStep",
+        "Flux2VaeEncoderSequentialStep",
+    ]
+    _import_structure["modular_pipeline"] = ["Flux2ModularPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .before_denoise import (
+            Flux2PrepareImageLatentsStep,
+            Flux2PrepareLatentsStep,
+            Flux2RoPEInputsStep,
+            Flux2SetTimestepsStep,
+        )
+        from .decoders import Flux2DecodeStep
+        from .denoise import (
+            Flux2DenoiseLoopWrapper,
+            Flux2DenoiseStep,
+            Flux2LoopAfterDenoiser,
+            Flux2LoopDenoiser,
+        )
+        from .encoders import (
+            Flux2RemoteTextEncoderStep,
+            Flux2TextEncoderStep,
+            Flux2VaeEncoderStep,
+        )
+        from .inputs import (
+            Flux2ProcessImagesInputStep,
+            Flux2TextInputStep,
+        )
+        from .modular_blocks import (
+            ALL_BLOCKS,
+            AUTO_BLOCKS,
+            IMAGE_CONDITIONED_BLOCKS,
+            REMOTE_AUTO_BLOCKS,
+            TEXT2IMAGE_BLOCKS,
+            Flux2AutoBlocks,
+            Flux2AutoVaeEncoderStep,
+            Flux2BeforeDenoiseStep,
+            Flux2VaeEncoderSequentialStep,
+        )
+        from .modular_pipeline import Flux2ModularPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)