huggingface
diff --git a/‎src/diffusers/modular_pipelines/flux/__init__.py‎
Lines changed: 10 additions & 2 deletions b/‎src/diffusers/modular_pipelines/flux/__init__.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎src/diffusers/modular_pipelines/flux/before_denoise.py‎
Lines changed: 216 additions & 4 deletions b/‎src/diffusers/modular_pipelines/flux/before_denoise.py‎
Lines changed: 216 additions & 4 deletions
@@ -24,15 +24,19 @@
     _import_structure["encoders"] = ["FluxTextEncoderStep"]
     _import_structure["modular_blocks"] = [
         "ALL_BLOCKS",
+        "ALL_BLOCKS_KONTEXT",
         "AUTO_BLOCKS",
+        "AUTO_BLOCKS_KONTEXT",
         "TEXT2IMAGE_BLOCKS",
         "FluxAutoBeforeDenoiseStep",
         "FluxAutoBlocks",
-        "FluxAutoBlocks",
         "FluxAutoDecodeStep",
         "FluxAutoDenoiseStep",
+        "FluxKontextAutoBeforeDenoiseStep",
+        "FluxKontextAutoBlocks",
+        "FluxKontextAutoDenoiseStep",
     ]
-    _import_structure["modular_pipeline"] = ["FluxModularPipeline"]
+    _import_structure["modular_pipeline"] = ["FluxKontextModularPipeline", "FluxModularPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
@@ -44,12 +48,16 @@
         from .encoders import FluxTextEncoderStep
         from .modular_blocks import (
             ALL_BLOCKS,
+            ALL_BLOCKS_KONTEXT,
             AUTO_BLOCKS,
+            AUTO_BLOCKS_KONTEXT,
             TEXT2IMAGE_BLOCKS,
             FluxAutoBeforeDenoiseStep,
             FluxAutoBlocks,
             FluxAutoDecodeStep,
             FluxAutoDenoiseStep,
+            FluxKontextAutoBeforeDenoiseStep,
+            FluxKontextAutoDenoiseStep,
         )
         from .modular_pipeline import FluxModularPipeline
 else:
 
@@ -18,6 +18,8 @@
 import numpy as np
 import torch
 
+from ...configuration_utils import FrozenDict
+from ...image_processor import VaeImageProcessor
 from ...models import AutoencoderKL
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import logging
@@ -182,15 +184,15 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
     return latent_image_ids.to(device=device, dtype=dtype)
 
 
-# Cannot use "# Copied from" because it introduces weird indentation errors.
-def _encode_vae_image(vae, image: torch.Tensor, generator: torch.Generator):
+def _encode_vae_image(vae, image: torch.Tensor, generator: torch.Generator, sample_mode: str = "sample"):
     if isinstance(generator, list):
         image_latents = [
-            retrieve_latents(vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(image.shape[0])
+            retrieve_latents(vae.encode(image[i : i + 1]), generator=generator[i], sample_mode=sample_mode)
+            for i in range(image.shape[0])
         ]
         image_latents = torch.cat(image_latents, dim=0)
     else:
-        image_latents = retrieve_latents(vae.encode(image), generator=generator)
+        image_latents = retrieve_latents(vae.encode(image), generator=generator, sample_mode=sample_mode)
 
     image_latents = (image_latents - vae.config.shift_factor) * vae.config.scaling_factor
 
@@ -687,3 +689,213 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
         self.set_block_state(state, block_state)
 
         return components, state
+
+
+class FluxKontextPrepareLatentsStep(ModularPipelineBlocks):
+    model_name = "flux_kontext"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKL),
+            ComponentSpec(
+                "image_processor",
+                VaeImageProcessor,
+                config=FrozenDict({"vae_scale_factor": 16}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def description(self) -> str:
+        return "Prepare latents step that prepares the latents for the image-to-image generation process with Flux Kontext"
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("height", type_hint=int),
+            InputParam("width", type_hint=int),
+            InputParam("max_area", type_hint=int, default=1024**2),
+            InputParam("latents", type_hint=Optional[torch.Tensor]),
+            InputParam("num_images_per_prompt", type_hint=int, default=1),
+            InputParam("generator"),
+            InputParam(
+                "batch_size",
+                required=True,
+                type_hint=int,
+                description="Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`. Can be generated in input step.",
+            ),
+            InputParam("dtype", type_hint=torch.dtype, description="The dtype of the model inputs"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "latents", type_hint=torch.Tensor, description="The initial latents to use for the denoising process"
+            ),
+            OutputParam(
+                "image_latents", type_hint=torch.Tensor, description="Latents computed from the input image(s)."
+            ),
+            OutputParam(
+                "latent_ids",
+                type_hint=torch.Tensor,
+                description="IDs computed from the latent sequence needed for RoPE",
+            ),
+            OutputParam(
+                "image_ids",
+                type_hint=torch.Tensor,
+                description="IDs computed from the image sequence needed for RoPE",
+            ),
+        ]
+
+    @staticmethod
+    def check_inputs(components, block_state):
+        if (block_state.height is not None and block_state.height % (components.vae_scale_factor * 2) != 0) or (
+            block_state.width is not None and block_state.width % (components.vae_scale_factor * 2) != 0
+        ):
+            logger.warning(
+                f"`height` and `width` have to be divisible by {components.vae_scale_factor} but are {block_state.height} and {block_state.width}."
+            )
+
+    @staticmethod
+    def preprocess_image(
+        image, image_processor: VaeImageProcessor, vae_scale_factor: int, latent_channels: int, _auto_resize=True
+    ):
+        from ...pipelines.flux.pipeline_flux_kontext import PREFERRED_KONTEXT_RESOLUTIONS
+
+        if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == latent_channels):
+            multiple_of = vae_scale_factor * 2
+            img = image[0] if isinstance(image, list) else image
+            image_height, image_width = image_processor.get_default_height_width(img)
+            aspect_ratio = image_width / image_height
+            if _auto_resize:
+                # Kontext is trained on specific resolutions, using one of them is recommended
+                _, image_width, image_height = min(
+                    (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_KONTEXT_RESOLUTIONS
+                )
+            image_width = image_width // multiple_of * multiple_of
+            image_height = image_height // multiple_of * multiple_of
+            image = image_processor.resize(image, image_height, image_width)
+            image = image_processor.preprocess(image, image_height, image_width)
+        return image
+
+    @staticmethod
+    def prepare_latents(
+        comp,
+        image,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        # Couldn't use the `prepare_latents` method directly from Flux because I decided to copy over
+        # the packing methods here. So, for example, `comp._pack_latents()` won't work if we were
+        # to go with the "# Copied from ..." approach. Or maybe there's a way?
+
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (comp.vae_scale_factor * 2))
+        width = 2 * (int(width) // (comp.vae_scale_factor * 2))
+        shape = (batch_size, num_channels_latents, height, width)
+
+        image_latents = image_ids = None
+        if image is not None:
+            image = image.to(device=device, dtype=dtype)
+            if image.shape[1] != num_channels_latents:
+                image_latents = _encode_vae_image(image=image, generator=generator, sample_mode="argmax")
+            else:
+                image_latents = image
+            if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+                # expand init_latents for batch_size
+                additional_image_per_prompt = batch_size // image_latents.shape[0]
+                image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+            elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+                )
+            else:
+                image_latents = torch.cat([image_latents], dim=0)
+
+            image_latent_height, image_latent_width = image_latents.shape[2:]
+            image_latents = _pack_latents(
+                image_latents, batch_size, num_channels_latents, image_latent_height, image_latent_width
+            )
+            image_ids = _prepare_latent_image_ids(
+                batch_size, image_latent_height // 2, image_latent_width // 2, device, dtype
+            )
+            # image ids are the same as latent ids with the first dimension set to 1 instead of 0
+            image_ids[..., 0] = 1
+
+        latent_ids = _prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = _pack_latents(latents, batch_size, num_channels_latents, height, width)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+
+        return latents, image_latents, latent_ids, image_ids
+
+    @torch.no_grad()
+    def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        block_state.height = block_state.height or components.default_height
+        block_state.width = block_state.width or components.default_width
+        block_state.device = components._execution_device
+        block_state.dtype = torch.bfloat16  # TODO: okay to hardcode this?
+        block_state.num_channels_latents = components.num_channels_latents
+
+        self.check_inputs(components, block_state)
+
+        # Adjust height and width if needed.
+        max_area = block_state.max_area
+        original_height, original_width = block_state.height, block_state.width
+        aspect_ratio = original_width / original_height
+        width = round((max_area * aspect_ratio) ** 0.5)
+        height = round((max_area / aspect_ratio) ** 0.5)
+
+        multiple_of = components.vae_scale_factor * 2
+        width = width // multiple_of * multiple_of
+        height = height // multiple_of * multiple_of
+
+        if height != original_height or width != original_width:
+            logger.warning(
+                f"Generation `height` and `width` have been adjusted to {height} and {width} to fit the model requirements."
+            )
+            block_state.height = height
+            block_state.width = width
+
+        # Process input image(s).
+        # `_auto_resize` is currently forced to True. Since it's private anyway, I thought of not adding it.
+        image = block_state.image
+        block_state.image = self.preprocess_image(
+            image=image,
+            image_processor=components.image_processor,
+            vae_scale_factor=components.vae_scale_factor,
+            latent_channels=components.num_channels_latents,
+        )
+
+        batch_size = block_state.batch_size * block_state.num_images_per_prompt
+        block_state.latents, block_state.image_latents, block_state.latent_ids, block_state.image_ids = (
+            self.prepare_latents(
+                components,
+                batch_size,
+                block_state.num_channels_latents,
+                block_state.height,
+                block_state.width,
+                block_state.dtype,
+                block_state.device,
+                block_state.generator,
+                block_state.latents,
+            )
+        )
+
+        self.set_block_state(state, block_state)
+
+        return components, state