start

sayakpaul · sayakpaul · commit 046bf9efa3b2 · 2025-08-06T10:56:29.000+05:30
diff --git a/src/diffusers/modular_pipelines/flux/before_denoise.py b/src/diffusers/modular_pipelines/flux/before_denoise.py
@@ -13,11 +13,12 @@
 # limitations under the License.
 
 import inspect
-from typing import List, Optional, Union
+from typing import Any, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
 
+from ...models import AutoencoderKL
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import logging
 from ...utils.torch_utils import randn_tensor
@@ -103,6 +104,61 @@ def calculate_shift(
     return mu
 
 
+def prepare_latents_img2img(
+    vae, scheduler, image, timestep, batch_size, num_channels_latents, height, width, dtype, device, generator
+):
+    if isinstance(generator, list) and len(generator) != batch_size:
+        raise ValueError(
+            f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+            f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+        )
+
+    vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
+    latent_channels = vae.config.latent_channels
+
+    # VAE applies 8x compression on images but we must also account for packing which requires
+    # latent height and width to be divisible by 2.
+    height = 2 * (int(height) // (vae_scale_factor * 2))
+    width = 2 * (int(width) // (vae_scale_factor * 2))
+    shape = (batch_size, num_channels_latents, height, width)
+    latent_image_ids = _prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+
+    image = image.to(device=device, dtype=dtype)
+    if image.shape[1] != latent_channels:
+        image_latents = _encode_vae_image(image=image, generator=generator)
+    else:
+        image_latents = image
+    if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+        # expand init_latents for batch_size
+        additional_image_per_prompt = batch_size // image_latents.shape[0]
+        image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+    elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+        raise ValueError(
+            f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+        )
+    else:
+        image_latents = torch.cat([image_latents], dim=0)
+
+    noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+    latents = scheduler.scale_noise(image_latents, timestep, noise)
+    latents = _pack_latents(latents, batch_size, num_channels_latents, height, width)
+    return latents, latent_image_ids
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
 def _pack_latents(latents, batch_size, num_channels_latents, height, width):
     latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
     latents = latents.permute(0, 2, 4, 1, 3, 5)
@@ -125,6 +181,44 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
     return latent_image_ids.to(device=device, dtype=dtype)
 
 
+# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image with self.vae->vae
+def _encode_vae_image(vae, image: torch.Tensor, generator: torch.Generator):
+    if isinstance(generator, list):
+        image_latents = [
+            retrieve_latents(vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(image.shape[0])
+        ]
+        image_latents = torch.cat(image_latents, dim=0)
+    else:
+        image_latents = retrieve_latents(vae.encode(image), generator=generator)
+
+    image_latents = (image_latents - vae.config.shift_factor) * vae.config.scaling_factor
+
+    return image_latents
+
+
+def _get_timesteps_and_optionals(transformer, scheduler, latents, num_inference_steps, guidance_scale, sigmas, device):
+    image_seq_len = latents.shape[1]
+
+    sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+    if hasattr(scheduler.config, "use_flow_sigmas") and scheduler.config.use_flow_sigmas:
+        sigmas = None
+    mu = calculate_shift(
+        image_seq_len,
+        scheduler.config.get("base_image_seq_len", 256),
+        scheduler.config.get("max_image_seq_len", 4096),
+        scheduler.config.get("base_shift", 0.5),
+        scheduler.config.get("max_shift", 1.15),
+    )
+    timesteps, num_inference_steps = retrieve_timesteps(scheduler, num_inference_steps, device, sigmas=sigmas, mu=mu)
+    if transformer.config.guidance_embeds:
+        guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+        guidance = guidance.expand(latents.shape[0])
+    else:
+        guidance = None
+
+    return timesteps, num_inference_steps, sigmas, guidance
+
+
 class FluxInputStep(PipelineBlock):
     model_name = "flux"
 
@@ -264,34 +358,103 @@ def intermediate_outputs(self) -> List[OutputParam]:
     def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
         block_state.device = components._execution_device
-        scheduler = components.scheduler
 
-        latents = block_state.latents
-        image_seq_len = latents.shape[1]
+        scheduler = components.scheduler
+        transformer = components.transformer
 
-        num_inference_steps = block_state.num_inference_steps
-        sigmas = block_state.sigmas
-        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
-        if hasattr(scheduler.config, "use_flow_sigmas") and scheduler.config.use_flow_sigmas:
-            sigmas = None
-        block_state.sigmas = sigmas
-        mu = calculate_shift(
-            image_seq_len,
-            scheduler.config.get("base_image_seq_len", 256),
-            scheduler.config.get("max_image_seq_len", 4096),
-            scheduler.config.get("base_shift", 0.5),
-            scheduler.config.get("max_shift", 1.15),
+        timesteps, num_inference_steps, sigmas, guidance = _get_timesteps_and_optionals(
+            transformer,
+            scheduler,
+            block_state.latents,
+            block_state.num_inference_steps,
+            block_state.guidance_scale,
+            block_state.sigmas,
+            block_state.device,
         )
-        block_state.timesteps, block_state.num_inference_steps = retrieve_timesteps(
-            scheduler, block_state.num_inference_steps, block_state.device, sigmas=block_state.sigmas, mu=mu
+        block_state.timesteps = timesteps
+        block_state.num_inference_steps = num_inference_steps
+        block_state.sigmas = sigmas
+        block_state.guidance = guidance
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class FluxImg2ImgSetTimestepsStep(PipelineBlock):
+    model_name = "flux"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
+
+    @property
+    def description(self) -> str:
+        return "Step that sets the scheduler's timesteps for inference"
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("num_inference_steps", default=50),
+            InputParam("timesteps"),
+            InputParam("sigmas"),
+            InputParam("guidance_scale", default=3.5),
+            InputParam("latents", type_hint=torch.Tensor),
+            InputParam("num_images_per_prompt", default=1),
+        ]
+
+    @property
+    def intermediate_inputs(self) -> List[str]:
+        return [
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
+            )
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam("timesteps", type_hint=torch.Tensor, description="The timesteps to use for inference"),
+            OutputParam(
+                "num_inference_steps",
+                type_hint=int,
+                description="The number of denoising steps to perform at inference time",
+            ),
+            OutputParam(
+                "latent_timestep",
+                type_hint=torch.Tensor,
+                description="The timestep that represents the initial noise level for image-to-image generation",
+            ),
+            OutputParam("guidance", type_hint=torch.Tensor, description="Optional guidance to be used."),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        block_state.device = components._execution_device
+
+        scheduler = components.scheduler
+        transformer = components.transformer
+
+        timesteps, num_inference_steps, sigmas, guidance = _get_timesteps_and_optionals(
+            transformer,
+            scheduler,
+            block_state.latents,
+            block_state.num_inference_steps,
+            block_state.guidance_scale,
+            block_state.sigmas,
+            block_state.device,
         )
-        if components.transformer.config.guidance_embeds:
-            guidance = torch.full([1], block_state.guidance_scale, device=block_state.device, dtype=torch.float32)
-            guidance = guidance.expand(latents.shape[0])
-        else:
-            guidance = None
+        block_state.timesteps = timesteps
+        block_state.num_inference_steps = num_inference_steps
+        block_state.sigmas = sigmas
         block_state.guidance = guidance
 
+        batch_size = block_state.latents.shape[0]
+        block_state.latent_timestep = timesteps[:1].repeat(batch_size * block_state.num_images_per_prompt)
+
         self.set_block_state(state, block_state)
         return components, state
 
@@ -418,3 +581,96 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
         self.set_block_state(state, block_state)
 
         return components, state
+
+
+class FluxLImg2ImgPrepareLatentsStep(PipelineBlock):
+    model_name = "flux"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [ComponentSpec("vae", AutoencoderKL), ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
+
+    @property
+    def description(self) -> str:
+        return "Step that prepares the latents for the image-to-image generation process"
+
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return [
+            InputParam("height", type_hint=int),
+            InputParam("width", type_hint=int),
+            InputParam("latents", type_hint=Optional[torch.Tensor]),
+            InputParam("num_images_per_prompt", type_hint=int, default=1),
+            InputParam("latents"),
+        ]
+
+    @property
+    def intermediate_inputs(self) -> List[InputParam]:
+        return [
+            InputParam("generator"),
+            InputParam(
+                "image_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The latents representing the reference image for image-to-image/inpainting generation. Can be generated in vae_encode step.",
+            ),
+            InputParam(
+                "latent_timestep",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The timestep that represents the initial noise level for image-to-image/inpainting generation. Can be generated in set_timesteps step.",
+            ),
+            InputParam(
+                "batch_size",
+                required=True,
+                type_hint=int,
+                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
+            ),
+            InputParam("dtype", required=True, type_hint=torch.dtype, description="The dtype of the model inputs"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "latents", type_hint=torch.Tensor, description="The initial latents to use for the denoising process"
+            ),
+            OutputParam(
+                "latent_image_ids",
+                type_hint=torch.Tensor,
+                description="IDs computed from the image sequence needed for RoPE",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        block_state.height = block_state.height or components.default_height
+        block_state.width = block_state.width or components.default_width
+        block_state.device = components._execution_device
+        block_state.dtype = torch.bfloat16  # TODO: okay to hardcode this?
+        block_state.num_channels_latents = components.num_channels_latents
+        block_state.dtype = block_state.dtype if block_state.dtype is not None else components.vae.dtype
+        block_state.device = components._execution_device
+
+        # TODO: implement `check_inputs`
+
+        if block_state.latents is None:
+            block_state.latents, block_state.latent_image_ids = prepare_latents_img2img(
+                components.vae,
+                components.scheduler,
+                block_state.image_latents,
+                block_state.latent_timestep,
+                block_state.batch_size * block_state.num_images_per_prompt,
+                block_state.num_channels_latents,
+                block_state.height,
+                block_state.width,
+                block_state.dtype,
+                block_state.device,
+                block_state.generator,
+            )
+
+        self.set_block_state(state, block_state)
+
+        return components, state