fix

sayakpaul · sayakpaul · commit 7c7e8a44bcf2 · 2025-10-07T13:02:57.000+05:30
diff --git a/src/diffusers/modular_pipelines/flux/before_denoise.py b/src/diffusers/modular_pipelines/flux/before_denoise.py
@@ -398,6 +398,7 @@ def prepare_latents(
                 f" size of {batch_size}. Make sure the batch size matches the length of the generators."
             )
 
+        # TODO: move packing latents code to a patchifier
         latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         latents = _pack_latents(latents, batch_size, num_channels_latents, height, width)
 
@@ -436,12 +437,13 @@ class FluxImg2ImgPrepareLatentsStep(ModularPipelineBlocks):
 
     @property
     def description(self) -> str:
-        return "Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps, prepare_latents. Both noise and image latents should alreadybe patchified."
+        return "Step that adds noise to image latents for image-to-image. Should be run after `set_timesteps`,"
+        " `prepare_latents`. Both noise and image latents should already be patchified."
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
         return [
-            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
+            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)
         ]
 
     @property
@@ -521,9 +523,9 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="image_height", required=True),
-            InputParam(name="image_width", required=True),
-            InputParam(name="prompt_embeds"),
+            InputParam(name="height", required=True),
+            InputParam(name="width", required=True),
+            InputParam(name="prompt_embeds")
         ]
 
     @property
@@ -552,8 +554,8 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
             device=prompt_embeds.device, dtype=prompt_embeds.dtype
         )
 
-        height = 2 * (int(block_state.image_height) // (components.vae_scale_factor * 2))
-        width = 2 * (int(block_state.image_width) // (components.vae_scale_factor * 2))
+        height = 2 * (int(block_state.height) // (components.vae_scale_factor * 2))
+        width = 2 * (int(block_state.width) // (components.vae_scale_factor * 2))
         block_state.img_ids = FluxPipeline._prepare_latent_image_ids(None, height // 2, width // 2, device, dtype)
 
         self.set_block_state(state, block_state)
diff --git a/src/diffusers/modular_pipelines/flux/denoise.py b/src/diffusers/modular_pipelines/flux/denoise.py
@@ -76,18 +76,17 @@ def inputs(self) -> List[Tuple[str, Any]]:
                 description="Pooled prompt embeddings",
             ),
             InputParam(
-                "text_ids",
+                "txt_ids",
                 required=True,
                 type_hint=torch.Tensor,
                 description="IDs computed from text sequence needed for RoPE",
             ),
             InputParam(
-                "latent_image_ids",
+                "img_ids",
                 required=True,
                 type_hint=torch.Tensor,
                 description="IDs computed from image sequence needed for RoPE",
             ),
-            # TODO: guidance
         ]
 
     @torch.no_grad()
@@ -101,8 +100,8 @@ def __call__(
             encoder_hidden_states=block_state.prompt_embeds,
             pooled_projections=block_state.pooled_prompt_embeds,
             joint_attention_kwargs=block_state.joint_attention_kwargs,
-            txt_ids=block_state.text_ids,
-            img_ids=block_state.latent_image_ids,
+            txt_ids=block_state.txt_ids,
+            img_ids=block_state.img_ids,
             return_dict=False,
         )[0]
         block_state.noise_pred = noise_pred
diff --git a/src/diffusers/modular_pipelines/flux/encoders.py b/src/diffusers/modular_pipelines/flux/encoders.py
@@ -204,15 +204,13 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
         dtype = components.vae.dtype
 
         image = getattr(block_state, self._image_input_name)
+        image = image.to(device=device, dtype=dtype)
 
         # Encode image into latents
         image_latents = encode_vae_image(
             image=image,
             vae=components.vae,
-            generator=block_state.generator,
-            device=device,
-            dtype=dtype,
-            latent_channels=components.num_channels_latents,
+            generator=block_state.generator
         )
         setattr(block_state, self._image_latents_output_name, image_latents)
 
@@ -412,7 +410,6 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
             prompt_embeds=None,
             pooled_prompt_embeds=None,
             device=block_state.device,
-            num_images_per_prompt=1,  # TODO: hardcoded for now.
             max_sequence_length=block_state.max_sequence_length,
             lora_scale=block_state.text_encoder_lora_scale,
         )
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks.py b/src/diffusers/modular_pipelines/flux/modular_blocks.py
@@ -18,21 +18,43 @@
 from .before_denoise import (
     FluxImg2ImgPrepareLatentsStep,
     FluxImg2ImgSetTimestepsStep,
-    FluxInputStep,
     FluxPrepareLatentsStep,
     FluxSetTimestepsStep,
 )
 from .decoders import FluxDecodeStep
 from .denoise import FluxDenoiseStep
-from .encoders import FluxTextEncoderStep, FluxVaeEncoderStep
+from .encoders import FluxTextEncoderStep, FluxVaeEncoderDynamicStep
+from .before_denoise import FluxRoPEInputsStep
+from .inputs import FluxTextInputStep, FluxInputsDynamicStep
+
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
 # vae encoder (run before before_denoise)
+from .encoders import FluxProcessImagesInputStep
+
+FluxImg2ImgVaeEncoderBlocks = InsertableDict(
+    [
+        ("preprocess", FluxProcessImagesInputStep()),
+        ("encode", FluxVaeEncoderDynamicStep()),
+    ]
+)
+
+class FluxImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
+    model_name = "flux"
+
+    block_classes = FluxImg2ImgVaeEncoderBlocks.values()
+    block_names = FluxImg2ImgVaeEncoderBlocks.keys()
+
+    @property
+    def description(self) -> str:
+        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
+
+
 class FluxAutoVaeEncoderStep(AutoPipelineBlocks):
-    block_classes = [FluxVaeEncoderStep]
+    block_classes = [FluxImg2ImgVaeEncoderStep]
     block_names = ["img2img"]
     block_trigger_inputs = ["image"]
 
@@ -41,44 +63,49 @@ def description(self):
         return (
             "Vae encoder step that encode the image inputs into their latent representations.\n"
             + "This is an auto pipeline block that works for img2img tasks.\n"
-            + " - `FluxVaeEncoderStep` (img2img) is used when only `image` is provided."
-            + " - if `image` is provided, step will be skipped."
+            + " - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided."
+            + " - if `image` is not provided, step will be skipped."
         )
 
 
-# before_denoise: text2img, img2img
-class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
-    block_classes = [
-        FluxInputStep,
-        FluxPrepareLatentsStep,
-        FluxSetTimestepsStep,
+
+# before_denoise: text2img
+FluxBeforeDenoiseBlocks = InsertableDict(
+    [
+        ("prepare_latents", FluxPrepareLatentsStep()),
+        ("set_timesteps", FluxSetTimestepsStep()),
+        ("prepare_rope_inputs", FluxRoPEInputsStep())
     ]
-    block_names = ["input", "prepare_latents", "set_timesteps"]
+)
+
+class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
+    block_classes = FluxBeforeDenoiseBlocks.values()
+    block_names = FluxBeforeDenoiseBlocks.keys()
 
     @property
     def description(self):
         return (
-            "Before denoise step that prepare the inputs for the denoise step.\n"
-            + "This is a sequential pipeline blocks:\n"
-            + " - `FluxInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `FluxPrepareLatentsStep` is used to prepare the latents\n"
-            + " - `FluxSetTimestepsStep` is used to set the timesteps\n"
+            "Before denoise step that prepares the inputs for the denoise step in text-to-image generation."
         )
 
 
 # before_denoise: img2img
+FluxImg2ImgBeforeDenoiseBlocks = InsertableDict(
+    [
+        ("prepare_latents", FluxPrepareLatentsStep()),
+        ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
+        ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
+        ("prepare_rope_inputs", FluxRoPEInputsStep())
+    ]
+)
 class FluxImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
-    block_classes = [FluxInputStep, FluxImg2ImgSetTimestepsStep, FluxImg2ImgPrepareLatentsStep]
-    block_names = ["input", "set_timesteps", "prepare_latents"]
+    block_classes = FluxImg2ImgBeforeDenoiseBlocks.values()
+    block_names = FluxImg2ImgBeforeDenoiseBlocks.keys()
 
     @property
     def description(self):
         return (
-            "Before denoise step that prepare the inputs for the denoise step for img2img task.\n"
-            + "This is a sequential pipeline blocks:\n"
-            + " - `FluxInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `FluxImg2ImgSetTimestepsStep` is used to set the timesteps\n"
-            + " - `FluxImg2ImgPrepareLatentsStep` is used to prepare the latents\n"
+            "Before denoise step that prepare the inputs for the denoise step for img2img task."
         )
 
 
@@ -113,7 +140,7 @@ def description(self) -> str:
         )
 
 
-# decode: all task (text2img, img2img, inpainting)
+# decode: all task (text2img, img2img)
 class FluxAutoDecodeStep(AutoPipelineBlocks):
     block_classes = [FluxDecodeStep]
     block_names = ["non-inpaint"]
@@ -124,32 +151,73 @@ def description(self):
         return "Decode step that decode the denoised latents into image outputs.\n - `FluxDecodeStep`"
 
 
+# inputs: text2image/img2img
+FluxImg2ImgBlocks = InsertableDict(
+    [
+        ("text_inputs", FluxTextInputStep()),
+        ("additional_inputs", FluxInputsDynamicStep())
+    ]
+)
+
+class FluxImg2ImgInputStep(SequentialPipelineBlocks):
+    model_name = "flux"
+    block_classes = FluxImg2ImgBlocks.values()
+    block_names = FluxImg2ImgBlocks.keys()
+
+    @property
+    def description(self):
+        return "Input step that prepares the inputs for the img2img denoising step. It:\n"
+        " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
+        " - update height/width based `image_latents`, patchify `image_latents`."
+
+
+class FluxImageAutoInputStep(AutoPipelineBlocks):
+    block_classes = [FluxImg2ImgInputStep, FluxTextInputStep]
+    block_names = ["img2img", "text2image"]
+    block_trigger_inputs = [ "image_latents", None]
+
+    @property
+    def description(self):
+        return (
+            "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
+            " This is an auto pipeline block that works for text2image/img2img tasks.\n"
+            + " - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n"
+            + " - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.\n"
+        )
+
+
 class FluxCoreDenoiseStep(SequentialPipelineBlocks):
-    block_classes = [FluxInputStep, FluxAutoBeforeDenoiseStep, FluxAutoDenoiseStep]
+    model_name = "flux"
+    block_classes = [FluxImageAutoInputStep, FluxAutoBeforeDenoiseStep, FluxAutoDenoiseStep]
     block_names = ["input", "before_denoise", "denoise"]
 
     @property
     def description(self):
         return (
             "Core step that performs the denoising process. \n"
-            + " - `FluxInputStep` (input) standardizes the inputs for the denoising step.\n"
+            + " - `FluxImageAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
             + " - `FluxAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
             + " - `FluxAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
-            + "This step support text-to-image and image-to-image tasks for Flux:\n"
+            + "This step supports text-to-image and image-to-image tasks for Flux:\n"
             + " - for image-to-image generation, you need to provide `image_latents`\n"
-            + " - for text-to-image generation, all you need to provide is prompt embeddings"
+            + " - for text-to-image generation, all you need to provide is prompt embeddings."
         )
 
 
-# text2image
-class FluxAutoBlocks(SequentialPipelineBlocks):
-    block_classes = [
-        FluxTextEncoderStep,
-        FluxAutoVaeEncoderStep,
-        FluxCoreDenoiseStep,
-        FluxAutoDecodeStep,
+# Auto blocks (text2image and img2img)
+AUTO_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", FluxTextEncoderStep()),
+        ("image_encoder", FluxAutoVaeEncoderStep()),
+        ("denoise", FluxCoreDenoiseStep()),
+        ("decode", FluxDecodeStep())
     ]
-    block_names = ["text_encoder", "image_encoder", "denoise", "decode"]
+)
+class FluxAutoBlocks(SequentialPipelineBlocks):
+    model_name = "flux"
+
+    block_classes = AUTO_BLOCKS.values()
+    block_names = AUTO_BLOCKS.keys()
 
     @property
     def description(self):
@@ -162,35 +230,28 @@ def description(self):
 
 TEXT2IMAGE_BLOCKS = InsertableDict(
     [
-        ("text_encoder", FluxTextEncoderStep),
-        ("input", FluxInputStep),
-        ("prepare_latents", FluxPrepareLatentsStep),
-        ("set_timesteps", FluxSetTimestepsStep),
-        ("denoise", FluxDenoiseStep),
-        ("decode", FluxDecodeStep),
+        ("text_encoder", FluxTextEncoderStep()),
+        ("input", FluxTextInputStep()),
+        ("prepare_latents", FluxPrepareLatentsStep()),
+        ("set_timesteps", FluxSetTimestepsStep()),
+        ("prepare_rope_inputs", FluxRoPEInputsStep()),
+        ("denoise", FluxDenoiseStep()),
+        ("decode", FluxDecodeStep()),
     ]
 )
 
 IMAGE2IMAGE_BLOCKS = InsertableDict(
     [
-        ("text_encoder", FluxTextEncoderStep),
-        ("image_encoder", FluxVaeEncoderStep),
-        ("input", FluxInputStep),
-        ("set_timesteps", FluxImg2ImgSetTimestepsStep),
-        ("prepare_latents", FluxImg2ImgPrepareLatentsStep),
-        ("denoise", FluxDenoiseStep),
-        ("decode", FluxDecodeStep),
+        ("text_encoder", FluxTextEncoderStep()),
+        ("vae_encoder", FluxVaeEncoderDynamicStep()),
+        ("input", FluxImg2ImgInputStep()),
+        ("prepare_latents", FluxPrepareLatentsStep()),
+        ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
+        ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
+        ("prepare_rope_inputs", FluxRoPEInputsStep()),
+        ("denoise", FluxDenoiseStep()),
+        ("decode", FluxDecodeStep()),
     ]
 )
 
-AUTO_BLOCKS = InsertableDict(
-    [
-        ("text_encoder", FluxTextEncoderStep),
-        ("image_encoder", FluxAutoVaeEncoderStep),
-        ("denoise", FluxCoreDenoiseStep),
-        ("decode", FluxAutoDecodeStep),
-    ]
-)
-
-
 ALL_BLOCKS = {"text2image": TEXT2IMAGE_BLOCKS, "img2img": IMAGE2IMAGE_BLOCKS, "auto": AUTO_BLOCKS}