make flux ready for mellon (huggingface#12419)

sayakpaul · asomoza · web-flow · commit 7f3e9b8695e8 · 2025-10-06T13:15:54.000+05:30
* make flux ready for mellon

* up

* Apply suggestions from code review

Co-authored-by: Álvaro Somoza &lt;asomoza@users.noreply.github.com&gt;

---------

Co-authored-by: Álvaro Somoza &lt;asomoza@users.noreply.github.com&gt;
diff --git a/src/diffusers/modular_pipelines/flux/before_denoise.py b/src/diffusers/modular_pipelines/flux/before_denoise.py
@@ -252,11 +252,13 @@ def inputs(self) -> List[InputParam]:
             InputParam(
                 "prompt_embeds",
                 required=True,
+                kwargs_type="denoiser_input_fields",
                 type_hint=torch.Tensor,
                 description="Pre-generated text embeddings. Can be generated from text_encoder step.",
             ),
             InputParam(
                 "pooled_prompt_embeds",
+                kwargs_type="denoiser_input_fields",
                 type_hint=torch.Tensor,
                 description="Pre-generated pooled text embeddings. Can be generated from text_encoder step.",
             ),
@@ -279,11 +281,13 @@ def intermediate_outputs(self) -> List[str]:
             OutputParam(
                 "prompt_embeds",
                 type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
                 description="text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "pooled_prompt_embeds",
                 type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
                 description="pooled text embeddings used to guide the image generation",
             ),
             # TODO: support negative embeddings?
diff --git a/src/diffusers/modular_pipelines/flux/encoders.py b/src/diffusers/modular_pipelines/flux/encoders.py
@@ -181,6 +181,7 @@ def inputs(self) -> List[InputParam]:
         return [
             InputParam("prompt"),
             InputParam("prompt_2"),
+            InputParam("max_sequence_length", type_hint=int, default=512, required=False),
             InputParam("joint_attention_kwargs"),
         ]
 
@@ -189,16 +190,19 @@ def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
                 "prompt_embeds",
+                kwargs_type="denoiser_input_fields",
                 type_hint=torch.Tensor,
                 description="text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "pooled_prompt_embeds",
+                kwargs_type="denoiser_input_fields",
                 type_hint=torch.Tensor,
                 description="pooled text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "text_ids",
+                kwargs_type="denoiser_input_fields",
                 type_hint=torch.Tensor,
                 description="ids from the text sequence for RoPE",
             ),
@@ -404,6 +408,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
             pooled_prompt_embeds=None,
             device=block_state.device,
             num_images_per_prompt=1,  # TODO: hardcoded for now.
+            max_sequence_length=block_state.max_sequence_length,
             lora_scale=block_state.text_encoder_lora_scale,
         )
 
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks.py b/src/diffusers/modular_pipelines/flux/modular_blocks.py
@@ -84,9 +84,9 @@ def description(self):
 
 # before_denoise: all task (text2img, img2img)
 class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks):
-    block_classes = [FluxBeforeDenoiseStep, FluxImg2ImgBeforeDenoiseStep]
-    block_names = ["text2image", "img2img"]
-    block_trigger_inputs = [None, "image_latents"]
+    block_classes = [FluxImg2ImgBeforeDenoiseStep, FluxBeforeDenoiseStep]
+    block_names = ["img2img", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
 
     @property
     def description(self):
@@ -124,16 +124,32 @@ def description(self):
         return "Decode step that decode the denoised latents into image outputs.\n - `FluxDecodeStep`"
 
 
+class FluxCoreDenoiseStep(SequentialPipelineBlocks):
+    block_classes = [FluxInputStep, FluxAutoBeforeDenoiseStep, FluxAutoDenoiseStep]
+    block_names = ["input", "before_denoise", "denoise"]
+
+    @property
+    def description(self):
+        return (
+            "Core step that performs the denoising process. \n"
+            + " - `FluxInputStep` (input) standardizes the inputs for the denoising step.\n"
+            + " - `FluxAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+            + " - `FluxAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
+            + "This step support text-to-image and image-to-image tasks for Flux:\n"
+            + " - for image-to-image generation, you need to provide `image_latents`\n"
+            + " - for text-to-image generation, all you need to provide is prompt embeddings"
+        )
+
+
 # text2image
 class FluxAutoBlocks(SequentialPipelineBlocks):
     block_classes = [
         FluxTextEncoderStep,
         FluxAutoVaeEncoderStep,
-        FluxAutoBeforeDenoiseStep,
-        FluxAutoDenoiseStep,
+        FluxCoreDenoiseStep,
         FluxAutoDecodeStep,
     ]
-    block_names = ["text_encoder", "image_encoder", "before_denoise", "denoise", "decoder"]
+    block_names = ["text_encoder", "image_encoder", "denoise", "decode"]
 
     @property
     def description(self):
@@ -171,8 +187,7 @@ def description(self):
     [
         ("text_encoder", FluxTextEncoderStep),
         ("image_encoder", FluxAutoVaeEncoderStep),
-        ("before_denoise", FluxAutoBeforeDenoiseStep),
-        ("denoise", FluxAutoDenoiseStep),
+        ("denoise", FluxCoreDenoiseStep),
         ("decode", FluxAutoDecodeStep),
     ]
 )