up

sayakpaul · sayakpaul · commit 0484e77ed10c · 2025-07-26T15:33:09.000+05:30
diff --git a/src/diffusers/modular_pipelines/flux/before_denoise.py b/src/diffusers/modular_pipelines/flux/before_denoise.py
@@ -233,7 +233,24 @@ def description(self) -> str:
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [InputParam("num_inference_steps", default=50), InputParam("timesteps"), InputParam("sigmas")]
+        return [
+            InputParam("num_inference_steps", default=50),
+            InputParam("timesteps"), 
+            InputParam("sigmas"),
+            InputParam("guidance_scale", default=3.5),
+            InputParam("latents", type_hint=torch.Tensor)
+        ]
+
+    @property
+    def intermediate_inputs(self) -> List[str]:
+        return [
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
+            )
+        ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
@@ -244,6 +261,7 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 type_hint=int,
                 description="The number of denoising steps to perform at inference time",
             ),
+            OutputParam("guidance", type_hint=torch.Tensor, description="Optional guidance to be used.")
         ]
 
     @torch.no_grad()
@@ -271,6 +289,12 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
         block_state.timesteps, block_state.num_inference_steps = retrieve_timesteps(
             scheduler, block_state.num_inference_steps, block_state.device, sigmas=block_state.sigmas, mu=mu
         )
+        if components.transformer.config.guidance_embeds:
+            guidance = torch.full([1], block_state.guidance_scale, device=block_state.device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+        block_state.guidance = guidance
 
         self.set_block_state(state, block_state)
         return components, state
@@ -314,8 +338,12 @@ def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
                 "latents", type_hint=torch.Tensor, description="The initial latents to use for the denoising process"
+            ),
+            OutputParam(
+                "latent_image_ids", type_hint=torch.Tensor, description="IDs computed from the image sequence needed for RoPE"
             )
         ]
+        
 
     @staticmethod
     def check_inputs(components, block_state):
@@ -378,7 +406,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
 
         self.check_inputs(components, block_state)
 
-        block_state.latents = self.prepare_latents(
+        block_state.latents, block_state.latent_image_ids = self.prepare_latents(
             components,
             block_state.batch_size * block_state.num_images_per_prompt,
             block_state.num_channels_latents,
@@ -389,7 +417,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
             block_state.generator,
             block_state.latents,
         )
-
+        
         self.set_block_state(state, block_state)
 
         return components, state
diff --git a/src/diffusers/modular_pipelines/flux/denoise.py b/src/diffusers/modular_pipelines/flux/denoise.py
@@ -19,6 +19,8 @@
 from ...models import FluxTransformer2DModel
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import logging
+from ...configuration_utils import FrozenDict
+from ...guiders import ClassifierFreeGuidance
 from ..modular_pipeline import (
     BlockState,
     LoopSequentialPipelineBlocks,
@@ -37,7 +39,9 @@ class FluxLoopDenoiser(PipelineBlock):
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
-        return [ComponentSpec("transformer", FluxTransformer2DModel)]
+        return [
+            ComponentSpec("transformer", FluxTransformer2DModel)
+        ]
 
     @property
     def description(self) -> str:
@@ -49,9 +53,7 @@ def description(self) -> str:
 
     @property
     def inputs(self) -> List[Tuple[str, Any]]:
-        return [
-            InputParam("attention_kwargs"),
-        ]
+        return [InputParam("joint_attention_kwargs")]
 
     @property
     def intermediate_inputs(self) -> List[str]:
@@ -63,10 +65,34 @@ def intermediate_inputs(self) -> List[str]:
                 description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
             ),
             InputParam(
-                "num_inference_steps",
+                "guidance",
                 required=True,
-                type_hint=int,
-                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
+                type_hint=torch.Tensor,
+                description="Guidance scale as a tensor",
+            ),
+            InputParam(
+                "prompt_embeds",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Prompt embeddings",
+            ),
+            InputParam(
+                "pooled_prompt_embeds",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Pooled prompt embeddings",
+            ),
+            InputParam(
+                "text_ids",
+                required=True,
+                type_hint=torch.Tensor,
+                description="IDs computed from text sequence needed for RoPE",
+            ),
+            InputParam(
+                "latent_image_ids",
+                required=True,
+                type_hint=torch.Tensor,
+                description="IDs computed from image sequence needed for RoPE",
             ),
             # TODO: guidance
         ]
@@ -78,9 +104,10 @@ def __call__(
         noise_pred = components.transformer(
             hidden_states=block_state.latents,
             timestep=t.flatten() / 1000,
+            guidance=block_state.guidance,
             encoder_hidden_states=block_state.prompt_embeds,
             pooled_projections=block_state.pooled_prompt_embeds,
-            attention_kwargs=block_state.attention_kwargs,
+            joint_attention_kwargs=block_state.joint_attention_kwargs,
             txt_ids=block_state.text_ids,
             img_ids=block_state.latent_image_ids,
             return_dict=False,
@@ -96,7 +123,7 @@ class FluxLoopAfterDenoiser(PipelineBlock):
     @property
     def expected_components(self) -> List[ComponentSpec]:
         return [
-            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
+            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)
         ]
 
     @property
@@ -113,9 +140,7 @@ def inputs(self) -> List[Tuple[str, Any]]:
 
     @property
     def intermediate_inputs(self) -> List[str]:
-        return [
-            InputParam("generator"),
-        ]
+        return [InputParam("generator")]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
@@ -129,7 +154,6 @@ def __call__(self, components: FluxModularPipeline, block_state: BlockState, i:
             block_state.noise_pred,
             t,
             block_state.latents,
-            **block_state.scheduler_step_kwargs,
             return_dict=False,
         )[0]
 
@@ -199,9 +223,9 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
 class FluxDenoiseStep(FluxDenoiseLoopWrapper):
     block_classes = [
         FluxLoopDenoiser,
-        FluxLoopAfterDenoiser,
+        FluxLoopAfterDenoiser
     ]
-    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+    block_names = ["denoiser", "after_denoiser"]
 
     @property
     def description(self) -> str:
diff --git a/src/diffusers/modular_pipelines/flux/encoders.py b/src/diffusers/modular_pipelines/flux/encoders.py
@@ -294,6 +294,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
             else None
         )
         (block_state.prompt_embeds, block_state.pooled_prompt_embeds, block_state.text_ids) = self.encode_prompt(
+            components,
             prompt=block_state.prompt,
             prompt_2=None,
             prompt_embeds=None,
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks.py b/src/diffusers/modular_pipelines/flux/modular_blocks.py
@@ -28,26 +28,26 @@
 class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
     block_classes = [
         FluxInputStep,
-        FluxSetTimestepsStep,
         FluxPrepareLatentsStep,
+        FluxSetTimestepsStep,
     ]
-    block_names = ["input", "set_timesteps", "prepare_latents"]
+    block_names = ["input", "prepare_latents", "set_timesteps"]
 
     @property
     def description(self):
         return (
             "Before denoise step that prepare the inputs for the denoise step.\n"
             + "This is a sequential pipeline blocks:\n"
             + " - `FluxInputStep` is used to adjust the batch size of the model inputs\n"
-            + " - `FluxSetTimestepsStep` is used to set the timesteps\n"
             + " - `FluxPrepareLatentsStep` is used to prepare the latents\n"
+            + " - `FluxSetTimestepsStep` is used to set the timesteps\n"
         )
 
 
 # before_denoise: all task (text2vid,)
 class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks):
     block_classes = [
-        FluxBeforeDenoiseStep,
+        FluxBeforeDenoiseStep
     ]
     block_names = ["text2image"]
     block_trigger_inputs = [None]
@@ -114,8 +114,10 @@ def description(self):
     [
         ("text_encoder", FluxTextEncoderStep),
         ("input", FluxInputStep),
-        ("set_timesteps", FluxSetTimestepsStep),
         ("prepare_latents", FluxPrepareLatentsStep),
+        # Setting it after preparation of latents because we rely on `latents`
+        # to calculate `img_seq_len` for `shift`.
+        ("set_timesteps", FluxSetTimestepsStep),
         ("denoise", FluxDenoiseStep),
         ("decode", FluxDecodeStep),
     ]
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -1665,7 +1665,7 @@ def get_block_state(self, state: PipelineState) -> dict:
             if input_param.name:
                 value = state.get_intermediate(input_param.name)
                 if input_param.required and value is None:
-                    raise ValueError(f"Required intermediate input '{input_param.name}' is missing")
+                    raise ValueError(f"Required intermediate input '{input_param.name}' is missing.")
                 elif value is not None or (value is None and input_param.name not in data):
                     data[input_param.name] = value
             elif input_param.kwargs_type:

Original file line number	Diff line number	Diff line change
`@@ -294,6 +294,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip`
`294`	`294`	`else None`
`295`	`295`	`)`
`296`	`296`	`(block_state.prompt_embeds, block_state.pooled_prompt_embeds, block_state.text_ids) = self.encode_prompt(`
	`297`	`+ components,`
`297`	`298`	`prompt=block_state.prompt,`
`298`	`299`	`prompt_2=None,`
`299`	`300`	`prompt_embeds=None,`