up

sayakpaul · sayakpaul · commit 0496a6911bba · 2025-07-26T15:39:22.000+05:30
diff --git a/src/diffusers/modular_pipelines/flux/before_denoise.py b/src/diffusers/modular_pipelines/flux/before_denoise.py
@@ -105,26 +105,26 @@ def calculate_shift(
 
 # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
 def _pack_latents(latents, batch_size, num_channels_latents, height, width):
-        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
-        latents = latents.permute(0, 2, 4, 1, 3, 5)
-        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+    latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+    latents = latents.permute(0, 2, 4, 1, 3, 5)
+    latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
 
-        return latents
+    return latents
 
 
 # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
 def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
-        latent_image_ids = torch.zeros(height, width, 3)
-        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
-        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
+    latent_image_ids = torch.zeros(height, width, 3)
+    latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
+    latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
 
-        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+    latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
 
-        latent_image_ids = latent_image_ids.reshape(
-            latent_image_id_height * latent_image_id_width, latent_image_id_channels
-        )
+    latent_image_ids = latent_image_ids.reshape(
+        latent_image_id_height * latent_image_id_width, latent_image_id_channels
+    )
 
-        return latent_image_ids.to(device=device, dtype=dtype)
+    return latent_image_ids.to(device=device, dtype=dtype)
 
 
 class FluxInputStep(PipelineBlock):
@@ -180,13 +180,11 @@ def intermediate_outputs(self) -> List[str]:
             OutputParam(
                 "prompt_embeds",
                 type_hint=torch.Tensor,
-                # kwargs_type="guider_input_fields",  # already in intermedites state but declare here again for guider_input_fields
                 description="text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "pooled_prompt_embeds",
                 type_hint=torch.Tensor,
-                # kwargs_type="guider_input_fields",  # already in intermedites state but declare here again for guider_input_fields
                 description="pooled text embeddings used to guide the image generation",
             ),
             # TODO: support negative embeddings?
@@ -235,10 +233,10 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam("num_inference_steps", default=50),
-            InputParam("timesteps"), 
+            InputParam("timesteps"),
             InputParam("sigmas"),
             InputParam("guidance_scale", default=3.5),
-            InputParam("latents", type_hint=torch.Tensor)
+            InputParam("latents", type_hint=torch.Tensor),
         ]
 
     @property
@@ -261,7 +259,7 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 type_hint=int,
                 description="The number of denoising steps to perform at inference time",
             ),
-            OutputParam("guidance", type_hint=torch.Tensor, description="Optional guidance to be used.")
+            OutputParam("guidance", type_hint=torch.Tensor, description="Optional guidance to be used."),
         ]
 
     @torch.no_grad()
@@ -340,10 +338,11 @@ def intermediate_outputs(self) -> List[OutputParam]:
                 "latents", type_hint=torch.Tensor, description="The initial latents to use for the denoising process"
             ),
             OutputParam(
-                "latent_image_ids", type_hint=torch.Tensor, description="IDs computed from the image sequence needed for RoPE"
-            )
+                "latent_image_ids",
+                type_hint=torch.Tensor,
+                description="IDs computed from the image sequence needed for RoPE",
+            ),
         ]
-        
 
     @staticmethod
     def check_inputs(components, block_state):
@@ -417,7 +416,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
             block_state.generator,
             block_state.latents,
         )
-        
+
         self.set_block_state(state, block_state)
 
         return components, state
diff --git a/src/diffusers/modular_pipelines/flux/decoders.py b/src/diffusers/modular_pipelines/flux/decoders.py
@@ -31,19 +31,19 @@
 
 # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
 def _unpack_latents(latents, height, width, vae_scale_factor):
-        batch_size, num_patches, channels = latents.shape
+    batch_size, num_patches, channels = latents.shape
 
-        # VAE applies 8x compression on images but we must also account for packing which requires
-        # latent height and width to be divisible by 2.
-        height = 2 * (int(height) // (vae_scale_factor * 2))
-        width = 2 * (int(width) // (vae_scale_factor * 2))
+    # VAE applies 8x compression on images but we must also account for packing which requires
+    # latent height and width to be divisible by 2.
+    height = 2 * (int(height) // (vae_scale_factor * 2))
+    width = 2 * (int(width) // (vae_scale_factor * 2))
 
-        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
-        latents = latents.permute(0, 3, 1, 4, 2, 5)
+    latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+    latents = latents.permute(0, 3, 1, 4, 2, 5)
 
-        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
+    latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
 
-        return latents
+    return latents
 
 
 class FluxDecodeStep(PipelineBlock):
diff --git a/src/diffusers/modular_pipelines/flux/denoise.py b/src/diffusers/modular_pipelines/flux/denoise.py
@@ -19,8 +19,6 @@
 from ...models import FluxTransformer2DModel
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import logging
-from ...configuration_utils import FrozenDict
-from ...guiders import ClassifierFreeGuidance
 from ..modular_pipeline import (
     BlockState,
     LoopSequentialPipelineBlocks,
@@ -39,9 +37,7 @@ class FluxLoopDenoiser(PipelineBlock):
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
-        return [
-            ComponentSpec("transformer", FluxTransformer2DModel)
-        ]
+        return [ComponentSpec("transformer", FluxTransformer2DModel)]
 
     @property
     def description(self) -> str:
@@ -122,9 +118,7 @@ class FluxLoopAfterDenoiser(PipelineBlock):
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
-        return [
-            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)
-        ]
+        return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
 
     @property
     def description(self) -> str:
@@ -221,10 +215,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
 
 
 class FluxDenoiseStep(FluxDenoiseLoopWrapper):
-    block_classes = [
-        FluxLoopDenoiser,
-        FluxLoopAfterDenoiser
-    ]
+    block_classes = [FluxLoopDenoiser, FluxLoopAfterDenoiser]
     block_names = ["denoiser", "after_denoiser"]
 
     @property
diff --git a/src/diffusers/modular_pipelines/flux/encoders.py b/src/diffusers/modular_pipelines/flux/encoders.py
@@ -84,19 +84,16 @@ def intermediate_outputs(self) -> List[OutputParam]:
             OutputParam(
                 "prompt_embeds",
                 type_hint=torch.Tensor,
-                # kwargs_type="guider_input_fields",
                 description="text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "pooled_prompt_embeds",
                 type_hint=torch.Tensor,
-                # kwargs_type="guider_input_fields",
                 description="pooled text embeddings used to guide the image generation",
             ),
             OutputParam(
                 "text_ids",
                 type_hint=torch.Tensor,
-                # kwargs_type="guider_input_fields",
                 description="ids from the text sequence for RoPE",
             ),
         ]
diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks.py b/src/diffusers/modular_pipelines/flux/modular_blocks.py
@@ -46,9 +46,7 @@ def description(self):
 
 # before_denoise: all task (text2vid,)
 class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks):
-    block_classes = [
-        FluxBeforeDenoiseStep
-    ]
+    block_classes = [FluxBeforeDenoiseStep]
     block_names = ["text2image"]
     block_trigger_inputs = [None]