huggingface
diff --git a/‎src/diffusers/modular_pipelines/qwenimage/before_denoise.py‎
Lines changed: 152 additions & 3 deletions b/‎src/diffusers/modular_pipelines/qwenimage/before_denoise.py‎
Lines changed: 152 additions & 3 deletions
diff --git a/‎src/diffusers/modular_pipelines/qwenimage/denoise.py‎
Lines changed: 125 additions & 2 deletions b/‎src/diffusers/modular_pipelines/qwenimage/denoise.py‎
Lines changed: 125 additions & 2 deletions
@@ -18,6 +18,11 @@
 import torch
 import inspect
 
+from ...image_processor import VaeImageProcessor
+from ...configuration_utils import FrozenDict
+
+from ...pipelines.qwenimage.pipeline_qwenimage_edit import calculate_dimensions
+
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
 from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 from .modular_pipeline import QwenImageModularPipeline
@@ -110,6 +115,41 @@ def pack_latents(latents, batch_size, num_channels_latents, height, width):
     return latents
 
 
+class QwenImageImageResizeStep(ModularPipelineBlocks):
+    model_name = "qwenimage"
+
+    @property
+    def description(self) -> str:
+        return "Image Resize step that resize the image to the target area while maintaining the aspect ratio"
+    
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("image_processor", VaeImageProcessor, config=FrozenDict({"vae_scale_factor": 16}), default_creation_method="from_config"),
+        ]
+    
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(name="image", required=True, type_hint=torch.Tensor, description="The image to resize"),
+        ]
+    
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
+        block_state = self.get_block_state(state)
+
+
+        if not isinstance(block_state.image, list):
+            block_state.image = [block_state.image]
+        
+        image_width, image_height = block_state.image[0].size
+        calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, image_width / image_height)
+
+        block_state.image = components.image_processor.resize(block_state.image, height=calculated_height, width=calculated_width)
+        self.set_block_state(state, block_state)
+        return components, state
+
+
 class QwenImageInputStep(ModularPipelineBlocks):
 
     model_name = "qwenimage"
@@ -123,6 +163,7 @@ def description(self) -> str:
             "All input tensors are expected to have either batch_size=1 or match the batch_size\n"
             "of prompt_embeds. The tensors will be duplicated across the batch dimension to\n"
             "have a final batch_size of batch_size * num_images_per_prompt."
+            "  3. If `image_latents` is provided and `height` and `width` are not provided, it will update the `height` and `width` parameters."
         )
 
     @property
@@ -133,7 +174,9 @@ def inputs(self) -> List[InputParam]:
             InputParam(name="prompt_embeds_mask", required=True, kwargs_type="guider_input_fields"),
             InputParam(name="negative_prompt_embeds", kwargs_type="guider_input_fields"),
             InputParam(name="negative_prompt_embeds_mask", kwargs_type="guider_input_fields"),
-
+            InputParam(name="image_latents"),
+            InputParam(name="height"),
+            InputParam(name="width"),
         ]
 
     @property
@@ -152,7 +195,7 @@ def intermediate_outputs(self) -> List[str]:
         ]
 
     @staticmethod
-    def check_inputs(prompt_embeds, prompt_embeds_mask, negative_prompt_embeds, negative_prompt_embeds_mask):
+    def check_inputs(prompt_embeds, prompt_embeds_mask, negative_prompt_embeds, negative_prompt_embeds_mask, image_latents):
 
         if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None:
             raise ValueError("`negative_prompt_embeds_mask` is required when `negative_prompt_embeds` is not None")
@@ -168,6 +211,9 @@ def check_inputs(prompt_embeds, prompt_embeds_mask, negative_prompt_embeds, nega
 
         elif negative_prompt_embeds_mask is not None and negative_prompt_embeds_mask.shape[0] != prompt_embeds.shape[0]:
             raise ValueError("`negative_prompt_embeds_mask` must have the same batch size as `prompt_embeds`")
+
+        if image_latents is not None and image_latents.shape[0] != 1 and image_latents.shape[0] != prompt_embeds.shape[0]:
+            raise ValueError(f"`image_latents` must have have batch size 1 or {prompt_embeds.shape[0]}, but got {image_latents.shape[0]}")
 
 
 
@@ -180,6 +226,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
             prompt_embeds_mask=block_state.prompt_embeds_mask,
             negative_prompt_embeds=block_state.negative_prompt_embeds,
             negative_prompt_embeds_mask=block_state.negative_prompt_embeds_mask,
+            image_latents=block_state.image_latents,
         )
 
         block_state.batch_size = block_state.prompt_embeds.shape[0]
@@ -204,7 +251,20 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
             block_state.negative_prompt_embeds_mask = block_state.negative_prompt_embeds_mask.repeat(1, block_state.num_images_per_prompt, 1)
             block_state.negative_prompt_embeds_mask = block_state.negative_prompt_embeds_mask.view(
                 block_state.batch_size * block_state.num_images_per_prompt, seq_len)
-            
+        
+        if block_state.image_latents is not None:
+            final_batch_size = block_state.batch_size * block_state.num_images_per_prompt
+            block_state.image_latents = block_state.image_latents.repeat(
+                final_batch_size // block_state.image_latents.shape[0], 1, 1, 1, 1
+            )
+
+            height_image_latent, width_image_latent = block_state.image_latents.shape[3:]
+
+            if block_state.height is None:
+                block_state.height = height_image_latent * components.vae_scale_factor
+            if block_state.width is None:
+                block_state.width = width_image_latent * components.vae_scale_factor
+           
         self.set_block_state(state, block_state)
 
         return components, state
@@ -312,6 +372,43 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         return components, state
 
 
+class QwenImagePrepareImageLatentsStep(ModularPipelineBlocks):
+
+    model_name = "qwenimage"
+
+    @property
+    def description(self) -> str:
+        return "Prepare latents step that prepares the latents for the text-to-image generation process"
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(name="image_latents", required=True, type_hint=torch.Tensor, description="The latents representing the reference image, can be generated in vae encoder step"),
+        ]
+
+
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
+
+        block_state = self.get_block_state(state)
+
+        height_image_latent, width_image_latent = block_state.image_latents.shape[3:]
+
+        block_state.image_latents = pack_latents(
+            latents=block_state.image_latents,
+            batch_size=block_state.image_latents.shape[0],
+            num_channels_latents=components.num_channels_latents,
+            height=height_image_latent,
+            width=width_image_latent,
+        )
+
+        
+        self.set_block_state(state, block_state)
+
+        return components, state
+
+
+
 
 class QwenImageSetTimestepsStep(ModularPipelineBlocks):
 
@@ -410,6 +507,58 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         )
 
 
+        self.set_block_state(state, block_state)
+
+        return components, state
+
+
+class QwenImageEditPrepareAdditionalInputsStep(ModularPipelineBlocks):
+
+    model_name = "qwenimage"
+
+    @property
+    def description(self) -> str:
+        return "Step that prepares the additional inputs for the text-to-image generation process"
+    
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(name="batch_size", required=True),
+            InputParam(name="image", required=True, type_hint=torch.Tensor, description="The resized image input"),
+            InputParam(name="height", required=True),
+            InputParam(name="width", required=True),
+            InputParam(name="prompt_embeds_mask"),
+            InputParam(name="negative_prompt_embeds_mask"),
+        ]
+    
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(name="img_shapes", type_hint=List[List[Tuple[int, int, int]]], description="The shapes of the images latents, used for RoPE calculation"),
+            OutputParam(name="txt_seq_lens", kwargs_type="guider_input_fields", type_hint=List[int], description="The sequence lengths of the prompt embeds, used for RoPE calculation"),
+            OutputParam(name="negative_txt_seq_lens", kwargs_type="guider_input_fields", type_hint=List[int], description="The sequence lengths of the negative prompt embeds, used for RoPE calculation"),
+        ]
+    
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
+
+        block_state = self.get_block_state(state)
+
+        image = block_state.image[0] if isinstance(block_state.image, list) else block_state.image
+        image_width, image_height = image.size
+
+        block_state.img_shapes = [
+            [
+                (1, block_state.height // components.vae_scale_factor // 2, block_state.width // components.vae_scale_factor // 2),
+                (1, image_height // components.vae_scale_factor // 2, image_width // components.vae_scale_factor // 2),
+            ]
+        ] * block_state.batch_size
+
+        block_state.txt_seq_lens = block_state.prompt_embeds_mask.sum(dim=1).tolist() if block_state.prompt_embeds_mask is not None else None
+        block_state.negative_txt_seq_lens = (
+            block_state.negative_prompt_embeds_mask.sum(dim=1).tolist() if block_state.negative_prompt_embeds_mask is not None else None
+        )
+
+
         self.set_block_state(state, block_state)
 
         return components, state
@@ -52,6 +52,34 @@ def inputs(self) -> List[InputParam]:
     def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
         # one timestep
         block_state.timestep = t.expand(block_state.latents.shape[0]).to(block_state.latents.dtype)
+        block_state.latent_model_input = block_state.latents
+        return components, block_state
+
+
+class QwenImageEditLoopBeforeDenoiser(ModularPipelineBlocks):
+    model_name = "qwenimage"
+
+    @property
+    def description(self) -> str:
+        return (
+            "step within the denoising loop that prepares the latent input for the denoiser. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `QwenImageDenoiseLoopWrapper`)"
+        )
+    
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("latents", required=True, type_hint=torch.Tensor, description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."),
+            InputParam("image_latents", required=True, type_hint=torch.Tensor, description="The initial image latents to use for the denoising process. Can be encoded in vae_encoder step and packed in prepare_image_latents step."),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+        # one timestep
+
+        block_state.latent_model_input = torch.cat([block_state.latents, block_state.image_latents], dim=1)
+        block_state.timestep = t.expand(block_state.latents.shape[0]).to(block_state.latents.dtype)
         return components, block_state
 
 
@@ -107,7 +135,7 @@ def __call__(self, components: QwenImageModularPipeline, block_state: BlockState
 
             # YiYi TODO: add cache context
             guider_state_batch.noise_pred = components.transformer(
-                hidden_states=block_state.latents,
+                hidden_states=block_state.latent_model_input,
                 timestep=block_state.timestep / 1000,
                 img_shapes=block_state.img_shapes,
                 attention_kwargs=block_state.attention_kwargs,
@@ -128,7 +156,80 @@ def __call__(self, components: QwenImageModularPipeline, block_state: BlockState
         return components, block_state
 
 
-            
+class QwenImageEditLoopDenoiser(ModularPipelineBlocks):
+    model_name = "qwenimage"
+
+    @property
+    def description(self) -> str:
+        return (
+            "step within the denoising loop that denoise the latent input for the denoiser. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `QwenImageDenoiseLoopWrapper`)"
+        )
+    
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 4.0}),
+                default_creation_method="from_config",
+            ),
+            ComponentSpec("transformer", QwenImageTransformer2DModel),
+        ]
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("attention_kwargs"),
+            InputParam("latents", required=True, type_hint=torch.Tensor, description="The latents to use for the denoising process. Can be generated in prepare_latents step."),
+            InputParam("num_inference_steps", required=True, type_hint=int, description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step."),
+            InputParam(kwargs_type="guider_input_fields", description="All coditional model inputs that need to be prepared with guider: e.g. prompt_embeds, negative_prompt_embeds, etc."),
+            InputParam("img_shapes", required=True, type_hint=List[Tuple[int, int]], description="The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step."),
+        ]
+    
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+        
+        guider_input_fields = {
+            "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
+            "encoder_hidden_states_mask": ("prompt_embeds_mask", "negative_prompt_embeds_mask"),
+            "txt_seq_lens": ("txt_seq_lens", "negative_txt_seq_lens"),
+        }
+
+        components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
+        guider_state = components.guider.prepare_inputs(block_state, guider_input_fields)
+
+        for guider_state_batch in guider_state:
+            components.guider.prepare_models(components.transformer)
+            cond_kwargs = guider_state_batch.as_dict()
+            cond_kwargs = {k: v for k, v in cond_kwargs.items() if k in guider_input_fields}
+
+            # YiYi TODO: add cache context
+            guider_state_batch.noise_pred = components.transformer(
+                hidden_states=block_state.latent_model_input,
+                timestep=block_state.timestep / 1000,
+                img_shapes=block_state.img_shapes,
+                attention_kwargs=block_state.attention_kwargs,
+                return_dict=False,
+                **cond_kwargs,
+            )[0]
+
+            components.guider.cleanup_models(components.transformer)
+        
+        guider_output = components.guider(guider_state)
+
+        pred = guider_output.pred[:, : block_state.latents.size(1)]
+        pred_cond = guider_output.pred_cond[:, : block_state.latents.size(1)]
+
+        # apply guidance rescale
+        pred_cond_norm = torch.norm(pred_cond, dim=-1, keepdim=True)
+        pred_norm = torch.norm(pred, dim=-1, keepdim=True)
+        block_state.noise_pred = pred * (pred_cond_norm / pred_norm)
+        
+
+        return components, block_state       
 
 
 class QwenImageLoopAfterDenoiser(ModularPipelineBlocks):
@@ -237,4 +338,26 @@ def description(self) -> str:
             " - `QwenImageLoopDenoiser`\n"
             " - `QwenImageLoopAfterDenoiser`\n"
             "This block supports text2img tasks."
+        )
+
+
+# composing the denoising loops
+class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
+    block_classes = [
+        QwenImageEditLoopBeforeDenoiser,
+        QwenImageEditLoopDenoiser,
+        QwenImageLoopAfterDenoiser,
+    ]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents. \n"
+            "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n"
+            "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
+            " - `QwenImageEditLoopBeforeDenoiser`\n"
+            " - `QwenImageEditLoopDenoiser`\n"
+            " - `QwenImageLoopAfterDenoiser`\n"
+            "This block supports text2img and img2img tasks."
         )