up up

yiyixuxu · yiyixuxu · commit 4483400370ba · 2025-08-27T04:23:15.000+02:00
diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
@@ -887,15 +887,21 @@ def __init__(
     def preprocess(
         self,
         image: PIL.Image.Image,
-        mask: PIL.Image.Image,
-        height:int,
-        width:int,
+        mask: PIL.Image.Image = None,
+        height:int = None,
+        width:int = None,
         padding_mask_crop:Optional[int] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Preprocess the image and mask.
         """
-        
+        if mask is None and padding_mask_crop is not None:
+            raise ValueError("mask must be provided if padding_mask_crop is provided")
+
+        # if mask is None, same behavior as regular image processor
+        if mask is None:
+            return self._image_processor.preprocess(image, height=height, width=width)
+
         if padding_mask_crop is not None:
             crops_coords = self._image_processor.get_crop_region(
                 mask, width, height, pad=padding_mask_crop
@@ -913,6 +919,7 @@ def preprocess(
             resize_mode=resize_mode,
         )
 
+
         processed_mask = self._mask_processor.preprocess(
             mask,
             height=height,
diff --git a/src/diffusers/modular_pipelines/qwenimage/__init__.py b/src/diffusers/modular_pipelines/qwenimage/__init__.py
@@ -26,6 +26,7 @@
         "ALL_BLOCKS",
         "CONTROLNET_BLOCKS",
         "TEXT2IMAGE_BLOCKS",
+        "INPAINT_BLOCKS",
     ]
     _import_structure["modular_pipeline"] = ["QwenImageModularPipeline"]
 
@@ -43,6 +44,7 @@
             ALL_BLOCKS,
             CONTROLNET_BLOCKS,
             TEXT2IMAGE_BLOCKS,
+            INPAINT_BLOCKS,
         )
         from .modular_pipeline import QwenImageModularPipeline
 else:
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -129,7 +129,7 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
 
     @property
     def description(self) -> str:
-        return "Prepare latents step that prepares the latents for the text-to-image generation process"
+        return "Prepare initial random noise for the generation process"
 
     @property
     def inputs(self) -> List[InputParam]:
@@ -358,7 +358,7 @@ class QwenImagePackLatentsDynamicStep(ModularPipelineBlocks):
 
     @property
     def description(self) -> str:
-        return "Step that pachify the latents inputs. Should be used with outputs from vae encoder step. If height and width are not provided, It will be updated based on the height and width of the latents."
+        return "Step that pachify the latents inputs. Should be used with outputs from vae encoder step."
 
     @property
     def inputs(self) -> List[InputParam]:
@@ -367,8 +367,6 @@ def inputs(self) -> List[InputParam]:
             additional_inputs.append(InputParam(name=input_name))
 
         return [
-            InputParam(name="height"),
-            InputParam(name="width"),
             InputParam(name="num_images_per_prompt", default=1),
             InputParam(
                 name="batch_size",
@@ -378,11 +376,10 @@ def inputs(self) -> List[InputParam]:
             ),
         ] + additional_inputs
 
-    def __init__(self, input_names: List[str] = ["image_latents"], update_height_width: bool = True):
+    def __init__(self, input_names: List[str] = ["image_latents"]):
         if not isinstance(input_names, list):
             input_names = [input_names]
         self._latents_input_names = input_names
-        self._update_height_width = update_height_width
         super().__init__()
     
     @staticmethod
@@ -425,11 +422,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
                 width=width_latents,
             )
 
-            if self._update_height_width and block_state.height is None:
-                block_state.height = height_latents * components.vae_scale_factor
-            if self._update_height_width and block_state.width is None:
-                block_state.width = width_latents * components.vae_scale_factor
-
             setattr(block_state, input_name, latents_input)
 
         self.set_block_state(state, block_state)
@@ -580,7 +572,7 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
 
     @property
     def description(self) -> str:
-        return "Step that prepares the additional inputs for the text-to-image generation process"
+        return "Step that prepares the RoPE inputs for the denoising process"
 
     @property
     def inputs(self) -> List[InputParam]:
@@ -641,7 +633,7 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
 
     @property
     def description(self) -> str:
-        return "Step that prepares the additional inputs for the text-to-image generation process"
+        return "Step that prepares the RoPE inputs for the text-to-image generation process. This is used in QwenImage Edit."
 
     @property
     def inputs(self) -> List[InputParam]:
diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py
@@ -610,5 +610,4 @@ def description(self) -> str:
             " - `QwenImageEditLoopBeforeDenoiser`\n"
             " - `QwenImageEditLoopDenoiser`\n"
             " - `QwenImageLoopAfterDenoiser`\n"
-            "This block supports text2img and img2img tasks."
         )
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -414,10 +414,14 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
 class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
     model_name = "qwenimage"
 
-    def __init__(self, input_name: str = "image", output_name: str = "image_latents", include_image_processor: bool = True):
+    def __init__(self, input_name: str = "image", output_name: str = "image_latents", include_image_processor: bool = True, **image_processor_kwargs):
+        if not include_image_processor and len(image_processor_kwargs) > 0:
+            logger.warning(f"these kwargs will be ignored: {image_processor_kwargs} since image_processor is not used in this block")
+
         self._image_input_name = input_name
         self._image_latents_output_name = output_name
         self._include_image_processor = include_image_processor
+        self._image_processor_kwargs = image_processor_kwargs
         super().__init__()
 
     @property
@@ -430,22 +434,28 @@ def expected_components(self) -> List[ComponentSpec]:
             ComponentSpec("vae", AutoencoderKLQwenImage),
         ]
         if self._include_image_processor:
+            image_processor_config = {"vae_scale_factor": 16}
+            image_processor_config.update(self._image_processor_kwargs)
             components.append(
                 ComponentSpec(
-                    "image_processor",
+                    f"{self._image_input_name}_processor",
                     VaeImageProcessor,
-                    config=FrozenDict({"vae_scale_factor": 16}),
+                    config=FrozenDict(image_processor_config),
                     default_creation_method="from_config",
                 )
             )
         return components
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [
+        inputs = [
             InputParam(self._image_input_name, required=True),
             InputParam("generator"),
         ]
+        if self._include_image_processor:
+            inputs.append(InputParam("height"))
+            inputs.append(InputParam("width"))
+        return inputs
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
@@ -457,6 +467,14 @@ def intermediate_outputs(self) -> List[OutputParam]:
             )
         ]
 
+    @staticmethod
+    def check_inputs(height, width, vae_scale_factor):
+        if height is not None and height % (vae_scale_factor * 2) != 0:
+            raise ValueError(f"Height must be divisible by {vae_scale_factor * 2} but is {height}")
+
+        if width is not None and width % (vae_scale_factor * 2) != 0:
+            raise ValueError(f"Width must be divisible by {vae_scale_factor * 2} but is {width}")
+
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
@@ -467,7 +485,16 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         image = getattr(block_state, self._image_input_name)
 
         if self._include_image_processor:
-            image = components.image_processor.preprocess(image)
+            image_processor = getattr(components, f"{self._image_input_name}_processor")
+            self.check_inputs(block_state.height, block_state.width, components.vae_scale_factor)
+
+            if not image_processor.config.do_resize and (block_state.height is not None or block_state.width is not None):
+                logger.warning(f"height and width are provided but image_processor.config.do_resize is False, these will be ignored")
+
+            height = block_state.height or components.default_height
+            width = block_state.width or components.default_width
+            image = image_processor.preprocess(image, height=height, width=width)
+ 
         image = image.unsqueeze(2)
         image = image.to(device=device, dtype=dtype)
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/input_output_processor.py b/src/diffusers/modular_pipelines/qwenimage/input_output_processor.py
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@`
`26`	`26`	`"ALL_BLOCKS",`
`27`	`27`	`"CONTROLNET_BLOCKS",`
`28`	`28`	`"TEXT2IMAGE_BLOCKS",`
	`29`	`+ "INPAINT_BLOCKS",`
`29`	`30`	`]`
`30`	`31`	`_import_structure["modular_pipeline"] = ["QwenImageModularPipeline"]`
`31`	`32`
`@@ -43,6 +44,7 @@`
`43`	`44`	`ALL_BLOCKS,`
`44`	`45`	`CONTROLNET_BLOCKS,`
`45`	`46`	`TEXT2IMAGE_BLOCKS,`
	`47`	`+ INPAINT_BLOCKS,`
`46`	`48`	`)`
`47`	`49`	`from .modular_pipeline import QwenImageModularPipeline`
`48`	`50`	`else:`
Original file line number	Diff line number	Diff line change
`@@ -610,5 +610,4 @@ def description(self) -> str:`
`610`	`610`	" - `QwenImageEditLoopBeforeDenoiser`\n"
`611`	`611`	" - `QwenImageEditLoopDenoiser`\n"
`612`	`612`	" - `QwenImageLoopAfterDenoiser`\n"
`613`		`- "This block supports text2img and img2img tasks."`
`614`	`613`	`)`