huggingface
diff --git a/‎src/diffusers/image_processor.py‎
Lines changed: 21 additions & 27 deletions b/‎src/diffusers/image_processor.py‎
Lines changed: 21 additions & 27 deletions
diff --git a/‎src/diffusers/modular_pipelines/qwenimage/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎src/diffusers/modular_pipelines/qwenimage/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/diffusers/modular_pipelines/qwenimage/before_denoise.py‎
Lines changed: 69 additions & 40 deletions b/‎src/diffusers/modular_pipelines/qwenimage/before_denoise.py‎
Lines changed: 69 additions & 40 deletions
diff --git a/‎src/diffusers/modular_pipelines/qwenimage/decoders.py‎
Lines changed: 5 additions & 3 deletions b/‎src/diffusers/modular_pipelines/qwenimage/decoders.py‎
Lines changed: 5 additions & 3 deletions
@@ -842,11 +842,12 @@ class InpaintProcessor(ConfigMixin):
     """
     Image processor for inpainting image and mask.
     """
+
     config_name = CONFIG_NAME
 
     @register_to_config
     def __init__(
-        self, 
+        self,
         do_resize: bool = True,
         vae_scale_factor: int = 8,
         vae_latent_channels: int = 4,
@@ -855,42 +856,40 @@ def __init__(
         do_normalize: bool = True,
         do_binarize: bool = False,
         do_convert_grayscale: bool = False,
-        mask_do_normalize: bool = False, 
-        mask_do_binarize: bool = True, 
+        mask_do_normalize: bool = False,
+        mask_do_binarize: bool = True,
         mask_do_convert_grayscale: bool = True,
-        ):
-
+    ):
         super().__init__()
 
         self._image_processor = VaeImageProcessor(
             do_resize=do_resize,
-            vae_scale_factor=vae_scale_factor, 
+            vae_scale_factor=vae_scale_factor,
             vae_latent_channels=vae_latent_channels,
             resample=resample,
             reducing_gap=reducing_gap,
             do_normalize=do_normalize,
             do_binarize=do_binarize,
             do_convert_grayscale=do_convert_grayscale,
-            )
+        )
         self._mask_processor = VaeImageProcessor(
             do_resize=do_resize,
-            vae_scale_factor=vae_scale_factor, 
+            vae_scale_factor=vae_scale_factor,
             vae_latent_channels=vae_latent_channels,
             resample=resample,
             reducing_gap=reducing_gap,
-            do_normalize=mask_do_normalize, 
-            do_binarize=mask_do_binarize, 
-            do_convert_grayscale=mask_do_convert_grayscale, 
-            )
+            do_normalize=mask_do_normalize,
+            do_binarize=mask_do_binarize,
+            do_convert_grayscale=mask_do_convert_grayscale,
+        )
 
-    
     def preprocess(
         self,
         image: PIL.Image.Image,
         mask: PIL.Image.Image = None,
-        height:int = None,
-        width:int = None,
-        padding_mask_crop:Optional[int] = None,
+        height: int = None,
+        width: int = None,
+        padding_mask_crop: Optional[int] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Preprocess the image and mask.
@@ -903,14 +902,12 @@ def preprocess(
             return self._image_processor.preprocess(image, height=height, width=width)
 
         if padding_mask_crop is not None:
-            crops_coords = self._image_processor.get_crop_region(
-                mask, width, height, pad=padding_mask_crop
-            )
+            crops_coords = self._image_processor.get_crop_region(mask, width, height, pad=padding_mask_crop)
             resize_mode = "fill"
         else:
             crops_coords = None
             resize_mode = "default"
-        
+
         processed_image = self._image_processor.preprocess(
             image,
             height=height,
@@ -919,7 +916,6 @@ def preprocess(
             resize_mode=resize_mode,
         )
 
-
         processed_mask = self._mask_processor.preprocess(
             mask,
             height=height,
@@ -928,7 +924,6 @@ def preprocess(
             crops_coords=crops_coords,
         )
 
-        
         if crops_coords is not None:
             postprocessing_kwargs = {
                 "crops_coords": crops_coords,
@@ -944,7 +939,6 @@ def preprocess(
 
         return processed_image, processed_mask, postprocessing_kwargs
 
-    
     def postprocess(
         self,
         image: torch.Tensor,
@@ -965,10 +959,10 @@ def postprocess(
             raise ValueError("original_image and original_mask must be provided if crops_coords is provided")
 
         elif crops_coords is not None:
-            image = [self._image_processor.apply_overlay(
-                original_mask, original_image, i, crops_coords
-            ) for i in image]
-        
+            image = [
+                self._image_processor.apply_overlay(original_mask, original_image, i, crops_coords) for i in image
+            ]
+
         return image
 
 
 
@@ -25,8 +25,8 @@
     _import_structure["modular_blocks"] = [
         "ALL_BLOCKS",
         "CONTROLNET_BLOCKS",
-        "TEXT2IMAGE_BLOCKS",
         "INPAINT_BLOCKS",
+        "TEXT2IMAGE_BLOCKS",
     ]
     _import_structure["modular_pipeline"] = ["QwenImageModularPipeline"]
 
@@ -43,8 +43,8 @@
         from .modular_blocks import (
             ALL_BLOCKS,
             CONTROLNET_BLOCKS,
-            TEXT2IMAGE_BLOCKS,
             INPAINT_BLOCKS,
+            TEXT2IMAGE_BLOCKS,
         )
         from .modular_pipeline import QwenImageModularPipeline
 else:
 
@@ -18,10 +18,7 @@
 import numpy as np
 import torch
 
-from ...configuration_utils import FrozenDict
-from ...image_processor import VaeImageProcessor
 from ...models import QwenImageControlNetModel, QwenImageMultiControlNetModel
-from ...pipelines.qwenimage.pipeline_qwenimage_edit import calculate_dimensions
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils.torch_utils import randn_tensor, unwrap_module
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
@@ -243,45 +240,62 @@ def expected_components(self) -> List[ComponentSpec]:
         return [
             ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
         ]
-    
+
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="latents", required=True, type_hint=torch.Tensor, description="The initial random noised, can be generated in prepare latent step."),
-            InputParam(name="image_latents", required=True, type_hint=torch.Tensor, description="The image latents to use for the denoising process. Can be generated in vae encoder + pack latents step."),
-            InputParam(name="timesteps", required=True, type_hint=torch.Tensor, description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",),
-            InputParam(name="batch_size", required=True, type_hint=int, description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in expand textinput step."),
+            InputParam(
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial random noised, can be generated in prepare latent step.",
+            ),
+            InputParam(
+                name="image_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The image latents to use for the denoising process. Can be generated in vae encoder + pack latents step.",
+            ),
+            InputParam(
+                name="timesteps",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
+            InputParam(
+                name="batch_size",
+                required=True,
+                type_hint=int,
+                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in expand textinput step.",
+            ),
             InputParam(name="num_images_per_prompt", required=True),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="initial_noise", type_hint=torch.Tensor, description="The initial random noised used for inpainting denoising."),
+            OutputParam(
+                name="initial_noise",
+                type_hint=torch.Tensor,
+                description="The initial random noised used for inpainting denoising.",
+            ),
         ]
-    
-    
+
     @staticmethod
     def check_inputs(image_latents, latents, batch_size):
-
         if image_latents.shape[0] != batch_size:
             raise ValueError(
                 f"`image_latents` must have have batch size {batch_size}, but got {image_latents.shape[0]}"
             )
 
         if image_latents.ndim != 3:
             raise ValueError(f"`image_latents` must have 3 dimensions (patchified), but got {image_latents.ndim}")
-        
-        
+
         if latents.shape[0] != batch_size:
-            raise ValueError(
-                f"`latents` must have have batch size {batch_size}, but got {latents.shape[0]}"
-            )
-        
-    
+            raise ValueError(f"`latents` must have have batch size {batch_size}, but got {latents.shape[0]}")
+
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
-        
         block_state = self.get_block_state(state)
         final_batch_size = block_state.batch_size * block_state.num_images_per_prompt
 
@@ -290,43 +304,52 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
             latents=block_state.latents,
             batch_size=final_batch_size,
         )
-        
+
         # prepare latent timestep
         latent_timestep = block_state.timesteps[:1].repeat(final_batch_size)
-        
+
         # make copy of initial_noise
         block_state.initial_noise = block_state.latents
 
         # scale noise
-        block_state.latents = components.scheduler.scale_noise(block_state.image_latents, latent_timestep, block_state.latents)
+        block_state.latents = components.scheduler.scale_noise(
+            block_state.image_latents, latent_timestep, block_state.latents
+        )
 
         self.set_block_state(state, block_state)
-        
-        return components, state    
+
+        return components, state
 
 
 class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
     model_name = "qwenimage"
-    
+
     @property
     def description(self) -> str:
         return "Step that create the mask latents for the inpainting process. Should be run with the pachify latents step."
-    
+
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="mask_image", required=True, type_hint=torch.Tensor, description="The mask to use for the inpainting process."),
+            InputParam(
+                name="mask_image",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The mask to use for the inpainting process.",
+            ),
             InputParam(name="height", required=True),
             InputParam(name="width", required=True),
             InputParam(name="dtype", required=True),
         ]
-    
+
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="mask", type_hint=torch.Tensor, description="The mask to use for the inpainting process."),
+            OutputParam(
+                name="mask", type_hint=torch.Tensor, description="The mask to use for the inpainting process."
+            ),
         ]
-    
+
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
@@ -342,14 +365,14 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         block_state.mask = torch.nn.functional.interpolate(
             block_state.mask_image,
             size=(height_latents, width_latents),
-        ) 
+        )
 
         block_state.mask = block_state.mask.unsqueeze(2)
         block_state.mask = block_state.mask.repeat(1, components.num_channels_latents, 1, 1, 1)
         block_state.mask = block_state.mask.to(device=device, dtype=block_state.dtype)
-        
+
         self.set_block_state(state, block_state)
-        
+
         return components, state
 
 
@@ -381,14 +404,14 @@ def __init__(self, input_names: List[str] = ["image_latents"]):
             input_names = [input_names]
         self._latents_input_names = input_names
         super().__init__()
-    
+
     @staticmethod
     def check_input_shape(latents_input, latents_input_name, batch_size):
         if latents_input is not None and latents_input.shape[0] != 1 and latents_input.shape[0] != batch_size:
             raise ValueError(
                 f"`{latents_input_name}` must have have batch size 1 or {batch_size}, but got {latents_input.shape[0]}"
             )
-        
+
         if latents_input.ndim != 5 and latents_input.ndim != 4:
             raise ValueError(f"`{latents_input_name}` must have 4 or 5 dimensions, but got {latents_input.ndim}")
 
@@ -526,11 +549,12 @@ def inputs(self) -> List[InputParam]:
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
-                name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
+                name="timesteps",
+                type_hint=torch.Tensor,
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
         ]
 
-
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
 
@@ -609,9 +633,14 @@ def intermediate_outputs(self) -> List[OutputParam]:
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
 
-
         block_state.img_shapes = [
-            [(1, block_state.height // components.vae_scale_factor // 2, block_state.width // components.vae_scale_factor // 2)]
+            [
+                (
+                    1,
+                    block_state.height // components.vae_scale_factor // 2,
+                    block_state.width // components.vae_scale_factor // 2,
+                )
+            ]
             * block_state.batch_size
         ]
         block_state.txt_seq_lens = (
 
@@ -103,7 +103,7 @@ def intermediate_outputs(self) -> List[str]:
     def check_inputs(output_type):
         if output_type not in ["pil", "np", "pt"]:
             raise ValueError(f"Invalid output_type: {output_type}")
-    
+
     def __init__(self, include_image_processor: bool = True):
         self._include_image_processor = include_image_processor
         super().__init__()
@@ -118,7 +118,9 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         block_state.width = block_state.width or components.default_width
 
         # YiYi Notes: remove support for output_type = "latents', we can just skip decode/encode step in modular
-        block_state.latents = unpack_latents(block_state.latents, block_state.height, block_state.width, components.vae_scale_factor)
+        block_state.latents = unpack_latents(
+            block_state.latents, block_state.height, block_state.width, components.vae_scale_factor
+        )
         block_state.latents = block_state.latents.to(components.vae.dtype)
 
         latents_mean = (
@@ -131,7 +133,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         ).to(block_state.latents.device, block_state.latents.dtype)
         block_state.latents = block_state.latents / latents_std + latents_mean
         block_state.images = components.vae.decode(block_state.latents, return_dict=False)[0][:, :, 0]
-        
+
         if self._include_image_processor:
             block_state.images = components.image_processor.postprocess(
                 block_state.images, output_type=block_state.output_type