diff --git a/docs/source/en/api/image_processor.md b/docs/source/en/api/image_processor.md index 3e75af026d7e..82d1837b0b50 100644 --- a/docs/source/en/api/image_processor.md +++ b/docs/source/en/api/image_processor.md @@ -20,6 +20,12 @@ All pipelines with [`VaeImageProcessor`] accept PIL Image, PyTorch tensor, or Nu [[autodoc]] image_processor.VaeImageProcessor +## InpaintProcessor + +The [`InpaintProcessor`] accepts `mask` and `image` inputs and process them together. Optionally, it can accept padding_mask_crop and apply mask overlay. + +[[autodoc]] image_processor.InpaintProcessor + ## VaeImageProcessorLDM3D The [`VaeImageProcessorLDM3D`] accepts RGB and depth inputs and returns RGB and depth outputs. diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 762ae3846a7d..518e8d9a9f10 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -372,6 +372,10 @@ [ "FluxAutoBlocks", "FluxModularPipeline", + "QwenImageAutoBlocks", + "QwenImageEditAutoBlocks", + "QwenImageEditModularPipeline", + "QwenImageModularPipeline", "StableDiffusionXLAutoBlocks", "StableDiffusionXLModularPipeline", "WanAutoBlocks", @@ -1017,6 +1021,10 @@ from .modular_pipelines import ( FluxAutoBlocks, FluxModularPipeline, + QwenImageAutoBlocks, + QwenImageEditAutoBlocks, + QwenImageEditModularPipeline, + QwenImageModularPipeline, StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline, WanAutoBlocks, diff --git a/src/diffusers/hooks/_helpers.py b/src/diffusers/hooks/_helpers.py index b7a74be2e5b2..f6e5bdd52d1f 100644 --- a/src/diffusers/hooks/_helpers.py +++ b/src/diffusers/hooks/_helpers.py @@ -108,6 +108,7 @@ def _register_attention_processors_metadata(): from ..models.attention_processor import AttnProcessor2_0 from ..models.transformers.transformer_cogview4 import CogView4AttnProcessor from ..models.transformers.transformer_flux import FluxAttnProcessor + from ..models.transformers.transformer_qwenimage import QwenDoubleStreamAttnProcessor2_0 from ..models.transformers.transformer_wan import WanAttnProcessor2_0 # AttnProcessor2_0 @@ -140,6 +141,14 @@ def _register_attention_processors_metadata(): metadata=AttentionProcessorMetadata(skip_processor_output_fn=_skip_proc_output_fn_Attention_FluxAttnProcessor), ) + # QwenDoubleStreamAttnProcessor2 + AttentionProcessorRegistry.register( + model_class=QwenDoubleStreamAttnProcessor2_0, + metadata=AttentionProcessorMetadata( + skip_processor_output_fn=_skip_proc_output_fn_Attention_QwenDoubleStreamAttnProcessor2_0 + ), + ) + def _register_transformer_blocks_metadata(): from ..models.attention import BasicTransformerBlock @@ -298,4 +307,5 @@ def _skip_attention___ret___hidden_states___encoder_hidden_states(self, *args, * _skip_proc_output_fn_Attention_WanAttnProcessor2_0 = _skip_attention___ret___hidden_states # not sure what this is yet. _skip_proc_output_fn_Attention_FluxAttnProcessor = _skip_attention___ret___hidden_states +_skip_proc_output_fn_Attention_QwenDoubleStreamAttnProcessor2_0 = _skip_attention___ret___hidden_states # fmt: on diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 6a3cf77a7df7..0e3082eada8a 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -523,6 +523,7 @@ def resize( size=(height, width), ) image = self.pt_to_numpy(image) + return image def binarize(self, image: PIL.Image.Image) -> PIL.Image.Image: @@ -838,6 +839,137 @@ def apply_overlay( return image +class InpaintProcessor(ConfigMixin): + """ + Image processor for inpainting image and mask. + """ + + config_name = CONFIG_NAME + + @register_to_config + def __init__( + self, + do_resize: bool = True, + vae_scale_factor: int = 8, + vae_latent_channels: int = 4, + resample: str = "lanczos", + reducing_gap: int = None, + do_normalize: bool = True, + do_binarize: bool = False, + do_convert_grayscale: bool = False, + mask_do_normalize: bool = False, + mask_do_binarize: bool = True, + mask_do_convert_grayscale: bool = True, + ): + super().__init__() + + self._image_processor = VaeImageProcessor( + do_resize=do_resize, + vae_scale_factor=vae_scale_factor, + vae_latent_channels=vae_latent_channels, + resample=resample, + reducing_gap=reducing_gap, + do_normalize=do_normalize, + do_binarize=do_binarize, + do_convert_grayscale=do_convert_grayscale, + ) + self._mask_processor = VaeImageProcessor( + do_resize=do_resize, + vae_scale_factor=vae_scale_factor, + vae_latent_channels=vae_latent_channels, + resample=resample, + reducing_gap=reducing_gap, + do_normalize=mask_do_normalize, + do_binarize=mask_do_binarize, + do_convert_grayscale=mask_do_convert_grayscale, + ) + + def preprocess( + self, + image: PIL.Image.Image, + mask: PIL.Image.Image = None, + height: int = None, + width: int = None, + padding_mask_crop: Optional[int] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Preprocess the image and mask. + """ + if mask is None and padding_mask_crop is not None: + raise ValueError("mask must be provided if padding_mask_crop is provided") + + # if mask is None, same behavior as regular image processor + if mask is None: + return self._image_processor.preprocess(image, height=height, width=width) + + if padding_mask_crop is not None: + crops_coords = self._image_processor.get_crop_region(mask, width, height, pad=padding_mask_crop) + resize_mode = "fill" + else: + crops_coords = None + resize_mode = "default" + + processed_image = self._image_processor.preprocess( + image, + height=height, + width=width, + crops_coords=crops_coords, + resize_mode=resize_mode, + ) + + processed_mask = self._mask_processor.preprocess( + mask, + height=height, + width=width, + resize_mode=resize_mode, + crops_coords=crops_coords, + ) + + if crops_coords is not None: + postprocessing_kwargs = { + "crops_coords": crops_coords, + "original_image": image, + "original_mask": mask, + } + else: + postprocessing_kwargs = { + "crops_coords": None, + "original_image": None, + "original_mask": None, + } + + return processed_image, processed_mask, postprocessing_kwargs + + def postprocess( + self, + image: torch.Tensor, + output_type: str = "pil", + original_image: Optional[PIL.Image.Image] = None, + original_mask: Optional[PIL.Image.Image] = None, + crops_coords: Optional[Tuple[int, int, int, int]] = None, + ) -> Tuple[PIL.Image.Image, PIL.Image.Image]: + """ + Postprocess the image, optionally apply mask overlay + """ + image = self._image_processor.postprocess( + image, + output_type=output_type, + ) + # optionally apply the mask overlay + if crops_coords is not None and (original_image is None or original_mask is None): + raise ValueError("original_image and original_mask must be provided if crops_coords is provided") + + elif crops_coords is not None and output_type != "pil": + raise ValueError("output_type must be 'pil' if crops_coords is provided") + + elif crops_coords is not None: + image = [ + self._image_processor.apply_overlay(original_mask, original_image, i, crops_coords) for i in image + ] + + return image + + class VaeImageProcessorLDM3D(VaeImageProcessor): """ Image processor for VAE LDM3D. diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py index 68d707f9e047..65c22b349b1c 100644 --- a/src/diffusers/modular_pipelines/__init__.py +++ b/src/diffusers/modular_pipelines/__init__.py @@ -47,6 +47,12 @@ _import_structure["stable_diffusion_xl"] = ["StableDiffusionXLAutoBlocks", "StableDiffusionXLModularPipeline"] _import_structure["wan"] = ["WanAutoBlocks", "WanModularPipeline"] _import_structure["flux"] = ["FluxAutoBlocks", "FluxModularPipeline"] + _import_structure["qwenimage"] = [ + "QwenImageAutoBlocks", + "QwenImageModularPipeline", + "QwenImageEditModularPipeline", + "QwenImageEditAutoBlocks", + ] _import_structure["components_manager"] = ["ComponentsManager"] if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: @@ -68,6 +74,12 @@ SequentialPipelineBlocks, ) from .modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, InsertableDict, OutputParam + from .qwenimage import ( + QwenImageAutoBlocks, + QwenImageEditAutoBlocks, + QwenImageEditModularPipeline, + QwenImageModularPipeline, + ) from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline from .wan import WanAutoBlocks, WanModularPipeline else: diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py index 3918679c1613..b55805849645 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/modular_pipeline.py @@ -56,6 +56,8 @@ ("stable-diffusion-xl", "StableDiffusionXLModularPipeline"), ("wan", "WanModularPipeline"), ("flux", "FluxModularPipeline"), + ("qwenimage", "QwenImageModularPipeline"), + ("qwenimage-edit", "QwenImageEditModularPipeline"), ] ) @@ -64,6 +66,8 @@ ("StableDiffusionXLModularPipeline", "StableDiffusionXLAutoBlocks"), ("WanModularPipeline", "WanAutoBlocks"), ("FluxModularPipeline", "FluxAutoBlocks"), + ("QwenImageModularPipeline", "QwenImageAutoBlocks"), + ("QwenImageEditModularPipeline", "QwenImageEditAutoBlocks"), ] ) @@ -133,8 +137,8 @@ def __getattr__(self, name): Allow attribute access to intermediate values. If an attribute is not found in the object, look for it in the intermediates dict. """ - if name in self.intermediates: - return self.intermediates[name] + if name in self.values: + return self.values[name] raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'") def __repr__(self): @@ -548,8 +552,11 @@ class AutoPipelineBlocks(ModularPipelineBlocks): def __init__(self): sub_blocks = InsertableDict() - for block_name, block_cls in zip(self.block_names, self.block_classes): - sub_blocks[block_name] = block_cls() + for block_name, block in zip(self.block_names, self.block_classes): + if inspect.isclass(block): + sub_blocks[block_name] = block() + else: + sub_blocks[block_name] = block self.sub_blocks = sub_blocks if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)): raise ValueError( @@ -830,7 +837,9 @@ def expected_configs(self): return expected_configs @classmethod - def from_blocks_dict(cls, blocks_dict: Dict[str, Any]) -> "SequentialPipelineBlocks": + def from_blocks_dict( + cls, blocks_dict: Dict[str, Any], description: Optional[str] = None + ) -> "SequentialPipelineBlocks": """Creates a SequentialPipelineBlocks instance from a dictionary of blocks. Args: @@ -852,12 +861,19 @@ def from_blocks_dict(cls, blocks_dict: Dict[str, Any]) -> "SequentialPipelineBlo instance.block_classes = [block.__class__ for block in sub_blocks.values()] instance.block_names = list(sub_blocks.keys()) instance.sub_blocks = sub_blocks + + if description is not None: + instance.description = description + return instance def __init__(self): sub_blocks = InsertableDict() - for block_name, block_cls in zip(self.block_names, self.block_classes): - sub_blocks[block_name] = block_cls() + for block_name, block in zip(self.block_names, self.block_classes): + if inspect.isclass(block): + sub_blocks[block_name] = block() + else: + sub_blocks[block_name] = block self.sub_blocks = sub_blocks def _get_inputs(self): @@ -1280,8 +1296,11 @@ def outputs(self) -> List[str]: def __init__(self): sub_blocks = InsertableDict() - for block_name, block_cls in zip(self.block_names, self.block_classes): - sub_blocks[block_name] = block_cls() + for block_name, block in zip(self.block_names, self.block_classes): + if inspect.isclass(block): + sub_blocks[block_name] = block() + else: + sub_blocks[block_name] = block self.sub_blocks = sub_blocks @classmethod diff --git a/src/diffusers/modular_pipelines/qwenimage/__init__.py b/src/diffusers/modular_pipelines/qwenimage/__init__.py new file mode 100644 index 000000000000..81cf515730ef --- /dev/null +++ b/src/diffusers/modular_pipelines/qwenimage/__init__.py @@ -0,0 +1,75 @@ +from typing import TYPE_CHECKING + +from ...utils import ( + DIFFUSERS_SLOW_IMPORT, + OptionalDependencyNotAvailable, + _LazyModule, + get_objects_from_module, + is_torch_available, + is_transformers_available, +) + + +_dummy_objects = {} +_import_structure = {} + +try: + if not (is_transformers_available() and is_torch_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ...utils import dummy_torch_and_transformers_objects # noqa F403 + + _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) +else: + _import_structure["encoders"] = ["QwenImageTextEncoderStep"] + _import_structure["modular_blocks"] = [ + "ALL_BLOCKS", + "AUTO_BLOCKS", + "CONTROLNET_BLOCKS", + "EDIT_AUTO_BLOCKS", + "EDIT_BLOCKS", + "EDIT_INPAINT_BLOCKS", + "IMAGE2IMAGE_BLOCKS", + "INPAINT_BLOCKS", + "TEXT2IMAGE_BLOCKS", + "QwenImageAutoBlocks", + "QwenImageEditAutoBlocks", + ] + _import_structure["modular_pipeline"] = ["QwenImageEditModularPipeline", "QwenImageModularPipeline"] + +if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: + try: + if not (is_transformers_available() and is_torch_available()): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 + else: + from .encoders import ( + QwenImageTextEncoderStep, + ) + from .modular_blocks import ( + ALL_BLOCKS, + AUTO_BLOCKS, + CONTROLNET_BLOCKS, + EDIT_AUTO_BLOCKS, + EDIT_BLOCKS, + EDIT_INPAINT_BLOCKS, + IMAGE2IMAGE_BLOCKS, + INPAINT_BLOCKS, + TEXT2IMAGE_BLOCKS, + QwenImageAutoBlocks, + QwenImageEditAutoBlocks, + ) + from .modular_pipeline import QwenImageEditModularPipeline, QwenImageModularPipeline +else: + import sys + + sys.modules[__name__] = _LazyModule( + __name__, + globals()["__file__"], + _import_structure, + module_spec=__spec__, + ) + + for name, value in _dummy_objects.items(): + setattr(sys.modules[__name__], name, value) diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py new file mode 100644 index 000000000000..738a1e5d151d --- /dev/null +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -0,0 +1,727 @@ +# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch + +from ...models import QwenImageControlNetModel, QwenImageMultiControlNetModel +from ...schedulers import FlowMatchEulerDiscreteScheduler +from ...utils.torch_utils import randn_tensor, unwrap_module +from ..modular_pipeline import ModularPipelineBlocks, PipelineState +from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam +from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier + + +# Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.calculate_shift +def calculate_shift( + image_seq_len, + base_seq_len: int = 256, + max_seq_len: int = 4096, + base_shift: float = 0.5, + max_shift: float = 1.15, +): + m = (max_shift - base_shift) / (max_seq_len - base_seq_len) + b = base_shift - m * base_seq_len + mu = image_seq_len * m + b + return mu + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, + timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, + **kwargs, +): + r""" + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. + + Returns: + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + +# modified from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps +def get_timesteps(scheduler, num_inference_steps, strength): + # get the original timestep using init_timestep + init_timestep = min(num_inference_steps * strength, num_inference_steps) + + t_start = int(max(num_inference_steps - init_timestep, 0)) + timesteps = scheduler.timesteps[t_start * scheduler.order :] + if hasattr(scheduler, "set_begin_index"): + scheduler.set_begin_index(t_start * scheduler.order) + + return timesteps, num_inference_steps - t_start + + +# Prepare Latents steps + + +class QwenImagePrepareLatentsStep(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return "Prepare initial random noise for the generation process" + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam(name="height"), + InputParam(name="width"), + InputParam(name="num_images_per_prompt", default=1), + InputParam(name="generator"), + InputParam( + name="batch_size", + required=True, + type_hint=int, + description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.", + ), + InputParam( + name="dtype", + required=True, + type_hint=torch.dtype, + description="The dtype of the model inputs, can be generated in input step.", + ), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + name="latents", + type_hint=torch.Tensor, + description="The initial latents to use for the denoising process", + ), + ] + + @staticmethod + def check_inputs(height, width, vae_scale_factor): + if height is not None and height % (vae_scale_factor * 2) != 0: + raise ValueError(f"Height must be divisible by {vae_scale_factor * 2} but is {height}") + + if width is not None and width % (vae_scale_factor * 2) != 0: + raise ValueError(f"Width must be divisible by {vae_scale_factor * 2} but is {width}") + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + self.check_inputs( + height=block_state.height, + width=block_state.width, + vae_scale_factor=components.vae_scale_factor, + ) + + device = components._execution_device + batch_size = block_state.batch_size * block_state.num_images_per_prompt + + # we can update the height and width here since it's used to generate the initial + block_state.height = block_state.height or components.default_height + block_state.width = block_state.width or components.default_width + + # VAE applies 8x compression on images but we must also account for packing which requires + # latent height and width to be divisible by 2. + latent_height = 2 * (int(block_state.height) // (components.vae_scale_factor * 2)) + latent_width = 2 * (int(block_state.width) // (components.vae_scale_factor * 2)) + + shape = (batch_size, components.num_channels_latents, 1, latent_height, latent_width) + if isinstance(block_state.generator, list) and len(block_state.generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(block_state.generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + block_state.latents = randn_tensor( + shape, generator=block_state.generator, device=device, dtype=block_state.dtype + ) + block_state.latents = components.pachifier.pack_latents(block_state.latents) + + self.set_block_state(state, block_state) + + return components, state + + +class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return "Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps, prepare_latents. Both noise and image latents should alreadybe patchified." + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam( + name="latents", + required=True, + type_hint=torch.Tensor, + description="The initial random noised, can be generated in prepare latent step.", + ), + InputParam( + name="image_latents", + required=True, + type_hint=torch.Tensor, + description="The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.", + ), + InputParam( + name="timesteps", + required=True, + type_hint=torch.Tensor, + description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.", + ), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + name="initial_noise", + type_hint=torch.Tensor, + description="The initial random noised used for inpainting denoising.", + ), + ] + + @staticmethod + def check_inputs(image_latents, latents): + if image_latents.shape[0] != latents.shape[0]: + raise ValueError( + f"`image_latents` must have have same batch size as `latents`, but got {image_latents.shape[0]} and {latents.shape[0]}" + ) + + if image_latents.ndim != 3: + raise ValueError(f"`image_latents` must have 3 dimensions (patchified), but got {image_latents.ndim}") + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + self.check_inputs( + image_latents=block_state.image_latents, + latents=block_state.latents, + ) + + # prepare latent timestep + latent_timestep = block_state.timesteps[:1].repeat(block_state.latents.shape[0]) + + # make copy of initial_noise + block_state.initial_noise = block_state.latents + + # scale noise + block_state.latents = components.scheduler.scale_noise( + block_state.image_latents, latent_timestep, block_state.latents + ) + + self.set_block_state(state, block_state) + + return components, state + + +class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return "Step that creates mask latents from preprocessed mask_image by interpolating to latent space." + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam( + name="processed_mask_image", + required=True, + type_hint=torch.Tensor, + description="The processed mask to use for the inpainting process.", + ), + InputParam(name="height", required=True), + InputParam(name="width", required=True), + InputParam(name="dtype", required=True), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + name="mask", type_hint=torch.Tensor, description="The mask to use for the inpainting process." + ), + ] + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + device = components._execution_device + + # VAE applies 8x compression on images but we must also account for packing which requires + # latent height and width to be divisible by 2. + + height_latents = 2 * (int(block_state.height) // (components.vae_scale_factor * 2)) + width_latents = 2 * (int(block_state.width) // (components.vae_scale_factor * 2)) + + block_state.mask = torch.nn.functional.interpolate( + block_state.processed_mask_image, + size=(height_latents, width_latents), + ) + + block_state.mask = block_state.mask.unsqueeze(2) + block_state.mask = block_state.mask.repeat(1, components.num_channels_latents, 1, 1, 1) + block_state.mask = block_state.mask.to(device=device, dtype=block_state.dtype) + + block_state.mask = components.pachifier.pack_latents(block_state.mask) + + self.set_block_state(state, block_state) + + return components, state + + +# Set Timesteps steps + + +class QwenImageSetTimestepsStep(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return "Step that sets the the scheduler's timesteps for text-to-image generation. Should be run after prepare latents step." + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam(name="num_inference_steps", default=50), + InputParam(name="sigmas"), + InputParam( + name="latents", + required=True, + type_hint=torch.Tensor, + description="The latents to use for the denoising process, used to calculate the image sequence length.", + ), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process" + ), + ] + + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + device = components._execution_device + sigmas = ( + np.linspace(1.0, 1 / block_state.num_inference_steps, block_state.num_inference_steps) + if block_state.sigmas is None + else block_state.sigmas + ) + + mu = calculate_shift( + image_seq_len=block_state.latents.shape[1], + base_seq_len=components.scheduler.config.get("base_image_seq_len", 256), + max_seq_len=components.scheduler.config.get("max_image_seq_len", 4096), + base_shift=components.scheduler.config.get("base_shift", 0.5), + max_shift=components.scheduler.config.get("max_shift", 1.15), + ) + block_state.timesteps, block_state.num_inference_steps = retrieve_timesteps( + scheduler=components.scheduler, + num_inference_steps=block_state.num_inference_steps, + device=device, + sigmas=sigmas, + mu=mu, + ) + + components.scheduler.set_begin_index(0) + + self.set_block_state(state, block_state) + + return components, state + + +class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return "Step that sets the the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after prepare latents step." + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam(name="num_inference_steps", default=50), + InputParam(name="sigmas"), + InputParam( + name="latents", + required=True, + type_hint=torch.Tensor, + description="The latents to use for the denoising process, used to calculate the image sequence length.", + ), + InputParam(name="strength", default=0.9), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + name="timesteps", + type_hint=torch.Tensor, + description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.", + ), + ] + + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + device = components._execution_device + sigmas = ( + np.linspace(1.0, 1 / block_state.num_inference_steps, block_state.num_inference_steps) + if block_state.sigmas is None + else block_state.sigmas + ) + + mu = calculate_shift( + image_seq_len=block_state.latents.shape[1], + base_seq_len=components.scheduler.config.get("base_image_seq_len", 256), + max_seq_len=components.scheduler.config.get("max_image_seq_len", 4096), + base_shift=components.scheduler.config.get("base_shift", 0.5), + max_shift=components.scheduler.config.get("max_shift", 1.15), + ) + block_state.timesteps, block_state.num_inference_steps = retrieve_timesteps( + scheduler=components.scheduler, + num_inference_steps=block_state.num_inference_steps, + device=device, + sigmas=sigmas, + mu=mu, + ) + + block_state.timesteps, block_state.num_inference_steps = get_timesteps( + scheduler=components.scheduler, + num_inference_steps=block_state.num_inference_steps, + strength=block_state.strength, + ) + + self.set_block_state(state, block_state) + + return components, state + + +# other inputs for denoiser + +## RoPE inputs for denoiser + + +class QwenImageRoPEInputsStep(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return ( + "Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step" + ) + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam(name="batch_size", required=True), + InputParam(name="height", required=True), + InputParam(name="width", required=True), + InputParam(name="prompt_embeds_mask"), + InputParam(name="negative_prompt_embeds_mask"), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + name="img_shapes", + type_hint=List[List[Tuple[int, int, int]]], + description="The shapes of the images latents, used for RoPE calculation", + ), + OutputParam( + name="txt_seq_lens", + kwargs_type="denoiser_input_fields", + type_hint=List[int], + description="The sequence lengths of the prompt embeds, used for RoPE calculation", + ), + OutputParam( + name="negative_txt_seq_lens", + kwargs_type="denoiser_input_fields", + type_hint=List[int], + description="The sequence lengths of the negative prompt embeds, used for RoPE calculation", + ), + ] + + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + block_state.img_shapes = [ + [ + ( + 1, + block_state.height // components.vae_scale_factor // 2, + block_state.width // components.vae_scale_factor // 2, + ) + ] + * block_state.batch_size + ] + block_state.txt_seq_lens = ( + block_state.prompt_embeds_mask.sum(dim=1).tolist() if block_state.prompt_embeds_mask is not None else None + ) + block_state.negative_txt_seq_lens = ( + block_state.negative_prompt_embeds_mask.sum(dim=1).tolist() + if block_state.negative_prompt_embeds_mask is not None + else None + ) + + self.set_block_state(state, block_state) + + return components, state + + +class QwenImageEditRoPEInputsStep(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return "Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be place after prepare_latents step" + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam(name="batch_size", required=True), + InputParam( + name="resized_image", required=True, type_hint=torch.Tensor, description="The resized image input" + ), + InputParam(name="height", required=True), + InputParam(name="width", required=True), + InputParam(name="prompt_embeds_mask"), + InputParam(name="negative_prompt_embeds_mask"), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + name="img_shapes", + type_hint=List[List[Tuple[int, int, int]]], + description="The shapes of the images latents, used for RoPE calculation", + ), + OutputParam( + name="txt_seq_lens", + kwargs_type="denoiser_input_fields", + type_hint=List[int], + description="The sequence lengths of the prompt embeds, used for RoPE calculation", + ), + OutputParam( + name="negative_txt_seq_lens", + kwargs_type="denoiser_input_fields", + type_hint=List[int], + description="The sequence lengths of the negative prompt embeds, used for RoPE calculation", + ), + ] + + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + # for edit, image size can be different from the target size (height/width) + image = ( + block_state.resized_image[0] if isinstance(block_state.resized_image, list) else block_state.resized_image + ) + image_width, image_height = image.size + + block_state.img_shapes = [ + [ + ( + 1, + block_state.height // components.vae_scale_factor // 2, + block_state.width // components.vae_scale_factor // 2, + ), + (1, image_height // components.vae_scale_factor // 2, image_width // components.vae_scale_factor // 2), + ] + ] * block_state.batch_size + + block_state.txt_seq_lens = ( + block_state.prompt_embeds_mask.sum(dim=1).tolist() if block_state.prompt_embeds_mask is not None else None + ) + block_state.negative_txt_seq_lens = ( + block_state.negative_prompt_embeds_mask.sum(dim=1).tolist() + if block_state.negative_prompt_embeds_mask is not None + else None + ) + + self.set_block_state(state, block_state) + + return components, state + + +## ControlNet inputs for denoiser +class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("controlnet", QwenImageControlNetModel), + ] + + @property + def description(self) -> str: + return "step that prepare inputs for controlnet. Insert before the Denoise Step, after set_timesteps step." + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam("control_guidance_start", default=0.0), + InputParam("control_guidance_end", default=1.0), + InputParam("controlnet_conditioning_scale", default=1.0), + InputParam("control_image_latents", required=True), + InputParam( + "timesteps", + required=True, + type_hint=torch.Tensor, + description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.", + ), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam("controlnet_keep", type_hint=List[float], description="The controlnet keep values"), + ] + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + controlnet = unwrap_module(components.controlnet) + + # control_guidance_start/control_guidance_end (align format) + if not isinstance(block_state.control_guidance_start, list) and isinstance( + block_state.control_guidance_end, list + ): + block_state.control_guidance_start = len(block_state.control_guidance_end) * [ + block_state.control_guidance_start + ] + elif not isinstance(block_state.control_guidance_end, list) and isinstance( + block_state.control_guidance_start, list + ): + block_state.control_guidance_end = len(block_state.control_guidance_start) * [ + block_state.control_guidance_end + ] + elif not isinstance(block_state.control_guidance_start, list) and not isinstance( + block_state.control_guidance_end, list + ): + mult = ( + len(block_state.control_image_latents) if isinstance(controlnet, QwenImageMultiControlNetModel) else 1 + ) + block_state.control_guidance_start, block_state.control_guidance_end = ( + mult * [block_state.control_guidance_start], + mult * [block_state.control_guidance_end], + ) + + # controlnet_conditioning_scale (align format) + if isinstance(controlnet, QwenImageMultiControlNetModel) and isinstance( + block_state.controlnet_conditioning_scale, float + ): + block_state.controlnet_conditioning_scale = [block_state.controlnet_conditioning_scale] * mult + + # controlnet_keep + block_state.controlnet_keep = [] + for i in range(len(block_state.timesteps)): + keeps = [ + 1.0 - float(i / len(block_state.timesteps) < s or (i + 1) / len(block_state.timesteps) > e) + for s, e in zip(block_state.control_guidance_start, block_state.control_guidance_end) + ] + block_state.controlnet_keep.append(keeps[0] if isinstance(controlnet, QwenImageControlNetModel) else keeps) + + self.set_block_state(state, block_state) + + return components, state diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py new file mode 100644 index 000000000000..6c82fe989e55 --- /dev/null +++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py @@ -0,0 +1,203 @@ +# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Union + +import numpy as np +import PIL +import torch + +from ...configuration_utils import FrozenDict +from ...image_processor import InpaintProcessor, VaeImageProcessor +from ...models import AutoencoderKLQwenImage +from ...utils import logging +from ..modular_pipeline import ModularPipelineBlocks, PipelineState +from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam +from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier + + +logger = logging.get_logger(__name__) + + +class QwenImageDecoderStep(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return "Step that decodes the latents to images" + + @property + def expected_components(self) -> List[ComponentSpec]: + components = [ + ComponentSpec("vae", AutoencoderKLQwenImage), + ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"), + ] + + return components + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam(name="height", required=True), + InputParam(name="width", required=True), + InputParam( + name="latents", + required=True, + type_hint=torch.Tensor, + description="The latents to decode, can be generated in the denoise step", + ), + ] + + @property + def intermediate_outputs(self) -> List[str]: + return [ + OutputParam( + "images", + type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]], + description="The generated images, can be a PIL.Image.Image, torch.Tensor or a numpy array", + ) + ] + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + # YiYi Notes: remove support for output_type = "latents', we can just skip decode/encode step in modular + block_state.latents = components.pachifier.unpack_latents( + block_state.latents, block_state.height, block_state.width + ) + block_state.latents = block_state.latents.to(components.vae.dtype) + + latents_mean = ( + torch.tensor(components.vae.config.latents_mean) + .view(1, components.vae.config.z_dim, 1, 1, 1) + .to(block_state.latents.device, block_state.latents.dtype) + ) + latents_std = 1.0 / torch.tensor(components.vae.config.latents_std).view( + 1, components.vae.config.z_dim, 1, 1, 1 + ).to(block_state.latents.device, block_state.latents.dtype) + block_state.latents = block_state.latents / latents_std + latents_mean + block_state.images = components.vae.decode(block_state.latents, return_dict=False)[0][:, :, 0] + + self.set_block_state(state, block_state) + return components, state + + +class QwenImageProcessImagesOutputStep(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return "postprocess the generated image" + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec( + "image_processor", + VaeImageProcessor, + config=FrozenDict({"vae_scale_factor": 16}), + default_creation_method="from_config", + ), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam("images", required=True, description="the generated image from decoders step"), + InputParam( + name="output_type", + default="pil", + type_hint=str, + description="The type of the output images, can be 'pil', 'np', 'pt'", + ), + ] + + @staticmethod + def check_inputs(output_type): + if output_type not in ["pil", "np", "pt"]: + raise ValueError(f"Invalid output_type: {output_type}") + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState): + block_state = self.get_block_state(state) + + self.check_inputs(block_state.output_type) + + block_state.images = components.image_processor.postprocess( + image=block_state.images, + output_type=block_state.output_type, + ) + + self.set_block_state(state, block_state) + return components, state + + +class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return "postprocess the generated image, optional apply the mask overally to the original image.." + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec( + "image_mask_processor", + InpaintProcessor, + config=FrozenDict({"vae_scale_factor": 16}), + default_creation_method="from_config", + ), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam("images", required=True, description="the generated image from decoders step"), + InputParam( + name="output_type", + default="pil", + type_hint=str, + description="The type of the output images, can be 'pil', 'np', 'pt'", + ), + InputParam("mask_overlay_kwargs"), + ] + + @staticmethod + def check_inputs(output_type, mask_overlay_kwargs): + if output_type not in ["pil", "np", "pt"]: + raise ValueError(f"Invalid output_type: {output_type}") + + if mask_overlay_kwargs and output_type != "pil": + raise ValueError("only support output_type 'pil' for mask overlay") + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState): + block_state = self.get_block_state(state) + + self.check_inputs(block_state.output_type, block_state.mask_overlay_kwargs) + + if block_state.mask_overlay_kwargs is None: + mask_overlay_kwargs = {} + else: + mask_overlay_kwargs = block_state.mask_overlay_kwargs + + block_state.images = components.image_mask_processor.postprocess( + image=block_state.images, + **mask_overlay_kwargs, + ) + + self.set_block_state(state, block_state) + return components, state diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py new file mode 100644 index 000000000000..d0704ee6e071 --- /dev/null +++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py @@ -0,0 +1,668 @@ +# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Tuple + +import torch + +from ...configuration_utils import FrozenDict +from ...guiders import ClassifierFreeGuidance +from ...models import QwenImageControlNetModel, QwenImageTransformer2DModel +from ...schedulers import FlowMatchEulerDiscreteScheduler +from ...utils import logging +from ..modular_pipeline import BlockState, LoopSequentialPipelineBlocks, ModularPipelineBlocks, PipelineState +from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam +from .modular_pipeline import QwenImageModularPipeline + + +logger = logging.get_logger(__name__) + + +class QwenImageLoopBeforeDenoiser(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return ( + "step within the denoising loop that prepares the latent input for the denoiser. " + "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` " + "object (e.g. `QwenImageDenoiseLoopWrapper`)" + ) + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam( + "latents", + required=True, + type_hint=torch.Tensor, + description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.", + ), + ] + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor): + # one timestep + block_state.timestep = t.expand(block_state.latents.shape[0]).to(block_state.latents.dtype) + block_state.latent_model_input = block_state.latents + return components, block_state + + +class QwenImageEditLoopBeforeDenoiser(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return ( + "step within the denoising loop that prepares the latent input for the denoiser. " + "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` " + "object (e.g. `QwenImageDenoiseLoopWrapper`)" + ) + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam( + "latents", + required=True, + type_hint=torch.Tensor, + description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.", + ), + InputParam( + "image_latents", + required=True, + type_hint=torch.Tensor, + description="The initial image latents to use for the denoising process. Can be encoded in vae_encoder step and packed in prepare_image_latents step.", + ), + ] + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor): + # one timestep + + block_state.latent_model_input = torch.cat([block_state.latents, block_state.image_latents], dim=1) + block_state.timestep = t.expand(block_state.latents.shape[0]).to(block_state.latents.dtype) + return components, block_state + + +class QwenImageLoopBeforeDenoiserControlNet(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec( + "guider", + ClassifierFreeGuidance, + config=FrozenDict({"guidance_scale": 4.0}), + default_creation_method="from_config", + ), + ComponentSpec("controlnet", QwenImageControlNetModel), + ] + + @property + def description(self) -> str: + return ( + "step within the denoising loop that runs the controlnet before the denoiser. " + "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` " + "object (e.g. `QwenImageDenoiseLoopWrapper`)" + ) + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam( + "control_image_latents", + required=True, + type_hint=torch.Tensor, + description="The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.", + ), + InputParam( + "controlnet_conditioning_scale", + type_hint=float, + description="The controlnet conditioning scale value to use for the denoising process. Can be generated in prepare_controlnet_inputs step.", + ), + InputParam( + "controlnet_keep", + required=True, + type_hint=List[float], + description="The controlnet keep values to use for the denoising process. Can be generated in prepare_controlnet_inputs step.", + ), + InputParam( + "num_inference_steps", + required=True, + type_hint=int, + description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", + ), + InputParam( + kwargs_type="denoiser_input_fields", + description=( + "All conditional model inputs for the denoiser. " + "It should contain prompt_embeds/negative_prompt_embeds, txt_seq_lens/negative_txt_seq_lens." + ), + ), + ] + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: int): + # cond_scale for the timestep (controlnet input) + if isinstance(block_state.controlnet_keep[i], list): + block_state.cond_scale = [ + c * s for c, s in zip(block_state.controlnet_conditioning_scale, block_state.controlnet_keep[i]) + ] + else: + controlnet_cond_scale = block_state.controlnet_conditioning_scale + if isinstance(controlnet_cond_scale, list): + controlnet_cond_scale = controlnet_cond_scale[0] + block_state.cond_scale = controlnet_cond_scale * block_state.controlnet_keep[i] + + # run controlnet for the guidance batch + controlnet_block_samples = components.controlnet( + hidden_states=block_state.latent_model_input, + controlnet_cond=block_state.control_image_latents, + conditioning_scale=block_state.cond_scale, + timestep=block_state.timestep / 1000, + img_shapes=block_state.img_shapes, + encoder_hidden_states=block_state.prompt_embeds, + encoder_hidden_states_mask=block_state.prompt_embeds_mask, + txt_seq_lens=block_state.txt_seq_lens, + return_dict=False, + ) + + block_state.additional_cond_kwargs["controlnet_block_samples"] = controlnet_block_samples + + return components, block_state + + +class QwenImageLoopDenoiser(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return ( + "step within the denoising loop that denoise the latent input for the denoiser. " + "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` " + "object (e.g. `QwenImageDenoiseLoopWrapper`)" + ) + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec( + "guider", + ClassifierFreeGuidance, + config=FrozenDict({"guidance_scale": 4.0}), + default_creation_method="from_config", + ), + ComponentSpec("transformer", QwenImageTransformer2DModel), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam("attention_kwargs"), + InputParam( + "latents", + required=True, + type_hint=torch.Tensor, + description="The latents to use for the denoising process. Can be generated in prepare_latents step.", + ), + InputParam( + "num_inference_steps", + required=True, + type_hint=int, + description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", + ), + InputParam( + kwargs_type="denoiser_input_fields", + description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.", + ), + InputParam( + "img_shapes", + required=True, + type_hint=List[Tuple[int, int]], + description="The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.", + ), + ] + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor): + guider_input_fields = { + "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"), + "encoder_hidden_states_mask": ("prompt_embeds_mask", "negative_prompt_embeds_mask"), + "txt_seq_lens": ("txt_seq_lens", "negative_txt_seq_lens"), + } + + components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t) + guider_state = components.guider.prepare_inputs(block_state, guider_input_fields) + + for guider_state_batch in guider_state: + components.guider.prepare_models(components.transformer) + cond_kwargs = guider_state_batch.as_dict() + cond_kwargs = {k: v for k, v in cond_kwargs.items() if k in guider_input_fields} + + # YiYi TODO: add cache context + guider_state_batch.noise_pred = components.transformer( + hidden_states=block_state.latent_model_input, + timestep=block_state.timestep / 1000, + img_shapes=block_state.img_shapes, + attention_kwargs=block_state.attention_kwargs, + return_dict=False, + **cond_kwargs, + **block_state.additional_cond_kwargs, + )[0] + + components.guider.cleanup_models(components.transformer) + + guider_output = components.guider(guider_state) + + # apply guidance rescale + pred_cond_norm = torch.norm(guider_output.pred_cond, dim=-1, keepdim=True) + pred_norm = torch.norm(guider_output.pred, dim=-1, keepdim=True) + block_state.noise_pred = guider_output.pred * (pred_cond_norm / pred_norm) + + return components, block_state + + +class QwenImageEditLoopDenoiser(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return ( + "step within the denoising loop that denoise the latent input for the denoiser. " + "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` " + "object (e.g. `QwenImageDenoiseLoopWrapper`)" + ) + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec( + "guider", + ClassifierFreeGuidance, + config=FrozenDict({"guidance_scale": 4.0}), + default_creation_method="from_config", + ), + ComponentSpec("transformer", QwenImageTransformer2DModel), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam("attention_kwargs"), + InputParam( + "latents", + required=True, + type_hint=torch.Tensor, + description="The latents to use for the denoising process. Can be generated in prepare_latents step.", + ), + InputParam( + "num_inference_steps", + required=True, + type_hint=int, + description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", + ), + InputParam( + kwargs_type="denoiser_input_fields", + description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.", + ), + InputParam( + "img_shapes", + required=True, + type_hint=List[Tuple[int, int]], + description="The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.", + ), + ] + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor): + guider_input_fields = { + "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"), + "encoder_hidden_states_mask": ("prompt_embeds_mask", "negative_prompt_embeds_mask"), + "txt_seq_lens": ("txt_seq_lens", "negative_txt_seq_lens"), + } + + components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t) + guider_state = components.guider.prepare_inputs(block_state, guider_input_fields) + + for guider_state_batch in guider_state: + components.guider.prepare_models(components.transformer) + cond_kwargs = guider_state_batch.as_dict() + cond_kwargs = {k: v for k, v in cond_kwargs.items() if k in guider_input_fields} + + # YiYi TODO: add cache context + guider_state_batch.noise_pred = components.transformer( + hidden_states=block_state.latent_model_input, + timestep=block_state.timestep / 1000, + img_shapes=block_state.img_shapes, + attention_kwargs=block_state.attention_kwargs, + return_dict=False, + **cond_kwargs, + **block_state.additional_cond_kwargs, + )[0] + + components.guider.cleanup_models(components.transformer) + + guider_output = components.guider(guider_state) + + pred = guider_output.pred[:, : block_state.latents.size(1)] + pred_cond = guider_output.pred_cond[:, : block_state.latents.size(1)] + + # apply guidance rescale + pred_cond_norm = torch.norm(pred_cond, dim=-1, keepdim=True) + pred_norm = torch.norm(pred, dim=-1, keepdim=True) + block_state.noise_pred = pred * (pred_cond_norm / pred_norm) + + return components, block_state + + +class QwenImageLoopAfterDenoiser(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return ( + "step within the denoising loop that updates the latents. " + "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` " + "object (e.g. `QwenImageDenoiseLoopWrapper`)" + ) + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam("latents", type_hint=torch.Tensor, description="The denoised latents."), + ] + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor): + latents_dtype = block_state.latents.dtype + block_state.latents = components.scheduler.step( + block_state.noise_pred, + t, + block_state.latents, + return_dict=False, + )[0] + + if block_state.latents.dtype != latents_dtype: + if torch.backends.mps.is_available(): + # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272 + block_state.latents = block_state.latents.to(latents_dtype) + + return components, block_state + + +class QwenImageLoopAfterDenoiserInpaint(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return ( + "step within the denoising loop that updates the latents using mask and image_latents for inpainting. " + "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` " + "object (e.g. `QwenImageDenoiseLoopWrapper`)" + ) + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam( + "mask", + required=True, + type_hint=torch.Tensor, + description="The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.", + ), + InputParam( + "image_latents", + required=True, + type_hint=torch.Tensor, + description="The image latents to use for the inpainting process. Can be generated in inpaint prepare latents step.", + ), + InputParam( + "initial_noise", + required=True, + type_hint=torch.Tensor, + description="The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.", + ), + InputParam( + "timesteps", + required=True, + type_hint=torch.Tensor, + description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.", + ), + ] + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor): + block_state.init_latents_proper = block_state.image_latents + if i < len(block_state.timesteps) - 1: + block_state.noise_timestep = block_state.timesteps[i + 1] + block_state.init_latents_proper = components.scheduler.scale_noise( + block_state.init_latents_proper, torch.tensor([block_state.noise_timestep]), block_state.initial_noise + ) + + block_state.latents = ( + 1 - block_state.mask + ) * block_state.init_latents_proper + block_state.mask * block_state.latents + + return components, block_state + + +class QwenImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return ( + "Pipeline block that iteratively denoise the latents over `timesteps`. " + "The specific steps with each iteration can be customized with `sub_blocks` attributes" + ) + + @property + def loop_expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler), + ] + + @property + def loop_inputs(self) -> List[InputParam]: + return [ + InputParam( + "timesteps", + required=True, + type_hint=torch.Tensor, + description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.", + ), + InputParam( + "num_inference_steps", + required=True, + type_hint=int, + description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", + ), + ] + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + block_state.num_warmup_steps = max( + len(block_state.timesteps) - block_state.num_inference_steps * components.scheduler.order, 0 + ) + + block_state.additional_cond_kwargs = {} + + with self.progress_bar(total=block_state.num_inference_steps) as progress_bar: + for i, t in enumerate(block_state.timesteps): + components, block_state = self.loop_step(components, block_state, i=i, t=t) + if i == len(block_state.timesteps) - 1 or ( + (i + 1) > block_state.num_warmup_steps and (i + 1) % components.scheduler.order == 0 + ): + progress_bar.update() + + self.set_block_state(state, block_state) + + return components, state + + +# composing the denoising loops +class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper): + block_classes = [ + QwenImageLoopBeforeDenoiser, + QwenImageLoopDenoiser, + QwenImageLoopAfterDenoiser, + ] + block_names = ["before_denoiser", "denoiser", "after_denoiser"] + + @property + def description(self) -> str: + return ( + "Denoise step that iteratively denoise the latents. \n" + "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n" + "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n" + " - `QwenImageLoopBeforeDenoiser`\n" + " - `QwenImageLoopDenoiser`\n" + " - `QwenImageLoopAfterDenoiser`\n" + "This block supports text2image and image2image tasks for QwenImage." + ) + + +# composing the inpainting denoising loops +class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): + block_classes = [ + QwenImageLoopBeforeDenoiser, + QwenImageLoopDenoiser, + QwenImageLoopAfterDenoiser, + QwenImageLoopAfterDenoiserInpaint, + ] + block_names = ["before_denoiser", "denoiser", "after_denoiser", "after_denoiser_inpaint"] + + @property + def description(self) -> str: + return ( + "Denoise step that iteratively denoise the latents. \n" + "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n" + "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n" + " - `QwenImageLoopBeforeDenoiser`\n" + " - `QwenImageLoopDenoiser`\n" + " - `QwenImageLoopAfterDenoiser`\n" + " - `QwenImageLoopAfterDenoiserInpaint`\n" + "This block supports inpainting tasks for QwenImage." + ) + + +# composing the controlnet denoising loops +class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): + block_classes = [ + QwenImageLoopBeforeDenoiser, + QwenImageLoopBeforeDenoiserControlNet, + QwenImageLoopDenoiser, + QwenImageLoopAfterDenoiser, + ] + block_names = ["before_denoiser", "before_denoiser_controlnet", "denoiser", "after_denoiser"] + + @property + def description(self) -> str: + return ( + "Denoise step that iteratively denoise the latents. \n" + "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n" + "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n" + " - `QwenImageLoopBeforeDenoiser`\n" + " - `QwenImageLoopBeforeDenoiserControlNet`\n" + " - `QwenImageLoopDenoiser`\n" + " - `QwenImageLoopAfterDenoiser`\n" + "This block supports text2img/img2img tasks with controlnet for QwenImage." + ) + + +# composing the controlnet denoising loops +class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): + block_classes = [ + QwenImageLoopBeforeDenoiser, + QwenImageLoopBeforeDenoiserControlNet, + QwenImageLoopDenoiser, + QwenImageLoopAfterDenoiser, + QwenImageLoopAfterDenoiserInpaint, + ] + block_names = [ + "before_denoiser", + "before_denoiser_controlnet", + "denoiser", + "after_denoiser", + "after_denoiser_inpaint", + ] + + @property + def description(self) -> str: + return ( + "Denoise step that iteratively denoise the latents. \n" + "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n" + "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n" + " - `QwenImageLoopBeforeDenoiser`\n" + " - `QwenImageLoopBeforeDenoiserControlNet`\n" + " - `QwenImageLoopDenoiser`\n" + " - `QwenImageLoopAfterDenoiser`\n" + " - `QwenImageLoopAfterDenoiserInpaint`\n" + "This block supports inpainting tasks with controlnet for QwenImage." + ) + + +# composing the denoising loops +class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper): + block_classes = [ + QwenImageEditLoopBeforeDenoiser, + QwenImageEditLoopDenoiser, + QwenImageLoopAfterDenoiser, + ] + block_names = ["before_denoiser", "denoiser", "after_denoiser"] + + @property + def description(self) -> str: + return ( + "Denoise step that iteratively denoise the latents. \n" + "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n" + "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n" + " - `QwenImageEditLoopBeforeDenoiser`\n" + " - `QwenImageEditLoopDenoiser`\n" + " - `QwenImageLoopAfterDenoiser`\n" + "This block supports QwenImage Edit." + ) + + +class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): + block_classes = [ + QwenImageEditLoopBeforeDenoiser, + QwenImageEditLoopDenoiser, + QwenImageLoopAfterDenoiser, + QwenImageLoopAfterDenoiserInpaint, + ] + block_names = ["before_denoiser", "denoiser", "after_denoiser", "after_denoiser_inpaint"] + + @property + def description(self) -> str: + return ( + "Denoise step that iteratively denoise the latents. \n" + "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n" + "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n" + " - `QwenImageEditLoopBeforeDenoiser`\n" + " - `QwenImageEditLoopDenoiser`\n" + " - `QwenImageLoopAfterDenoiser`\n" + " - `QwenImageLoopAfterDenoiserInpaint`\n" + "This block supports inpainting tasks for QwenImage Edit." + ) diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py new file mode 100644 index 000000000000..280fa6a152c4 --- /dev/null +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -0,0 +1,857 @@ +# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Optional, Union + +import PIL +import torch +from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor + +from ...configuration_utils import FrozenDict +from ...guiders import ClassifierFreeGuidance +from ...image_processor import InpaintProcessor, VaeImageProcessor, is_valid_image, is_valid_image_imagelist +from ...models import AutoencoderKLQwenImage, QwenImageControlNetModel, QwenImageMultiControlNetModel +from ...pipelines.qwenimage.pipeline_qwenimage_edit import calculate_dimensions +from ...utils import logging +from ...utils.torch_utils import unwrap_module +from ..modular_pipeline import ModularPipelineBlocks, PipelineState +from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam +from .modular_pipeline import QwenImageModularPipeline + + +logger = logging.get_logger(__name__) + + +def _extract_masked_hidden(hidden_states: torch.Tensor, mask: torch.Tensor): + bool_mask = mask.bool() + valid_lengths = bool_mask.sum(dim=1) + selected = hidden_states[bool_mask] + split_result = torch.split(selected, valid_lengths.tolist(), dim=0) + return split_result + + +def get_qwen_prompt_embeds( + text_encoder, + tokenizer, + prompt: Union[str, List[str]] = None, + prompt_template_encode: str = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n", + prompt_template_encode_start_idx: int = 34, + tokenizer_max_length: int = 1024, + device: Optional[torch.device] = None, +): + prompt = [prompt] if isinstance(prompt, str) else prompt + + template = prompt_template_encode + drop_idx = prompt_template_encode_start_idx + txt = [template.format(e) for e in prompt] + txt_tokens = tokenizer( + txt, max_length=tokenizer_max_length + drop_idx, padding=True, truncation=True, return_tensors="pt" + ).to(device) + encoder_hidden_states = text_encoder( + input_ids=txt_tokens.input_ids, + attention_mask=txt_tokens.attention_mask, + output_hidden_states=True, + ) + hidden_states = encoder_hidden_states.hidden_states[-1] + + split_hidden_states = _extract_masked_hidden(hidden_states, txt_tokens.attention_mask) + split_hidden_states = [e[drop_idx:] for e in split_hidden_states] + attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states] + max_seq_len = max([e.size(0) for e in split_hidden_states]) + prompt_embeds = torch.stack( + [torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states] + ) + encoder_attention_mask = torch.stack( + [torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list] + ) + + prompt_embeds = prompt_embeds.to(device=device) + + return prompt_embeds, encoder_attention_mask + + +def get_qwen_prompt_embeds_edit( + text_encoder, + processor, + prompt: Union[str, List[str]] = None, + image: Optional[torch.Tensor] = None, + prompt_template_encode: str = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n", + prompt_template_encode_start_idx: int = 64, + device: Optional[torch.device] = None, +): + prompt = [prompt] if isinstance(prompt, str) else prompt + + template = prompt_template_encode + drop_idx = prompt_template_encode_start_idx + txt = [template.format(e) for e in prompt] + + model_inputs = processor( + text=txt, + images=image, + padding=True, + return_tensors="pt", + ).to(device) + + outputs = text_encoder( + input_ids=model_inputs.input_ids, + attention_mask=model_inputs.attention_mask, + pixel_values=model_inputs.pixel_values, + image_grid_thw=model_inputs.image_grid_thw, + output_hidden_states=True, + ) + + hidden_states = outputs.hidden_states[-1] + split_hidden_states = _extract_masked_hidden(hidden_states, model_inputs.attention_mask) + split_hidden_states = [e[drop_idx:] for e in split_hidden_states] + attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states] + max_seq_len = max([e.size(0) for e in split_hidden_states]) + prompt_embeds = torch.stack( + [torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states] + ) + encoder_attention_mask = torch.stack( + [torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list] + ) + + prompt_embeds = prompt_embeds.to(device=device) + + return prompt_embeds, encoder_attention_mask + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents +def retrieve_latents( + encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample" +): + if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": + return encoder_output.latent_dist.sample(generator) + elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": + return encoder_output.latent_dist.mode() + elif hasattr(encoder_output, "latents"): + return encoder_output.latents + else: + raise AttributeError("Could not access latents of provided encoder_output") + + +# Modified from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline._encode_vae_image +def encode_vae_image( + image: torch.Tensor, + vae: AutoencoderKLQwenImage, + generator: torch.Generator, + device: torch.device, + dtype: torch.dtype, + latent_channels: int = 16, + sample_mode: str = "argmax", +): + if not isinstance(image, torch.Tensor): + raise ValueError(f"Expected image to be a tensor, got {type(image)}.") + + # preprocessed image should be a 4D tensor: batch_size, num_channels, height, width + if image.dim() == 4: + image = image.unsqueeze(2) + elif image.dim() != 5: + raise ValueError(f"Expected image dims 4 or 5, got {image.dim()}.") + + image = image.to(device=device, dtype=dtype) + + if isinstance(generator, list): + image_latents = [ + retrieve_latents(vae.encode(image[i : i + 1]), generator=generator[i], sample_mode=sample_mode) + for i in range(image.shape[0]) + ] + image_latents = torch.cat(image_latents, dim=0) + else: + image_latents = retrieve_latents(vae.encode(image), generator=generator, sample_mode=sample_mode) + latents_mean = ( + torch.tensor(vae.config.latents_mean) + .view(1, latent_channels, 1, 1, 1) + .to(image_latents.device, image_latents.dtype) + ) + latents_std = ( + torch.tensor(vae.config.latents_std) + .view(1, latent_channels, 1, 1, 1) + .to(image_latents.device, image_latents.dtype) + ) + image_latents = (image_latents - latents_mean) / latents_std + + return image_latents + + +class QwenImageEditResizeDynamicStep(ModularPipelineBlocks): + model_name = "qwenimage" + + def __init__(self, input_name: str = "image", output_name: str = "resized_image"): + """Create a configurable step for resizing images to the target area (1024 * 1024) while maintaining the aspect ratio. + + This block resizes an input image tensor and exposes the resized result under configurable input and output + names. Use this when you need to wire the resize step to different image fields (e.g., "image", + "control_image") + + Args: + input_name (str, optional): Name of the image field to read from the + pipeline state. Defaults to "image". + output_name (str, optional): Name of the resized image field to write + back to the pipeline state. Defaults to "resized_image". + """ + if not isinstance(input_name, str) or not isinstance(output_name, str): + raise ValueError( + f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}" + ) + self._image_input_name = input_name + self._resized_image_output_name = output_name + super().__init__() + + @property + def description(self) -> str: + return f"Image Resize step that resize the {self._image_input_name} to the target area (1024 * 1024) while maintaining the aspect ratio." + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec( + "image_resize_processor", + VaeImageProcessor, + config=FrozenDict({"vae_scale_factor": 16}), + default_creation_method="from_config", + ), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam( + name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize" + ), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images" + ), + ] + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState): + block_state = self.get_block_state(state) + + images = getattr(block_state, self._image_input_name) + + if not is_valid_image_imagelist(images): + raise ValueError(f"Images must be image or list of images but are {type(images)}") + + if is_valid_image(images): + images = [images] + + image_width, image_height = images[0].size + calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, image_width / image_height) + + resized_images = [ + components.image_resize_processor.resize(image, height=calculated_height, width=calculated_width) + for image in images + ] + + setattr(block_state, self._resized_image_output_name, resized_images) + self.set_block_state(state, block_state) + return components, state + + +class QwenImageTextEncoderStep(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return "Text Encoder step that generate text_embeddings to guide the image generation" + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("text_encoder", Qwen2_5_VLForConditionalGeneration, description="The text encoder to use"), + ComponentSpec("tokenizer", Qwen2Tokenizer, description="The tokenizer to use"), + ComponentSpec( + "guider", + ClassifierFreeGuidance, + config=FrozenDict({"guidance_scale": 4.0}), + default_creation_method="from_config", + ), + ] + + @property + def expected_configs(self) -> List[ConfigSpec]: + return [ + ConfigSpec( + name="prompt_template_encode", + default="<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n", + ), + ConfigSpec(name="prompt_template_encode_start_idx", default=34), + ConfigSpec(name="tokenizer_max_length", default=1024), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"), + InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"), + InputParam( + name="max_sequence_length", type_hint=int, description="The max sequence length to use", default=1024 + ), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + name="prompt_embeds", + kwargs_type="denoiser_input_fields", + type_hint=torch.Tensor, + description="The prompt embeddings", + ), + OutputParam( + name="prompt_embeds_mask", + kwargs_type="denoiser_input_fields", + type_hint=torch.Tensor, + description="The encoder attention mask", + ), + OutputParam( + name="negative_prompt_embeds", + kwargs_type="denoiser_input_fields", + type_hint=torch.Tensor, + description="The negative prompt embeddings", + ), + OutputParam( + name="negative_prompt_embeds_mask", + kwargs_type="denoiser_input_fields", + type_hint=torch.Tensor, + description="The negative prompt embeddings mask", + ), + ] + + @staticmethod + def check_inputs(prompt, negative_prompt, max_sequence_length): + if not isinstance(prompt, str) and not isinstance(prompt, list): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if ( + negative_prompt is not None + and not isinstance(negative_prompt, str) + and not isinstance(negative_prompt, list) + ): + raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}") + + if max_sequence_length is not None and max_sequence_length > 1024: + raise ValueError(f"`max_sequence_length` cannot be greater than 1024 but is {max_sequence_length}") + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState): + block_state = self.get_block_state(state) + + device = components._execution_device + self.check_inputs(block_state.prompt, block_state.negative_prompt, block_state.max_sequence_length) + + block_state.prompt_embeds, block_state.prompt_embeds_mask = get_qwen_prompt_embeds( + components.text_encoder, + components.tokenizer, + prompt=block_state.prompt, + prompt_template_encode=components.config.prompt_template_encode, + prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx, + tokenizer_max_length=components.config.tokenizer_max_length, + device=device, + ) + + block_state.prompt_embeds = block_state.prompt_embeds[:, : block_state.max_sequence_length] + block_state.prompt_embeds_mask = block_state.prompt_embeds_mask[:, : block_state.max_sequence_length] + + if components.requires_unconditional_embeds: + negative_prompt = block_state.negative_prompt or "" + block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds( + components.text_encoder, + components.tokenizer, + prompt=negative_prompt, + prompt_template_encode=components.config.prompt_template_encode, + prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx, + tokenizer_max_length=components.config.tokenizer_max_length, + device=device, + ) + block_state.negative_prompt_embeds = block_state.negative_prompt_embeds[ + :, : block_state.max_sequence_length + ] + block_state.negative_prompt_embeds_mask = block_state.negative_prompt_embeds_mask[ + :, : block_state.max_sequence_length + ] + + self.set_block_state(state, block_state) + return components, state + + +class QwenImageEditTextEncoderStep(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return "Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image generation" + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("text_encoder", Qwen2_5_VLForConditionalGeneration), + ComponentSpec("processor", Qwen2VLProcessor), + ComponentSpec( + "guider", + ClassifierFreeGuidance, + config=FrozenDict({"guidance_scale": 4.0}), + default_creation_method="from_config", + ), + ] + + @property + def expected_configs(self) -> List[ConfigSpec]: + return [ + ConfigSpec( + name="prompt_template_encode", + default="<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n", + ), + ConfigSpec(name="prompt_template_encode_start_idx", default=64), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"), + InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"), + InputParam( + name="resized_image", + required=True, + type_hint=torch.Tensor, + description="The image prompt to encode, should be resized using resize step", + ), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + name="prompt_embeds", + kwargs_type="denoiser_input_fields", + type_hint=torch.Tensor, + description="The prompt embeddings", + ), + OutputParam( + name="prompt_embeds_mask", + kwargs_type="denoiser_input_fields", + type_hint=torch.Tensor, + description="The encoder attention mask", + ), + OutputParam( + name="negative_prompt_embeds", + kwargs_type="denoiser_input_fields", + type_hint=torch.Tensor, + description="The negative prompt embeddings", + ), + OutputParam( + name="negative_prompt_embeds_mask", + kwargs_type="denoiser_input_fields", + type_hint=torch.Tensor, + description="The negative prompt embeddings mask", + ), + ] + + @staticmethod + def check_inputs(prompt, negative_prompt): + if not isinstance(prompt, str) and not isinstance(prompt, list): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if ( + negative_prompt is not None + and not isinstance(negative_prompt, str) + and not isinstance(negative_prompt, list) + ): + raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}") + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState): + block_state = self.get_block_state(state) + + self.check_inputs(block_state.prompt, block_state.negative_prompt) + + device = components._execution_device + + block_state.prompt_embeds, block_state.prompt_embeds_mask = get_qwen_prompt_embeds_edit( + components.text_encoder, + components.processor, + prompt=block_state.prompt, + image=block_state.resized_image, + prompt_template_encode=components.config.prompt_template_encode, + prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx, + device=device, + ) + + if components.requires_unconditional_embeds: + negative_prompt = block_state.negative_prompt or "" + block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds_edit( + components.text_encoder, + components.processor, + prompt=negative_prompt, + image=block_state.resized_image, + prompt_template_encode=components.config.prompt_template_encode, + prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx, + device=device, + ) + + self.set_block_state(state, block_state) + return components, state + + +class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return "Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images can be resized first using QwenImageEditResizeDynamicStep." + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec( + "image_mask_processor", + InpaintProcessor, + config=FrozenDict({"vae_scale_factor": 16}), + default_creation_method="from_config", + ), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam("mask_image", required=True), + InputParam("resized_image"), + InputParam("image"), + InputParam("height"), + InputParam("width"), + InputParam("padding_mask_crop"), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam(name="processed_image"), + OutputParam(name="processed_mask_image"), + OutputParam( + name="mask_overlay_kwargs", + type_hint=Dict, + description="The kwargs for the postprocess step to apply the mask overlay", + ), + ] + + @staticmethod + def check_inputs(height, width, vae_scale_factor): + if height is not None and height % (vae_scale_factor * 2) != 0: + raise ValueError(f"Height must be divisible by {vae_scale_factor * 2} but is {height}") + + if width is not None and width % (vae_scale_factor * 2) != 0: + raise ValueError(f"Width must be divisible by {vae_scale_factor * 2} but is {width}") + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState): + block_state = self.get_block_state(state) + + if block_state.resized_image is None and block_state.image is None: + raise ValueError("resized_image and image cannot be None at the same time") + + if block_state.resized_image is None: + image = block_state.image + self.check_inputs( + height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor + ) + height = block_state.height or components.default_height + width = block_state.width or components.default_width + else: + width, height = block_state.resized_image[0].size + image = block_state.resized_image + + block_state.processed_image, block_state.processed_mask_image, block_state.mask_overlay_kwargs = ( + components.image_mask_processor.preprocess( + image=image, + mask=block_state.mask_image, + height=height, + width=width, + padding_mask_crop=block_state.padding_mask_crop, + ) + ) + + self.set_block_state(state, block_state) + return components, state + + +class QwenImageProcessImagesInputStep(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return "Image Preprocess step. Images can be resized first using QwenImageEditResizeDynamicStep." + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec( + "image_processor", + VaeImageProcessor, + config=FrozenDict({"vae_scale_factor": 16}), + default_creation_method="from_config", + ), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam("resized_image"), + InputParam("image"), + InputParam("height"), + InputParam("width"), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam(name="processed_image"), + ] + + @staticmethod + def check_inputs(height, width, vae_scale_factor): + if height is not None and height % (vae_scale_factor * 2) != 0: + raise ValueError(f"Height must be divisible by {vae_scale_factor * 2} but is {height}") + + if width is not None and width % (vae_scale_factor * 2) != 0: + raise ValueError(f"Width must be divisible by {vae_scale_factor * 2} but is {width}") + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState): + block_state = self.get_block_state(state) + + if block_state.resized_image is None and block_state.image is None: + raise ValueError("resized_image and image cannot be None at the same time") + + if block_state.resized_image is None: + image = block_state.image + self.check_inputs( + height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor + ) + height = block_state.height or components.default_height + width = block_state.width or components.default_width + else: + width, height = block_state.resized_image[0].size + image = block_state.resized_image + + block_state.processed_image = components.image_processor.preprocess( + image=image, + height=height, + width=width, + ) + + self.set_block_state(state, block_state) + return components, state + + +class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks): + model_name = "qwenimage" + + def __init__( + self, + input_name: str = "processed_image", + output_name: str = "image_latents", + ): + """Initialize a VAE encoder step for converting images to latent representations. + + Both the input and output names are configurable so this block can be configured to process to different image + inputs (e.g., "processed_image" -> "image_latents", "processed_control_image" -> "control_image_latents"). + + Args: + input_name (str, optional): Name of the input image tensor. Defaults to "processed_image". + Examples: "processed_image" or "processed_control_image" + output_name (str, optional): Name of the output latent tensor. Defaults to "image_latents". + Examples: "image_latents" or "control_image_latents" + + Examples: + # Basic usage with default settings (includes image processor) QwenImageVaeEncoderDynamicStep() + + # Custom input/output names for control image QwenImageVaeEncoderDynamicStep( + input_name="processed_control_image", output_name="control_image_latents" + ) + """ + self._image_input_name = input_name + self._image_latents_output_name = output_name + super().__init__() + + @property + def description(self) -> str: + return f"Dynamic VAE Encoder step that converts {self._image_input_name} into latent representations {self._image_latents_output_name}.\n" + + @property + def expected_components(self) -> List[ComponentSpec]: + components = [ + ComponentSpec("vae", AutoencoderKLQwenImage), + ] + return components + + @property + def inputs(self) -> List[InputParam]: + inputs = [ + InputParam(self._image_input_name, required=True), + InputParam("generator"), + ] + return inputs + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + self._image_latents_output_name, + type_hint=torch.Tensor, + description="The latents representing the reference image", + ) + ] + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + device = components._execution_device + dtype = components.vae.dtype + + image = getattr(block_state, self._image_input_name) + + # Encode image into latents + image_latents = encode_vae_image( + image=image, + vae=components.vae, + generator=block_state.generator, + device=device, + dtype=dtype, + latent_channels=components.num_channels_latents, + ) + + setattr(block_state, self._image_latents_output_name, image_latents) + + self.set_block_state(state, block_state) + + return components, state + + +class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return "VAE Encoder step that converts `control_image` into latent representations control_image_latents.\n" + + @property + def expected_components(self) -> List[ComponentSpec]: + components = [ + ComponentSpec("vae", AutoencoderKLQwenImage), + ComponentSpec("controlnet", QwenImageControlNetModel), + ComponentSpec( + "control_image_processor", + VaeImageProcessor, + config=FrozenDict({"vae_scale_factor": 16}), + default_creation_method="from_config", + ), + ] + return components + + @property + def inputs(self) -> List[InputParam]: + inputs = [ + InputParam("control_image", required=True), + InputParam("height"), + InputParam("width"), + InputParam("generator"), + ] + return inputs + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + "control_image_latents", + type_hint=torch.Tensor, + description="The latents representing the control image", + ) + ] + + @staticmethod + def check_inputs(height, width, vae_scale_factor): + if height is not None and height % (vae_scale_factor * 2) != 0: + raise ValueError(f"Height must be divisible by {vae_scale_factor * 2} but is {height}") + + if width is not None and width % (vae_scale_factor * 2) != 0: + raise ValueError(f"Width must be divisible by {vae_scale_factor * 2} but is {width}") + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + self.check_inputs(block_state.height, block_state.width, components.vae_scale_factor) + + device = components._execution_device + dtype = components.vae.dtype + + height = block_state.height or components.default_height + width = block_state.width or components.default_width + + controlnet = unwrap_module(components.controlnet) + if isinstance(controlnet, QwenImageMultiControlNetModel) and not isinstance(block_state.control_image, list): + block_state.control_image = [block_state.control_image] + + if isinstance(controlnet, QwenImageMultiControlNetModel): + block_state.control_image_latents = [] + for control_image_ in block_state.control_image: + control_image_ = components.control_image_processor.preprocess( + image=control_image_, + height=height, + width=width, + ) + + control_image_latents_ = encode_vae_image( + image=control_image_, + vae=components.vae, + generator=block_state.generator, + device=device, + dtype=dtype, + latent_channels=components.num_channels_latents, + sample_mode="sample", + ) + block_state.control_image_latents.append(control_image_latents_) + + elif isinstance(controlnet, QwenImageControlNetModel): + control_image = components.control_image_processor.preprocess( + image=block_state.control_image, + height=height, + width=width, + ) + block_state.control_image_latents = encode_vae_image( + image=control_image, + vae=components.vae, + generator=block_state.generator, + device=device, + dtype=dtype, + latent_channels=components.num_channels_latents, + sample_mode="sample", + ) + + else: + raise ValueError( + f"Expected controlnet to be a QwenImageControlNetModel or QwenImageMultiControlNetModel, got {type(controlnet)}" + ) + + self.set_block_state(state, block_state) + + return components, state diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py new file mode 100644 index 000000000000..2b787c823865 --- /dev/null +++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py @@ -0,0 +1,431 @@ +# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Tuple + +import torch + +from ...models import QwenImageMultiControlNetModel +from ..modular_pipeline import ModularPipelineBlocks, PipelineState +from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam +from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier + + +def repeat_tensor_to_batch_size( + input_name: str, + input_tensor: torch.Tensor, + batch_size: int, + num_images_per_prompt: int = 1, +) -> torch.Tensor: + """Repeat tensor elements to match the final batch size. + + This function expands a tensor's batch dimension to match the final batch size (batch_size * num_images_per_prompt) + by repeating each element along dimension 0. + + The input tensor must have batch size 1 or batch_size. The function will: + - If batch size is 1: repeat each element (batch_size * num_images_per_prompt) times + - If batch size equals batch_size: repeat each element num_images_per_prompt times + + Args: + input_name (str): Name of the input tensor (used for error messages) + input_tensor (torch.Tensor): The tensor to repeat. Must have batch size 1 or batch_size. + batch_size (int): The base batch size (number of prompts) + num_images_per_prompt (int, optional): Number of images to generate per prompt. Defaults to 1. + + Returns: + torch.Tensor: The repeated tensor with final batch size (batch_size * num_images_per_prompt) + + Raises: + ValueError: If input_tensor is not a torch.Tensor or has invalid batch size + + Examples: + tensor = torch.tensor([[1, 2, 3]]) # shape: [1, 3] repeated = repeat_tensor_to_batch_size("image", tensor, + batch_size=2, num_images_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]) - shape: + [4, 3] + + tensor = torch.tensor([[1, 2, 3], [4, 5, 6]]) # shape: [2, 3] repeated = repeat_tensor_to_batch_size("image", + tensor, batch_size=2, num_images_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [4, 5, 6], [4, 5, 6]]) + - shape: [4, 3] + """ + # make sure input is a tensor + if not isinstance(input_tensor, torch.Tensor): + raise ValueError(f"`{input_name}` must be a tensor") + + # make sure input tensor e.g. image_latents has batch size 1 or batch_size same as prompts + if input_tensor.shape[0] == 1: + repeat_by = batch_size * num_images_per_prompt + elif input_tensor.shape[0] == batch_size: + repeat_by = num_images_per_prompt + else: + raise ValueError( + f"`{input_name}` must have have batch size 1 or {batch_size}, but got {input_tensor.shape[0]}" + ) + + # expand the tensor to match the batch_size * num_images_per_prompt + input_tensor = input_tensor.repeat_interleave(repeat_by, dim=0) + + return input_tensor + + +def calculate_dimension_from_latents(latents: torch.Tensor, vae_scale_factor: int) -> Tuple[int, int]: + """Calculate image dimensions from latent tensor dimensions. + + This function converts latent space dimensions to image space dimensions by multiplying the latent height and width + by the VAE scale factor. + + Args: + latents (torch.Tensor): The latent tensor. Must have 4 or 5 dimensions. + Expected shapes: [batch, channels, height, width] or [batch, channels, frames, height, width] + vae_scale_factor (int): The scale factor used by the VAE to compress images. + Typically 8 for most VAEs (image is 8x larger than latents in each dimension) + + Returns: + Tuple[int, int]: The calculated image dimensions as (height, width) + + Raises: + ValueError: If latents tensor doesn't have 4 or 5 dimensions + + """ + # make sure the latents are not packed + if latents.ndim != 4 and latents.ndim != 5: + raise ValueError(f"unpacked latents must have 4 or 5 dimensions, but got {latents.ndim}") + + latent_height, latent_width = latents.shape[-2:] + + height = latent_height * vae_scale_factor + width = latent_width * vae_scale_factor + + return height, width + + +class QwenImageTextInputsStep(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + summary_section = ( + "Text input processing step that standardizes text embeddings for the pipeline.\n" + "This step:\n" + " 1. Determines `batch_size` and `dtype` based on `prompt_embeds`\n" + " 2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)" + ) + + # Placement guidance + placement_section = "\n\nThis block should be placed after all encoder steps to process the text embeddings before they are used in subsequent pipeline steps." + + return summary_section + placement_section + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam(name="num_images_per_prompt", default=1), + InputParam(name="prompt_embeds", required=True, kwargs_type="denoiser_input_fields"), + InputParam(name="prompt_embeds_mask", required=True, kwargs_type="denoiser_input_fields"), + InputParam(name="negative_prompt_embeds", kwargs_type="denoiser_input_fields"), + InputParam(name="negative_prompt_embeds_mask", kwargs_type="denoiser_input_fields"), + ] + + @property + def intermediate_outputs(self) -> List[str]: + return [ + OutputParam( + "batch_size", + type_hint=int, + description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt", + ), + OutputParam( + "dtype", + type_hint=torch.dtype, + description="Data type of model tensor inputs (determined by `prompt_embeds`)", + ), + ] + + @staticmethod + def check_inputs( + prompt_embeds, + prompt_embeds_mask, + negative_prompt_embeds, + negative_prompt_embeds_mask, + ): + if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None: + raise ValueError("`negative_prompt_embeds_mask` is required when `negative_prompt_embeds` is not None") + + if negative_prompt_embeds is None and negative_prompt_embeds_mask is not None: + raise ValueError("cannot pass `negative_prompt_embeds_mask` without `negative_prompt_embeds`") + + if prompt_embeds_mask.shape[0] != prompt_embeds.shape[0]: + raise ValueError("`prompt_embeds_mask` must have the same batch size as `prompt_embeds`") + + elif negative_prompt_embeds is not None and negative_prompt_embeds.shape[0] != prompt_embeds.shape[0]: + raise ValueError("`negative_prompt_embeds` must have the same batch size as `prompt_embeds`") + + elif ( + negative_prompt_embeds_mask is not None and negative_prompt_embeds_mask.shape[0] != prompt_embeds.shape[0] + ): + raise ValueError("`negative_prompt_embeds_mask` must have the same batch size as `prompt_embeds`") + + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + self.check_inputs( + prompt_embeds=block_state.prompt_embeds, + prompt_embeds_mask=block_state.prompt_embeds_mask, + negative_prompt_embeds=block_state.negative_prompt_embeds, + negative_prompt_embeds_mask=block_state.negative_prompt_embeds_mask, + ) + + block_state.batch_size = block_state.prompt_embeds.shape[0] + block_state.dtype = block_state.prompt_embeds.dtype + + _, seq_len, _ = block_state.prompt_embeds.shape + + block_state.prompt_embeds = block_state.prompt_embeds.repeat(1, block_state.num_images_per_prompt, 1) + block_state.prompt_embeds = block_state.prompt_embeds.view( + block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1 + ) + + block_state.prompt_embeds_mask = block_state.prompt_embeds_mask.repeat(1, block_state.num_images_per_prompt, 1) + block_state.prompt_embeds_mask = block_state.prompt_embeds_mask.view( + block_state.batch_size * block_state.num_images_per_prompt, seq_len + ) + + if block_state.negative_prompt_embeds is not None: + _, seq_len, _ = block_state.negative_prompt_embeds.shape + block_state.negative_prompt_embeds = block_state.negative_prompt_embeds.repeat( + 1, block_state.num_images_per_prompt, 1 + ) + block_state.negative_prompt_embeds = block_state.negative_prompt_embeds.view( + block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1 + ) + + block_state.negative_prompt_embeds_mask = block_state.negative_prompt_embeds_mask.repeat( + 1, block_state.num_images_per_prompt, 1 + ) + block_state.negative_prompt_embeds_mask = block_state.negative_prompt_embeds_mask.view( + block_state.batch_size * block_state.num_images_per_prompt, seq_len + ) + + self.set_block_state(state, block_state) + + return components, state + + +class QwenImageInputsDynamicStep(ModularPipelineBlocks): + model_name = "qwenimage" + + def __init__( + self, + image_latent_inputs: List[str] = ["image_latents"], + additional_batch_inputs: List[str] = [], + ): + """Initialize a configurable step that standardizes the inputs for the denoising step. It:\n" + + This step handles multiple common tasks to prepare inputs for the denoising step: + 1. For encoded image latents, use it update height/width if None, patchifies, and expands batch size + 2. For additional_batch_inputs: Only expands batch dimensions to match final batch size + + This is a dynamic block that allows you to configure which inputs to process. + + Args: + image_latent_inputs (List[str], optional): Names of image latent tensors to process. + These will be used to determine height/width, patchified, and batch-expanded. Can be a single string or + list of strings. Defaults to ["image_latents"]. Examples: ["image_latents"], ["control_image_latents"] + additional_batch_inputs (List[str], optional): + Names of additional conditional input tensors to expand batch size. These tensors will only have their + batch dimensions adjusted to match the final batch size. Can be a single string or list of strings. + Defaults to []. Examples: ["processed_mask_image"] + + Examples: + # Configure to process image_latents (default behavior) QwenImageInputsDynamicStep() + + # Configure to process multiple image latent inputs + QwenImageInputsDynamicStep(image_latent_inputs=["image_latents", "control_image_latents"]) + + # Configure to process image latents and additional batch inputs QwenImageInputsDynamicStep( + image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"] + ) + """ + if not isinstance(image_latent_inputs, list): + image_latent_inputs = [image_latent_inputs] + if not isinstance(additional_batch_inputs, list): + additional_batch_inputs = [additional_batch_inputs] + + self._image_latent_inputs = image_latent_inputs + self._additional_batch_inputs = additional_batch_inputs + super().__init__() + + @property + def description(self) -> str: + # Functionality section + summary_section = ( + "Input processing step that:\n" + " 1. For image latent inputs: Updates height/width if None, patchifies latents, and expands batch size\n" + " 2. For additional batch inputs: Expands batch dimensions to match final batch size" + ) + + # Inputs info + inputs_info = "" + if self._image_latent_inputs or self._additional_batch_inputs: + inputs_info = "\n\nConfigured inputs:" + if self._image_latent_inputs: + inputs_info += f"\n - Image latent inputs: {self._image_latent_inputs}" + if self._additional_batch_inputs: + inputs_info += f"\n - Additional batch inputs: {self._additional_batch_inputs}" + + # Placement guidance + placement_section = "\n\nThis block should be placed after the encoder steps and the text input step." + + return summary_section + inputs_info + placement_section + + @property + def inputs(self) -> List[InputParam]: + inputs = [ + InputParam(name="num_images_per_prompt", default=1), + InputParam(name="batch_size", required=True), + InputParam(name="height"), + InputParam(name="width"), + ] + + # Add image latent inputs + for image_latent_input_name in self._image_latent_inputs: + inputs.append(InputParam(name=image_latent_input_name)) + + # Add additional batch inputs + for input_name in self._additional_batch_inputs: + inputs.append(InputParam(name=input_name)) + + return inputs + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"), + ] + + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + # Process image latent inputs (height/width calculation, patchify, and batch expansion) + for image_latent_input_name in self._image_latent_inputs: + image_latent_tensor = getattr(block_state, image_latent_input_name) + if image_latent_tensor is None: + continue + + # 1. Calculate height/width from latents + height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor) + block_state.height = block_state.height or height + block_state.width = block_state.width or width + + # 2. Patchify the image latent tensor + image_latent_tensor = components.pachifier.pack_latents(image_latent_tensor) + + # 3. Expand batch size + image_latent_tensor = repeat_tensor_to_batch_size( + input_name=image_latent_input_name, + input_tensor=image_latent_tensor, + num_images_per_prompt=block_state.num_images_per_prompt, + batch_size=block_state.batch_size, + ) + + setattr(block_state, image_latent_input_name, image_latent_tensor) + + # Process additional batch inputs (only batch expansion) + for input_name in self._additional_batch_inputs: + input_tensor = getattr(block_state, input_name) + if input_tensor is None: + continue + + # Only expand batch size + input_tensor = repeat_tensor_to_batch_size( + input_name=input_name, + input_tensor=input_tensor, + num_images_per_prompt=block_state.num_images_per_prompt, + batch_size=block_state.batch_size, + ) + + setattr(block_state, input_name, input_tensor) + + self.set_block_state(state, block_state) + return components, state + + +class QwenImageControlNetInputsStep(ModularPipelineBlocks): + model_name = "qwenimage" + + @property + def description(self) -> str: + return "prepare the `control_image_latents` for controlnet. Insert after all the other inputs steps." + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam(name="control_image_latents", required=True), + InputParam(name="batch_size", required=True), + InputParam(name="num_images_per_prompt", default=1), + InputParam(name="height"), + InputParam(name="width"), + ] + + @torch.no_grad() + def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + if isinstance(components.controlnet, QwenImageMultiControlNetModel): + control_image_latents = [] + # loop through each control_image_latents + for i, control_image_latents_ in enumerate(block_state.control_image_latents): + # 1. update height/width if not provided + height, width = calculate_dimension_from_latents(control_image_latents_, components.vae_scale_factor) + block_state.height = block_state.height or height + block_state.width = block_state.width or width + + # 2. pack + control_image_latents_ = components.pachifier.pack_latents(control_image_latents_) + + # 3. repeat to match the batch size + control_image_latents_ = repeat_tensor_to_batch_size( + input_name=f"control_image_latents[{i}]", + input_tensor=control_image_latents_, + num_images_per_prompt=block_state.num_images_per_prompt, + batch_size=block_state.batch_size, + ) + + control_image_latents.append(control_image_latents_) + + block_state.control_image_latents = control_image_latents + + else: + # 1. update height/width if not provided + height, width = calculate_dimension_from_latents( + block_state.control_image_latents, components.vae_scale_factor + ) + block_state.height = block_state.height or height + block_state.width = block_state.width or width + + # 2. pack + block_state.control_image_latents = components.pachifier.pack_latents(block_state.control_image_latents) + + # 3. repeat to match the batch size + block_state.control_image_latents = repeat_tensor_to_batch_size( + input_name="control_image_latents", + input_tensor=block_state.control_image_latents, + num_images_per_prompt=block_state.num_images_per_prompt, + batch_size=block_state.batch_size, + ) + + block_state.control_image_latents = block_state.control_image_latents + + self.set_block_state(state, block_state) + + return components, state diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py new file mode 100644 index 000000000000..a01c742fcf68 --- /dev/null +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py @@ -0,0 +1,841 @@ +# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...utils import logging +from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks +from ..modular_pipeline_utils import InsertableDict +from .before_denoise import ( + QwenImageControlNetBeforeDenoiserStep, + QwenImageCreateMaskLatentsStep, + QwenImageEditRoPEInputsStep, + QwenImagePrepareLatentsStep, + QwenImagePrepareLatentsWithStrengthStep, + QwenImageRoPEInputsStep, + QwenImageSetTimestepsStep, + QwenImageSetTimestepsWithStrengthStep, +) +from .decoders import QwenImageDecoderStep, QwenImageInpaintProcessImagesOutputStep, QwenImageProcessImagesOutputStep +from .denoise import ( + QwenImageControlNetDenoiseStep, + QwenImageDenoiseStep, + QwenImageEditDenoiseStep, + QwenImageEditInpaintDenoiseStep, + QwenImageInpaintControlNetDenoiseStep, + QwenImageInpaintDenoiseStep, + QwenImageLoopBeforeDenoiserControlNet, +) +from .encoders import ( + QwenImageControlNetVaeEncoderStep, + QwenImageEditResizeDynamicStep, + QwenImageEditTextEncoderStep, + QwenImageInpaintProcessImagesInputStep, + QwenImageProcessImagesInputStep, + QwenImageTextEncoderStep, + QwenImageVaeEncoderDynamicStep, +) +from .inputs import QwenImageControlNetInputsStep, QwenImageInputsDynamicStep, QwenImageTextInputsStep + + +logger = logging.get_logger(__name__) + +# 1. QwenImage + +## 1.1 QwenImage/text2image + +#### QwenImage/decode +#### (standard decode step works for most tasks except for inpaint) +QwenImageDecodeBlocks = InsertableDict( + [ + ("decode", QwenImageDecoderStep()), + ("postprocess", QwenImageProcessImagesOutputStep()), + ] +) + + +class QwenImageDecodeStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageDecodeBlocks.values() + block_names = QwenImageDecodeBlocks.keys() + + @property + def description(self): + return "Decode step that decodes the latents to images and postprocess the generated image." + + +#### QwenImage/text2image presets +TEXT2IMAGE_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageTextEncoderStep()), + ("input", QwenImageTextInputsStep()), + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsStep()), + ("prepare_rope_inputs", QwenImageRoPEInputsStep()), + ("denoise", QwenImageDenoiseStep()), + ("decode", QwenImageDecodeStep()), + ] +) + + +## 1.2 QwenImage/inpaint + +#### QwenImage/inpaint vae encoder +QwenImageInpaintVaeEncoderBlocks = InsertableDict( + [ + ( + "preprocess", + QwenImageInpaintProcessImagesInputStep, + ), # image, mask_image -> processed_image, processed_mask_image, mask_overlay_kwargs + ("encode", QwenImageVaeEncoderDynamicStep()), # processed_image -> image_latents + ] +) + + +class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageInpaintVaeEncoderBlocks.values() + block_names = QwenImageInpaintVaeEncoderBlocks.keys() + + @property + def description(self) -> str: + return ( + "This step is used for processing image and mask inputs for inpainting tasks. It:\n" + " - Resizes the image to the target size, based on `height` and `width`.\n" + " - Processes and updates `image` and `mask_image`.\n" + " - Creates `image_latents`." + ) + + +#### QwenImage/inpaint inputs +QwenImageInpaintInputBlocks = InsertableDict( + [ + ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings + ( + "additional_inputs", + QwenImageInputsDynamicStep( + image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"] + ), + ), + ] +) + + +class QwenImageInpaintInputStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageInpaintInputBlocks.values() + block_names = QwenImageInpaintInputBlocks.keys() + + @property + def description(self): + return "Input step that prepares the inputs for the inpainting denoising step. It:\n" + " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents` and `processed_mask_image`).\n" + " - update height/width based `image_latents`, patchify `image_latents`." + + +# QwenImage/inpaint prepare latents +QwenImageInpaintPrepareLatentsBlocks = InsertableDict( + [ + ("add_noise_to_latents", QwenImagePrepareLatentsWithStrengthStep()), + ("create_mask_latents", QwenImageCreateMaskLatentsStep()), + ] +) + + +class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageInpaintPrepareLatentsBlocks.values() + block_names = QwenImageInpaintPrepareLatentsBlocks.keys() + + @property + def description(self) -> str: + return ( + "This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:\n" + " - Add noise to the image latents to create the latents input for the denoiser.\n" + " - Create the pachified latents `mask` based on the processedmask image.\n" + ) + + +#### QwenImage/inpaint decode +QwenImageInpaintDecodeBlocks = InsertableDict( + [ + ("decode", QwenImageDecoderStep()), + ("postprocess", QwenImageInpaintProcessImagesOutputStep()), + ] +) + + +class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageInpaintDecodeBlocks.values() + block_names = QwenImageInpaintDecodeBlocks.keys() + + @property + def description(self): + return "Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image." + + +#### QwenImage/inpaint presets +INPAINT_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageTextEncoderStep()), + ("vae_encoder", QwenImageInpaintVaeEncoderStep()), + ("input", QwenImageInpaintInputStep()), + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), + ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), + ("prepare_rope_inputs", QwenImageRoPEInputsStep()), + ("denoise", QwenImageInpaintDenoiseStep()), + ("decode", QwenImageInpaintDecodeStep()), + ] +) + + +## 1.3 QwenImage/img2img + +#### QwenImage/img2img vae encoder +QwenImageImg2ImgVaeEncoderBlocks = InsertableDict( + [ + ("preprocess", QwenImageProcessImagesInputStep()), + ("encode", QwenImageVaeEncoderDynamicStep()), + ] +) + + +class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage" + + block_classes = QwenImageImg2ImgVaeEncoderBlocks.values() + block_names = QwenImageImg2ImgVaeEncoderBlocks.keys() + + @property + def description(self) -> str: + return "Vae encoder step that preprocess andencode the image inputs into their latent representations." + + +#### QwenImage/img2img inputs +QwenImageImg2ImgInputBlocks = InsertableDict( + [ + ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings + ("additional_inputs", QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])), + ] +) + + +class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageImg2ImgInputBlocks.values() + block_names = QwenImageImg2ImgInputBlocks.keys() + + @property + def description(self): + return "Input step that prepares the inputs for the img2img denoising step. It:\n" + " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n" + " - update height/width based `image_latents`, patchify `image_latents`." + + +#### QwenImage/img2img presets +IMAGE2IMAGE_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageTextEncoderStep()), + ("vae_encoder", QwenImageImg2ImgVaeEncoderStep()), + ("input", QwenImageImg2ImgInputStep()), + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), + ("prepare_img2img_latents", QwenImagePrepareLatentsWithStrengthStep()), + ("prepare_rope_inputs", QwenImageRoPEInputsStep()), + ("denoise", QwenImageDenoiseStep()), + ("decode", QwenImageDecodeStep()), + ] +) + + +## 1.4 QwenImage/controlnet + +#### QwenImage/controlnet presets +CONTROLNET_BLOCKS = InsertableDict( + [ + ("controlnet_vae_encoder", QwenImageControlNetVaeEncoderStep()), # vae encoder step for control_image + ("controlnet_inputs", QwenImageControlNetInputsStep()), # additional input step for controlnet + ( + "controlnet_before_denoise", + QwenImageControlNetBeforeDenoiserStep(), + ), # before denoise step (after set_timesteps step) + ( + "controlnet_denoise_loop_before", + QwenImageLoopBeforeDenoiserControlNet(), + ), # controlnet loop step (insert before the denoiseloop_denoiser) + ] +) + + +## 1.5 QwenImage/auto encoders + + +#### for inpaint and img2img tasks +class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks): + block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep] + block_names = ["inpaint", "img2img"] + block_trigger_inputs = ["mask_image", "image"] + + @property + def description(self): + return ( + "Vae encoder step that encode the image inputs into their latent representations.\n" + + "This is an auto pipeline block.\n" + + " - `QwenImageInpaintVaeEncoderStep` (inpaint) is used when `mask_image` is provided.\n" + + " - `QwenImageImg2ImgVaeEncoderStep` (img2img) is used when `image` is provided.\n" + + " - if `mask_image` or `image` is not provided, step will be skipped." + ) + + +# for controlnet tasks +class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): + block_classes = [QwenImageControlNetVaeEncoderStep] + block_names = ["controlnet"] + block_trigger_inputs = ["control_image"] + + @property + def description(self): + return ( + "Vae encoder step that encode the image inputs into their latent representations.\n" + + "This is an auto pipeline block.\n" + + " - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.\n" + + " - if `control_image` is not provided, step will be skipped." + ) + + +## 1.6 QwenImage/auto inputs + + +# text2image/inpaint/img2img +class QwenImageAutoInputStep(AutoPipelineBlocks): + block_classes = [QwenImageInpaintInputStep, QwenImageImg2ImgInputStep, QwenImageTextInputsStep] + block_names = ["inpaint", "img2img", "text2image"] + block_trigger_inputs = ["processed_mask_image", "image_latents", None] + + @property + def description(self): + return ( + "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n" + " This is an auto pipeline block that works for text2image/inpaint/img2img tasks.\n" + + " - `QwenImageInpaintInputStep` (inpaint) is used when `processed_mask_image` is provided.\n" + + " - `QwenImageImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n" + + " - `QwenImageTextInputsStep` (text2image) is used when both `processed_mask_image` and `image_latents` are not provided.\n" + ) + + +# controlnet +class QwenImageOptionalControlNetInputStep(AutoPipelineBlocks): + block_classes = [QwenImageControlNetInputsStep] + block_names = ["controlnet"] + block_trigger_inputs = ["control_image_latents"] + + @property + def description(self): + return ( + "Controlnet input step that prepare the control_image_latents input.\n" + + "This is an auto pipeline block.\n" + + " - `QwenImageControlNetInputsStep` (controlnet) is used when `control_image_latents` is provided.\n" + + " - if `control_image_latents` is not provided, step will be skipped." + ) + + +## 1.7 QwenImage/auto before denoise step +# compose the steps into a BeforeDenoiseStep for text2image/img2img/inpaint tasks before combine into an auto step + +# QwenImage/text2image before denoise +QwenImageText2ImageBeforeDenoiseBlocks = InsertableDict( + [ + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsStep()), + ("prepare_rope_inputs", QwenImageRoPEInputsStep()), + ] +) + + +class QwenImageText2ImageBeforeDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageText2ImageBeforeDenoiseBlocks.values() + block_names = QwenImageText2ImageBeforeDenoiseBlocks.keys() + + @property + def description(self): + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for text2image task." + + +# QwenImage/inpaint before denoise +QwenImageInpaintBeforeDenoiseBlocks = InsertableDict( + [ + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), + ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), + ("prepare_rope_inputs", QwenImageRoPEInputsStep()), + ] +) + + +class QwenImageInpaintBeforeDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageInpaintBeforeDenoiseBlocks.values() + block_names = QwenImageInpaintBeforeDenoiseBlocks.keys() + + @property + def description(self): + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task." + + +# QwenImage/img2img before denoise +QwenImageImg2ImgBeforeDenoiseBlocks = InsertableDict( + [ + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), + ("prepare_img2img_latents", QwenImagePrepareLatentsWithStrengthStep()), + ("prepare_rope_inputs", QwenImageRoPEInputsStep()), + ] +) + + +class QwenImageImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageImg2ImgBeforeDenoiseBlocks.values() + block_names = QwenImageImg2ImgBeforeDenoiseBlocks.keys() + + @property + def description(self): + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task." + + +# auto before_denoise step for text2image, inpaint, img2img tasks +class QwenImageAutoBeforeDenoiseStep(AutoPipelineBlocks): + block_classes = [ + QwenImageInpaintBeforeDenoiseStep, + QwenImageImg2ImgBeforeDenoiseStep, + QwenImageText2ImageBeforeDenoiseStep, + ] + block_names = ["inpaint", "img2img", "text2image"] + block_trigger_inputs = ["processed_mask_image", "image_latents", None] + + @property + def description(self): + return ( + "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n" + + "This is an auto pipeline block that works for text2img, inpainting, img2img tasks.\n" + + " - `QwenImageInpaintBeforeDenoiseStep` (inpaint) is used when `processed_mask_image` is provided.\n" + + " - `QwenImageImg2ImgBeforeDenoiseStep` (img2img) is used when `image_latents` is provided.\n" + + " - `QwenImageText2ImageBeforeDenoiseStep` (text2image) is used when both `processed_mask_image` and `image_latents` are not provided.\n" + ) + + +# auto before_denoise step for controlnet tasks +class QwenImageOptionalControlNetBeforeDenoiseStep(AutoPipelineBlocks): + block_classes = [QwenImageControlNetBeforeDenoiserStep] + block_names = ["controlnet"] + block_trigger_inputs = ["control_image_latents"] + + @property + def description(self): + return ( + "Controlnet before denoise step that prepare the controlnet input.\n" + + "This is an auto pipeline block.\n" + + " - `QwenImageControlNetBeforeDenoiserStep` (controlnet) is used when `control_image_latents` is provided.\n" + + " - if `control_image_latents` is not provided, step will be skipped." + ) + + +## 1.8 QwenImage/auto denoise + + +# auto denoise step for controlnet tasks: works for all tasks with controlnet +class QwenImageControlNetAutoDenoiseStep(AutoPipelineBlocks): + block_classes = [QwenImageInpaintControlNetDenoiseStep, QwenImageControlNetDenoiseStep] + block_names = ["inpaint_denoise", "denoise"] + block_trigger_inputs = ["mask", None] + + @property + def description(self): + return ( + "Controlnet step during the denoising process. \n" + " This is an auto pipeline block that works for inpaint and text2image/img2img tasks with controlnet.\n" + + " - `QwenImageInpaintControlNetDenoiseStep` (inpaint) is used when `mask` is provided.\n" + + " - `QwenImageControlNetDenoiseStep` (text2image/img2img) is used when `mask` is not provided.\n" + ) + + +# auto denoise step for everything: works for all tasks with or without controlnet +class QwenImageAutoDenoiseStep(AutoPipelineBlocks): + block_classes = [ + QwenImageControlNetAutoDenoiseStep, + QwenImageInpaintDenoiseStep, + QwenImageDenoiseStep, + ] + block_names = ["controlnet_denoise", "inpaint_denoise", "denoise"] + block_trigger_inputs = ["control_image_latents", "mask", None] + + @property + def description(self): + return ( + "Denoise step that iteratively denoise the latents. \n" + " This is an auto pipeline block that works for inpaint/text2image/img2img tasks. It also works with controlnet\n" + + " - `QwenImageControlNetAutoDenoiseStep` (controlnet) is used when `control_image_latents` is provided.\n" + + " - `QwenImageInpaintDenoiseStep` (inpaint) is used when `mask` is provided and `control_image_latents` is not provided.\n" + + " - `QwenImageDenoiseStep` (text2image/img2img) is used when `mask` is not provided and `control_image_latents` is not provided.\n" + ) + + +## 1.9 QwenImage/auto decode +# auto decode step for inpaint and text2image tasks + + +class QwenImageAutoDecodeStep(AutoPipelineBlocks): + block_classes = [QwenImageInpaintDecodeStep, QwenImageDecodeStep] + block_names = ["inpaint_decode", "decode"] + block_trigger_inputs = ["mask", None] + + @property + def description(self): + return ( + "Decode step that decode the latents into images. \n" + " This is an auto pipeline block that works for inpaint/text2image/img2img tasks, for both QwenImage and QwenImage-Edit.\n" + + " - `QwenImageInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n" + + " - `QwenImageDecodeStep` (text2image/img2img) is used when `mask` is not provided.\n" + ) + + +## 1.10 QwenImage/auto block & presets +AUTO_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageTextEncoderStep()), + ("vae_encoder", QwenImageAutoVaeEncoderStep()), + ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()), + ("input", QwenImageAutoInputStep()), + ("controlnet_input", QwenImageOptionalControlNetInputStep()), + ("before_denoise", QwenImageAutoBeforeDenoiseStep()), + ("controlnet_before_denoise", QwenImageOptionalControlNetBeforeDenoiseStep()), + ("denoise", QwenImageAutoDenoiseStep()), + ("decode", QwenImageAutoDecodeStep()), + ] +) + + +class QwenImageAutoBlocks(SequentialPipelineBlocks): + model_name = "qwenimage" + + block_classes = AUTO_BLOCKS.values() + block_names = AUTO_BLOCKS.keys() + + @property + def description(self): + return ( + "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n" + + "- for image-to-image generation, you need to provide `image`\n" + + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n" + + "- to run the controlnet workflow, you need to provide `control_image`\n" + + "- for text-to-image generation, all you need to provide is `prompt`" + ) + + +# 2. QwenImage-Edit + +## 2.1 QwenImage-Edit/edit + +#### QwenImage-Edit/edit vl encoder: take both image and text prompts +QwenImageEditVLEncoderBlocks = InsertableDict( + [ + ("resize", QwenImageEditResizeDynamicStep()), + ("encode", QwenImageEditTextEncoderStep()), + ] +) + + +class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditVLEncoderBlocks.values() + block_names = QwenImageEditVLEncoderBlocks.keys() + + @property + def description(self) -> str: + return "QwenImage-Edit VL encoder step that encode the image an text prompts together." + + +#### QwenImage-Edit/edit vae encoder +QwenImageEditVaeEncoderBlocks = InsertableDict( + [ + ("resize", QwenImageEditResizeDynamicStep()), # edit has a different resize step + ("preprocess", QwenImageProcessImagesInputStep()), # resized_image -> processed_image + ("encode", QwenImageVaeEncoderDynamicStep()), # processed_image -> image_latents + ] +) + + +class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditVaeEncoderBlocks.values() + block_names = QwenImageEditVaeEncoderBlocks.keys() + + @property + def description(self) -> str: + return "Vae encoder step that encode the image inputs into their latent representations." + + +#### QwenImage-Edit/edit input +QwenImageEditInputBlocks = InsertableDict( + [ + ("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings + ("additional_inputs", QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])), + ] +) + + +class QwenImageEditInputStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditInputBlocks.values() + block_names = QwenImageEditInputBlocks.keys() + + @property + def description(self): + return "Input step that prepares the inputs for the edit denoising step. It:\n" + " - make sure the text embeddings have consistent batch size as well as the additional inputs: \n" + " - `image_latents`.\n" + " - update height/width based `image_latents`, patchify `image_latents`." + + +#### QwenImage/edit presets +EDIT_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageEditVLEncoderStep()), + ("vae_encoder", QwenImageEditVaeEncoderStep()), + ("input", QwenImageEditInputStep()), + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsStep()), + ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), + ("denoise", QwenImageEditDenoiseStep()), + ("decode", QwenImageDecodeStep()), + ] +) + + +## 2.2 QwenImage-Edit/edit inpaint + +#### QwenImage-Edit/edit inpaint vae encoder: the difference from regular inpaint is the resize step +QwenImageEditInpaintVaeEncoderBlocks = InsertableDict( + [ + ("resize", QwenImageEditResizeDynamicStep()), # image -> resized_image + ( + "preprocess", + QwenImageInpaintProcessImagesInputStep, + ), # resized_image, mask_image -> processed_image, processed_mask_image, mask_overlay_kwargs + ( + "encode", + QwenImageVaeEncoderDynamicStep(input_name="processed_image", output_name="image_latents"), + ), # processed_image -> image_latents + ] +) + + +class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditInpaintVaeEncoderBlocks.values() + block_names = QwenImageEditInpaintVaeEncoderBlocks.keys() + + @property + def description(self) -> str: + return ( + "This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:\n" + " - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.\n" + " - process the resized image and mask image.\n" + " - create image latents." + ) + + +#### QwenImage-Edit/edit inpaint presets +EDIT_INPAINT_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageEditVLEncoderStep()), + ("vae_encoder", QwenImageEditInpaintVaeEncoderStep()), + ("input", QwenImageInpaintInputStep()), + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), + ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), + ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), + ("denoise", QwenImageEditInpaintDenoiseStep()), + ("decode", QwenImageInpaintDecodeStep()), + ] +) + + +## 2.3 QwenImage-Edit/auto encoders + + +class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks): + block_classes = [ + QwenImageEditInpaintVaeEncoderStep, + QwenImageEditVaeEncoderStep, + ] + block_names = ["edit_inpaint", "edit"] + block_trigger_inputs = ["mask_image", "image"] + + @property + def description(self): + return ( + "Vae encoder step that encode the image inputs into their latent representations. \n" + " This is an auto pipeline block that works for edit and edit_inpaint tasks.\n" + + " - `QwenImageEditInpaintVaeEncoderStep` (edit_inpaint) is used when `mask_image` is provided.\n" + + " - `QwenImageEditVaeEncoderStep` (edit) is used when `image` is provided.\n" + + " - if `mask_image` or `image` is not provided, step will be skipped." + ) + + +## 2.4 QwenImage-Edit/auto inputs +class QwenImageEditAutoInputStep(AutoPipelineBlocks): + block_classes = [QwenImageInpaintInputStep, QwenImageEditInputStep] + block_names = ["edit_inpaint", "edit"] + block_trigger_inputs = ["processed_mask_image", "image"] + + @property + def description(self): + return ( + "Input step that prepares the inputs for the edit denoising step.\n" + + " It is an auto pipeline block that works for edit and edit_inpaint tasks.\n" + + " - `QwenImageInpaintInputStep` (edit_inpaint) is used when `processed_mask_image` is provided.\n" + + " - `QwenImageEditInputStep` (edit) is used when `image_latents` is provided.\n" + + " - if `processed_mask_image` or `image_latents` is not provided, step will be skipped." + ) + + +## 2.5 QwenImage-Edit/auto before denoise +# compose the steps into a BeforeDenoiseStep for edit and edit_inpaint tasks before combine into an auto step + +#### QwenImage-Edit/edit before denoise +QwenImageEditBeforeDenoiseBlocks = InsertableDict( + [ + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsStep()), + ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), + ] +) + + +class QwenImageEditBeforeDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditBeforeDenoiseBlocks.values() + block_names = QwenImageEditBeforeDenoiseBlocks.keys() + + @property + def description(self): + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit task." + + +#### QwenImage-Edit/edit inpaint before denoise +QwenImageEditInpaintBeforeDenoiseBlocks = InsertableDict( + [ + ("prepare_latents", QwenImagePrepareLatentsStep()), + ("set_timesteps", QwenImageSetTimestepsWithStrengthStep()), + ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()), + ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), + ] +) + + +class QwenImageEditInpaintBeforeDenoiseStep(SequentialPipelineBlocks): + model_name = "qwenimage" + block_classes = QwenImageEditInpaintBeforeDenoiseBlocks.values() + block_names = QwenImageEditInpaintBeforeDenoiseBlocks.keys() + + @property + def description(self): + return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit inpaint task." + + +# auto before_denoise step for edit and edit_inpaint tasks +class QwenImageEditAutoBeforeDenoiseStep(AutoPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = [ + QwenImageEditInpaintBeforeDenoiseStep, + QwenImageEditBeforeDenoiseStep, + ] + block_names = ["edit_inpaint", "edit"] + block_trigger_inputs = ["processed_mask_image", "image_latents"] + + @property + def description(self): + return ( + "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n" + + "This is an auto pipeline block that works for edit (img2img) and edit inpaint tasks.\n" + + " - `QwenImageEditInpaintBeforeDenoiseStep` (edit_inpaint) is used when `processed_mask_image` is provided.\n" + + " - `QwenImageEditBeforeDenoiseStep` (edit) is used when `image_latents` is provided and `processed_mask_image` is not provided.\n" + + " - if `image_latents` or `processed_mask_image` is not provided, step will be skipped." + ) + + +## 2.6 QwenImage-Edit/auto denoise + + +class QwenImageEditAutoDenoiseStep(AutoPipelineBlocks): + model_name = "qwenimage-edit" + + block_classes = [QwenImageEditInpaintDenoiseStep, QwenImageEditDenoiseStep] + block_names = ["inpaint_denoise", "denoise"] + block_trigger_inputs = ["processed_mask_image", "image_latents"] + + @property + def description(self): + return ( + "Denoise step that iteratively denoise the latents. \n" + + "This block supports edit (img2img) and edit inpaint tasks for QwenImage Edit. \n" + + " - `QwenImageEditInpaintDenoiseStep` (inpaint) is used when `processed_mask_image` is provided.\n" + + " - `QwenImageEditDenoiseStep` (img2img) is used when `image_latents` is provided.\n" + + " - if `processed_mask_image` or `image_latents` is not provided, step will be skipped." + ) + + +## 2.7 QwenImage-Edit/auto blocks & presets + +EDIT_AUTO_BLOCKS = InsertableDict( + [ + ("text_encoder", QwenImageEditVLEncoderStep()), + ("vae_encoder", QwenImageEditAutoVaeEncoderStep()), + ("input", QwenImageEditAutoInputStep()), + ("before_denoise", QwenImageEditAutoBeforeDenoiseStep()), + ("denoise", QwenImageEditAutoDenoiseStep()), + ("decode", QwenImageAutoDecodeStep()), + ] +) + + +class QwenImageEditAutoBlocks(SequentialPipelineBlocks): + model_name = "qwenimage-edit" + block_classes = EDIT_AUTO_BLOCKS.values() + block_names = EDIT_AUTO_BLOCKS.keys() + + @property + def description(self): + return ( + "Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.\n" + + "- for edit (img2img) generation, you need to provide `image`\n" + + "- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n" + ) + + +# 3. all block presets supported in QwenImage & QwenImage-Edit + + +ALL_BLOCKS = { + "text2image": TEXT2IMAGE_BLOCKS, + "img2img": IMAGE2IMAGE_BLOCKS, + "edit": EDIT_BLOCKS, + "edit_inpaint": EDIT_INPAINT_BLOCKS, + "inpaint": INPAINT_BLOCKS, + "controlnet": CONTROLNET_BLOCKS, + "auto": AUTO_BLOCKS, + "edit_auto": EDIT_AUTO_BLOCKS, +} diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py new file mode 100644 index 000000000000..fe9757f41bcc --- /dev/null +++ b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py @@ -0,0 +1,202 @@ +# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ...configuration_utils import ConfigMixin, register_to_config +from ...loaders import QwenImageLoraLoaderMixin +from ..modular_pipeline import ModularPipeline + + +class QwenImagePachifier(ConfigMixin): + """ + A class to pack and unpack latents for QwenImage. + """ + + config_name = "config.json" + + @register_to_config + def __init__( + self, + patch_size: int = 2, + ): + super().__init__() + + def pack_latents(self, latents): + if latents.ndim != 4 and latents.ndim != 5: + raise ValueError(f"Latents must have 4 or 5 dimensions, but got {latents.ndim}") + + if latents.ndim == 4: + latents = latents.unsqueeze(2) + + batch_size, num_channels_latents, num_latent_frames, latent_height, latent_width = latents.shape + patch_size = self.config.patch_size + + if latent_height % patch_size != 0 or latent_width % patch_size != 0: + raise ValueError( + f"Latent height and width must be divisible by {patch_size}, but got {latent_height} and {latent_width}" + ) + + latents = latents.view( + batch_size, + num_channels_latents, + latent_height // patch_size, + patch_size, + latent_width // patch_size, + patch_size, + ) + latents = latents.permute( + 0, 2, 4, 1, 3, 5 + ) # Batch_size, num_patches_height, num_patches_width, num_channels_latents, patch_size, patch_size + latents = latents.reshape( + batch_size, + (latent_height // patch_size) * (latent_width // patch_size), + num_channels_latents * patch_size * patch_size, + ) + + return latents + + def unpack_latents(self, latents, height, width, vae_scale_factor=8): + if latents.ndim != 3: + raise ValueError(f"Latents must have 3 dimensions, but got {latents.ndim}") + + batch_size, num_patches, channels = latents.shape + patch_size = self.config.patch_size + + # VAE applies 8x compression on images but we must also account for packing which requires + # latent height and width to be divisible by 2. + height = patch_size * (int(height) // (vae_scale_factor * patch_size)) + width = patch_size * (int(width) // (vae_scale_factor * patch_size)) + + latents = latents.view( + batch_size, + height // patch_size, + width // patch_size, + channels // (patch_size * patch_size), + patch_size, + patch_size, + ) + latents = latents.permute(0, 3, 1, 4, 2, 5) + + latents = latents.reshape(batch_size, channels // (patch_size * patch_size), 1, height, width) + + return latents + + +class QwenImageModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin): + """ + A ModularPipeline for QwenImage. + + + + This is an experimental feature and is likely to change in the future. + + + """ + + @property + def default_height(self): + return self.default_sample_size * self.vae_scale_factor + + @property + def default_width(self): + return self.default_sample_size * self.vae_scale_factor + + @property + def default_sample_size(self): + return 128 + + @property + def vae_scale_factor(self): + vae_scale_factor = 8 + if hasattr(self, "vae") and self.vae is not None: + vae_scale_factor = 2 ** len(self.vae.temperal_downsample) + return vae_scale_factor + + @property + def num_channels_latents(self): + num_channels_latents = 16 + if hasattr(self, "transformer") and self.transformer is not None: + num_channels_latents = self.transformer.config.in_channels // 4 + return num_channels_latents + + @property + def is_guidance_distilled(self): + is_guidance_distilled = False + if hasattr(self, "transformer") and self.transformer is not None: + is_guidance_distilled = self.transformer.config.guidance_embeds + return is_guidance_distilled + + @property + def requires_unconditional_embeds(self): + requires_unconditional_embeds = False + + if hasattr(self, "guider") and self.guider is not None: + requires_unconditional_embeds = self.guider._enabled and self.guider.num_conditions > 1 + + return requires_unconditional_embeds + + +class QwenImageEditModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin): + """ + A ModularPipeline for QwenImage-Edit. + + + + This is an experimental feature and is likely to change in the future. + + + """ + + # YiYi TODO: qwen edit should not provide default height/width, should be derived from the resized input image (after adjustment) produced by the resize step. + @property + def default_height(self): + return self.default_sample_size * self.vae_scale_factor + + @property + def default_width(self): + return self.default_sample_size * self.vae_scale_factor + + @property + def default_sample_size(self): + return 128 + + @property + def vae_scale_factor(self): + vae_scale_factor = 8 + if hasattr(self, "vae") and self.vae is not None: + vae_scale_factor = 2 ** len(self.vae.temperal_downsample) + return vae_scale_factor + + @property + def num_channels_latents(self): + num_channels_latents = 16 + if hasattr(self, "transformer") and self.transformer is not None: + num_channels_latents = self.transformer.config.in_channels // 4 + return num_channels_latents + + @property + def is_guidance_distilled(self): + is_guidance_distilled = False + if hasattr(self, "transformer") and self.transformer is not None: + is_guidance_distilled = self.transformer.config.guidance_embeds + return is_guidance_distilled + + @property + def requires_unconditional_embeds(self): + requires_unconditional_embeds = False + + if hasattr(self, "guider") and self.guider is not None: + requires_unconditional_embeds = self.guider._enabled and self.guider.num_conditions > 1 + + return requires_unconditional_embeds diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py index 0ee37f520135..e84f5cad1ab4 100644 --- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py @@ -76,6 +76,7 @@ def vae_scale_factor(self): vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) return vae_scale_factor + # YiYi TODO: change to num_channels_latents @property def num_channels_unet(self): num_channels_unet = 4 diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py index ebabf179954b..880984eeb8a0 100644 --- a/src/diffusers/pipelines/auto_pipeline.py +++ b/src/diffusers/pipelines/auto_pipeline.py @@ -91,6 +91,14 @@ StableDiffusionXLPAGPipeline, ) from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline +from .qwenimage import ( + QwenImageControlNetPipeline, + QwenImageEditInpaintPipeline, + QwenImageEditPipeline, + QwenImageImg2ImgPipeline, + QwenImageInpaintPipeline, + QwenImagePipeline, +) from .sana import SanaPipeline from .stable_cascade import StableCascadeCombinedPipeline, StableCascadeDecoderPipeline from .stable_diffusion import ( @@ -150,6 +158,8 @@ ("cogview3", CogView3PlusPipeline), ("cogview4", CogView4Pipeline), ("cogview4-control", CogView4ControlPipeline), + ("qwenimage", QwenImagePipeline), + ("qwenimage-controlnet", QwenImageControlNetPipeline), ] ) @@ -174,6 +184,8 @@ ("flux-controlnet", FluxControlNetImg2ImgPipeline), ("flux-control", FluxControlImg2ImgPipeline), ("flux-kontext", FluxKontextPipeline), + ("qwenimage", QwenImageImg2ImgPipeline), + ("qwenimage-edit", QwenImageEditPipeline), ] ) @@ -195,6 +207,8 @@ ("flux-controlnet", FluxControlNetInpaintPipeline), ("flux-control", FluxControlInpaintPipeline), ("stable-diffusion-pag", StableDiffusionPAGInpaintPipeline), + ("qwenimage", QwenImageInpaintPipeline), + ("qwenimage-edit", QwenImageEditInpaintPipeline), ] ) diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index 91eefc5c10e0..cd4d965e57dc 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -32,6 +32,66 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class QwenImageAutoBlocks(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class QwenImageEditAutoBlocks(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class QwenImageEditModularPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class QwenImageModularPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class StableDiffusionXLAutoBlocks(metaclass=DummyObject): _backends = ["torch", "transformers"]