diff --git a/examples/text_to_image/requirements.txt b/examples/text_to_image/requirements.txt index be05fe3fcdc5..c3ffa42f0edc 100644 --- a/examples/text_to_image/requirements.txt +++ b/examples/text_to_image/requirements.txt @@ -5,4 +5,4 @@ datasets>=2.19.1 ftfy tensorboard Jinja2 -peft>=0.17.0 +peft==0.7.0 diff --git a/examples/text_to_image/requirements_sdxl.txt b/examples/text_to_image/requirements_sdxl.txt index 4dacc26ce4bb..64cbc9205fd0 100644 --- a/examples/text_to_image/requirements_sdxl.txt +++ b/examples/text_to_image/requirements_sdxl.txt @@ -5,4 +5,4 @@ ftfy tensorboard Jinja2 datasets -peft>=0.17.0 \ No newline at end of file +peft==0.7.0 \ No newline at end of file diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 686e8d99dabf..8867250deda8 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -390,8 +390,6 @@ "QwenImageAutoBlocks", "QwenImageEditAutoBlocks", "QwenImageEditModularPipeline", - "QwenImageEditPlusAutoBlocks", - "QwenImageEditPlusModularPipeline", "QwenImageModularPipeline", "StableDiffusionXLAutoBlocks", "StableDiffusionXLModularPipeline", @@ -1054,8 +1052,6 @@ QwenImageAutoBlocks, QwenImageEditAutoBlocks, QwenImageEditModularPipeline, - QwenImageEditPlusAutoBlocks, - QwenImageEditPlusModularPipeline, QwenImageModularPipeline, StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline, diff --git a/src/diffusers/models/transformers/transformer_ltx.py b/src/diffusers/models/transformers/transformer_ltx.py index 685c73c07c75..9f3840690d81 100644 --- a/src/diffusers/models/transformers/transformer_ltx.py +++ b/src/diffusers/models/transformers/transformer_ltx.py @@ -353,9 +353,7 @@ def forward( norm_hidden_states = self.norm1(hidden_states) num_ada_params = self.scale_shift_table.shape[0] - ada_values = self.scale_shift_table[None, None].to(temb.device) + temb.reshape( - batch_size, temb.size(1), num_ada_params, -1 - ) + ada_values = self.scale_shift_table[None, None] + temb.reshape(batch_size, temb.size(1), num_ada_params, -1) shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ada_values.unbind(dim=2) norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa diff --git a/src/diffusers/models/transformers/transformer_wan.py b/src/diffusers/models/transformers/transformer_wan.py index dd75fb124f1a..25c055fb563c 100644 --- a/src/diffusers/models/transformers/transformer_wan.py +++ b/src/diffusers/models/transformers/transformer_wan.py @@ -682,12 +682,12 @@ def forward( # 5. Output norm, projection & unpatchify if temb.ndim == 3: # batch_size, seq_len, inner_dim (wan 2.2 ti2v) - shift, scale = (self.scale_shift_table.unsqueeze(0).to(temb.device) + temb.unsqueeze(2)).chunk(2, dim=2) + shift, scale = (self.scale_shift_table.unsqueeze(0) + temb.unsqueeze(2)).chunk(2, dim=2) shift = shift.squeeze(2) scale = scale.squeeze(2) else: # batch_size, inner_dim - shift, scale = (self.scale_shift_table.to(temb.device) + temb.unsqueeze(1)).chunk(2, dim=1) + shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1) # Move the shift and scale tensors to the same device as hidden_states. # When using multi-GPU inference via accelerate these will be on the diff --git a/src/diffusers/models/transformers/transformer_wan_vace.py b/src/diffusers/models/transformers/transformer_wan_vace.py index 30c38c244ad8..e5a9c7e0a659 100644 --- a/src/diffusers/models/transformers/transformer_wan_vace.py +++ b/src/diffusers/models/transformers/transformer_wan_vace.py @@ -103,7 +103,7 @@ def forward( control_hidden_states = control_hidden_states + hidden_states shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = ( - self.scale_shift_table.to(temb.device) + temb.float() + self.scale_shift_table + temb.float() ).chunk(6, dim=1) # 1. Self-attention @@ -361,7 +361,7 @@ def forward( hidden_states = hidden_states + control_hint * scale # 6. Output norm, projection & unpatchify - shift, scale = (self.scale_shift_table.to(temb.device) + temb.unsqueeze(1)).chunk(2, dim=1) + shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1) # Move the shift and scale tensors to the same device as hidden_states. # When using multi-GPU inference via accelerate these will be on the diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py index 2e590594af71..d470d7220d73 100644 --- a/src/diffusers/modular_pipelines/__init__.py +++ b/src/diffusers/modular_pipelines/__init__.py @@ -46,14 +46,13 @@ ] _import_structure["stable_diffusion_xl"] = ["StableDiffusionXLAutoBlocks", "StableDiffusionXLModularPipeline"] _import_structure["wan"] = ["WanAutoBlocks", "WanModularPipeline"] + _import_structure["flux"] = ["FluxAutoBlocks", "FluxModularPipeline", "FluxKontextModularPipeline"] _import_structure["flux"] = ["FluxAutoBlocks", "FluxModularPipeline"] _import_structure["qwenimage"] = [ "QwenImageAutoBlocks", "QwenImageModularPipeline", "QwenImageEditModularPipeline", "QwenImageEditAutoBlocks", - "QwenImageEditPlusModularPipeline", - "QwenImageEditPlusAutoBlocks", ] _import_structure["components_manager"] = ["ComponentsManager"] @@ -65,7 +64,7 @@ from ..utils.dummy_pt_objects import * # noqa F403 else: from .components_manager import ComponentsManager - from .flux import FluxAutoBlocks, FluxModularPipeline + from .flux import FluxAutoBlocks, FluxKontextModularPipeline, FluxModularPipeline from .modular_pipeline import ( AutoPipelineBlocks, BlockState, @@ -80,8 +79,6 @@ QwenImageAutoBlocks, QwenImageEditAutoBlocks, QwenImageEditModularPipeline, - QwenImageEditPlusAutoBlocks, - QwenImageEditPlusModularPipeline, QwenImageModularPipeline, ) from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline diff --git a/src/diffusers/modular_pipelines/flux/__init__.py b/src/diffusers/modular_pipelines/flux/__init__.py index 2891edf79041..ca10a633bc0d 100644 --- a/src/diffusers/modular_pipelines/flux/__init__.py +++ b/src/diffusers/modular_pipelines/flux/__init__.py @@ -24,15 +24,19 @@ _import_structure["encoders"] = ["FluxTextEncoderStep"] _import_structure["modular_blocks"] = [ "ALL_BLOCKS", + "ALL_BLOCKS_KONTEXT", "AUTO_BLOCKS", + "AUTO_BLOCKS_KONTEXT", "TEXT2IMAGE_BLOCKS", "FluxAutoBeforeDenoiseStep", "FluxAutoBlocks", - "FluxAutoBlocks", "FluxAutoDecodeStep", "FluxAutoDenoiseStep", + "FluxKontextAutoBeforeDenoiseStep", + "FluxKontextAutoBlocks", + "FluxKontextAutoDenoiseStep", ] - _import_structure["modular_pipeline"] = ["FluxModularPipeline"] + _import_structure["modular_pipeline"] = ["FluxKontextModularPipeline", "FluxModularPipeline"] if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: try: @@ -44,14 +48,18 @@ from .encoders import FluxTextEncoderStep from .modular_blocks import ( ALL_BLOCKS, + ALL_BLOCKS_KONTEXT, AUTO_BLOCKS, + AUTO_BLOCKS_KONTEXT, TEXT2IMAGE_BLOCKS, FluxAutoBeforeDenoiseStep, FluxAutoBlocks, FluxAutoDecodeStep, FluxAutoDenoiseStep, + FluxKontextAutoBeforeDenoiseStep, + FluxKontextAutoDenoiseStep, ) - from .modular_pipeline import FluxModularPipeline + from .modular_pipeline import FluxKontextModularPipeline, FluxModularPipeline else: import sys diff --git a/src/diffusers/modular_pipelines/flux/before_denoise.py b/src/diffusers/modular_pipelines/flux/before_denoise.py index 95858fbf6eb0..eca5dcd919ce 100644 --- a/src/diffusers/modular_pipelines/flux/before_denoise.py +++ b/src/diffusers/modular_pipelines/flux/before_denoise.py @@ -17,14 +17,17 @@ import numpy as np import torch +from PIL import Image +from ...configuration_utils import FrozenDict +from ...image_processor import VaeImageProcessor from ...models import AutoencoderKL from ...schedulers import FlowMatchEulerDiscreteScheduler from ...utils import logging from ...utils.torch_utils import randn_tensor from ..modular_pipeline import ModularPipelineBlocks, PipelineState from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam -from .modular_pipeline import FluxModularPipeline +from .modular_pipeline import FluxKontextModularPipeline, FluxModularPipeline logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -182,15 +185,15 @@ def _prepare_latent_image_ids(batch_size, height, width, device, dtype): return latent_image_ids.to(device=device, dtype=dtype) -# Cannot use "# Copied from" because it introduces weird indentation errors. -def _encode_vae_image(vae, image: torch.Tensor, generator: torch.Generator): +def _encode_vae_image(vae, image: torch.Tensor, generator: torch.Generator, sample_mode: str = "sample"): if isinstance(generator, list): image_latents = [ - retrieve_latents(vae.encode(image[i : i + 1]), generator=generator[i]) for i in range(image.shape[0]) + retrieve_latents(vae.encode(image[i : i + 1]), generator=generator[i], sample_mode=sample_mode) + for i in range(image.shape[0]) ] image_latents = torch.cat(image_latents, dim=0) else: - image_latents = retrieve_latents(vae.encode(image), generator=generator) + image_latents = retrieve_latents(vae.encode(image), generator=generator, sample_mode=sample_mode) image_latents = (image_latents - vae.config.shift_factor) * vae.config.scaling_factor @@ -252,13 +255,11 @@ def inputs(self) -> List[InputParam]: InputParam( "prompt_embeds", required=True, - kwargs_type="denoiser_input_fields", type_hint=torch.Tensor, description="Pre-generated text embeddings. Can be generated from text_encoder step.", ), InputParam( "pooled_prompt_embeds", - kwargs_type="denoiser_input_fields", type_hint=torch.Tensor, description="Pre-generated pooled text embeddings. Can be generated from text_encoder step.", ), @@ -281,13 +282,11 @@ def intermediate_outputs(self) -> List[str]: OutputParam( "prompt_embeds", type_hint=torch.Tensor, - kwargs_type="denoiser_input_fields", description="text embeddings used to guide the image generation", ), OutputParam( "pooled_prompt_embeds", type_hint=torch.Tensor, - kwargs_type="denoiser_input_fields", description="pooled text embeddings used to guide the image generation", ), # TODO: support negative embeddings? @@ -321,6 +320,141 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip return components, state +class FluxKontextInputStep(ModularPipelineBlocks): + model_name = "flux_kontext" + + @property + def description(self) -> str: + return ( + "Input processing step that:\n" + " 1. Determines `batch_size` and `dtype` based on `prompt_embeds`\n" + " 2. Adjusts input tensor shapes based on `batch_size` (number of prompts) and `num_images_per_prompt`\n\n" + "All input tensors are expected to have either batch_size=1 or match the batch_size\n" + "of prompt_embeds. The tensors will be duplicated across the batch dimension to\n" + "have a final batch_size of batch_size * num_images_per_prompt.\n" + " 3. Processes the input `image`." + ) + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ + ComponentSpec( + "image_processor", + VaeImageProcessor, + config=FrozenDict({"vae_scale_factor": 16}), + default_creation_method="from_config", + ), + ] + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam("num_images_per_prompt", default=1), + InputParam( + "prompt_embeds", + required=True, + type_hint=torch.Tensor, + description="Pre-generated text embeddings. Can be generated from text_encoder step.", + ), + InputParam( + "pooled_prompt_embeds", + type_hint=torch.Tensor, + description="Pre-generated pooled text embeddings. Can be generated from text_encoder step.", + ), + InputParam( + "image", + required=False, + type_hint=Union[Image.Image, torch.Tensor], + description="Input image/image latents to perform denoising.", + ), + ] + + @property + def intermediate_outputs(self) -> List[str]: + return [ + OutputParam( + "batch_size", + type_hint=int, + description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt", + ), + OutputParam( + "dtype", + type_hint=torch.dtype, + description="Data type of model tensor inputs (determined by `prompt_embeds`)", + ), + OutputParam( + "prompt_embeds", + type_hint=torch.Tensor, + description="text embeddings used to guide the image generation", + ), + OutputParam( + "pooled_prompt_embeds", + type_hint=torch.Tensor, + description="pooled text embeddings used to guide the image generation", + ), + OutputParam( + "image", + type_hint=torch.Tensor, + description="Processed image/image latents.", + ), + ] + + def check_inputs(self, components, block_state): + if block_state.prompt_embeds is not None and block_state.pooled_prompt_embeds is not None: + if block_state.prompt_embeds.shape[0] != block_state.pooled_prompt_embeds.shape[0]: + raise ValueError( + "`prompt_embeds` and `pooled_prompt_embeds` must have the same batch size when passed directly, but" + f" got: `prompt_embeds` {block_state.prompt_embeds.shape} != `pooled_prompt_embeds`" + f" {block_state.pooled_prompt_embeds.shape}." + ) + + @staticmethod + def preprocess_image( + image, image_processor: VaeImageProcessor, vae_scale_factor: int, latent_channels: int, _auto_resize=True + ) -> torch.Tensor: + from ...pipelines.flux.pipeline_flux_kontext import PREFERRED_KONTEXT_RESOLUTIONS + + if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == latent_channels): + multiple_of = vae_scale_factor * 2 + img = image[0] if isinstance(image, list) else image + image_height, image_width = image_processor.get_default_height_width(img) + aspect_ratio = image_width / image_height + if _auto_resize: + # Kontext is trained on specific resolutions, using one of them is recommended + _, image_width, image_height = min( + (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_KONTEXT_RESOLUTIONS + ) + image_width = image_width // multiple_of * multiple_of + image_height = image_height // multiple_of * multiple_of + image = image_processor.resize(image, image_height, image_width) + image = image_processor.preprocess(image, image_height, image_width) + return image + + @torch.no_grad() + def __call__(self, components: FluxKontextModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + self.check_inputs(components, block_state) + + block_state.batch_size = block_state.prompt_embeds.shape[0] + block_state.dtype = block_state.prompt_embeds.dtype + + _, seq_len, _ = block_state.prompt_embeds.shape + block_state.prompt_embeds = block_state.prompt_embeds.repeat(1, block_state.num_images_per_prompt, 1) + block_state.prompt_embeds = block_state.prompt_embeds.view( + block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1 + ) + # TODO: `_auto_resize` is currently forced to True. Since it's private anyway, I thought of not adding it. + block_state.image = self.preprocess_image( + image=block_state.image, + image_processor=components.image_processor, + vae_scale_factor=components.vae_scale_factor, + latent_channels=components.num_channels_latents, + ) + self.set_block_state(state, block_state) + + return components, state + + class FluxSetTimestepsStep(ModularPipelineBlocks): model_name = "flux" @@ -692,3 +826,175 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip self.set_block_state(state, block_state) return components, state + + +class FluxKontextPrepareLatentsStep(ModularPipelineBlocks): + model_name = "flux_kontext" + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ComponentSpec("vae", AutoencoderKL)] + + @property + def description(self) -> str: + return "Step that prepares the latents for the image-to-image generation process with Flux Kontext." + + @property + def inputs(self) -> List[InputParam]: + return [ + InputParam("height", type_hint=int), + InputParam("width", type_hint=int), + InputParam("image", type_hint=Union[Image.Image, torch.Tensor], required=False), + InputParam("max_area", type_hint=int, default=1024**2), + InputParam("latents", type_hint=Optional[torch.Tensor]), + InputParam("num_images_per_prompt", type_hint=int, default=1), + InputParam("generator"), + InputParam( + "batch_size", + required=True, + type_hint=int, + description="Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`. Can be generated in input step.", + ), + InputParam("dtype", type_hint=torch.dtype, description="The dtype of the model inputs"), + ] + + @property + def intermediate_outputs(self) -> List[OutputParam]: + return [ + OutputParam( + "latents", type_hint=torch.Tensor, description="The initial latents to use for the denoising process" + ), + OutputParam( + "image_latents", type_hint=torch.Tensor, description="Latents computed from the input image(s)." + ), + OutputParam( + "latent_ids", + type_hint=torch.Tensor, + description="IDs computed from the latent sequence needed for RoPE", + ), + OutputParam( + "image_ids", + type_hint=torch.Tensor, + description="IDs computed from the image sequence needed for RoPE", + ), + ] + + @staticmethod + def check_inputs(components, block_state): + if (block_state.height is not None and block_state.height % (components.vae_scale_factor * 2) != 0) or ( + block_state.width is not None and block_state.width % (components.vae_scale_factor * 2) != 0 + ): + logger.warning( + f"`height` and `width` have to be divisible by {components.vae_scale_factor} but are {block_state.height} and {block_state.width}." + ) + + @staticmethod + def prepare_latents( + comp, + image, + batch_size, + num_channels_latents, + height, + width, + dtype, + device, + generator, + latents=None, + ): + # Couldn't use the `prepare_latents` method directly from Flux because I decided to copy over + # the packing methods here. So, for example, `comp._pack_latents()` won't work if we were + # to go with the "# Copied from ..." approach. Or maybe there's a way? + + # VAE applies 8x compression on images but we must also account for packing which requires + # latent height and width to be divisible by 2. + height = 2 * (int(height) // (comp.vae_scale_factor * 2)) + width = 2 * (int(width) // (comp.vae_scale_factor * 2)) + shape = (batch_size, num_channels_latents, height, width) + + image_latents = image_ids = None + if image is not None: + image = image.to(device=device, dtype=dtype) + if image.shape[1] != num_channels_latents: + image_latents = _encode_vae_image(vae=comp.vae, image=image, generator=generator, sample_mode="argmax") + else: + image_latents = image + if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0: + # expand init_latents for batch_size + additional_image_per_prompt = batch_size // image_latents.shape[0] + image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0) + elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts." + ) + else: + image_latents = torch.cat([image_latents], dim=0) + + image_latent_height, image_latent_width = image_latents.shape[2:] + image_latents = _pack_latents( + image_latents, batch_size, num_channels_latents, image_latent_height, image_latent_width + ) + image_ids = _prepare_latent_image_ids( + batch_size, image_latent_height // 2, image_latent_width // 2, device, dtype + ) + # image ids are the same as latent ids with the first dimension set to 1 instead of 0 + image_ids[..., 0] = 1 + + latent_ids = _prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + latents = _pack_latents(latents, batch_size, num_channels_latents, height, width) + else: + latents = latents.to(device=device, dtype=dtype) + + return latents, image_latents, latent_ids, image_ids + + @torch.no_grad() + def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + block_state.height = block_state.height or components.default_height + block_state.width = block_state.width or components.default_width + block_state.device = components._execution_device + block_state.dtype = torch.bfloat16 # TODO: okay to hardcode this? + block_state.num_channels_latents = components.num_channels_latents + + self.check_inputs(components, block_state) + + # Adjust height and width if needed. + max_area = block_state.max_area + original_height, original_width = block_state.height, block_state.width + aspect_ratio = original_width / original_height + width = round((max_area * aspect_ratio) ** 0.5) + height = round((max_area / aspect_ratio) ** 0.5) + + multiple_of = components.vae_scale_factor * 2 + width = width // multiple_of * multiple_of + height = height // multiple_of * multiple_of + + if height != original_height or width != original_width: + logger.warning( + f"Generation `height` and `width` have been adjusted to {height} and {width} to fit the model requirements." + ) + block_state.height = height + block_state.width = width + + batch_size = block_state.batch_size * block_state.num_images_per_prompt + block_state.latents, block_state.image_latents, block_state.latent_ids, block_state.image_ids = ( + self.prepare_latents( + components, + block_state.image, + batch_size, + block_state.num_channels_latents, + block_state.height, + block_state.width, + block_state.dtype, + block_state.device, + block_state.generator, + block_state.latents, + ) + ) + + self.set_block_state(state, block_state) + + return components, state diff --git a/src/diffusers/modular_pipelines/flux/denoise.py b/src/diffusers/modular_pipelines/flux/denoise.py index ffa0a4456f5d..3ce0dc35b100 100644 --- a/src/diffusers/modular_pipelines/flux/denoise.py +++ b/src/diffusers/modular_pipelines/flux/denoise.py @@ -26,7 +26,7 @@ PipelineState, ) from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam -from .modular_pipeline import FluxModularPipeline +from .modular_pipeline import FluxKontextModularPipeline, FluxModularPipeline logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -110,6 +110,106 @@ def __call__( return components, block_state +class FluxKontextLoopDenoiser(ModularPipelineBlocks): + model_name = "flux_kontext" + + @property + def expected_components(self) -> List[ComponentSpec]: + return [ComponentSpec("transformer", FluxTransformer2DModel)] + + @property + def description(self) -> str: + return ( + "Step within the denoising loop that denoise the latents for Flux Kontext. " + "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` " + "object (e.g. `FluxDenoiseLoopWrapper`)" + ) + + @property + def inputs(self) -> List[Tuple[str, Any]]: + return [ + InputParam("joint_attention_kwargs"), + InputParam( + "latents", + required=True, + type_hint=torch.Tensor, + description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.", + ), + InputParam( + "image_latents", + type_hint=torch.Tensor, + description="Image latents to use for the denoising process. Can be generated in prepare_latent step.", + ), + InputParam( + "guidance", + required=True, + type_hint=torch.Tensor, + description="Guidance scale as a tensor", + ), + InputParam( + "prompt_embeds", + required=True, + type_hint=torch.Tensor, + description="Prompt embeddings", + ), + InputParam( + "pooled_prompt_embeds", + required=True, + type_hint=torch.Tensor, + description="Pooled prompt embeddings", + ), + InputParam( + "text_ids", + required=True, + type_hint=torch.Tensor, + description="IDs computed from text sequence needed for RoPE", + ), + InputParam( + "latent_ids", + required=True, + type_hint=torch.Tensor, + description="IDs computed from latent sequence needed for RoPE", + ), + InputParam( + "image_ids", + type_hint=torch.Tensor, + description="IDs computed from image sequence needed for RoPE", + ), + ] + + @torch.no_grad() + def __call__( + self, components: FluxKontextModularPipeline, block_state: BlockState, i: int, t: torch.Tensor + ) -> PipelineState: + latent_ids = block_state.latent_ids + image_ids = block_state.image_ids + if image_ids is not None: + latent_ids = torch.cat([latent_ids, image_ids], dim=0) # dim 0 is sequence dimension + + latents = block_state.latents + latent_model_input = latents + image_latents = block_state.image_latents + if image_latents is not None: + latent_model_input = torch.cat([latent_model_input, image_latents], dim=1) + + timestep = t.expand(latents.shape[0]).to(latents.dtype) + noise_pred = components.transformer( + hidden_states=latent_model_input, + timestep=timestep / 1000, + guidance=block_state.guidance, + encoder_hidden_states=block_state.prompt_embeds, + pooled_projections=block_state.pooled_prompt_embeds, + joint_attention_kwargs=block_state.joint_attention_kwargs, + txt_ids=block_state.text_ids, + img_ids=latent_ids, + return_dict=False, + )[0] + noise_pred = noise_pred[:, : latents.size(1)] + block_state.noise_pred = noise_pred + + return components, block_state + + class FluxLoopAfterDenoiser(ModularPipelineBlocks): model_name = "flux" @@ -225,3 +325,19 @@ def description(self) -> str: " - `FluxLoopAfterDenoiser`\n" "This block supports both text2image and img2img tasks." ) + + +class FluxKontextDenoiseStep(FluxDenoiseLoopWrapper): + block_classes = [FluxKontextLoopDenoiser, FluxLoopAfterDenoiser] + block_names = ["denoiser", "after_denoiser"] + + @property + def description(self) -> str: + return ( + "Denoise step that iteratively denoise the latents. \n" + "Its loop logic is defined in `FluxDenoiseLoopWrapper.__call__` method \n" + "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n" + " - `FluxKontextLoopDenoiser`\n" + " - `FluxLoopAfterDenoiser`\n" + "This block supports both text2image and img2img tasks." + ) diff --git a/src/diffusers/modular_pipelines/flux/encoders.py b/src/diffusers/modular_pipelines/flux/encoders.py index 16ddecbadb4f..8c49990280ac 100644 --- a/src/diffusers/modular_pipelines/flux/encoders.py +++ b/src/diffusers/modular_pipelines/flux/encoders.py @@ -181,7 +181,6 @@ def inputs(self) -> List[InputParam]: return [ InputParam("prompt"), InputParam("prompt_2"), - InputParam("max_sequence_length", type_hint=int, default=512, required=False), InputParam("joint_attention_kwargs"), ] @@ -190,19 +189,16 @@ def intermediate_outputs(self) -> List[OutputParam]: return [ OutputParam( "prompt_embeds", - kwargs_type="denoiser_input_fields", type_hint=torch.Tensor, description="text embeddings used to guide the image generation", ), OutputParam( "pooled_prompt_embeds", - kwargs_type="denoiser_input_fields", type_hint=torch.Tensor, description="pooled text embeddings used to guide the image generation", ), OutputParam( "text_ids", - kwargs_type="denoiser_input_fields", type_hint=torch.Tensor, description="ids from the text sequence for RoPE", ), @@ -408,7 +404,6 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip pooled_prompt_embeds=None, device=block_state.device, num_images_per_prompt=1, # TODO: hardcoded for now. - max_sequence_length=block_state.max_sequence_length, lora_scale=block_state.text_encoder_lora_scale, ) diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks.py b/src/diffusers/modular_pipelines/flux/modular_blocks.py index ca4f993a11fe..6ce5e3eaeaf0 100644 --- a/src/diffusers/modular_pipelines/flux/modular_blocks.py +++ b/src/diffusers/modular_pipelines/flux/modular_blocks.py @@ -19,11 +19,13 @@ FluxImg2ImgPrepareLatentsStep, FluxImg2ImgSetTimestepsStep, FluxInputStep, + FluxKontextInputStep, + FluxKontextPrepareLatentsStep, FluxPrepareLatentsStep, FluxSetTimestepsStep, ) from .decoders import FluxDecodeStep -from .denoise import FluxDenoiseStep +from .denoise import FluxDenoiseStep, FluxKontextDenoiseStep from .encoders import FluxTextEncoderStep, FluxVaeEncoderStep @@ -46,7 +48,7 @@ def description(self): ) -# before_denoise: text2img, img2img +# before_denoise: text2img class FluxBeforeDenoiseStep(SequentialPipelineBlocks): block_classes = [ FluxInputStep, @@ -66,6 +68,26 @@ def description(self): ) +# before_denoise: text2img, img2img (for Kontext) +class FluxKontextBeforeDenoiseStep(SequentialPipelineBlocks): + block_classes = [ + FluxInputStep, + FluxKontextPrepareLatentsStep, + FluxSetTimestepsStep, + ] + block_names = ["input", "prepare_latents", "set_timesteps"] + + @property + def description(self): + return ( + "Before denoise step that prepare the inputs for the denoise step in Flux Kontext.\n" + + "This is a sequential pipeline blocks:\n" + + " - `FluxInputStep` is used to adjust the batch size of the model inputs\n" + + " - `FluxKontextPrepareLatentsStep` is used to prepare the latents\n" + + " - `FluxSetTimestepsStep` is used to set the timesteps\n" + ) + + # before_denoise: img2img class FluxImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks): block_classes = [FluxInputStep, FluxImg2ImgSetTimestepsStep, FluxImg2ImgPrepareLatentsStep] @@ -84,9 +106,9 @@ def description(self): # before_denoise: all task (text2img, img2img) class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks): - block_classes = [FluxImg2ImgBeforeDenoiseStep, FluxBeforeDenoiseStep] - block_names = ["img2img", "text2image"] - block_trigger_inputs = ["image_latents", None] + block_classes = [FluxBeforeDenoiseStep, FluxImg2ImgBeforeDenoiseStep] + block_names = ["text2image", "img2img"] + block_trigger_inputs = [None, "image_latents"] @property def description(self): @@ -98,7 +120,24 @@ def description(self): ) -# denoise: text2image +# flux kontext (both text2img and img2img) +class FluxKontextAutoBeforeDenoiseStep(AutoPipelineBlocks): + # Kontext should follow `FluxBeforeDenoiseStep` when T2I mode is on. + block_classes = [FluxBeforeDenoiseStep, FluxKontextBeforeDenoiseStep] + block_names = ["text2image", "img2img"] + block_trigger_inputs = [None, "image_latents"] + + @property + def description(self): + return ( + "Before denoise step that prepare the inputs for the denoise step.\n" + + "This is an auto pipeline block that works for text2image.\n" + + " - `FluxBeforeDenoiseStep` (text2img) is used when only `image_latents` is None.\n" + + " - `FluxKontextBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n" + ) + + +# denoise: text2image, img2img class FluxAutoDenoiseStep(AutoPipelineBlocks): block_classes = [FluxDenoiseStep] block_names = ["denoise"] @@ -113,7 +152,21 @@ def description(self) -> str: ) -# decode: all task (text2img, img2img, inpainting) +class FluxKontextAutoDenoiseStep(AutoPipelineBlocks): + block_classes = [FluxKontextDenoiseStep] + block_names = ["denoise"] + block_trigger_inputs = [None] + + @property + def description(self) -> str: + return ( + "Denoise step that iteratively denoise the latents for Flux Kontext. " + "This is a auto pipeline block that works for text2image and img2img tasks." + " - `FluxDenoiseStep` (denoise) for text2image and img2img tasks." + ) + + +# decode: all task (text2img, img2img) class FluxAutoDecodeStep(AutoPipelineBlocks): block_classes = [FluxDecodeStep] block_names = ["non-inpaint"] @@ -124,37 +177,40 @@ def description(self): return "Decode step that decode the denoised latents into image outputs.\n - `FluxDecodeStep`" -class FluxCoreDenoiseStep(SequentialPipelineBlocks): - block_classes = [FluxInputStep, FluxAutoBeforeDenoiseStep, FluxAutoDenoiseStep] - block_names = ["input", "before_denoise", "denoise"] +# text2image, img2img +class FluxAutoBlocks(SequentialPipelineBlocks): + block_classes = [ + FluxTextEncoderStep, + FluxAutoVaeEncoderStep, + FluxAutoBeforeDenoiseStep, + FluxAutoDenoiseStep, + FluxAutoDecodeStep, + ] + block_names = ["text_encoder", "image_encoder", "before_denoise", "denoise", "decoder"] @property def description(self): return ( - "Core step that performs the denoising process. \n" - + " - `FluxInputStep` (input) standardizes the inputs for the denoising step.\n" - + " - `FluxAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" - + " - `FluxAutoDenoiseStep` (denoise) iteratively denoises the latents.\n" - + "This step support text-to-image and image-to-image tasks for Flux:\n" - + " - for image-to-image generation, you need to provide `image_latents`\n" - + " - for text-to-image generation, all you need to provide is prompt embeddings" + "Auto Modular pipeline for text-to-image and image-to-image using Flux.\n" + + "- for text-to-image generation, all you need to provide is `prompt`\n" + + "- for image-to-image generation, you need to provide either `image` or `image_latents`" ) -# text2image -class FluxAutoBlocks(SequentialPipelineBlocks): +# text2image, img2img +class FluxKontextAutoBlocks(SequentialPipelineBlocks): block_classes = [ FluxTextEncoderStep, - FluxAutoVaeEncoderStep, - FluxCoreDenoiseStep, + FluxKontextAutoBeforeDenoiseStep, + FluxKontextAutoDenoiseStep, FluxAutoDecodeStep, ] - block_names = ["text_encoder", "image_encoder", "denoise", "decode"] + block_names = ["text_encoder", "before_denoise", "denoise", "decoder"] @property def description(self): return ( - "Auto Modular pipeline for text-to-image and image-to-image using Flux.\n" + "Auto Modular pipeline for text-to-image and image-to-image using Flux Kontext.\n" + "- for text-to-image generation, all you need to provide is `prompt`\n" + "- for image-to-image generation, you need to provide either `image` or `image_latents`" ) @@ -187,10 +243,22 @@ def description(self): [ ("text_encoder", FluxTextEncoderStep), ("image_encoder", FluxAutoVaeEncoderStep), - ("denoise", FluxCoreDenoiseStep), + ("before_denoise", FluxAutoBeforeDenoiseStep), + ("denoise", FluxAutoDenoiseStep), ("decode", FluxAutoDecodeStep), ] ) +AUTO_BLOCKS_KONTEXT = InsertableDict( + [ + ("text_encoder", FluxTextEncoderStep), + ("input", FluxKontextInputStep), + ("prepare_latents", FluxKontextPrepareLatentsStep), + ("set_timesteps", FluxSetTimestepsStep), + ("denoise", FluxKontextDenoiseStep), + ("decode", FluxDecodeStep), + ] +) ALL_BLOCKS = {"text2image": TEXT2IMAGE_BLOCKS, "img2img": IMAGE2IMAGE_BLOCKS, "auto": AUTO_BLOCKS} +ALL_BLOCKS_KONTEXT = {"auto": AUTO_BLOCKS_KONTEXT} diff --git a/src/diffusers/modular_pipelines/flux/modular_pipeline.py b/src/diffusers/modular_pipelines/flux/modular_pipeline.py index 563b0333431f..385478cc01ab 100644 --- a/src/diffusers/modular_pipelines/flux/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/flux/modular_pipeline.py @@ -55,3 +55,18 @@ def num_channels_latents(self): if getattr(self, "transformer", None): num_channels_latents = self.transformer.config.in_channels // 4 return num_channels_latents + + +class FluxKontextModularPipeline(FluxModularPipeline): + """ + A ModularPipeline for Flux Kontext. + + + + This is an experimental feature and is likely to change in the future. + + + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py index e543bf0bb3af..0f9f663fc039 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/modular_pipeline.py @@ -59,7 +59,6 @@ ("flux", "FluxModularPipeline"), ("qwenimage", "QwenImageModularPipeline"), ("qwenimage-edit", "QwenImageEditModularPipeline"), - ("qwenimage-edit-plus", "QwenImageEditPlusModularPipeline"), ] ) @@ -547,6 +546,7 @@ def __init__(self): f"In {self.__class__.__name__}, the number of block_classes, block_names, and block_trigger_inputs must be the same." ) default_blocks = [t for t in self.block_trigger_inputs if t is None] + print(f"{default_blocks=}, {self.block_trigger_inputs=}") # can only have 1 or 0 default block, and has to put in the last # the order of blocks matters here because the first block with matching trigger will be dispatched # e.g. blocks = [inpaint, img2img] and block_trigger_inputs = ["mask", "image"] @@ -1629,8 +1629,7 @@ def from_pretrained( blocks = ModularPipelineBlocks.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs ) - except EnvironmentError as e: - logger.debug(f"EnvironmentError: {e}") + except EnvironmentError: blocks = None cache_dir = kwargs.pop("cache_dir", None) diff --git a/src/diffusers/modular_pipelines/qwenimage/__init__.py b/src/diffusers/modular_pipelines/qwenimage/__init__.py index ae4ec4799fbc..81cf515730ef 100644 --- a/src/diffusers/modular_pipelines/qwenimage/__init__.py +++ b/src/diffusers/modular_pipelines/qwenimage/__init__.py @@ -29,20 +29,13 @@ "EDIT_AUTO_BLOCKS", "EDIT_BLOCKS", "EDIT_INPAINT_BLOCKS", - "EDIT_PLUS_AUTO_BLOCKS", - "EDIT_PLUS_BLOCKS", "IMAGE2IMAGE_BLOCKS", "INPAINT_BLOCKS", "TEXT2IMAGE_BLOCKS", "QwenImageAutoBlocks", "QwenImageEditAutoBlocks", - "QwenImageEditPlusAutoBlocks", - ] - _import_structure["modular_pipeline"] = [ - "QwenImageEditModularPipeline", - "QwenImageEditPlusModularPipeline", - "QwenImageModularPipeline", ] + _import_structure["modular_pipeline"] = ["QwenImageEditModularPipeline", "QwenImageModularPipeline"] if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: try: @@ -61,20 +54,13 @@ EDIT_AUTO_BLOCKS, EDIT_BLOCKS, EDIT_INPAINT_BLOCKS, - EDIT_PLUS_AUTO_BLOCKS, - EDIT_PLUS_BLOCKS, IMAGE2IMAGE_BLOCKS, INPAINT_BLOCKS, TEXT2IMAGE_BLOCKS, QwenImageAutoBlocks, QwenImageEditAutoBlocks, - QwenImageEditPlusAutoBlocks, - ) - from .modular_pipeline import ( - QwenImageEditModularPipeline, - QwenImageEditPlusModularPipeline, - QwenImageModularPipeline, ) + from .modular_pipeline import QwenImageEditModularPipeline, QwenImageModularPipeline else: import sys diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index fdec95dc506e..606236cfe91b 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -203,6 +203,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - block_state.latents = components.pachifier.pack_latents(block_state.latents) self.set_block_state(state, block_state) + return components, state @@ -570,7 +571,7 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks): @property def description(self) -> str: - return "Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after prepare_latents step" + return "Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be place after prepare_latents step" @property def inputs(self) -> List[InputParam]: diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py index 04fb3fdc947b..2ab83a03ee55 100644 --- a/src/diffusers/modular_pipelines/qwenimage/encoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -128,61 +128,6 @@ def get_qwen_prompt_embeds_edit( return prompt_embeds, encoder_attention_mask -def get_qwen_prompt_embeds_edit_plus( - text_encoder, - processor, - prompt: Union[str, List[str]] = None, - image: Optional[Union[torch.Tensor, List[PIL.Image.Image], PIL.Image.Image]] = None, - prompt_template_encode: str = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n", - img_template_encode: str = "Picture {}: <|vision_start|><|image_pad|><|vision_end|>", - prompt_template_encode_start_idx: int = 64, - device: Optional[torch.device] = None, -): - prompt = [prompt] if isinstance(prompt, str) else prompt - if isinstance(image, list): - base_img_prompt = "" - for i, img in enumerate(image): - base_img_prompt += img_template_encode.format(i + 1) - elif image is not None: - base_img_prompt = img_template_encode.format(1) - else: - base_img_prompt = "" - - template = prompt_template_encode - - drop_idx = prompt_template_encode_start_idx - txt = [template.format(base_img_prompt + e) for e in prompt] - - model_inputs = processor( - text=txt, - images=image, - padding=True, - return_tensors="pt", - ).to(device) - outputs = text_encoder( - input_ids=model_inputs.input_ids, - attention_mask=model_inputs.attention_mask, - pixel_values=model_inputs.pixel_values, - image_grid_thw=model_inputs.image_grid_thw, - output_hidden_states=True, - ) - - hidden_states = outputs.hidden_states[-1] - split_hidden_states = _extract_masked_hidden(hidden_states, model_inputs.attention_mask) - split_hidden_states = [e[drop_idx:] for e in split_hidden_states] - attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states] - max_seq_len = max([e.size(0) for e in split_hidden_states]) - prompt_embeds = torch.stack( - [torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states] - ) - encoder_attention_mask = torch.stack( - [torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list] - ) - - prompt_embeds = prompt_embeds.to(device=device) - return prompt_embeds, encoder_attention_mask - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents def retrieve_latents( encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample" @@ -321,83 +266,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): return components, state -class QwenImageEditPlusResizeDynamicStep(QwenImageEditResizeDynamicStep): - model_name = "qwenimage" - - def __init__( - self, - input_name: str = "image", - output_name: str = "resized_image", - vae_image_output_name: str = "vae_image", - ): - """Create a configurable step for resizing images to the target area (1024 * 1024) while maintaining the aspect ratio. - - This block resizes an input image or a list input images and exposes the resized result under configurable - input and output names. Use this when you need to wire the resize step to different image fields (e.g., - "image", "control_image") - - Args: - input_name (str, optional): Name of the image field to read from the - pipeline state. Defaults to "image". - output_name (str, optional): Name of the resized image field to write - back to the pipeline state. Defaults to "resized_image". - vae_image_output_name (str, optional): Name of the image field - to write back to the pipeline state. This is used by the VAE encoder step later on. QwenImage Edit Plus - processes the input image(s) differently for the VL and the VAE. - """ - if not isinstance(input_name, str) or not isinstance(output_name, str): - raise ValueError( - f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}" - ) - self.condition_image_size = 384 * 384 - self._image_input_name = input_name - self._resized_image_output_name = output_name - self._vae_image_output_name = vae_image_output_name - super().__init__() - - @property - def intermediate_outputs(self) -> List[OutputParam]: - return super().intermediate_outputs + [ - OutputParam( - name=self._vae_image_output_name, - type_hint=List[PIL.Image.Image], - description="The images to be processed which will be further used by the VAE encoder.", - ), - ] - - @torch.no_grad() - def __call__(self, components: QwenImageModularPipeline, state: PipelineState): - block_state = self.get_block_state(state) - - images = getattr(block_state, self._image_input_name) - - if not is_valid_image_imagelist(images): - raise ValueError(f"Images must be image or list of images but are {type(images)}") - - if ( - not isinstance(images, torch.Tensor) - and isinstance(images, PIL.Image.Image) - and not isinstance(images, list) - ): - images = [images] - - # TODO (sayakpaul): revisit this when the inputs are `torch.Tensor`s - condition_images = [] - vae_images = [] - for img in images: - image_width, image_height = img.size - condition_width, condition_height, _ = calculate_dimensions( - self.condition_image_size, image_width / image_height - ) - condition_images.append(components.image_resize_processor.resize(img, condition_height, condition_width)) - vae_images.append(img) - - setattr(block_state, self._resized_image_output_name, condition_images) - setattr(block_state, self._vae_image_output_name, vae_images) - self.set_block_state(state, block_state) - return components, state - - class QwenImageTextEncoderStep(ModularPipelineBlocks): model_name = "qwenimage" @@ -643,61 +511,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): return components, state -class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep): - model_name = "qwenimage" - - @property - def expected_configs(self) -> List[ConfigSpec]: - return [ - ConfigSpec( - name="prompt_template_encode", - default="<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n", - ), - ConfigSpec( - name="img_template_encode", - default="Picture {}: <|vision_start|><|image_pad|><|vision_end|>", - ), - ConfigSpec(name="prompt_template_encode_start_idx", default=64), - ] - - @torch.no_grad() - def __call__(self, components: QwenImageModularPipeline, state: PipelineState): - block_state = self.get_block_state(state) - - self.check_inputs(block_state.prompt, block_state.negative_prompt) - - device = components._execution_device - - block_state.prompt_embeds, block_state.prompt_embeds_mask = get_qwen_prompt_embeds_edit_plus( - components.text_encoder, - components.processor, - prompt=block_state.prompt, - image=block_state.resized_image, - prompt_template_encode=components.config.prompt_template_encode, - img_template_encode=components.config.img_template_encode, - prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx, - device=device, - ) - - if components.requires_unconditional_embeds: - negative_prompt = block_state.negative_prompt or " " - block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = ( - get_qwen_prompt_embeds_edit_plus( - components.text_encoder, - components.processor, - prompt=negative_prompt, - image=block_state.resized_image, - prompt_template_encode=components.config.prompt_template_encode, - img_template_encode=components.config.img_template_encode, - prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx, - device=device, - ) - ) - - self.set_block_state(state, block_state) - return components, state - - class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks): model_name = "qwenimage" @@ -799,7 +612,12 @@ def expected_components(self) -> List[ComponentSpec]: @property def inputs(self) -> List[InputParam]: - return [InputParam("resized_image"), InputParam("image"), InputParam("height"), InputParam("width")] + return [ + InputParam("resized_image"), + InputParam("image"), + InputParam("height"), + InputParam("width"), + ] @property def intermediate_outputs(self) -> List[OutputParam]: @@ -843,47 +661,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState): return components, state -class QwenImageEditPlusProcessImagesInputStep(QwenImageProcessImagesInputStep): - model_name = "qwenimage-edit-plus" - vae_image_size = 1024 * 1024 - - @property - def description(self) -> str: - return "Image Preprocess step for QwenImage Edit Plus. Unlike QwenImage Edit, QwenImage Edit Plus doesn't use the same resized image for further preprocessing." - - @property - def inputs(self) -> List[InputParam]: - return [InputParam("vae_image"), InputParam("image"), InputParam("height"), InputParam("width")] - - @torch.no_grad() - def __call__(self, components: QwenImageModularPipeline, state: PipelineState): - block_state = self.get_block_state(state) - - if block_state.vae_image is None and block_state.image is None: - raise ValueError("`vae_image` and `image` cannot be None at the same time") - - if block_state.vae_image is None: - image = block_state.image - self.check_inputs( - height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor - ) - height = block_state.height or components.default_height - width = block_state.width or components.default_width - block_state.processed_image = components.image_processor.preprocess( - image=image, height=height, width=width - ) - else: - width, height = block_state.vae_image[0].size - image = block_state.vae_image - - block_state.processed_image = components.image_processor.preprocess( - image=image, height=height, width=width - ) - - self.set_block_state(state, block_state) - return components, state - - class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks): model_name = "qwenimage" @@ -961,6 +738,7 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) - dtype=dtype, latent_channels=components.num_channels_latents, ) + setattr(block_state, self._image_latents_output_name, image_latents) self.set_block_state(state, block_state) diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py index 83bfcb3da4fd..9126766cc202 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py @@ -37,9 +37,6 @@ ) from .encoders import ( QwenImageControlNetVaeEncoderStep, - QwenImageEditPlusProcessImagesInputStep, - QwenImageEditPlusResizeDynamicStep, - QwenImageEditPlusTextEncoderStep, QwenImageEditResizeDynamicStep, QwenImageEditTextEncoderStep, QwenImageInpaintProcessImagesInputStep, @@ -875,151 +872,7 @@ def description(self): ) -#################### QwenImage Edit Plus ##################### - -# 3. QwenImage-Edit Plus - -## 3.1 QwenImage-Edit Plus / edit - -#### QwenImage-Edit Plus vl encoder: take both image and text prompts -QwenImageEditPlusVLEncoderBlocks = InsertableDict( - [ - ("resize", QwenImageEditPlusResizeDynamicStep()), - ("encode", QwenImageEditPlusTextEncoderStep()), - ] -) - - -class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditPlusVLEncoderBlocks.values() - block_names = QwenImageEditPlusVLEncoderBlocks.keys() - - @property - def description(self) -> str: - return "QwenImage-Edit Plus VL encoder step that encode the image an text prompts together." - - -#### QwenImage-Edit Plus vae encoder -QwenImageEditPlusVaeEncoderBlocks = InsertableDict( - [ - ("resize", QwenImageEditPlusResizeDynamicStep()), # edit plus has a different resize step - ("preprocess", QwenImageEditPlusProcessImagesInputStep()), # vae_image -> processed_image - ("encode", QwenImageVaeEncoderDynamicStep()), # processed_image -> image_latents - ] -) - - -class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): - model_name = "qwenimage" - block_classes = QwenImageEditPlusVaeEncoderBlocks.values() - block_names = QwenImageEditPlusVaeEncoderBlocks.keys() - - @property - def description(self) -> str: - return "Vae encoder step that encode the image inputs into their latent representations." - - -#### QwenImage Edit Plus presets -EDIT_PLUS_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageEditPlusVLEncoderStep()), - ("vae_encoder", QwenImageEditPlusVaeEncoderStep()), - ("input", QwenImageEditInputStep()), - ("prepare_latents", QwenImagePrepareLatentsStep()), - ("set_timesteps", QwenImageSetTimestepsStep()), - ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()), - ("denoise", QwenImageEditDenoiseStep()), - ("decode", QwenImageDecodeStep()), - ] -) - - -# auto before_denoise step for edit tasks -class QwenImageEditPlusAutoBeforeDenoiseStep(AutoPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = [QwenImageEditBeforeDenoiseStep] - block_names = ["edit"] - block_trigger_inputs = ["image_latents"] - - @property - def description(self): - return ( - "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n" - + "This is an auto pipeline block that works for edit (img2img) task.\n" - + " - `QwenImageEditBeforeDenoiseStep` (edit) is used when `image_latents` is provided and `processed_mask_image` is not provided.\n" - + " - if `image_latents` is not provided, step will be skipped." - ) - - -## 3.2 QwenImage-Edit Plus/auto encoders - - -class QwenImageEditPlusAutoVaeEncoderStep(AutoPipelineBlocks): - block_classes = [ - QwenImageEditPlusVaeEncoderStep, - ] - block_names = ["edit"] - block_trigger_inputs = ["image"] - - @property - def description(self): - return ( - "Vae encoder step that encode the image inputs into their latent representations. \n" - " This is an auto pipeline block that works for edit task.\n" - + " - `QwenImageEditPlusVaeEncoderStep` (edit) is used when `image` is provided.\n" - + " - if `image` is not provided, step will be skipped." - ) - - -## 3.3 QwenImage-Edit/auto blocks & presets - - -class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = [ - QwenImageEditAutoInputStep, - QwenImageEditPlusAutoBeforeDenoiseStep, - QwenImageEditAutoDenoiseStep, - ] - block_names = ["input", "before_denoise", "denoise"] - - @property - def description(self): - return ( - "Core step that performs the denoising process. \n" - + " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n" - + " - `QwenImageEditPlusAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n" - + " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n" - + "This step support edit (img2img) workflow for QwenImage Edit Plus:\n" - + " - When `image_latents` is provided, it will be used for edit (img2img) task.\n" - ) - - -EDIT_PLUS_AUTO_BLOCKS = InsertableDict( - [ - ("text_encoder", QwenImageEditPlusVLEncoderStep()), - ("vae_encoder", QwenImageEditPlusAutoVaeEncoderStep()), - ("denoise", QwenImageEditPlusCoreDenoiseStep()), - ("decode", QwenImageAutoDecodeStep()), - ] -) - - -class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): - model_name = "qwenimage-edit-plus" - block_classes = EDIT_PLUS_AUTO_BLOCKS.values() - block_names = EDIT_PLUS_AUTO_BLOCKS.keys() - - @property - def description(self): - return ( - "Auto Modular pipeline for edit (img2img) and edit tasks using QwenImage-Edit Plus.\n" - + "- for edit (img2img) generation, you need to provide `image`\n" - ) - - -# 3. all block presets supported in QwenImage, QwenImage-Edit, QwenImage-Edit Plus +# 3. all block presets supported in QwenImage & QwenImage-Edit ALL_BLOCKS = { @@ -1027,10 +880,8 @@ def description(self): "img2img": IMAGE2IMAGE_BLOCKS, "edit": EDIT_BLOCKS, "edit_inpaint": EDIT_INPAINT_BLOCKS, - "edit_plus": EDIT_PLUS_BLOCKS, "inpaint": INPAINT_BLOCKS, "controlnet": CONTROLNET_BLOCKS, "auto": AUTO_BLOCKS, "edit_auto": EDIT_AUTO_BLOCKS, - "edit_plus_auto": EDIT_PLUS_AUTO_BLOCKS, } diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py index d9e30864f660..7200169923a5 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py @@ -196,13 +196,3 @@ def requires_unconditional_embeds(self): requires_unconditional_embeds = self.guider._enabled and self.guider.num_conditions > 1 return requires_unconditional_embeds - - -class QwenImageEditPlusModularPipeline(QwenImageEditModularPipeline): - """ - A ModularPipeline for QwenImage-Edit Plus. - - > [!WARNING] > This is an experimental feature and is likely to change in the future. - """ - - default_blocks_name = "QwenImageEditPlusAutoBlocks" diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py index 8a32d4c367a3..d265bfdcaf3d 100644 --- a/src/diffusers/pipelines/auto_pipeline.py +++ b/src/diffusers/pipelines/auto_pipeline.py @@ -95,7 +95,6 @@ QwenImageControlNetPipeline, QwenImageEditInpaintPipeline, QwenImageEditPipeline, - QwenImageEditPlusPipeline, QwenImageImg2ImgPipeline, QwenImageInpaintPipeline, QwenImagePipeline, @@ -187,7 +186,6 @@ ("flux-kontext", FluxKontextPipeline), ("qwenimage", QwenImageImg2ImgPipeline), ("qwenimage-edit", QwenImageEditPipeline), - ("qwenimage-edit-plus", QwenImageEditPlusPipeline), ] ) diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index cf8037796488..bb8fea8c8a8b 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -77,36 +77,6 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) -class QwenImageEditPlusAutoBlocks(metaclass=DummyObject): - _backends = ["torch", "transformers"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch", "transformers"]) - - @classmethod - def from_config(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - -class QwenImageEditPlusModularPipeline(metaclass=DummyObject): - _backends = ["torch", "transformers"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch", "transformers"]) - - @classmethod - def from_config(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - class QwenImageModularPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"]