Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/diffusers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,8 @@
"QwenImageAutoBlocks",
"QwenImageEditAutoBlocks",
"QwenImageEditModularPipeline",
"QwenImageEditPlusAutoBlocks",
"QwenImageEditPlusModularPipeline",
"QwenImageModularPipeline",
"StableDiffusionXLAutoBlocks",
"StableDiffusionXLModularPipeline",
Expand Down Expand Up @@ -1052,6 +1054,8 @@
QwenImageAutoBlocks,
QwenImageEditAutoBlocks,
QwenImageEditModularPipeline,
QwenImageEditPlusAutoBlocks,
QwenImageEditPlusModularPipeline,
QwenImageModularPipeline,
StableDiffusionXLAutoBlocks,
StableDiffusionXLModularPipeline,
Expand Down
4 changes: 4 additions & 0 deletions src/diffusers/modular_pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@
"QwenImageModularPipeline",
"QwenImageEditModularPipeline",
"QwenImageEditAutoBlocks",
"QwenImageEditPlusModularPipeline",
"QwenImageEditPlusAutoBlocks",
]
_import_structure["components_manager"] = ["ComponentsManager"]

Expand All @@ -78,6 +80,8 @@
QwenImageAutoBlocks,
QwenImageEditAutoBlocks,
QwenImageEditModularPipeline,
QwenImageEditPlusAutoBlocks,
QwenImageEditPlusModularPipeline,
QwenImageModularPipeline,
)
from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline
Expand Down
4 changes: 3 additions & 1 deletion src/diffusers/modular_pipelines/modular_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
("flux", "FluxModularPipeline"),
("qwenimage", "QwenImageModularPipeline"),
("qwenimage-edit", "QwenImageEditModularPipeline"),
("qwenimage-edit-plus", "QwenImageEditPlusModularPipeline"),
]
)

Expand Down Expand Up @@ -1628,7 +1629,8 @@ def from_pretrained(
blocks = ModularPipelineBlocks.from_pretrained(
pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
)
except EnvironmentError:
except EnvironmentError as e:
logger.debug(f"EnvironmentError: {e}")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hopefully this is okay?

blocks = None

cache_dir = kwargs.pop("cache_dir", None)
Expand Down
18 changes: 16 additions & 2 deletions src/diffusers/modular_pipelines/qwenimage/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,20 @@
"EDIT_AUTO_BLOCKS",
"EDIT_BLOCKS",
"EDIT_INPAINT_BLOCKS",
"EDIT_PLUS_AUTO_BLOCKS",
"EDIT_PLUS_BLOCKS",
"IMAGE2IMAGE_BLOCKS",
"INPAINT_BLOCKS",
"TEXT2IMAGE_BLOCKS",
"QwenImageAutoBlocks",
"QwenImageEditAutoBlocks",
"QwenImageEditPlusAutoBlocks",
]
_import_structure["modular_pipeline"] = [
"QwenImageEditModularPipeline",
"QwenImageEditPlusModularPipeline",
"QwenImageModularPipeline",
]
_import_structure["modular_pipeline"] = ["QwenImageEditModularPipeline", "QwenImageModularPipeline"]

if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
try:
Expand All @@ -54,13 +61,20 @@
EDIT_AUTO_BLOCKS,
EDIT_BLOCKS,
EDIT_INPAINT_BLOCKS,
EDIT_PLUS_AUTO_BLOCKS,
EDIT_PLUS_BLOCKS,
IMAGE2IMAGE_BLOCKS,
INPAINT_BLOCKS,
TEXT2IMAGE_BLOCKS,
QwenImageAutoBlocks,
QwenImageEditAutoBlocks,
QwenImageEditPlusAutoBlocks,
)
from .modular_pipeline import (
QwenImageEditModularPipeline,
QwenImageEditPlusModularPipeline,
QwenImageModularPipeline,
)
from .modular_pipeline import QwenImageEditModularPipeline, QwenImageModularPipeline
else:
import sys

Expand Down
3 changes: 1 addition & 2 deletions src/diffusers/modular_pipelines/qwenimage/before_denoise.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
block_state.latents = components.pachifier.pack_latents(block_state.latents)

self.set_block_state(state, block_state)

return components, state


Expand Down Expand Up @@ -571,7 +570,7 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):

@property
def description(self) -> str:
return "Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be place after prepare_latents step"
return "Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after prepare_latents step"

@property
def inputs(self) -> List[InputParam]:
Expand Down
236 changes: 229 additions & 7 deletions src/diffusers/modular_pipelines/qwenimage/encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,61 @@ def get_qwen_prompt_embeds_edit(
return prompt_embeds, encoder_attention_mask


def get_qwen_prompt_embeds_edit_plus(
text_encoder,
processor,
prompt: Union[str, List[str]] = None,
image: Optional[Union[torch.Tensor, List[PIL.Image.Image], PIL.Image.Image]] = None,
prompt_template_encode: str = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
img_template_encode: str = "Picture {}: <|vision_start|><|image_pad|><|vision_end|>",
prompt_template_encode_start_idx: int = 64,
device: Optional[torch.device] = None,
):
prompt = [prompt] if isinstance(prompt, str) else prompt
if isinstance(image, list):
base_img_prompt = ""
for i, img in enumerate(image):
base_img_prompt += img_template_encode.format(i + 1)
elif image is not None:
base_img_prompt = img_template_encode.format(1)
else:
base_img_prompt = ""

template = prompt_template_encode

drop_idx = prompt_template_encode_start_idx
txt = [template.format(base_img_prompt + e) for e in prompt]

model_inputs = processor(
text=txt,
images=image,
padding=True,
return_tensors="pt",
).to(device)
outputs = text_encoder(
input_ids=model_inputs.input_ids,
attention_mask=model_inputs.attention_mask,
pixel_values=model_inputs.pixel_values,
image_grid_thw=model_inputs.image_grid_thw,
output_hidden_states=True,
)

hidden_states = outputs.hidden_states[-1]
split_hidden_states = _extract_masked_hidden(hidden_states, model_inputs.attention_mask)
split_hidden_states = [e[drop_idx:] for e in split_hidden_states]
attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states]
max_seq_len = max([e.size(0) for e in split_hidden_states])
prompt_embeds = torch.stack(
[torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states]
)
encoder_attention_mask = torch.stack(
[torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list]
)

prompt_embeds = prompt_embeds.to(device=device)
return prompt_embeds, encoder_attention_mask


# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
def retrieve_latents(
encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
Expand Down Expand Up @@ -266,6 +321,83 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
return components, state


class QwenImageEditPlusResizeDynamicStep(QwenImageEditResizeDynamicStep):
model_name = "qwenimage"

def __init__(
self,
input_name: str = "image",
output_name: str = "resized_image",
vae_image_output_name: str = "vae_image",
):
"""Create a configurable step for resizing images to the target area (1024 * 1024) while maintaining the aspect ratio.

This block resizes an input image or a list input images and exposes the resized result under configurable
input and output names. Use this when you need to wire the resize step to different image fields (e.g.,
"image", "control_image")

Args:
input_name (str, optional): Name of the image field to read from the
pipeline state. Defaults to "image".
output_name (str, optional): Name of the resized image field to write
back to the pipeline state. Defaults to "resized_image".
vae_image_output_name (str, optional): Name of the image field
to write back to the pipeline state. This is used by the VAE encoder step later on. QwenImage Edit Plus
processes the input image(s) differently for the VL and the VAE.
"""
if not isinstance(input_name, str) or not isinstance(output_name, str):
raise ValueError(
f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
)
self.condition_image_size = 384 * 384
self._image_input_name = input_name
self._resized_image_output_name = output_name
self._vae_image_output_name = vae_image_output_name
super().__init__()

@property
def intermediate_outputs(self) -> List[OutputParam]:
return super().intermediate_outputs + [
OutputParam(
name=self._vae_image_output_name,
type_hint=List[PIL.Image.Image],
description="The images to be processed which will be further used by the VAE encoder.",
),
]

@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
block_state = self.get_block_state(state)

images = getattr(block_state, self._image_input_name)

if not is_valid_image_imagelist(images):
raise ValueError(f"Images must be image or list of images but are {type(images)}")

if (
not isinstance(images, torch.Tensor)
and isinstance(images, PIL.Image.Image)
and not isinstance(images, list)
):
images = [images]

# TODO (sayakpaul): revisit this when the inputs are `torch.Tensor`s
condition_images = []
vae_images = []
for img in images:
image_width, image_height = img.size
condition_width, condition_height, _ = calculate_dimensions(
self.condition_image_size, image_width / image_height
)
condition_images.append(components.image_resize_processor.resize(img, condition_height, condition_width))
vae_images.append(img)

setattr(block_state, self._resized_image_output_name, condition_images)
setattr(block_state, self._vae_image_output_name, vae_images)
self.set_block_state(state, block_state)
return components, state


class QwenImageTextEncoderStep(ModularPipelineBlocks):
model_name = "qwenimage"

Expand Down Expand Up @@ -511,6 +643,61 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
return components, state


class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep):
model_name = "qwenimage"

@property
def expected_configs(self) -> List[ConfigSpec]:
return [
ConfigSpec(
name="prompt_template_encode",
default="<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
),
ConfigSpec(
name="img_template_encode",
default="Picture {}: <|vision_start|><|image_pad|><|vision_end|>",
),
ConfigSpec(name="prompt_template_encode_start_idx", default=64),
]

@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
block_state = self.get_block_state(state)

self.check_inputs(block_state.prompt, block_state.negative_prompt)

device = components._execution_device

block_state.prompt_embeds, block_state.prompt_embeds_mask = get_qwen_prompt_embeds_edit_plus(
components.text_encoder,
components.processor,
prompt=block_state.prompt,
image=block_state.resized_image,
prompt_template_encode=components.config.prompt_template_encode,
img_template_encode=components.config.img_template_encode,
prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
device=device,
)

if components.requires_unconditional_embeds:
negative_prompt = block_state.negative_prompt or " "
block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = (
get_qwen_prompt_embeds_edit_plus(
components.text_encoder,
components.processor,
prompt=negative_prompt,
image=block_state.resized_image,
prompt_template_encode=components.config.prompt_template_encode,
img_template_encode=components.config.img_template_encode,
prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
device=device,
)
)

self.set_block_state(state, block_state)
return components, state


class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
model_name = "qwenimage"

Expand Down Expand Up @@ -612,12 +799,7 @@ def expected_components(self) -> List[ComponentSpec]:

@property
def inputs(self) -> List[InputParam]:
return [
InputParam("resized_image"),
InputParam("image"),
InputParam("height"),
InputParam("width"),
]
return [InputParam("resized_image"), InputParam("image"), InputParam("height"), InputParam("width")]

@property
def intermediate_outputs(self) -> List[OutputParam]:
Expand Down Expand Up @@ -661,6 +843,47 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
return components, state


class QwenImageEditPlusProcessImagesInputStep(QwenImageProcessImagesInputStep):
model_name = "qwenimage-edit-plus"
vae_image_size = 1024 * 1024

@property
def description(self) -> str:
return "Image Preprocess step for QwenImage Edit Plus. Unlike QwenImage Edit, QwenImage Edit Plus doesn't use the same resized image for further preprocessing."

@property
def inputs(self) -> List[InputParam]:
return [InputParam("vae_image"), InputParam("image"), InputParam("height"), InputParam("width")]

@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
block_state = self.get_block_state(state)

if block_state.vae_image is None and block_state.image is None:
raise ValueError("`vae_image` and `image` cannot be None at the same time")

if block_state.vae_image is None:
image = block_state.image
self.check_inputs(
height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor
)
height = block_state.height or components.default_height
width = block_state.width or components.default_width
block_state.processed_image = components.image_processor.preprocess(
image=image, height=height, width=width
)
else:
width, height = block_state.vae_image[0].size
image = block_state.vae_image

block_state.processed_image = components.image_processor.preprocess(
image=image, height=height, width=width
)

self.set_block_state(state, block_state)
return components, state


class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
model_name = "qwenimage"

Expand Down Expand Up @@ -738,7 +961,6 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
dtype=dtype,
latent_channels=components.num_channels_latents,
)

setattr(block_state, self._image_latents_output_name, image_latents)

self.set_block_state(state, block_state)
Expand Down
Loading
Loading