Skip to content

Commit 67403da

Browse files
authored
Merge branch 'main' into schedulers/unipc-custom-sigmas
2 parents 7bc5a19 + 03be15e commit 67403da

File tree

10 files changed

+330
-18
lines changed

10 files changed

+330
-18
lines changed

docs/source/en/api/pipelines/qwenimage.md

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,12 @@
1616

1717
Qwen-Image from the Qwen team is an image generation foundation model in the Qwen series that achieves significant advances in complex text rendering and precise image editing. Experiments show strong general capabilities in both image generation and editing, with exceptional performance in text rendering, especially for Chinese.
1818

19-
Check out the model card [here](https://huggingface.co/Qwen/Qwen-Image) to learn more.
19+
Qwen-Image comes in the following variants:
20+
21+
| model type | model id |
22+
|:----------:|:--------:|
23+
| Qwen-Image | [`Qwen/Qwen-Image`](https://huggingface.co/Qwen/Qwen-Image) |
24+
| Qwen-Image-Edit | [`Qwen/Qwen-Image-Edit`](https://huggingface.co/Qwen/Qwen-Image-Edit) |
2025

2126
<Tip>
2227

@@ -87,10 +92,6 @@ image.save("qwen_fewsteps.png")
8792
- all
8893
- __call__
8994

90-
## QwenImagePipelineOutput
91-
92-
[[autodoc]] pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput
93-
9495
## QwenImageImg2ImgPipeline
9596

9697
[[autodoc]] QwenImageImg2ImgPipeline
@@ -102,3 +103,13 @@ image.save("qwen_fewsteps.png")
102103
[[autodoc]] QwenImageInpaintPipeline
103104
- all
104105
- __call__
106+
107+
## QwenImageEditPipeline
108+
109+
[[autodoc]] QwenImageEditPipeline
110+
- all
111+
- __call__
112+
113+
## QwenImagePipelineOutput
114+
115+
[[autodoc]] pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput

examples/dreambooth/README_qwen.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,9 @@ Now, we can launch training using:
7575
```bash
7676
export MODEL_NAME="Qwen/Qwen-Image"
7777
export INSTANCE_DIR="dog"
78-
export OUTPUT_DIR="trained-sana-lora"
78+
export OUTPUT_DIR="trained-qwenimage-lora"
7979

80-
accelerate launch train_dreambooth_lora_sana.py \
80+
accelerate launch train_dreambooth_lora_qwenimage.py \
8181
--pretrained_model_name_or_path=$MODEL_NAME \
8282
--instance_data_dir=$INSTANCE_DIR \
8383
--output_dir=$OUTPUT_DIR \

src/diffusers/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -489,10 +489,10 @@
489489
"PixArtAlphaPipeline",
490490
"PixArtSigmaPAGPipeline",
491491
"PixArtSigmaPipeline",
492+
"QwenImageEditPipeline",
492493
"QwenImageImg2ImgPipeline",
493494
"QwenImageInpaintPipeline",
494495
"QwenImagePipeline",
495-
"QwenImageEditPipeline",
496496
"ReduxImageEncoder",
497497
"SanaControlNetPipeline",
498498
"SanaPAGPipeline",

src/diffusers/models/autoencoders/autoencoder_dc.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,7 @@ def __init__(
299299
act_fn: Union[str, Tuple[str]] = "silu",
300300
upsample_block_type: str = "pixel_shuffle",
301301
in_shortcut: bool = True,
302+
conv_act_fn: str = "relu",
302303
):
303304
super().__init__()
304305

@@ -349,7 +350,7 @@ def __init__(
349350
channels = block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1]
350351

351352
self.norm_out = RMSNorm(channels, 1e-5, elementwise_affine=True, bias=True)
352-
self.conv_act = nn.ReLU()
353+
self.conv_act = get_activation(conv_act_fn)
353354
self.conv_out = None
354355

355356
if layers_per_block[0] > 0:
@@ -414,6 +415,12 @@ class AutoencoderDC(ModelMixin, ConfigMixin, FromOriginalModelMixin):
414415
The normalization type(s) to use in the decoder.
415416
decoder_act_fns (`Union[str, Tuple[str]]`, defaults to `"silu"`):
416417
The activation function(s) to use in the decoder.
418+
encoder_out_shortcut (`bool`, defaults to `True`):
419+
Whether to use shortcut at the end of the encoder.
420+
decoder_in_shortcut (`bool`, defaults to `True`):
421+
Whether to use shortcut at the beginning of the decoder.
422+
decoder_conv_act_fn (`str`, defaults to `"relu"`):
423+
The activation function to use at the end of the decoder.
417424
scaling_factor (`float`, defaults to `1.0`):
418425
The multiplicative inverse of the root mean square of the latent features. This is used to scale the latent
419426
space to have unit variance when training the diffusion model. The latents are scaled with the formula `z =
@@ -441,6 +448,9 @@ def __init__(
441448
downsample_block_type: str = "pixel_unshuffle",
442449
decoder_norm_types: Union[str, Tuple[str]] = "rms_norm",
443450
decoder_act_fns: Union[str, Tuple[str]] = "silu",
451+
encoder_out_shortcut: bool = True,
452+
decoder_in_shortcut: bool = True,
453+
decoder_conv_act_fn: str = "relu",
444454
scaling_factor: float = 1.0,
445455
) -> None:
446456
super().__init__()
@@ -454,6 +464,7 @@ def __init__(
454464
layers_per_block=encoder_layers_per_block,
455465
qkv_multiscales=encoder_qkv_multiscales,
456466
downsample_block_type=downsample_block_type,
467+
out_shortcut=encoder_out_shortcut,
457468
)
458469
self.decoder = Decoder(
459470
in_channels=in_channels,
@@ -466,6 +477,8 @@ def __init__(
466477
norm_type=decoder_norm_types,
467478
act_fn=decoder_act_fns,
468479
upsample_block_type=upsample_block_type,
480+
in_shortcut=decoder_in_shortcut,
481+
conv_act_fn=decoder_conv_act_fn,
469482
)
470483

471484
self.spatial_compression_ratio = 2 ** (len(encoder_block_out_channels) - 1)

src/diffusers/models/transformers/transformer_cogview4.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from ..embeddings import CogView3CombinedTimestepSizeEmbeddings
2929
from ..modeling_outputs import Transformer2DModelOutput
3030
from ..modeling_utils import ModelMixin
31-
from ..normalization import AdaLayerNormContinuous
31+
from ..normalization import LayerNorm, RMSNorm
3232

3333

3434
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -584,6 +584,38 @@ def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tens
584584
return (freqs.cos(), freqs.sin())
585585

586586

587+
class CogView4AdaLayerNormContinuous(nn.Module):
588+
"""
589+
CogView4-only final AdaLN: LN(x) -> Linear(cond) -> chunk -> affine. Matches Megatron: **no activation** before the
590+
Linear on conditioning embedding.
591+
"""
592+
593+
def __init__(
594+
self,
595+
embedding_dim: int,
596+
conditioning_embedding_dim: int,
597+
elementwise_affine: bool = True,
598+
eps: float = 1e-5,
599+
bias: bool = True,
600+
norm_type: str = "layer_norm",
601+
):
602+
super().__init__()
603+
self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
604+
if norm_type == "layer_norm":
605+
self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)
606+
elif norm_type == "rms_norm":
607+
self.norm = RMSNorm(embedding_dim, eps, elementwise_affine)
608+
else:
609+
raise ValueError(f"unknown norm_type {norm_type}")
610+
611+
def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor:
612+
# *** NO SiLU here ***
613+
emb = self.linear(conditioning_embedding.to(x.dtype))
614+
scale, shift = torch.chunk(emb, 2, dim=1)
615+
x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
616+
return x
617+
618+
587619
class CogView4Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, CacheMixin):
588620
r"""
589621
Args:
@@ -666,7 +698,7 @@ def __init__(
666698
)
667699

668700
# 4. Output projection
669-
self.norm_out = AdaLayerNormContinuous(inner_dim, time_embed_dim, elementwise_affine=False)
701+
self.norm_out = CogView4AdaLayerNormContinuous(inner_dim, time_embed_dim, elementwise_affine=False)
670702
self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels, bias=True)
671703

672704
self.gradient_checkpointing = False

src/diffusers/models/transformers/transformer_qwenimage.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ def forward(self, video_fhw, txt_seq_lens, device):
219219
video_freq = self.rope_cache[rope_key]
220220
else:
221221
video_freq = self._compute_video_freqs(frame, height, width, idx)
222+
video_freq = video_freq.to(device)
222223
vid_freqs.append(video_freq)
223224

224225
if self.scale_rope:

src/diffusers/pipelines/qwenimage/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@
2424
else:
2525
_import_structure["modeling_qwenimage"] = ["ReduxImageEncoder"]
2626
_import_structure["pipeline_qwenimage"] = ["QwenImagePipeline"]
27+
_import_structure["pipeline_qwenimage_edit"] = ["QwenImageEditPipeline"]
2728
_import_structure["pipeline_qwenimage_img2img"] = ["QwenImageImg2ImgPipeline"]
2829
_import_structure["pipeline_qwenimage_inpaint"] = ["QwenImageInpaintPipeline"]
29-
_import_structure["pipeline_qwenimage_edit"] = ["QwenImageEditPipeline"]
3030

3131
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
3232
try:

src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,15 +46,20 @@
4646
>>> import torch
4747
>>> from PIL import Image
4848
>>> from diffusers import QwenImageEditPipeline
49+
>>> from diffusers.utils import load_image
4950
5051
>>> pipe = QwenImageEditPipeline.from_pretrained("Qwen/Qwen-Image-Edit", torch_dtype=torch.bfloat16)
5152
>>> pipe.to("cuda")
52-
>>> prompt = "Change the cat to a dog"
53-
>>> image = Image.open("cat.png")
53+
>>> image = load_image(
54+
... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/yarn-art-pikachu.png"
55+
... ).convert("RGB")
56+
>>> prompt = (
57+
... "Make Pikachu hold a sign that says 'Qwen Edit is awesome', yarn art style, detailed, vibrant colors"
58+
... )
5459
>>> # Depending on the variant being used, the pipeline call will slightly vary.
5560
>>> # Refer to the pipeline documentation for more details.
5661
>>> image = pipe(image, prompt, num_inference_steps=50).images[0]
57-
>>> image.save("qwenimageedit.png")
62+
>>> image.save("qwenimage_edit.png")
5863
```
5964
"""
6065
PREFERRED_QWENIMAGE_RESOLUTIONS = [
@@ -178,7 +183,7 @@ def calculate_dimensions(target_area, ratio):
178183

179184
class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
180185
r"""
181-
The QwenImage pipeline for text-to-image generation.
186+
The Qwen-Image-Edit pipeline for image editing.
182187
183188
Args:
184189
transformer ([`QwenImageTransformer2DModel`]):
@@ -217,8 +222,8 @@ def __init__(
217222
transformer=transformer,
218223
scheduler=scheduler,
219224
)
220-
self.latent_channels = 16
221225
self.vae_scale_factor = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
226+
self.latent_channels = self.vae.config.z_dim if getattr(self, "vae", None) else 16
222227
# QwenImage latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
223228
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
224229
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
@@ -635,7 +640,9 @@ def __call__(
635640
[`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When
636641
returning a tuple, the first element is a list with the generated images.
637642
"""
638-
calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, image.width / image.height)
643+
image_size = image[0].size if isinstance(image, list) else image.size
644+
width, height = image_size
645+
calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, width / height)
639646
height = height or calculated_height
640647
width = width or calculated_width
641648

tests/models/transformers/test_models_transformer_qwenimage.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import unittest
1717

18+
import pytest
1819
import torch
1920

2021
from diffusers import QwenImageTransformer2DModel
@@ -99,3 +100,7 @@ def prepare_init_args_and_inputs_for_common(self):
99100

100101
def prepare_dummy_input(self, height, width):
101102
return QwenImageTransformerTests().prepare_dummy_input(height=height, width=width)
103+
104+
@pytest.mark.xfail(condition=True, reason="RoPE needs to be revisited.", strict=True)
105+
def test_torch_compile_recompilation_and_graph_break(self):
106+
super().test_torch_compile_recompilation_and_graph_break()

0 commit comments

Comments
 (0)