Skip to content

Commit 4331603

Browse files
committed
Merge branch 'bria_3_2_pipeline' of https://github.com/galbria/diffusers into bria_3_2_pipeline
2 parents f702292 + 3dab480 commit 4331603

File tree

15 files changed

+373
-28
lines changed

15 files changed

+373
-28
lines changed

docs/source/en/api/pipelines/qwenimage.md

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,12 @@
1616

1717
Qwen-Image from the Qwen team is an image generation foundation model in the Qwen series that achieves significant advances in complex text rendering and precise image editing. Experiments show strong general capabilities in both image generation and editing, with exceptional performance in text rendering, especially for Chinese.
1818

19-
Check out the model card [here](https://huggingface.co/Qwen/Qwen-Image) to learn more.
19+
Qwen-Image comes in the following variants:
20+
21+
| model type | model id |
22+
|:----------:|:--------:|
23+
| Qwen-Image | [`Qwen/Qwen-Image`](https://huggingface.co/Qwen/Qwen-Image) |
24+
| Qwen-Image-Edit | [`Qwen/Qwen-Image-Edit`](https://huggingface.co/Qwen/Qwen-Image-Edit) |
2025

2126
<Tip>
2227

@@ -81,16 +86,18 @@ image.save("qwen_fewsteps.png")
8186

8287
</details>
8388

89+
<Tip>
90+
91+
The `guidance_scale` parameter in the pipeline is there to support future guidance-distilled models when they come up. Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance, please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should enable classifier-free guidance computations.
92+
93+
</Tip>
94+
8495
## QwenImagePipeline
8596

8697
[[autodoc]] QwenImagePipeline
8798
- all
8899
- __call__
89100

90-
## QwenImagePipelineOutput
91-
92-
[[autodoc]] pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput
93-
94101
## QwenImageImg2ImgPipeline
95102

96103
[[autodoc]] QwenImageImg2ImgPipeline
@@ -102,3 +109,13 @@ image.save("qwen_fewsteps.png")
102109
[[autodoc]] QwenImageInpaintPipeline
103110
- all
104111
- __call__
112+
113+
## QwenImageEditPipeline
114+
115+
[[autodoc]] QwenImageEditPipeline
116+
- all
117+
- __call__
118+
119+
## QwenImagePipelineOutput
120+
121+
[[autodoc]] pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput

examples/dreambooth/README_qwen.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,9 @@ Now, we can launch training using:
7575
```bash
7676
export MODEL_NAME="Qwen/Qwen-Image"
7777
export INSTANCE_DIR="dog"
78-
export OUTPUT_DIR="trained-sana-lora"
78+
export OUTPUT_DIR="trained-qwenimage-lora"
7979

80-
accelerate launch train_dreambooth_lora_sana.py \
80+
accelerate launch train_dreambooth_lora_qwenimage.py \
8181
--pretrained_model_name_or_path=$MODEL_NAME \
8282
--instance_data_dir=$INSTANCE_DIR \
8383
--output_dir=$OUTPUT_DIR \

src/diffusers/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -491,10 +491,10 @@
491491
"PixArtAlphaPipeline",
492492
"PixArtSigmaPAGPipeline",
493493
"PixArtSigmaPipeline",
494+
"QwenImageEditPipeline",
494495
"QwenImageImg2ImgPipeline",
495496
"QwenImageInpaintPipeline",
496497
"QwenImagePipeline",
497-
"QwenImageEditPipeline",
498498
"ReduxImageEncoder",
499499
"SanaControlNetPipeline",
500500
"SanaPAGPipeline",

src/diffusers/models/autoencoders/autoencoder_dc.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,7 @@ def __init__(
299299
act_fn: Union[str, Tuple[str]] = "silu",
300300
upsample_block_type: str = "pixel_shuffle",
301301
in_shortcut: bool = True,
302+
conv_act_fn: str = "relu",
302303
):
303304
super().__init__()
304305

@@ -349,7 +350,7 @@ def __init__(
349350
channels = block_out_channels[0] if layers_per_block[0] > 0 else block_out_channels[1]
350351

351352
self.norm_out = RMSNorm(channels, 1e-5, elementwise_affine=True, bias=True)
352-
self.conv_act = nn.ReLU()
353+
self.conv_act = get_activation(conv_act_fn)
353354
self.conv_out = None
354355

355356
if layers_per_block[0] > 0:
@@ -414,6 +415,12 @@ class AutoencoderDC(ModelMixin, ConfigMixin, FromOriginalModelMixin):
414415
The normalization type(s) to use in the decoder.
415416
decoder_act_fns (`Union[str, Tuple[str]]`, defaults to `"silu"`):
416417
The activation function(s) to use in the decoder.
418+
encoder_out_shortcut (`bool`, defaults to `True`):
419+
Whether to use shortcut at the end of the encoder.
420+
decoder_in_shortcut (`bool`, defaults to `True`):
421+
Whether to use shortcut at the beginning of the decoder.
422+
decoder_conv_act_fn (`str`, defaults to `"relu"`):
423+
The activation function to use at the end of the decoder.
417424
scaling_factor (`float`, defaults to `1.0`):
418425
The multiplicative inverse of the root mean square of the latent features. This is used to scale the latent
419426
space to have unit variance when training the diffusion model. The latents are scaled with the formula `z =
@@ -441,6 +448,9 @@ def __init__(
441448
downsample_block_type: str = "pixel_unshuffle",
442449
decoder_norm_types: Union[str, Tuple[str]] = "rms_norm",
443450
decoder_act_fns: Union[str, Tuple[str]] = "silu",
451+
encoder_out_shortcut: bool = True,
452+
decoder_in_shortcut: bool = True,
453+
decoder_conv_act_fn: str = "relu",
444454
scaling_factor: float = 1.0,
445455
) -> None:
446456
super().__init__()
@@ -454,6 +464,7 @@ def __init__(
454464
layers_per_block=encoder_layers_per_block,
455465
qkv_multiscales=encoder_qkv_multiscales,
456466
downsample_block_type=downsample_block_type,
467+
out_shortcut=encoder_out_shortcut,
457468
)
458469
self.decoder = Decoder(
459470
in_channels=in_channels,
@@ -466,6 +477,8 @@ def __init__(
466477
norm_type=decoder_norm_types,
467478
act_fn=decoder_act_fns,
468479
upsample_block_type=upsample_block_type,
480+
in_shortcut=decoder_in_shortcut,
481+
conv_act_fn=decoder_conv_act_fn,
469482
)
470483

471484
self.spatial_compression_ratio = 2 ** (len(encoder_block_out_channels) - 1)

src/diffusers/models/model_loading_utils.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -726,23 +726,29 @@ def _caching_allocator_warmup(
726726
very large margin.
727727
"""
728728
factor = 2 if hf_quantizer is None else hf_quantizer.get_cuda_warm_up_factor()
729-
# Remove disk and cpu devices, and cast to proper torch.device
729+
730+
# Keep only accelerator devices
730731
accelerator_device_map = {
731732
param: torch.device(device)
732733
for param, device in expanded_device_map.items()
733734
if str(device) not in ["cpu", "disk"]
734735
}
735-
total_byte_count = defaultdict(lambda: 0)
736+
if not accelerator_device_map:
737+
return
738+
739+
elements_per_device = defaultdict(int)
736740
for param_name, device in accelerator_device_map.items():
737741
try:
738-
param = model.get_parameter(param_name)
742+
p = model.get_parameter(param_name)
739743
except AttributeError:
740-
param = model.get_buffer(param_name)
741-
# The dtype of different parameters may be different with composite models or `keep_in_fp32_modules`
742-
param_byte_count = param.numel() * param.element_size()
744+
try:
745+
p = model.get_buffer(param_name)
746+
except AttributeError:
747+
raise AttributeError(f"Parameter or buffer with name={param_name} not found in model")
743748
# TODO: account for TP when needed.
744-
total_byte_count[device] += param_byte_count
749+
elements_per_device[device] += p.numel()
745750

746751
# This will kick off the caching allocator to avoid having to Malloc afterwards
747-
for device, byte_count in total_byte_count.items():
748-
_ = torch.empty(byte_count // factor, dtype=dtype, device=device, requires_grad=False)
752+
for device, elem_count in elements_per_device.items():
753+
warmup_elems = max(1, elem_count // factor)
754+
_ = torch.empty(warmup_elems, dtype=dtype, device=device, requires_grad=False)

src/diffusers/models/transformers/transformer_cogview4.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from ..embeddings import CogView3CombinedTimestepSizeEmbeddings
2929
from ..modeling_outputs import Transformer2DModelOutput
3030
from ..modeling_utils import ModelMixin
31-
from ..normalization import AdaLayerNormContinuous
31+
from ..normalization import LayerNorm, RMSNorm
3232

3333

3434
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -584,6 +584,38 @@ def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tens
584584
return (freqs.cos(), freqs.sin())
585585

586586

587+
class CogView4AdaLayerNormContinuous(nn.Module):
588+
"""
589+
CogView4-only final AdaLN: LN(x) -> Linear(cond) -> chunk -> affine. Matches Megatron: **no activation** before the
590+
Linear on conditioning embedding.
591+
"""
592+
593+
def __init__(
594+
self,
595+
embedding_dim: int,
596+
conditioning_embedding_dim: int,
597+
elementwise_affine: bool = True,
598+
eps: float = 1e-5,
599+
bias: bool = True,
600+
norm_type: str = "layer_norm",
601+
):
602+
super().__init__()
603+
self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
604+
if norm_type == "layer_norm":
605+
self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)
606+
elif norm_type == "rms_norm":
607+
self.norm = RMSNorm(embedding_dim, eps, elementwise_affine)
608+
else:
609+
raise ValueError(f"unknown norm_type {norm_type}")
610+
611+
def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor:
612+
# *** NO SiLU here ***
613+
emb = self.linear(conditioning_embedding.to(x.dtype))
614+
scale, shift = torch.chunk(emb, 2, dim=1)
615+
x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
616+
return x
617+
618+
587619
class CogView4Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, CacheMixin):
588620
r"""
589621
Args:
@@ -666,7 +698,7 @@ def __init__(
666698
)
667699

668700
# 4. Output projection
669-
self.norm_out = AdaLayerNormContinuous(inner_dim, time_embed_dim, elementwise_affine=False)
701+
self.norm_out = CogView4AdaLayerNormContinuous(inner_dim, time_embed_dim, elementwise_affine=False)
670702
self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels, bias=True)
671703

672704
self.gradient_checkpointing = False

src/diffusers/models/transformers/transformer_qwenimage.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ def forward(self, video_fhw, txt_seq_lens, device):
219219
video_freq = self.rope_cache[rope_key]
220220
else:
221221
video_freq = self._compute_video_freqs(frame, height, width, idx)
222+
video_freq = video_freq.to(device)
222223
vid_freqs.append(video_freq)
223224

224225
if self.scale_rope:

src/diffusers/pipelines/qwenimage/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@
2424
else:
2525
_import_structure["modeling_qwenimage"] = ["ReduxImageEncoder"]
2626
_import_structure["pipeline_qwenimage"] = ["QwenImagePipeline"]
27+
_import_structure["pipeline_qwenimage_edit"] = ["QwenImageEditPipeline"]
2728
_import_structure["pipeline_qwenimage_img2img"] = ["QwenImageImg2ImgPipeline"]
2829
_import_structure["pipeline_qwenimage_inpaint"] = ["QwenImageInpaintPipeline"]
29-
_import_structure["pipeline_qwenimage_edit"] = ["QwenImageEditPipeline"]
3030

3131
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
3232
try:

src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,11 @@ def __call__(
480480
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
481481
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
482482
the text `prompt`, usually at the expense of lower image quality.
483+
484+
This parameter in the pipeline is there to support future guidance-distilled models when they come up.
485+
Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance,
486+
please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should
487+
enable classifier-free guidance computations.
483488
num_images_per_prompt (`int`, *optional*, defaults to 1):
484489
The number of images to generate per prompt.
485490
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):

src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,15 +46,20 @@
4646
>>> import torch
4747
>>> from PIL import Image
4848
>>> from diffusers import QwenImageEditPipeline
49+
>>> from diffusers.utils import load_image
4950
5051
>>> pipe = QwenImageEditPipeline.from_pretrained("Qwen/Qwen-Image-Edit", torch_dtype=torch.bfloat16)
5152
>>> pipe.to("cuda")
52-
>>> prompt = "Change the cat to a dog"
53-
>>> image = Image.open("cat.png")
53+
>>> image = load_image(
54+
... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/yarn-art-pikachu.png"
55+
... ).convert("RGB")
56+
>>> prompt = (
57+
... "Make Pikachu hold a sign that says 'Qwen Edit is awesome', yarn art style, detailed, vibrant colors"
58+
... )
5459
>>> # Depending on the variant being used, the pipeline call will slightly vary.
5560
>>> # Refer to the pipeline documentation for more details.
5661
>>> image = pipe(image, prompt, num_inference_steps=50).images[0]
57-
>>> image.save("qwenimageedit.png")
62+
>>> image.save("qwenimage_edit.png")
5863
```
5964
"""
6065
PREFERRED_QWENIMAGE_RESOLUTIONS = [
@@ -178,7 +183,7 @@ def calculate_dimensions(target_area, ratio):
178183

179184
class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
180185
r"""
181-
The QwenImage pipeline for text-to-image generation.
186+
The Qwen-Image-Edit pipeline for image editing.
182187
183188
Args:
184189
transformer ([`QwenImageTransformer2DModel`]):
@@ -217,8 +222,8 @@ def __init__(
217222
transformer=transformer,
218223
scheduler=scheduler,
219224
)
220-
self.latent_channels = 16
221225
self.vae_scale_factor = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
226+
self.latent_channels = self.vae.config.z_dim if getattr(self, "vae", None) else 16
222227
# QwenImage latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
223228
# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
224229
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
@@ -592,6 +597,11 @@ def __call__(
592597
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
593598
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
594599
the text `prompt`, usually at the expense of lower image quality.
600+
601+
This parameter in the pipeline is there to support future guidance-distilled models when they come up.
602+
Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance,
603+
please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should
604+
enable classifier-free guidance computations.
595605
num_images_per_prompt (`int`, *optional*, defaults to 1):
596606
The number of images to generate per prompt.
597607
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -635,7 +645,9 @@ def __call__(
635645
[`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When
636646
returning a tuple, the first element is a list with the generated images.
637647
"""
638-
calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, image.width / image.height)
648+
image_size = image[0].size if isinstance(image, list) else image.size
649+
width, height = image_size
650+
calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, width / height)
639651
height = height or calculated_height
640652
width = width or calculated_width
641653

0 commit comments

Comments
 (0)