Skip to content

Commit 9a38fab

Browse files
authored
tests + minor refactor for QwenImage (#12057)
* update * update * update * add docs
1 parent cb8e61e commit 9a38fab

File tree

9 files changed

+388
-151
lines changed

9 files changed

+388
-151
lines changed

docs/source/en/_toctree.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,8 @@
366366
title: PixArtTransformer2DModel
367367
- local: api/models/prior_transformer
368368
title: PriorTransformer
369+
- local: api/models/qwenimage_transformer2d
370+
title: QwenImageTransformer2DModel
369371
- local: api/models/sana_transformer2d
370372
title: SanaTransformer2DModel
371373
- local: api/models/sd3_transformer2d
@@ -418,6 +420,8 @@
418420
title: AutoencoderKLMagvit
419421
- local: api/models/autoencoderkl_mochi
420422
title: AutoencoderKLMochi
423+
- local: api/models/autoencoderkl_qwenimage
424+
title: AutoencoderKLQwenImage
421425
- local: api/models/autoencoder_kl_wan
422426
title: AutoencoderKLWan
423427
- local: api/models/consistency_decoder_vae
@@ -554,6 +558,8 @@
554558
title: PixArt-α
555559
- local: api/pipelines/pixart_sigma
556560
title: PixArt-Σ
561+
- local: api/pipelines/qwenimage
562+
title: QwenImage
557563
- local: api/pipelines/sana
558564
title: Sana
559565
- local: api/pipelines/sana_sprint
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4+
the License. You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10+
specific language governing permissions and limitations under the License. -->
11+
12+
# AutoencoderKLQwenImage
13+
14+
The model can be loaded with the following code snippet.
15+
16+
```python
17+
from diffusers import AutoencoderKLQwenImage
18+
19+
vae = AutoencoderKLQwenImage.from_pretrained("Qwen/QwenImage-20B", subfolder="vae")
20+
```
21+
22+
## AutoencoderKLQwenImage
23+
24+
[[autodoc]] AutoencoderKLQwenImage
25+
- decode
26+
- encode
27+
- all
28+
29+
## AutoencoderKLOutput
30+
31+
[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
32+
33+
## DecoderOutput
34+
35+
[[autodoc]] models.autoencoders.vae.DecoderOutput
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4+
the License. You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10+
specific language governing permissions and limitations under the License. -->
11+
12+
# QwenImageTransformer2DModel
13+
14+
The model can be loaded with the following code snippet.
15+
16+
```python
17+
from diffusers import QwenImageTransformer2DModel
18+
19+
transformer = QwenImageTransformer2DModel.from_pretrained("Qwen/QwenImage-20B", subfolder="transformer", torch_dtype=torch.bfloat16)
20+
```
21+
22+
## QwenImageTransformer2DModel
23+
24+
[[autodoc]] QwenImageTransformer2DModel
25+
26+
## Transformer2DModelOutput
27+
28+
[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License. -->
14+
15+
# QwenImage
16+
17+
<!-- TODO: update this section when model is out -->
18+
19+
<Tip>
20+
21+
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
22+
23+
</Tip>
24+
25+
## QwenImagePipeline
26+
27+
[[autodoc]] QwenImagePipeline
28+
- all
29+
- __call__
30+
31+
## QwenImagePipeline
32+
33+
[[autodoc]] pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput

src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py

Lines changed: 4 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -668,6 +668,7 @@ class AutoencoderKLQwenImage(ModelMixin, ConfigMixin, FromOriginalModelMixin):
668668

669669
_supports_gradient_checkpointing = False
670670

671+
# fmt: off
671672
@register_to_config
672673
def __init__(
673674
self,
@@ -678,43 +679,10 @@ def __init__(
678679
attn_scales: List[float] = [],
679680
temperal_downsample: List[bool] = [False, True, True],
680681
dropout: float = 0.0,
681-
latents_mean: List[float] = [
682-
-0.7571,
683-
-0.7089,
684-
-0.9113,
685-
0.1075,
686-
-0.1745,
687-
0.9653,
688-
-0.1517,
689-
1.5508,
690-
0.4134,
691-
-0.0715,
692-
0.5517,
693-
-0.3632,
694-
-0.1922,
695-
-0.9497,
696-
0.2503,
697-
-0.2921,
698-
],
699-
latents_std: List[float] = [
700-
2.8184,
701-
1.4541,
702-
2.3275,
703-
2.6558,
704-
1.2196,
705-
1.7708,
706-
2.6052,
707-
2.0743,
708-
3.2687,
709-
2.1526,
710-
2.8652,
711-
1.5579,
712-
1.6382,
713-
1.1253,
714-
2.8251,
715-
1.9160,
716-
],
682+
latents_mean: List[float] = [-0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508, 0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921],
683+
latents_std: List[float] = [2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743, 3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160],
717684
) -> None:
685+
# fmt: on
718686
super().__init__()
719687

720688
self.z_dim = z_dim

src/diffusers/models/transformers/transformer_qwenimage.py

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ def apply_rotary_emb_qwen(
140140

141141

142142
class QwenTimestepProjEmbeddings(nn.Module):
143-
def __init__(self, embedding_dim, pooled_projection_dim):
143+
def __init__(self, embedding_dim):
144144
super().__init__()
145145

146146
self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1000)
@@ -473,8 +473,6 @@ class QwenImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, Fro
473473
joint_attention_dim (`int`, defaults to `3584`):
474474
The number of dimensions to use for the joint attention (embedding/channel dimension of
475475
`encoder_hidden_states`).
476-
pooled_projection_dim (`int`, defaults to `768`):
477-
The number of dimensions to use for the pooled projection.
478476
guidance_embeds (`bool`, defaults to `False`):
479477
Whether to use guidance embeddings for guidance-distilled variant of the model.
480478
axes_dims_rope (`Tuple[int]`, defaults to `(16, 56, 56)`):
@@ -495,8 +493,7 @@ def __init__(
495493
attention_head_dim: int = 128,
496494
num_attention_heads: int = 24,
497495
joint_attention_dim: int = 3584,
498-
pooled_projection_dim: int = 768,
499-
guidance_embeds: bool = False,
496+
guidance_embeds: bool = False, # TODO: this should probably be removed
500497
axes_dims_rope: Tuple[int, int, int] = (16, 56, 56),
501498
):
502499
super().__init__()
@@ -505,9 +502,7 @@ def __init__(
505502

506503
self.pos_embed = QwenEmbedRope(theta=10000, axes_dim=list(axes_dims_rope), scale_rope=True)
507504

508-
self.time_text_embed = QwenTimestepProjEmbeddings(
509-
embedding_dim=self.inner_dim, pooled_projection_dim=pooled_projection_dim
510-
)
505+
self.time_text_embed = QwenTimestepProjEmbeddings(embedding_dim=self.inner_dim)
511506

512507
self.txt_norm = RMSNorm(joint_attention_dim, eps=1e-6)
513508

@@ -538,10 +533,9 @@ def forward(
538533
timestep: torch.LongTensor = None,
539534
img_shapes: Optional[List[Tuple[int, int, int]]] = None,
540535
txt_seq_lens: Optional[List[int]] = None,
541-
guidance: torch.Tensor = None,
542-
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
536+
guidance: torch.Tensor = None, # TODO: this should probably be removed
537+
attention_kwargs: Optional[Dict[str, Any]] = None,
543538
return_dict: bool = True,
544-
controlnet_blocks_repeat: bool = False,
545539
) -> Union[torch.Tensor, Transformer2DModelOutput]:
546540
"""
547541
The [`QwenTransformer2DModel`] forward method.
@@ -555,7 +549,7 @@ def forward(
555549
Mask of the input conditions.
556550
timestep ( `torch.LongTensor`):
557551
Used to indicate denoising step.
558-
joint_attention_kwargs (`dict`, *optional*):
552+
attention_kwargs (`dict`, *optional*):
559553
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
560554
`self.processor` in
561555
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
@@ -567,17 +561,17 @@ def forward(
567561
If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
568562
`tuple` where the first element is the sample tensor.
569563
"""
570-
if joint_attention_kwargs is not None:
571-
joint_attention_kwargs = joint_attention_kwargs.copy()
572-
lora_scale = joint_attention_kwargs.pop("scale", 1.0)
564+
if attention_kwargs is not None:
565+
attention_kwargs = attention_kwargs.copy()
566+
lora_scale = attention_kwargs.pop("scale", 1.0)
573567
else:
574568
lora_scale = 1.0
575569

576570
if USE_PEFT_BACKEND:
577571
# weight the lora layers by setting `lora_scale` for each PEFT layer
578572
scale_lora_layers(self, lora_scale)
579573
else:
580-
if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
574+
if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
581575
logger.warning(
582576
"Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
583577
)
@@ -617,7 +611,7 @@ def forward(
617611
encoder_hidden_states_mask=encoder_hidden_states_mask,
618612
temb=temb,
619613
image_rotary_emb=image_rotary_emb,
620-
joint_attention_kwargs=joint_attention_kwargs,
614+
joint_attention_kwargs=attention_kwargs,
621615
)
622616

623617
# Use only the image part (hidden_states) from the dual-stream blocks

0 commit comments

Comments
 (0)