Skip to content

Commit 04fa6f4

Browse files
authored
Merge branch 'main' into wan22-followup
2 parents 7a66c4c + 8e53cd9 commit 04fa6f4

File tree

18 files changed

+2677
-3
lines changed

18 files changed

+2677
-3
lines changed

docs/source/en/api/pipelines/wan.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,17 @@
2929
You can find all the original Wan2.1 checkpoints under the [Wan-AI](https://huggingface.co/Wan-AI) organization.
3030

3131
The following Wan models are supported in Diffusers:
32+
3233
- [Wan 2.1 T2V 1.3B](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers)
3334
- [Wan 2.1 T2V 14B](https://huggingface.co/Wan-AI/Wan2.1-T2V-14B-Diffusers)
3435
- [Wan 2.1 I2V 14B - 480P](https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-480P-Diffusers)
3536
- [Wan 2.1 I2V 14B - 720P](https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P-Diffusers)
3637
- [Wan 2.1 FLF2V 14B - 720P](https://huggingface.co/Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers)
3738
- [Wan 2.1 VACE 1.3B](https://huggingface.co/Wan-AI/Wan2.1-VACE-1.3B-diffusers)
3839
- [Wan 2.1 VACE 14B](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B-diffusers)
40+
- [Wan 2.2 T2V 14B](https://huggingface.co/Wan-AI/Wan2.2-T2V-A14B-Diffusers)
41+
- [Wan 2.2 I2V 14B](https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers)
42+
- [Wan 2.2 TI2V 5B](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B-Diffusers)
3943

4044
> [!TIP]
4145
> Click on the Wan2.1 models in the right sidebar for more examples of video generation.
@@ -327,6 +331,8 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip
327331

328332
- Try lower `shift` values (`2.0` to `5.0`) for lower resolution videos and higher `shift` values (`7.0` to `12.0`) for higher resolution images.
329333

334+
- Wan 2.1 and 2.2 support using [LightX2V LoRAs](https://huggingface.co/Kijai/WanVideo_comfy/tree/main/Lightx2v) to speed up inference. Using them on Wan 2.2 is slightly more involed. Refer to [this code snippet](https://github.com/huggingface/diffusers/pull/12040#issuecomment-3144185272) to learn more.
335+
330336
## WanPipeline
331337

332338
[[autodoc]] WanPipeline

src/diffusers/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@
174174
"AutoencoderKLLTXVideo",
175175
"AutoencoderKLMagvit",
176176
"AutoencoderKLMochi",
177+
"AutoencoderKLQwenImage",
177178
"AutoencoderKLTemporalDecoder",
178179
"AutoencoderKLWan",
179180
"AutoencoderOobleck",
@@ -215,6 +216,7 @@
215216
"OmniGenTransformer2DModel",
216217
"PixArtTransformer2DModel",
217218
"PriorTransformer",
219+
"QwenImageTransformer2DModel",
218220
"SanaControlNetModel",
219221
"SanaTransformer2DModel",
220222
"SD3ControlNetModel",
@@ -486,6 +488,7 @@
486488
"PixArtAlphaPipeline",
487489
"PixArtSigmaPAGPipeline",
488490
"PixArtSigmaPipeline",
491+
"QwenImagePipeline",
489492
"ReduxImageEncoder",
490493
"SanaControlNetPipeline",
491494
"SanaPAGPipeline",
@@ -832,6 +835,7 @@
832835
AutoencoderKLLTXVideo,
833836
AutoencoderKLMagvit,
834837
AutoencoderKLMochi,
838+
AutoencoderKLQwenImage,
835839
AutoencoderKLTemporalDecoder,
836840
AutoencoderKLWan,
837841
AutoencoderOobleck,
@@ -873,6 +877,7 @@
873877
OmniGenTransformer2DModel,
874878
PixArtTransformer2DModel,
875879
PriorTransformer,
880+
QwenImageTransformer2DModel,
876881
SanaControlNetModel,
877882
SanaTransformer2DModel,
878883
SD3ControlNetModel,
@@ -1119,6 +1124,7 @@
11191124
PixArtAlphaPipeline,
11201125
PixArtSigmaPAGPipeline,
11211126
PixArtSigmaPipeline,
1127+
QwenImagePipeline,
11221128
ReduxImageEncoder,
11231129
SanaControlNetPipeline,
11241130
SanaPAGPipeline,

src/diffusers/hooks/_helpers.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ def _register_transformer_blocks_metadata():
153153
)
154154
from ..models.transformers.transformer_ltx import LTXVideoTransformerBlock
155155
from ..models.transformers.transformer_mochi import MochiTransformerBlock
156+
from ..models.transformers.transformer_qwenimage import QwenImageTransformerBlock
156157
from ..models.transformers.transformer_wan import WanTransformerBlock
157158

158159
# BasicTransformerBlock
@@ -255,6 +256,15 @@ def _register_transformer_blocks_metadata():
255256
),
256257
)
257258

259+
# QwenImage
260+
TransformerBlockRegistry.register(
261+
model_class=QwenImageTransformerBlock,
262+
metadata=TransformerBlockMetadata(
263+
return_hidden_states_index=1,
264+
return_encoder_hidden_states_index=0,
265+
),
266+
)
267+
258268

259269
# fmt: off
260270
def _skip_attention___ret___hidden_states(self, *args, **kwargs):

src/diffusers/loaders/lora_conversion_utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1974,6 +1974,10 @@ def _convert_non_diffusers_wan_lora_to_diffusers(state_dict):
19741974
converted_key = f"condition_embedder.image_embedder.{img_ours}.lora_B.weight"
19751975
if original_key in original_state_dict:
19761976
converted_state_dict[converted_key] = original_state_dict.pop(original_key)
1977+
bias_key_theirs = original_key.removesuffix(f".{lora_up_key}.weight") + ".diff_b"
1978+
if bias_key_theirs in original_state_dict:
1979+
bias_key = converted_key.removesuffix(".weight") + ".bias"
1980+
converted_state_dict[bias_key] = original_state_dict.pop(bias_key_theirs)
19771981

19781982
if len(original_state_dict) > 0:
19791983
diff = all(".diff" in k for k in original_state_dict)

src/diffusers/models/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
_import_structure["autoencoders.autoencoder_kl_ltx"] = ["AutoencoderKLLTXVideo"]
3939
_import_structure["autoencoders.autoencoder_kl_magvit"] = ["AutoencoderKLMagvit"]
4040
_import_structure["autoencoders.autoencoder_kl_mochi"] = ["AutoencoderKLMochi"]
41+
_import_structure["autoencoders.autoencoder_kl_qwenimage"] = ["AutoencoderKLQwenImage"]
4142
_import_structure["autoencoders.autoencoder_kl_temporal_decoder"] = ["AutoencoderKLTemporalDecoder"]
4243
_import_structure["autoencoders.autoencoder_kl_wan"] = ["AutoencoderKLWan"]
4344
_import_structure["autoencoders.autoencoder_oobleck"] = ["AutoencoderOobleck"]
@@ -88,6 +89,7 @@
8889
_import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
8990
_import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
9091
_import_structure["transformers.transformer_omnigen"] = ["OmniGenTransformer2DModel"]
92+
_import_structure["transformers.transformer_qwenimage"] = ["QwenImageTransformer2DModel"]
9193
_import_structure["transformers.transformer_sd3"] = ["SD3Transformer2DModel"]
9294
_import_structure["transformers.transformer_skyreels_v2"] = ["SkyReelsV2Transformer3DModel"]
9395
_import_structure["transformers.transformer_temporal"] = ["TransformerTemporalModel"]
@@ -126,6 +128,7 @@
126128
AutoencoderKLLTXVideo,
127129
AutoencoderKLMagvit,
128130
AutoencoderKLMochi,
131+
AutoencoderKLQwenImage,
129132
AutoencoderKLTemporalDecoder,
130133
AutoencoderKLWan,
131134
AutoencoderOobleck,
@@ -177,6 +180,7 @@
177180
OmniGenTransformer2DModel,
178181
PixArtTransformer2DModel,
179182
PriorTransformer,
183+
QwenImageTransformer2DModel,
180184
SanaTransformer2DModel,
181185
SD3Transformer2DModel,
182186
SkyReelsV2Transformer3DModel,

src/diffusers/models/autoencoders/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from .autoencoder_kl_ltx import AutoencoderKLLTXVideo
99
from .autoencoder_kl_magvit import AutoencoderKLMagvit
1010
from .autoencoder_kl_mochi import AutoencoderKLMochi
11+
from .autoencoder_kl_qwenimage import AutoencoderKLQwenImage
1112
from .autoencoder_kl_temporal_decoder import AutoencoderKLTemporalDecoder
1213
from .autoencoder_kl_wan import AutoencoderKLWan
1314
from .autoencoder_oobleck import AutoencoderOobleck

src/diffusers/models/autoencoders/autoencoder_kl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def __init__(
9090
shift_factor: Optional[float] = None,
9191
latents_mean: Optional[Tuple[float]] = None,
9292
latents_std: Optional[Tuple[float]] = None,
93-
force_upcast: float = True,
93+
force_upcast: bool = True,
9494
use_quant_conv: bool = True,
9595
use_post_quant_conv: bool = True,
9696
mid_block_add_attention: bool = True,

src/diffusers/models/autoencoders/autoencoder_kl_cosmos.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,9 @@ def _arrange(self, hidden_states: torch.Tensor) -> torch.Tensor:
168168
batch_size, num_channels, num_frames, height, width = hidden_states.shape
169169
p = self.patch_size
170170

171-
hidden_states = torch.reshape(batch_size, num_channels, num_frames // p, p, height // p, p, width // p, p)
171+
hidden_states = hidden_states.reshape(
172+
batch_size, num_channels, num_frames // p, p, height // p, p, width // p, p
173+
)
172174
hidden_states = hidden_states.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(1, 4).contiguous()
173175
return hidden_states
174176

0 commit comments

Comments
 (0)