huggingface
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 6 additions & 0 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/source/en/api/models/autoencoderkl_qwenimage.md‎
Lines changed: 35 additions & 0 deletions b/‎docs/source/en/api/models/autoencoderkl_qwenimage.md‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎docs/source/en/api/models/qwenimage_transformer2d.md‎
Lines changed: 28 additions & 0 deletions b/‎docs/source/en/api/models/qwenimage_transformer2d.md‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/qwenimage.md‎
Lines changed: 33 additions & 0 deletions b/‎docs/source/en/api/pipelines/qwenimage.md‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/wan.md‎
Lines changed: 6 additions & 0 deletions b/‎docs/source/en/api/pipelines/wan.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/diffusers/__init__.py‎
Lines changed: 6 additions & 0 deletions b/‎src/diffusers/__init__.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/diffusers/hooks/_helpers.py‎
Lines changed: 10 additions & 0 deletions b/‎src/diffusers/hooks/_helpers.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/diffusers/loaders/lora_conversion_utils.py‎
Lines changed: 4 additions & 0 deletions b/‎src/diffusers/loaders/lora_conversion_utils.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/diffusers/models/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎src/diffusers/models/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/diffusers/models/autoencoders/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎src/diffusers/models/autoencoders/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -366,6 +366,8 @@
         title: PixArtTransformer2DModel
       - local: api/models/prior_transformer
         title: PriorTransformer
+      - local: api/models/qwenimage_transformer2d
+        title: QwenImageTransformer2DModel
       - local: api/models/sana_transformer2d
         title: SanaTransformer2DModel
       - local: api/models/sd3_transformer2d
@@ -418,6 +420,8 @@
         title: AutoencoderKLMagvit
       - local: api/models/autoencoderkl_mochi
         title: AutoencoderKLMochi
+      - local: api/models/autoencoderkl_qwenimage
+        title: AutoencoderKLQwenImage
       - local: api/models/autoencoder_kl_wan
         title: AutoencoderKLWan
       - local: api/models/consistency_decoder_vae
@@ -554,6 +558,8 @@
       title: PixArt-α
     - local: api/pipelines/pixart_sigma
       title: PixArt-Σ
+    - local: api/pipelines/qwenimage
+      title: QwenImage
     - local: api/pipelines/sana
       title: Sana
     - local: api/pipelines/sana_sprint
 
@@ -0,0 +1,35 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# AutoencoderKLQwenImage
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import AutoencoderKLQwenImage
+
+vae = AutoencoderKLQwenImage.from_pretrained("Qwen/QwenImage-20B", subfolder="vae")
+```
+
+## AutoencoderKLQwenImage
+
+[[autodoc]] AutoencoderKLQwenImage
+    - decode
+    - encode
+    - all
+
+## AutoencoderKLOutput
+
+[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
+
+## DecoderOutput
+
+[[autodoc]] models.autoencoders.vae.DecoderOutput
@@ -0,0 +1,28 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# QwenImageTransformer2DModel
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import QwenImageTransformer2DModel
+
+transformer = QwenImageTransformer2DModel.from_pretrained("Qwen/QwenImage-20B", subfolder="transformer", torch_dtype=torch.bfloat16)
+```
+
+## QwenImageTransformer2DModel
+
+[[autodoc]] QwenImageTransformer2DModel
+
+## Transformer2DModelOutput
+
+[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
@@ -0,0 +1,33 @@
+<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. -->
+
+# QwenImage
+
+<!-- TODO: update this section when model is out -->
+
+<Tip>
+
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
+
+</Tip>
+
+## QwenImagePipeline
+
+[[autodoc]] QwenImagePipeline
+  - all
+  - __call__
+
+## QwenImagePipeline
+
+[[autodoc]] pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput
@@ -29,13 +29,17 @@
 You can find all the original Wan2.1 checkpoints under the [Wan-AI](https://huggingface.co/Wan-AI) organization.
 
 The following Wan models are supported in Diffusers:
+
 - [Wan 2.1 T2V 1.3B](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers)
 - [Wan 2.1 T2V 14B](https://huggingface.co/Wan-AI/Wan2.1-T2V-14B-Diffusers)
 - [Wan 2.1 I2V 14B - 480P](https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-480P-Diffusers)
 - [Wan 2.1 I2V 14B - 720P](https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P-Diffusers)
 - [Wan 2.1 FLF2V 14B - 720P](https://huggingface.co/Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers)
 - [Wan 2.1 VACE 1.3B](https://huggingface.co/Wan-AI/Wan2.1-VACE-1.3B-diffusers)
 - [Wan 2.1 VACE 14B](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B-diffusers)
+- [Wan 2.2 T2V 14B](https://huggingface.co/Wan-AI/Wan2.2-T2V-A14B-Diffusers)
+- [Wan 2.2 I2V 14B](https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers)
+- [Wan 2.2 TI2V 5B](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B-Diffusers)
 
 > [!TIP]
 > Click on the Wan2.1 models in the right sidebar for more examples of video generation.
@@ -327,6 +331,8 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip
 
 - Try lower `shift` values (`2.0` to `5.0`) for lower resolution videos and higher `shift` values (`7.0` to `12.0`) for higher resolution images.
 
+- Wan 2.1 and 2.2 support using [LightX2V LoRAs](https://huggingface.co/Kijai/WanVideo_comfy/tree/main/Lightx2v) to speed up inference. Using them on Wan 2.2 is slightly more involed. Refer to [this code snippet](https://github.com/huggingface/diffusers/pull/12040#issuecomment-3144185272) to learn more.
+
 ## WanPipeline
 
 [[autodoc]] WanPipeline
 
@@ -174,6 +174,7 @@
             "AutoencoderKLLTXVideo",
             "AutoencoderKLMagvit",
             "AutoencoderKLMochi",
+            "AutoencoderKLQwenImage",
             "AutoencoderKLTemporalDecoder",
             "AutoencoderKLWan",
             "AutoencoderOobleck",
@@ -215,6 +216,7 @@
             "OmniGenTransformer2DModel",
             "PixArtTransformer2DModel",
             "PriorTransformer",
+            "QwenImageTransformer2DModel",
             "SanaControlNetModel",
             "SanaTransformer2DModel",
             "SD3ControlNetModel",
@@ -486,6 +488,7 @@
             "PixArtAlphaPipeline",
             "PixArtSigmaPAGPipeline",
             "PixArtSigmaPipeline",
+            "QwenImagePipeline",
             "ReduxImageEncoder",
             "SanaControlNetPipeline",
             "SanaPAGPipeline",
@@ -832,6 +835,7 @@
             AutoencoderKLLTXVideo,
             AutoencoderKLMagvit,
             AutoencoderKLMochi,
+            AutoencoderKLQwenImage,
             AutoencoderKLTemporalDecoder,
             AutoencoderKLWan,
             AutoencoderOobleck,
@@ -873,6 +877,7 @@
             OmniGenTransformer2DModel,
             PixArtTransformer2DModel,
             PriorTransformer,
+            QwenImageTransformer2DModel,
             SanaControlNetModel,
             SanaTransformer2DModel,
             SD3ControlNetModel,
@@ -1119,6 +1124,7 @@
             PixArtAlphaPipeline,
             PixArtSigmaPAGPipeline,
             PixArtSigmaPipeline,
+            QwenImagePipeline,
             ReduxImageEncoder,
             SanaControlNetPipeline,
             SanaPAGPipeline,
 
@@ -153,6 +153,7 @@ def _register_transformer_blocks_metadata():
     )
     from ..models.transformers.transformer_ltx import LTXVideoTransformerBlock
     from ..models.transformers.transformer_mochi import MochiTransformerBlock
+    from ..models.transformers.transformer_qwenimage import QwenImageTransformerBlock
     from ..models.transformers.transformer_wan import WanTransformerBlock
 
     # BasicTransformerBlock
@@ -255,6 +256,15 @@ def _register_transformer_blocks_metadata():
         ),
     )
 
+    # QwenImage
+    TransformerBlockRegistry.register(
+        model_class=QwenImageTransformerBlock,
+        metadata=TransformerBlockMetadata(
+            return_hidden_states_index=1,
+            return_encoder_hidden_states_index=0,
+        ),
+    )
+
 
 # fmt: off
 def _skip_attention___ret___hidden_states(self, *args, **kwargs):
 
@@ -1974,6 +1974,10 @@ def _convert_non_diffusers_wan_lora_to_diffusers(state_dict):
             converted_key = f"condition_embedder.image_embedder.{img_ours}.lora_B.weight"
             if original_key in original_state_dict:
                 converted_state_dict[converted_key] = original_state_dict.pop(original_key)
+                bias_key_theirs = original_key.removesuffix(f".{lora_up_key}.weight") + ".diff_b"
+                if bias_key_theirs in original_state_dict:
+                    bias_key = converted_key.removesuffix(".weight") + ".bias"
+                    converted_state_dict[bias_key] = original_state_dict.pop(bias_key_theirs)
 
     if len(original_state_dict) > 0:
         diff = all(".diff" in k for k in original_state_dict)
 
@@ -38,6 +38,7 @@
     _import_structure["autoencoders.autoencoder_kl_ltx"] = ["AutoencoderKLLTXVideo"]
     _import_structure["autoencoders.autoencoder_kl_magvit"] = ["AutoencoderKLMagvit"]
     _import_structure["autoencoders.autoencoder_kl_mochi"] = ["AutoencoderKLMochi"]
+    _import_structure["autoencoders.autoencoder_kl_qwenimage"] = ["AutoencoderKLQwenImage"]
     _import_structure["autoencoders.autoencoder_kl_temporal_decoder"] = ["AutoencoderKLTemporalDecoder"]
     _import_structure["autoencoders.autoencoder_kl_wan"] = ["AutoencoderKLWan"]
     _import_structure["autoencoders.autoencoder_oobleck"] = ["AutoencoderOobleck"]
@@ -88,6 +89,7 @@
     _import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
     _import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
     _import_structure["transformers.transformer_omnigen"] = ["OmniGenTransformer2DModel"]
+    _import_structure["transformers.transformer_qwenimage"] = ["QwenImageTransformer2DModel"]
     _import_structure["transformers.transformer_sd3"] = ["SD3Transformer2DModel"]
     _import_structure["transformers.transformer_skyreels_v2"] = ["SkyReelsV2Transformer3DModel"]
     _import_structure["transformers.transformer_temporal"] = ["TransformerTemporalModel"]
@@ -126,6 +128,7 @@
             AutoencoderKLLTXVideo,
             AutoencoderKLMagvit,
             AutoencoderKLMochi,
+            AutoencoderKLQwenImage,
             AutoencoderKLTemporalDecoder,
             AutoencoderKLWan,
             AutoencoderOobleck,
@@ -177,6 +180,7 @@
             OmniGenTransformer2DModel,
             PixArtTransformer2DModel,
             PriorTransformer,
+            QwenImageTransformer2DModel,
             SanaTransformer2DModel,
             SD3Transformer2DModel,
             SkyReelsV2Transformer3DModel,
 
@@ -8,6 +8,7 @@
 from .autoencoder_kl_ltx import AutoencoderKLLTXVideo
 from .autoencoder_kl_magvit import AutoencoderKLMagvit
 from .autoencoder_kl_mochi import AutoencoderKLMochi
+from .autoencoder_kl_qwenimage import AutoencoderKLQwenImage
 from .autoencoder_kl_temporal_decoder import AutoencoderKLTemporalDecoder
 from .autoencoder_kl_wan import AutoencoderKLWan
 from .autoencoder_oobleck import AutoencoderOobleck