huggingface · sayakpaul · Aug 18, 2025 · Aug 17, 2025 · Aug 17, 2025 · Aug 17, 2025
diff --git a/docs/source/en/api/pipelines/qwenimage.md b/docs/source/en/api/pipelines/qwenimage.md
@@ -16,7 +16,12 @@
 
 Qwen-Image from the Qwen team is an image generation foundation model in the Qwen series that achieves significant advances in complex text rendering and precise image editing. Experiments show strong general capabilities in both image generation and editing, with exceptional performance in text rendering, especially for Chinese.
 
-Check out the model card [here](https://huggingface.co/Qwen/Qwen-Image) to learn more.
+Qwen-Image comes in the following variants:
+
+| model type | model id |
+|:----------:|:--------:|
+| Qwen-Image | [`Qwen/Qwen-Image`](https://huggingface.co/Qwen/Qwen-Image) |
+| Qwen-Image-Edit | [`Qwen/Qwen-Image-Edit`](https://huggingface.co/Qwen/Qwen-Image-Edit) |
 
 <Tip>
 
@@ -87,10 +92,6 @@ image.save("qwen_fewsteps.png")
   - all
   - __call__
 
-## QwenImagePipelineOutput
-
-[[autodoc]] pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput
-
 ## QwenImageImg2ImgPipeline
 
 [[autodoc]] QwenImageImg2ImgPipeline
@@ -102,3 +103,13 @@ image.save("qwen_fewsteps.png")
 [[autodoc]] QwenImageInpaintPipeline
   - all
   - __call__
+
+## QwenImageEditPipeline
+
+[[autodoc]] QwenImageEditPipeline
+  - all
+  - __call__
+
+## QwenImagePipelineOutput
+
+[[autodoc]] pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -489,10 +489,10 @@
             "PixArtAlphaPipeline",
             "PixArtSigmaPAGPipeline",
             "PixArtSigmaPipeline",
+            "QwenImageEditPipeline",
             "QwenImageImg2ImgPipeline",
             "QwenImageInpaintPipeline",
             "QwenImagePipeline",
-            "QwenImageEditPipeline",
             "ReduxImageEncoder",
             "SanaControlNetPipeline",
             "SanaPAGPipeline",

diff --git a/src/diffusers/models/transformers/transformer_qwenimage.py b/src/diffusers/models/transformers/transformer_qwenimage.py
@@ -219,6 +219,7 @@ def forward(self, video_fhw, txt_seq_lens, device):
                 video_freq = self.rope_cache[rope_key]
             else:
                 video_freq = self._compute_video_freqs(frame, height, width, idx)
+            video_freq = video_freq.to(device)
 def test_to_device(self): 
 def test_to_device(self): 
             vid_freqs.append(video_freq)
 
             if self.scale_rope:
@@ -249,7 +250,9 @@ def _compute_video_freqs(self, frame, height, width, idx=0):
             freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
 
         freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
-        return freqs.clone().contiguous()
+        freqs = freqs.clone().contiguous()
+
+        return freqs
 
 
 class QwenDoubleStreamAttnProcessor2_0:

diff --git a/src/diffusers/pipelines/qwenimage/__init__.py b/src/diffusers/pipelines/qwenimage/__init__.py
@@ -24,9 +24,9 @@
 else:
     _import_structure["modeling_qwenimage"] = ["ReduxImageEncoder"]
     _import_structure["pipeline_qwenimage"] = ["QwenImagePipeline"]
+    _import_structure["pipeline_qwenimage_edit"] = ["QwenImageEditPipeline"]
     _import_structure["pipeline_qwenimage_img2img"] = ["QwenImageImg2ImgPipeline"]
     _import_structure["pipeline_qwenimage_inpaint"] = ["QwenImageInpaintPipeline"]
-    _import_structure["pipeline_qwenimage_edit"] = ["QwenImageEditPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:

diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py
@@ -46,15 +46,20 @@
         >>> import torch
         >>> from PIL import Image
         >>> from diffusers import QwenImageEditPipeline
+        >>> from diffusers.utils import load_image
 
         >>> pipe = QwenImageEditPipeline.from_pretrained("Qwen/Qwen-Image-Edit", torch_dtype=torch.bfloat16)
         >>> pipe.to("cuda")
-        >>> prompt = "Change the cat to a dog"
-        >>> image = Image.open("cat.png")
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/yarn-art-pikachu.png"
+        ... ).convert("RGB")
+        >>> prompt = (
+        ...     "Make Pikachu hold a sign that says 'Qwen Edit is awesome', yarn art style, detailed, vibrant colors"
+        ... )
         >>> # Depending on the variant being used, the pipeline call will slightly vary.
         >>> # Refer to the pipeline documentation for more details.
         >>> image = pipe(image, prompt, num_inference_steps=50).images[0]
-        >>> image.save("qwenimageedit.png")
+        >>> image.save("qwenimage_edit.png")
         ```
 """
 PREFERRED_QWENIMAGE_RESOLUTIONS = [
@@ -178,7 +183,7 @@ def calculate_dimensions(target_area, ratio):
 
 class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
     r"""
-    The QwenImage pipeline for text-to-image generation.
+    The Qwen-Image-Edit pipeline for image editing.
 
     Args:
         transformer ([`QwenImageTransformer2DModel`]):
@@ -217,8 +222,8 @@ def __init__(
             transformer=transformer,
             scheduler=scheduler,
         )
-        self.latent_channels = 16
         self.vae_scale_factor = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
+        self.latent_channels = self.vae.config.z_dim if getattr(self, "vae", None) else 16
         # QwenImage latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
         # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
@@ -635,7 +640,9 @@ def __call__(
             [`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When
             returning a tuple, the first element is a list with the generated images.
         """
-        calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, image.width / image.height)
+        image_size = image[0].size if isinstance(image, list) else image.size
+        width, height = image_size
+        calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, width / height)
         height = height or calculated_height
         width = width or calculated_width
 

diff --git a/tests/models/transformers/test_models_transformer_qwenimage.py b/tests/models/transformers/test_models_transformer_qwenimage.py
@@ -15,6 +15,7 @@
 
 import unittest
 
+import pytest
 import torch
 
 from diffusers import QwenImageTransformer2DModel
@@ -99,3 +100,7 @@ def prepare_init_args_and_inputs_for_common(self):
 
     def prepare_dummy_input(self, height, width):
         return QwenImageTransformerTests().prepare_dummy_input(height=height, width=width)
+
+    @pytest.mark.xfail(condition=True, reason="RoPE needs to be revisited.", strict=True)
+    def test_torch_compile_recompilation_and_graph_break(self):
+        super().test_torch_compile_recompilation_and_graph_break()