up

sayakpaul · sayakpaul · commit 35744ebdf0dc · 2025-08-17T20:59:00.000+05:30
diff --git a/src/diffusers/models/transformers/transformer_qwenimage.py b/src/diffusers/models/transformers/transformer_qwenimage.py
@@ -219,6 +219,7 @@ def forward(self, video_fhw, txt_seq_lens, device):
                 video_freq = self.rope_cache[rope_key]
             else:
                 video_freq = self._compute_video_freqs(frame, height, width, idx)
+            video_freq = video_freq.to(device)
             vid_freqs.append(video_freq)
 
             if self.scale_rope:
@@ -249,8 +250,9 @@ def _compute_video_freqs(self, frame, height, width, idx=0):
             freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
 
         freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
-        return freqs.clone().contiguous()
-
+        freqs = freqs.clone().contiguous()
+        
+        return freqs
 
 class QwenDoubleStreamAttnProcessor2_0:
     """
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py
@@ -183,7 +183,7 @@ def calculate_dimensions(target_area, ratio):
 
 class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
     r"""
-    The QwenImage pipeline for text-to-image generation.
+    The Qwen-Image-Edit pipeline for image editing.
 
     Args:
         transformer ([`QwenImageTransformer2DModel`]):
@@ -222,8 +222,8 @@ def __init__(
             transformer=transformer,
             scheduler=scheduler,
         )
-        self.latent_channels = 16
         self.vae_scale_factor = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
+        self.latent_channels = self.vae.config.z_dim if getattr(self, "vae", None) else 16
         # QwenImage latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
         # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
@@ -258,7 +258,7 @@ def _get_qwen_prompt_embeds(
         template = self.prompt_template_encode
         drop_idx = self.prompt_template_encode_start_idx
         txt = [template.format(e) for e in prompt]
-
+        
         model_inputs = self.processor(
             text=txt,
             images=image,
@@ -640,7 +640,9 @@ def __call__(
             [`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When
             returning a tuple, the first element is a list with the generated images.
         """
-        calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, image.width / image.height)
+        image_size = image[0].size if isinstance(image, list) else image.size
+        width, height = image_size
+        calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, width / height)
         height = height or calculated_height
         width = width or calculated_width
 
diff --git a/tests/models/transformers/test_models_transformer_qwenimage.py b/tests/models/transformers/test_models_transformer_qwenimage.py
@@ -103,4 +103,4 @@ def prepare_dummy_input(self, height, width):
 
     @pytest.mark.xfail(condition=True, reason="RoPE needs to be revisited.", strict=True)
     def test_torch_compile_recompilation_and_graph_break(self):
-        pass
+        super().test_torch_compile_recompilation_and_graph_break()
diff --git a/tests/pipelines/qwenimage/test_qwenimage_edit.py b/tests/pipelines/qwenimage/test_qwenimage_edit.py
@@ -13,16 +13,16 @@
 # limitations under the License.
 
 import unittest
-
+import pytest
 import numpy as np
 import torch
 from PIL import Image
-from transformers import Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer
+from transformers import Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor
 
 from diffusers import (
     AutoencoderKLQwenImage,
     FlowMatchEulerDiscreteScheduler,
-    QwenImagePipeline,
+    QwenImageEditPipeline,
     QwenImageTransformer2DModel,
 )
 from diffusers.utils.testing_utils import enable_full_determinism, torch_device
@@ -34,12 +34,12 @@
 enable_full_determinism()
 
 
-class QwenImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = QwenImagePipeline
+class QwenImageEditPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = QwenImageEditPipeline
     params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
-    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    batch_params = frozenset(["prompt", "image"])
+    image_params = frozenset(["image"])
+    image_latents_params = frozenset(["latents"])
     required_optional_params = frozenset(
         [
             "num_inference_steps",
@@ -56,6 +56,8 @@ class QwenImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     test_group_offloading = True
 
     def get_dummy_components(self):
+        tiny_ckpt_id = "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration"
+        
         torch.manual_seed(0)
         transformer = QwenImageTransformer2DModel(
             patch_size=2,
@@ -77,10 +79,8 @@ def get_dummy_components(self):
             dim_mult=[1, 2, 4],
             num_res_blocks=1,
             temperal_downsample=[False, True],
-            # fmt: off
-            latents_mean=[0.0] * 4,
-            latents_std=[1.0] * 4,
-            # fmt: on
+            latents_mean=[0.0] * z_dim,
+            latents_std=[1.0] * z_dim,
         )
 
         torch.manual_seed(0)
@@ -115,14 +115,15 @@ def get_dummy_components(self):
             vision_token_id=151654,
         )
         text_encoder = Qwen2_5_VLForConditionalGeneration(config)
-        tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
+        tokenizer = Qwen2Tokenizer.from_pretrained(tiny_ckpt_id)
 
         components = {
             "transformer": transformer,
             "vae": vae,
             "scheduler": scheduler,
             "text_encoder": text_encoder,
             "tokenizer": tokenizer,
+            "processor": Qwen2VLProcessor.from_pretrained(tiny_ckpt_id),
         }
         return components
 
@@ -134,7 +135,7 @@ def get_dummy_inputs(self, device, seed=0):
 
         inputs = {
             "prompt": "dance monkey",
-            "image": Image.new("RGB", (16, 16)),
+            "image": Image.new("RGB", (32, 32)),
             "negative_prompt": "bad quality",
             "generator": generator,
             "num_inference_steps": 2,
@@ -160,13 +161,13 @@ def test_inference(self):
         generated_image = image[0]
         self.assertEqual(generated_image.shape, (3, 32, 32))
 
-        # fmt: off
-        expected_slice = torch.tensor([0.56331, 0.63677, 0.6015, 0.56369, 0.58166, 0.55277, 0.57176, 0.63261, 0.41466, 0.35561, 0.56229, 0.48334, 0.49714, 0.52622, 0.40872, 0.50208])
+        expected_slice = torch.tensor(
+            [[0.5637, 0.6341, 0.6001, 0.5620, 0.5794, 0.5498, 0.5757, 0.6389, 0.4174,
+        0.3597, 0.5649, 0.4894, 0.4969, 0.5255, 0.4083, 0.4986]])
         # fmt: on
 
         generated_slice = generated_image.flatten()
         generated_slice = torch.cat([generated_slice[:8], generated_slice[-8:]])
-        print(f"{generated_slice=}")
         self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=1e-3))
 
     def test_inference_batch_single_identical(self):
@@ -236,3 +237,7 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
             expected_diff_max,
             "VAE tiling should not affect the inference results",
         )
+
+    @pytest.mark.xfail(condition=True, reason="Preconfigured embeddings need to be revisited.", strict=True)
+    def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=None, atol=1e-4, rtol=1e-4):
+        super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict, atol, rtol)