From 23ae97392c98d8a0548260e59c2906d6abf6b454 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sun, 17 Aug 2025 17:03:47 +0530 Subject: [PATCH 1/8] add docs. --- src/diffusers/__init__.py | 2 +- src/diffusers/pipelines/qwenimage/__init__.py | 2 +- .../pipelines/qwenimage/pipeline_qwenimage_edit.py | 11 ++++++++--- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 612219ad43aa..ef645c9e145b 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -489,10 +489,10 @@ "PixArtAlphaPipeline", "PixArtSigmaPAGPipeline", "PixArtSigmaPipeline", + "QwenImageEditPipeline", "QwenImageImg2ImgPipeline", "QwenImageInpaintPipeline", "QwenImagePipeline", - "QwenImageEditPipeline", "ReduxImageEncoder", "SanaControlNetPipeline", "SanaPAGPipeline", diff --git a/src/diffusers/pipelines/qwenimage/__init__.py b/src/diffusers/pipelines/qwenimage/__init__.py index 3d0378511fe0..4b64474dda13 100644 --- a/src/diffusers/pipelines/qwenimage/__init__.py +++ b/src/diffusers/pipelines/qwenimage/__init__.py @@ -24,9 +24,9 @@ else: _import_structure["modeling_qwenimage"] = ["ReduxImageEncoder"] _import_structure["pipeline_qwenimage"] = ["QwenImagePipeline"] + _import_structure["pipeline_qwenimage_edit"] = ["QwenImageEditPipeline"] _import_structure["pipeline_qwenimage_img2img"] = ["QwenImageImg2ImgPipeline"] _import_structure["pipeline_qwenimage_inpaint"] = ["QwenImageInpaintPipeline"] - _import_structure["pipeline_qwenimage_edit"] = ["QwenImageEditPipeline"] if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: try: diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py index 942210c1fdb5..9a7a00d6f057 100644 --- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py +++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py @@ -46,15 +46,20 @@ >>> import torch >>> from PIL import Image >>> from diffusers import QwenImageEditPipeline + >>> from diffusers.utils import load_image >>> pipe = QwenImageEditPipeline.from_pretrained("Qwen/Qwen-Image-Edit", torch_dtype=torch.bfloat16) >>> pipe.to("cuda") - >>> prompt = "Change the cat to a dog" - >>> image = Image.open("cat.png") + >>> image = load_image( + ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/yarn-art-pikachu.png" + ... ).convert("RGB") + >>> prompt = ( + ... "Make Pikachu hold a sign that says 'Qwen Edit is awesome', yarn art style, detailed, vibrant colors" + ... ) >>> # Depending on the variant being used, the pipeline call will slightly vary. >>> # Refer to the pipeline documentation for more details. >>> image = pipe(image, prompt, num_inference_steps=50).images[0] - >>> image.save("qwenimageedit.png") + >>> image.save("qwenimage_edit.png") ``` """ PREFERRED_QWENIMAGE_RESOLUTIONS = [ From 34dd6cf053b5f9dbb1ab6f5ec92906db4f55a180 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sun, 17 Aug 2025 17:06:08 +0530 Subject: [PATCH 2/8] more docs. --- docs/source/en/api/pipelines/qwenimage.md | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/docs/source/en/api/pipelines/qwenimage.md b/docs/source/en/api/pipelines/qwenimage.md index 557249f7a35b..9ec2aff9a274 100644 --- a/docs/source/en/api/pipelines/qwenimage.md +++ b/docs/source/en/api/pipelines/qwenimage.md @@ -16,7 +16,12 @@ Qwen-Image from the Qwen team is an image generation foundation model in the Qwen series that achieves significant advances in complex text rendering and precise image editing. Experiments show strong general capabilities in both image generation and editing, with exceptional performance in text rendering, especially for Chinese. -Check out the model card [here](https://huggingface.co/Qwen/Qwen-Image) to learn more. +Qwen-Image comes in the following variants: + +| model type | model id | +|:----------:|:--------:| +| Qwen-Image | [`Qwen/Qwen-Image`](https://huggingface.co/Qwen/Qwen-Image) | +| Qwen-Image-Edit | [`Qwen/Qwen-Image-Edit`](https://huggingface.co/Qwen/Qwen-Image-Edit) | @@ -87,10 +92,6 @@ image.save("qwen_fewsteps.png") - all - __call__ -## QwenImagePipelineOutput - -[[autodoc]] pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput - ## QwenImageImg2ImgPipeline [[autodoc]] QwenImageImg2ImgPipeline @@ -102,3 +103,13 @@ image.save("qwen_fewsteps.png") [[autodoc]] QwenImageInpaintPipeline - all - __call__ + +## QwenImageEditPipeline + +[[autodoc]] QwenImageEditPipeline + - all + - __call__ + +## QwenImagePipelineOutput + +[[autodoc]] pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput \ No newline at end of file From df737cc5a0fb84bc4acaeba011dbff5c5babe61a Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sun, 17 Aug 2025 17:10:06 +0530 Subject: [PATCH 3/8] xfail full compilation for Qwen for now. --- .../models/transformers/test_models_transformer_qwenimage.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/models/transformers/test_models_transformer_qwenimage.py b/tests/models/transformers/test_models_transformer_qwenimage.py index 362697c67527..7b8af6854c9c 100644 --- a/tests/models/transformers/test_models_transformer_qwenimage.py +++ b/tests/models/transformers/test_models_transformer_qwenimage.py @@ -15,6 +15,7 @@ import unittest +import pytest import torch from diffusers import QwenImageTransformer2DModel @@ -99,3 +100,7 @@ def prepare_init_args_and_inputs_for_common(self): def prepare_dummy_input(self, height, width): return QwenImageTransformerTests().prepare_dummy_input(height=height, width=width) + + @pytest.mark.xfail(condition=True, reason="RoPE needs to be revisited.", strict=True) + def test_torch_compile_recompilation_and_graph_break(self): + pass From 615a4201bc3e3a067c68fa9e057d10de0e4ec2ec Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sun, 17 Aug 2025 17:34:13 +0530 Subject: [PATCH 4/8] tests --- .../qwenimage/test_qwenimage_edit.py | 238 ++++++++++++++++++ 1 file changed, 238 insertions(+) create mode 100644 tests/pipelines/qwenimage/test_qwenimage_edit.py diff --git a/tests/pipelines/qwenimage/test_qwenimage_edit.py b/tests/pipelines/qwenimage/test_qwenimage_edit.py new file mode 100644 index 000000000000..5ba1e0469298 --- /dev/null +++ b/tests/pipelines/qwenimage/test_qwenimage_edit.py @@ -0,0 +1,238 @@ +# Copyright 2025 The HuggingFace Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import torch +from PIL import Image +from transformers import Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer + +from diffusers import ( + AutoencoderKLQwenImage, + FlowMatchEulerDiscreteScheduler, + QwenImagePipeline, + QwenImageTransformer2DModel, +) +from diffusers.utils.testing_utils import enable_full_determinism, torch_device + +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineTesterMixin, to_np + + +enable_full_determinism() + + +class QwenImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = QwenImagePipeline + params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"} + batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + image_params = TEXT_TO_IMAGE_IMAGE_PARAMS + image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS + required_optional_params = frozenset( + [ + "num_inference_steps", + "generator", + "latents", + "return_dict", + "callback_on_step_end", + "callback_on_step_end_tensor_inputs", + ] + ) + supports_dduf = False + test_xformers_attention = False + test_layerwise_casting = True + test_group_offloading = True + + def get_dummy_components(self): + torch.manual_seed(0) + transformer = QwenImageTransformer2DModel( + patch_size=2, + in_channels=16, + out_channels=4, + num_layers=2, + attention_head_dim=16, + num_attention_heads=3, + joint_attention_dim=16, + guidance_embeds=False, + axes_dims_rope=(8, 4, 4), + ) + + torch.manual_seed(0) + z_dim = 4 + vae = AutoencoderKLQwenImage( + base_dim=z_dim * 6, + z_dim=z_dim, + dim_mult=[1, 2, 4], + num_res_blocks=1, + temperal_downsample=[False, True], + # fmt: off + latents_mean=[0.0] * 4, + latents_std=[1.0] * 4, + # fmt: on + ) + + torch.manual_seed(0) + scheduler = FlowMatchEulerDiscreteScheduler() + + torch.manual_seed(0) + config = Qwen2_5_VLConfig( + text_config={ + "hidden_size": 16, + "intermediate_size": 16, + "num_hidden_layers": 2, + "num_attention_heads": 2, + "num_key_value_heads": 2, + "rope_scaling": { + "mrope_section": [1, 1, 2], + "rope_type": "default", + "type": "default", + }, + "rope_theta": 1000000.0, + }, + vision_config={ + "depth": 2, + "hidden_size": 16, + "intermediate_size": 16, + "num_heads": 2, + "out_hidden_size": 16, + }, + hidden_size=16, + vocab_size=152064, + vision_end_token_id=151653, + vision_start_token_id=151652, + vision_token_id=151654, + ) + text_encoder = Qwen2_5_VLForConditionalGeneration(config) + tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration") + + components = { + "transformer": transformer, + "vae": vae, + "scheduler": scheduler, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + } + return components + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + + inputs = { + "prompt": "dance monkey", + "image": Image.new("RGB", (16, 16)), + "negative_prompt": "bad quality", + "generator": generator, + "num_inference_steps": 2, + "true_cfg_scale": 1.0, + "height": 32, + "width": 32, + "max_sequence_length": 16, + "output_type": "pt", + } + + return inputs + + def test_inference(self): + device = "cpu" + + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.to(device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + image = pipe(**inputs).images + generated_image = image[0] + self.assertEqual(generated_image.shape, (3, 32, 32)) + + # fmt: off + expected_slice = torch.tensor([0.56331, 0.63677, 0.6015, 0.56369, 0.58166, 0.55277, 0.57176, 0.63261, 0.41466, 0.35561, 0.56229, 0.48334, 0.49714, 0.52622, 0.40872, 0.50208]) + # fmt: on + + generated_slice = generated_image.flatten() + generated_slice = torch.cat([generated_slice[:8], generated_slice[-8:]]) + print(f"{generated_slice=}") + self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=1e-3)) + + def test_inference_batch_single_identical(self): + self._test_inference_batch_single_identical(batch_size=3, expected_max_diff=1e-1) + + def test_attention_slicing_forward_pass( + self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3 + ): + if not self.test_attention_slicing: + return + + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + for component in pipe.components.values(): + if hasattr(component, "set_default_attn_processor"): + component.set_default_attn_processor() + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + generator_device = "cpu" + inputs = self.get_dummy_inputs(generator_device) + output_without_slicing = pipe(**inputs)[0] + + pipe.enable_attention_slicing(slice_size=1) + inputs = self.get_dummy_inputs(generator_device) + output_with_slicing1 = pipe(**inputs)[0] + + pipe.enable_attention_slicing(slice_size=2) + inputs = self.get_dummy_inputs(generator_device) + output_with_slicing2 = pipe(**inputs)[0] + + if test_max_difference: + max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max() + max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max() + self.assertLess( + max(max_diff1, max_diff2), + expected_max_diff, + "Attention slicing should not affect the inference results", + ) + + def test_vae_tiling(self, expected_diff_max: float = 0.2): + generator_device = "cpu" + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components) + pipe.to("cpu") + pipe.set_progress_bar_config(disable=None) + + # Without tiling + inputs = self.get_dummy_inputs(generator_device) + inputs["height"] = inputs["width"] = 128 + output_without_tiling = pipe(**inputs)[0] + + # With tiling + pipe.vae.enable_tiling( + tile_sample_min_height=96, + tile_sample_min_width=96, + tile_sample_stride_height=64, + tile_sample_stride_width=64, + ) + inputs = self.get_dummy_inputs(generator_device) + inputs["height"] = inputs["width"] = 128 + output_with_tiling = pipe(**inputs)[0] + + self.assertLess( + (to_np(output_without_tiling) - to_np(output_with_tiling)).max(), + expected_diff_max, + "VAE tiling should not affect the inference results", + ) From 35744ebdf0dced58e2a69c74aa9d0f0d643aed9a Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sun, 17 Aug 2025 20:59:00 +0530 Subject: [PATCH 5/8] up --- .../transformers/transformer_qwenimage.py | 6 ++- .../qwenimage/pipeline_qwenimage_edit.py | 10 +++-- .../test_models_transformer_qwenimage.py | 2 +- .../qwenimage/test_qwenimage_edit.py | 39 +++++++++++-------- 4 files changed, 33 insertions(+), 24 deletions(-) diff --git a/src/diffusers/models/transformers/transformer_qwenimage.py b/src/diffusers/models/transformers/transformer_qwenimage.py index 049e69a4beb4..fe1847418508 100644 --- a/src/diffusers/models/transformers/transformer_qwenimage.py +++ b/src/diffusers/models/transformers/transformer_qwenimage.py @@ -219,6 +219,7 @@ def forward(self, video_fhw, txt_seq_lens, device): video_freq = self.rope_cache[rope_key] else: video_freq = self._compute_video_freqs(frame, height, width, idx) + video_freq = video_freq.to(device) vid_freqs.append(video_freq) if self.scale_rope: @@ -249,8 +250,9 @@ def _compute_video_freqs(self, frame, height, width, idx=0): freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1) freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1) - return freqs.clone().contiguous() - + freqs = freqs.clone().contiguous() + + return freqs class QwenDoubleStreamAttnProcessor2_0: """ diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py index 9a7a00d6f057..5ff78355cb52 100644 --- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py +++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py @@ -183,7 +183,7 @@ def calculate_dimensions(target_area, ratio): class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin): r""" - The QwenImage pipeline for text-to-image generation. + The Qwen-Image-Edit pipeline for image editing. Args: transformer ([`QwenImageTransformer2DModel`]): @@ -222,8 +222,8 @@ def __init__( transformer=transformer, scheduler=scheduler, ) - self.latent_channels = 16 self.vae_scale_factor = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8 + self.latent_channels = self.vae.config.z_dim if getattr(self, "vae", None) else 16 # QwenImage latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible # by the patch size. So the vae scale factor is multiplied by the patch size to account for this self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2) @@ -258,7 +258,7 @@ def _get_qwen_prompt_embeds( template = self.prompt_template_encode drop_idx = self.prompt_template_encode_start_idx txt = [template.format(e) for e in prompt] - + model_inputs = self.processor( text=txt, images=image, @@ -640,7 +640,9 @@ def __call__( [`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated images. """ - calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, image.width / image.height) + image_size = image[0].size if isinstance(image, list) else image.size + width, height = image_size + calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, width / height) height = height or calculated_height width = width or calculated_width diff --git a/tests/models/transformers/test_models_transformer_qwenimage.py b/tests/models/transformers/test_models_transformer_qwenimage.py index 7b8af6854c9c..498acb8d73c9 100644 --- a/tests/models/transformers/test_models_transformer_qwenimage.py +++ b/tests/models/transformers/test_models_transformer_qwenimage.py @@ -103,4 +103,4 @@ def prepare_dummy_input(self, height, width): @pytest.mark.xfail(condition=True, reason="RoPE needs to be revisited.", strict=True) def test_torch_compile_recompilation_and_graph_break(self): - pass + super().test_torch_compile_recompilation_and_graph_break() diff --git a/tests/pipelines/qwenimage/test_qwenimage_edit.py b/tests/pipelines/qwenimage/test_qwenimage_edit.py index 5ba1e0469298..57b27f4b44d6 100644 --- a/tests/pipelines/qwenimage/test_qwenimage_edit.py +++ b/tests/pipelines/qwenimage/test_qwenimage_edit.py @@ -13,16 +13,16 @@ # limitations under the License. import unittest - +import pytest import numpy as np import torch from PIL import Image -from transformers import Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer +from transformers import Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor from diffusers import ( AutoencoderKLQwenImage, FlowMatchEulerDiscreteScheduler, - QwenImagePipeline, + QwenImageEditPipeline, QwenImageTransformer2DModel, ) from diffusers.utils.testing_utils import enable_full_determinism, torch_device @@ -34,12 +34,12 @@ enable_full_determinism() -class QwenImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = QwenImagePipeline +class QwenImageEditPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = QwenImageEditPipeline params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"} - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = TEXT_TO_IMAGE_IMAGE_PARAMS - image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS + batch_params = frozenset(["prompt", "image"]) + image_params = frozenset(["image"]) + image_latents_params = frozenset(["latents"]) required_optional_params = frozenset( [ "num_inference_steps", @@ -56,6 +56,8 @@ class QwenImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase): test_group_offloading = True def get_dummy_components(self): + tiny_ckpt_id = "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration" + torch.manual_seed(0) transformer = QwenImageTransformer2DModel( patch_size=2, @@ -77,10 +79,8 @@ def get_dummy_components(self): dim_mult=[1, 2, 4], num_res_blocks=1, temperal_downsample=[False, True], - # fmt: off - latents_mean=[0.0] * 4, - latents_std=[1.0] * 4, - # fmt: on + latents_mean=[0.0] * z_dim, + latents_std=[1.0] * z_dim, ) torch.manual_seed(0) @@ -115,7 +115,7 @@ def get_dummy_components(self): vision_token_id=151654, ) text_encoder = Qwen2_5_VLForConditionalGeneration(config) - tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration") + tokenizer = Qwen2Tokenizer.from_pretrained(tiny_ckpt_id) components = { "transformer": transformer, @@ -123,6 +123,7 @@ def get_dummy_components(self): "scheduler": scheduler, "text_encoder": text_encoder, "tokenizer": tokenizer, + "processor": Qwen2VLProcessor.from_pretrained(tiny_ckpt_id), } return components @@ -134,7 +135,7 @@ def get_dummy_inputs(self, device, seed=0): inputs = { "prompt": "dance monkey", - "image": Image.new("RGB", (16, 16)), + "image": Image.new("RGB", (32, 32)), "negative_prompt": "bad quality", "generator": generator, "num_inference_steps": 2, @@ -160,13 +161,13 @@ def test_inference(self): generated_image = image[0] self.assertEqual(generated_image.shape, (3, 32, 32)) - # fmt: off - expected_slice = torch.tensor([0.56331, 0.63677, 0.6015, 0.56369, 0.58166, 0.55277, 0.57176, 0.63261, 0.41466, 0.35561, 0.56229, 0.48334, 0.49714, 0.52622, 0.40872, 0.50208]) + expected_slice = torch.tensor( + [[0.5637, 0.6341, 0.6001, 0.5620, 0.5794, 0.5498, 0.5757, 0.6389, 0.4174, + 0.3597, 0.5649, 0.4894, 0.4969, 0.5255, 0.4083, 0.4986]]) # fmt: on generated_slice = generated_image.flatten() generated_slice = torch.cat([generated_slice[:8], generated_slice[-8:]]) - print(f"{generated_slice=}") self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=1e-3)) def test_inference_batch_single_identical(self): @@ -236,3 +237,7 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2): expected_diff_max, "VAE tiling should not affect the inference results", ) + + @pytest.mark.xfail(condition=True, reason="Preconfigured embeddings need to be revisited.", strict=True) + def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=None, atol=1e-4, rtol=1e-4): + super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict, atol, rtol) \ No newline at end of file From 75f25980eb6af88e0b2c4587b96d7b3acea74187 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sun, 17 Aug 2025 21:00:12 +0530 Subject: [PATCH 6/8] up --- .../transformers/transformer_qwenimage.py | 3 +- .../qwenimage/pipeline_qwenimage_edit.py | 2 +- .../qwenimage/test_qwenimage_edit.py | 32 +++++++++++++++---- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/src/diffusers/models/transformers/transformer_qwenimage.py b/src/diffusers/models/transformers/transformer_qwenimage.py index fe1847418508..d615ea8ed394 100644 --- a/src/diffusers/models/transformers/transformer_qwenimage.py +++ b/src/diffusers/models/transformers/transformer_qwenimage.py @@ -251,9 +251,10 @@ def _compute_video_freqs(self, frame, height, width, idx=0): freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1) freqs = freqs.clone().contiguous() - + return freqs + class QwenDoubleStreamAttnProcessor2_0: """ Attention processor for Qwen double-stream architecture, matching DoubleStreamLayerMegatron logic. This processor diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py index 5ff78355cb52..9f68834e22b0 100644 --- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py +++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py @@ -258,7 +258,7 @@ def _get_qwen_prompt_embeds( template = self.prompt_template_encode drop_idx = self.prompt_template_encode_start_idx txt = [template.format(e) for e in prompt] - + model_inputs = self.processor( text=txt, images=image, diff --git a/tests/pipelines/qwenimage/test_qwenimage_edit.py b/tests/pipelines/qwenimage/test_qwenimage_edit.py index 57b27f4b44d6..2fcdca29434b 100644 --- a/tests/pipelines/qwenimage/test_qwenimage_edit.py +++ b/tests/pipelines/qwenimage/test_qwenimage_edit.py @@ -13,8 +13,9 @@ # limitations under the License. import unittest -import pytest + import numpy as np +import pytest import torch from PIL import Image from transformers import Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor @@ -27,7 +28,7 @@ ) from diffusers.utils.testing_utils import enable_full_determinism, torch_device -from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..pipeline_params import TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineTesterMixin, to_np @@ -57,7 +58,7 @@ class QwenImageEditPipelineFastTests(PipelineTesterMixin, unittest.TestCase): def get_dummy_components(self): tiny_ckpt_id = "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration" - + torch.manual_seed(0) transformer = QwenImageTransformer2DModel( patch_size=2, @@ -162,8 +163,27 @@ def test_inference(self): self.assertEqual(generated_image.shape, (3, 32, 32)) expected_slice = torch.tensor( - [[0.5637, 0.6341, 0.6001, 0.5620, 0.5794, 0.5498, 0.5757, 0.6389, 0.4174, - 0.3597, 0.5649, 0.4894, 0.4969, 0.5255, 0.4083, 0.4986]]) + [ + [ + 0.5637, + 0.6341, + 0.6001, + 0.5620, + 0.5794, + 0.5498, + 0.5757, + 0.6389, + 0.4174, + 0.3597, + 0.5649, + 0.4894, + 0.4969, + 0.5255, + 0.4083, + 0.4986, + ] + ] + ) # fmt: on generated_slice = generated_image.flatten() @@ -240,4 +260,4 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2): @pytest.mark.xfail(condition=True, reason="Preconfigured embeddings need to be revisited.", strict=True) def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=None, atol=1e-4, rtol=1e-4): - super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict, atol, rtol) \ No newline at end of file + super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict, atol, rtol) From 10c74969e51df0ed2ae492faae88770cda268257 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sun, 17 Aug 2025 21:03:52 +0530 Subject: [PATCH 7/8] up --- src/diffusers/models/transformers/transformer_qwenimage.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/diffusers/models/transformers/transformer_qwenimage.py b/src/diffusers/models/transformers/transformer_qwenimage.py index d615ea8ed394..3a417c46933d 100644 --- a/src/diffusers/models/transformers/transformer_qwenimage.py +++ b/src/diffusers/models/transformers/transformer_qwenimage.py @@ -250,9 +250,7 @@ def _compute_video_freqs(self, frame, height, width, idx=0): freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1) freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1) - freqs = freqs.clone().contiguous() - - return freqs + return freqs.clone().contiguous() class QwenDoubleStreamAttnProcessor2_0: From 58d47ca2307360150ce560057585838b78fec4f6 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Mon, 18 Aug 2025 07:12:57 +0530 Subject: [PATCH 8/8] reviewer feedback. --- .../qwenimage/test_qwenimage_edit.py | 24 ++----------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/tests/pipelines/qwenimage/test_qwenimage_edit.py b/tests/pipelines/qwenimage/test_qwenimage_edit.py index 2fcdca29434b..647c65ada6bf 100644 --- a/tests/pipelines/qwenimage/test_qwenimage_edit.py +++ b/tests/pipelines/qwenimage/test_qwenimage_edit.py @@ -162,28 +162,8 @@ def test_inference(self): generated_image = image[0] self.assertEqual(generated_image.shape, (3, 32, 32)) - expected_slice = torch.tensor( - [ - [ - 0.5637, - 0.6341, - 0.6001, - 0.5620, - 0.5794, - 0.5498, - 0.5757, - 0.6389, - 0.4174, - 0.3597, - 0.5649, - 0.4894, - 0.4969, - 0.5255, - 0.4083, - 0.4986, - ] - ] - ) + # fmt: off + expected_slice = torch.tensor([[0.5637, 0.6341, 0.6001, 0.5620, 0.5794, 0.5498, 0.5757, 0.6389, 0.4174, 0.3597, 0.5649, 0.4894, 0.4969, 0.5255, 0.4083, 0.4986]]) # fmt: on generated_slice = generated_image.flatten()