|
46 | 46 | >>> import torch |
47 | 47 | >>> from PIL import Image |
48 | 48 | >>> from diffusers import QwenImageEditPipeline |
| 49 | + >>> from diffusers.utils import load_image |
49 | 50 |
|
50 | 51 | >>> pipe = QwenImageEditPipeline.from_pretrained("Qwen/Qwen-Image-Edit", torch_dtype=torch.bfloat16) |
51 | 52 | >>> pipe.to("cuda") |
52 | | - >>> prompt = "Change the cat to a dog" |
53 | | - >>> image = Image.open("cat.png") |
| 53 | + >>> image = load_image( |
| 54 | + ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/yarn-art-pikachu.png" |
| 55 | + ... ).convert("RGB") |
| 56 | + >>> prompt = ( |
| 57 | + ... "Make Pikachu hold a sign that says 'Qwen Edit is awesome', yarn art style, detailed, vibrant colors" |
| 58 | + ... ) |
54 | 59 | >>> # Depending on the variant being used, the pipeline call will slightly vary. |
55 | 60 | >>> # Refer to the pipeline documentation for more details. |
56 | 61 | >>> image = pipe(image, prompt, num_inference_steps=50).images[0] |
57 | | - >>> image.save("qwenimageedit.png") |
| 62 | + >>> image.save("qwenimage_edit.png") |
58 | 63 | ``` |
59 | 64 | """ |
60 | 65 | PREFERRED_QWENIMAGE_RESOLUTIONS = [ |
@@ -178,7 +183,7 @@ def calculate_dimensions(target_area, ratio): |
178 | 183 |
|
179 | 184 | class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin): |
180 | 185 | r""" |
181 | | - The QwenImage pipeline for text-to-image generation. |
| 186 | + The Qwen-Image-Edit pipeline for image editing. |
182 | 187 |
|
183 | 188 | Args: |
184 | 189 | transformer ([`QwenImageTransformer2DModel`]): |
@@ -217,8 +222,8 @@ def __init__( |
217 | 222 | transformer=transformer, |
218 | 223 | scheduler=scheduler, |
219 | 224 | ) |
220 | | - self.latent_channels = 16 |
221 | 225 | self.vae_scale_factor = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8 |
| 226 | + self.latent_channels = self.vae.config.z_dim if getattr(self, "vae", None) else 16 |
222 | 227 | # QwenImage latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible |
223 | 228 | # by the patch size. So the vae scale factor is multiplied by the patch size to account for this |
224 | 229 | self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2) |
@@ -592,6 +597,11 @@ def __call__( |
592 | 597 | of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting |
593 | 598 | `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to |
594 | 599 | the text `prompt`, usually at the expense of lower image quality. |
| 600 | +
|
| 601 | + This parameter in the pipeline is there to support future guidance-distilled models when they come up. |
| 602 | + Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance, |
| 603 | + please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should |
| 604 | + enable classifier-free guidance computations. |
595 | 605 | num_images_per_prompt (`int`, *optional*, defaults to 1): |
596 | 606 | The number of images to generate per prompt. |
597 | 607 | generator (`torch.Generator` or `List[torch.Generator]`, *optional*): |
@@ -635,7 +645,9 @@ def __call__( |
635 | 645 | [`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When |
636 | 646 | returning a tuple, the first element is a list with the generated images. |
637 | 647 | """ |
638 | | - calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, image.width / image.height) |
| 648 | + image_size = image[0].size if isinstance(image, list) else image.size |
| 649 | + width, height = image_size |
| 650 | + calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, width / height) |
639 | 651 | height = height or calculated_height |
640 | 652 | width = width or calculated_width |
641 | 653 |
|
|
0 commit comments