Add ZImageImg2ImgPipeline (#12751)

CalamitousFelicitousness · yiyixuxu · asomoza · web-flow · commit 2246d2c7c4af · 2025-12-07T22:06:23.000-10:00
* Add ZImageImg2ImgPipeline

Updated the pipeline structure to include ZImageImg2ImgPipeline
    alongside ZImagePipeline.
Implemented the ZImageImg2ImgPipeline class for image-to-image
    transformations, including necessary methods for
    encoding prompts, preparing latents, and denoising.
Enhanced the auto_pipeline to map the new ZImageImg2ImgPipeline
    for image generation tasks.
Added unit tests for ZImageImg2ImgPipeline to ensure
    functionality and performance.
Updated dummy objects to include ZImageImg2ImgPipeline for
    testing purposes.

* Address review comments for ZImageImg2ImgPipeline

- Add `# Copied from` annotations to encode_prompt and _encode_prompt
- Add ZImagePipeline to auto_pipeline.py for AutoPipeline support

* Add ZImage pipeline documentation

---------

Co-authored-by: YiYi Xu &lt;yixu310@gmail.com&gt;
Co-authored-by: Álvaro Somoza &lt;asomoza@users.noreply.github.com&gt;
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -651,7 +651,7 @@
       - local: api/pipelines/wuerstchen
         title: Wuerstchen
       - local: api/pipelines/z_image
-        title: Z-Image        
+        title: Z-Image
       title: Image
     - sections:
       - local: api/pipelines/allegro
diff --git a/docs/source/en/api/pipelines/z_image.md b/docs/source/en/api/pipelines/z_image.md
@@ -26,8 +26,41 @@ specific language governing permissions and limitations under the License.
 
 Z-Image-Turbo is a distilled version of Z-Image that matches or exceeds leading competitors with only 8 NFEs (Number of Function Evaluations). It offers sub-second inference latency on enterprise-grade H800 GPUs and fits comfortably within 16G VRAM consumer devices. It excels in photorealistic image generation, bilingual text rendering (English & Chinese), and robust instruction adherence.
 
+## Image-to-image
+
+Use [`ZImageImg2ImgPipeline`] to transform an existing image based on a text prompt.
+
+```python
+import torch
+from diffusers import ZImageImg2ImgPipeline
+from diffusers.utils import load_image
+
+pipe = ZImageImg2ImgPipeline.from_pretrained("Tongyi-MAI/Z-Image-Turbo", torch_dtype=torch.bfloat16)
+pipe.to("cuda")
+
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+init_image = load_image(url).resize((1024, 1024))
+
+prompt = "A fantasy landscape with mountains and a river, detailed, vibrant colors"
+image = pipe(
+    prompt,
+    image=init_image,
+    strength=0.6,
+    num_inference_steps=9,
+    guidance_scale=0.0,
+    generator=torch.Generator("cuda").manual_seed(42),
+).images[0]
+image.save("zimage_img2img.png")
+```
+
 ## ZImagePipeline
 
 [[autodoc]] ZImagePipeline
 	- all
-	- __call__
+	- __call__
+
+## ZImageImg2ImgPipeline
+
+[[autodoc]] ZImageImg2ImgPipeline
+	- all
+	- __call__
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -662,6 +662,7 @@
             "WuerstchenCombinedPipeline",
             "WuerstchenDecoderPipeline",
             "WuerstchenPriorPipeline",
+            "ZImageImg2ImgPipeline",
             "ZImagePipeline",
         ]
     )
@@ -1360,6 +1361,7 @@
             WuerstchenCombinedPipeline,
             WuerstchenDecoderPipeline,
             WuerstchenPriorPipeline,
+            ZImageImg2ImgPipeline,
             ZImagePipeline,
         )
 
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
@@ -404,7 +404,7 @@
         "Kandinsky5T2IPipeline",
         "Kandinsky5I2IPipeline",
     ]
-    _import_structure["z_image"] = ["ZImagePipeline"]
+    _import_structure["z_image"] = ["ZImageImg2ImgPipeline", "ZImagePipeline"]
     _import_structure["skyreels_v2"] = [
         "SkyReelsV2DiffusionForcingPipeline",
         "SkyReelsV2DiffusionForcingImageToVideoPipeline",
@@ -841,7 +841,7 @@
             WuerstchenDecoderPipeline,
             WuerstchenPriorPipeline,
         )
-        from .z_image import ZImagePipeline
+        from .z_image import ZImageImg2ImgPipeline, ZImagePipeline
 
         try:
             if not is_onnx_available():
diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py
@@ -119,6 +119,7 @@
 )
 from .wan import WanImageToVideoPipeline, WanPipeline, WanVideoToVideoPipeline
 from .wuerstchen import WuerstchenCombinedPipeline, WuerstchenDecoderPipeline
+from .z_image import ZImageImg2ImgPipeline, ZImagePipeline
 
 
 AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
@@ -162,6 +163,7 @@
         ("cogview4-control", CogView4ControlPipeline),
         ("qwenimage", QwenImagePipeline),
         ("qwenimage-controlnet", QwenImageControlNetPipeline),
+        ("z-image", ZImagePipeline),
     ]
 )
 
@@ -189,6 +191,7 @@
         ("qwenimage", QwenImageImg2ImgPipeline),
         ("qwenimage-edit", QwenImageEditPipeline),
         ("qwenimage-edit-plus", QwenImageEditPlusPipeline),
+        ("z-image", ZImageImg2ImgPipeline),
     ]
 )
 
diff --git a/src/diffusers/pipelines/z_image/__init__.py b/src/diffusers/pipelines/z_image/__init__.py
@@ -23,6 +23,7 @@
 else:
     _import_structure["pipeline_output"] = ["ZImagePipelineOutput"]
     _import_structure["pipeline_z_image"] = ["ZImagePipeline"]
+    _import_structure["pipeline_z_image_img2img"] = ["ZImageImg2ImgPipeline"]
 
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -35,6 +36,7 @@
     else:
         from .pipeline_output import ZImagePipelineOutput
         from .pipeline_z_image import ZImagePipeline
+        from .pipeline_z_image_img2img import ZImageImg2ImgPipeline
 
 else:
     import sys
diff --git a/src/diffusers/pipelines/z_image/pipeline_z_image_img2img.py b/src/diffusers/pipelines/z_image/pipeline_z_image_img2img.py
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
diff --git a/tests/pipelines/z_image/test_z_image_img2img.py b/tests/pipelines/z_image/test_z_image_img2img.py

Original file line number	Diff line number	Diff line change
`@@ -662,6 +662,7 @@`
`662`	`662`	`"WuerstchenCombinedPipeline",`
`663`	`663`	`"WuerstchenDecoderPipeline",`
`664`	`664`	`"WuerstchenPriorPipeline",`
	`665`	`+ "ZImageImg2ImgPipeline",`
`665`	`666`	`"ZImagePipeline",`
`666`	`667`	`]`
`667`	`668`	`)`
`@@ -1360,6 +1361,7 @@`
`1360`	`1361`	`WuerstchenCombinedPipeline,`
`1361`	`1362`	`WuerstchenDecoderPipeline,`
`1362`	`1363`	`WuerstchenPriorPipeline,`
	`1364`	`+ ZImageImg2ImgPipeline,`
`1363`	`1365`	`ZImagePipeline,`
`1364`	`1366`	`)`
`1365`	`1367`
Original file line number	Diff line number	Diff line change
`@@ -119,6 +119,7 @@`
`119`	`119`	`)`
`120`	`120`	`from .wan import WanImageToVideoPipeline, WanPipeline, WanVideoToVideoPipeline`
`121`	`121`	`from .wuerstchen import WuerstchenCombinedPipeline, WuerstchenDecoderPipeline`
	`122`	`+from .z_image import ZImageImg2ImgPipeline, ZImagePipeline`
`122`	`123`
`123`	`124`
`124`	`125`	`AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(`
`@@ -162,6 +163,7 @@`
`162`	`163`	`("cogview4-control", CogView4ControlPipeline),`
`163`	`164`	`("qwenimage", QwenImagePipeline),`
`164`	`165`	`("qwenimage-controlnet", QwenImageControlNetPipeline),`
	`166`	`+ ("z-image", ZImagePipeline),`
`165`	`167`	`]`
`166`	`168`	`)`
`167`	`169`
`@@ -189,6 +191,7 @@`
`189`	`191`	`("qwenimage", QwenImageImg2ImgPipeline),`
`190`	`192`	`("qwenimage-edit", QwenImageEditPipeline),`
`191`	`193`	`("qwenimage-edit-plus", QwenImageEditPlusPipeline),`
	`194`	`+ ("z-image", ZImageImg2ImgPipeline),`
`192`	`195`	`]`
`193`	`196`	`)`
`194`	`197`