Merge branch 'z-image-dev' into z-image

JerryWu-code · JerryWu-code · commit 2f2d8c3a4e7f · 2025-11-25T06:48:02.000Z
diff --git a/src/diffusers/models/transformers/transformer_z_image.py b/src/diffusers/models/transformers/transformer_z_image.py
@@ -18,7 +18,6 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from einops import rearrange
 from torch.nn.utils.rnn import pad_sequence
 
 from ...configuration_utils import ConfigMixin, register_to_config
@@ -429,9 +428,12 @@ def unpatchify(self, x: List[torch.Tensor], size: List[Tuple], patch_size, f_pat
         for i in range(bsz):
             F, H, W = size[i]
             ori_len = (F // pF) * (H // pH) * (W // pW)
-            x[i] = rearrange(
-                x[i][:ori_len].view(F // pF, H // pH, W // pW, pF, pH, pW, self.out_channels),
-                "f h w pf ph pw c -> c (f pf) (h ph) (w pw)",
+            # "f h w pf ph pw c -> c (f pf) (h ph) (w pw)"
+            x[i] = (
+                x[i][:ori_len]
+                .view(F // pF, H // pH, W // pW, pF, pH, pW, self.out_channels)
+                .permute(6, 0, 3, 1, 4, 2, 5)
+                .reshape(self.out_channels, F, H, W)
             )
         return x
 
@@ -497,7 +499,8 @@ def patchify_and_embed(
             F_tokens, H_tokens, W_tokens = F // pF, H // pH, W // pW
 
             image = image.view(C, F_tokens, pF, H_tokens, pH, W_tokens, pW)
-            image = rearrange(image, "c f pf h ph w pw -> (f h w) (pf ph pw c)")
+            # "c f pf h ph w pw -> (f h w) (pf ph pw c)"
+            image = image.permute(1, 3, 5, 2, 4, 6, 0).reshape(F_tokens * H_tokens * W_tokens, pF * pH * pW * C)
 
             image_ori_len = len(image)
             image_padding_len = (-image_ori_len) % SEQ_MULTI_OF
diff --git a/src/diffusers/pipelines/z_image/pipeline_z_image.py b/src/diffusers/pipelines/z_image/pipeline_z_image.py
@@ -432,26 +432,35 @@ def __call__(
         elif prompt is not None and isinstance(prompt, list):
             batch_size = len(prompt)
         else:
-            batch_size = prompt_embeds.shape[0]
+            batch_size = len(prompt_embeds)
 
         lora_scale = (
             self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
         )
-        (
-            prompt_embeds,
-            negative_prompt_embeds,
-        ) = self.encode_prompt(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            do_classifier_free_guidance=self.do_classifier_free_guidance,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            dtype=dtype,
-            device=device,
-            num_images_per_prompt=num_images_per_prompt,
-            max_sequence_length=max_sequence_length,
-            lora_scale=lora_scale,
-        )
+
+        # If prompt_embeds is provided and prompt is None, skip encoding
+        if prompt_embeds is not None and prompt is None:
+            if self.do_classifier_free_guidance and negative_prompt_embeds is None:
+                raise ValueError(
+                    "When `prompt_embeds` is provided without `prompt`, "
+                    "`negative_prompt_embeds` must also be provided for classifier-free guidance."
+                )
+        else:
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+            ) = self.encode_prompt(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                dtype=dtype,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                lora_scale=lora_scale,
+            )
 
         # 4. Prepare latent variables
         num_channels_latents = self.transformer.in_channels