use control format

zRzRzRzRzRzRzR · zRzRzRzRzRzRzR · commit 347dd17e8ac6 · 2025-03-13T15:09:34.000+08:00
diff --git a/scripts/convert_cogview4_to_diffusers_megatron.py b/scripts/convert_cogview4_to_diffusers_megatron.py
@@ -162,14 +162,14 @@ def convert_megatron_transformer_checkpoint_to_diffusers(
     Returns:
         dict: The converted state dictionary compatible with Diffusers.
     """
-    ckpt = torch.load(ckpt_path, map_location="cpu")
+    ckpt = torch.load(ckpt_path, map_location="cpu",weights_only=False)
     mega = ckpt["model"]
 
     new_state_dict = {}
 
     # Patch Embedding
     new_state_dict["patch_embed.proj.weight"] = mega["encoder_expand_linear.weight"].reshape(
-        hidden_size, 128 if args.control else 64, 64
+        hidden_size, 128 if args.control else 64
     )
     new_state_dict["patch_embed.proj.bias"] = mega["encoder_expand_linear.bias"]
     new_state_dict["patch_embed.text_proj.weight"] = mega["text_projector.weight"]
@@ -260,7 +260,7 @@ def convert_cogview4_vae_checkpoint_to_diffusers(ckpt_path, vae_config):
     Returns:
         dict: The converted VAE state dictionary compatible with Diffusers.
     """
-    original_state_dict = torch.load(ckpt_path, map_location="cpu")["state_dict"]
+    original_state_dict = torch.load(ckpt_path, map_location="cpu",weights_only=False)["state_dict"]
     return convert_ldm_vae_checkpoint(original_state_dict, vae_config)
 
 
@@ -294,7 +294,7 @@ def main(args):
         )
         transformer = CogView4Transformer2DModel(
             patch_size=2,
-            in_channels=16,
+            in_channels=32 if args.control else 16,
             num_layers=args.num_layers,
             attention_head_dim=args.attention_head_dim,
             num_attention_heads=args.num_heads,
diff --git a/src/diffusers/pipelines/cogview4/pipeline_cogview4_control.py b/src/diffusers/pipelines/cogview4/pipeline_cogview4_control.py
@@ -46,15 +46,18 @@
         >>> from diffusers import CogView4Pipeline
 
         >>> pipe = CogView4ControlPipeline.from_pretrained("THUDM/CogView4-6B-Control", torch_dtype=torch.bfloat16)
-        >>> pipe.to("cuda")
-
-        >>> prompt = "A photo of an astronaut riding a horse on mars"
-        >>> image = pipe(prompt).images[0]
-        >>> image.save("output.png")
+        >>> control_image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        ... )
+        >>> prompt = "A bird in space"
+        >>> image = pipe(
+        ...     prompt, control_image=control_image, height=1024, width=1024, guidance_scale=3.5)
+        ... ).images[0]
+        >>> image.save("cogview4-control.png")
         ```
 """
 
-
+# Copied from diffusers.pipelines.cogview4.pipeline_cogview4.calculate_shift
 def calculate_shift(
     image_seq_len,
     base_seq_len: int = 256,
@@ -175,6 +178,7 @@ def __init__(
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
 
+    # Copied from diffusers.pipelines.cogview4.pipeline_cogview4.CogView4Pipeline._get_glm_embeds
     def _get_glm_embeds(
         self,
         prompt: Union[str, List[str]] = None,
@@ -341,7 +345,7 @@ def prepare_image(
             # image batch size is the same as prompt batch size
             repeat_by = num_images_per_prompt
 
-        image = image.repeat_interleave(repeat_by, dim=0)
+        image = image.repeat_interleave(repeat_by, dim=0, output_size=image.shape[0] * repeat_by)
 
         image = image.to(device=device, dtype=dtype)