update

zRzRzRzRzRzRzR · zRzRzRzRzRzRzR · commit 692e5cc3c0bf · 2025-03-05T16:16:54.000+08:00
diff --git a/examples/cogview4-control/train_control_cogview4.py b/examples/cogview4-control/train_control_cogview4.py
@@ -132,7 +132,7 @@ def log_validation(cogview4_transformer, args, accelerator, weight_dtype, step,
                     control_image=validation_image,
                     num_inference_steps=50,
                     guidance_scale=args.guidance_scale,
-                    max_sequence_length=args.max_sequence_length, # For downstream task training usage, training can be performed on a batch basis.
+                    max_sequence_length=args.max_sequence_length,  # For downstream task training usage, training can be performed on a batch basis.
                     padding_type="max_length",
                     generator=generator,
                     height=args.resolution,
@@ -660,7 +660,7 @@ def prepare_train_dataset(dataset, accelerator):
         [
             transforms.Resize((args.resolution, args.resolution), interpolation=transforms.InterpolationMode.BILINEAR),
             transforms.ToTensor(),
-            transforms.Lambda(lambda x: x * 2 - 1)
+            transforms.Lambda(lambda x: x * 2 - 1),
         ]
     )
 
@@ -1074,7 +1074,6 @@ def load_model_hook(models, input_dir):
                 )
 
                 # Add noise according for cogview4
-                # FIXME: The issue of variable-length training has not been resolved, here it is still extended to the longest one.
                 indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
                 timesteps = noise_scheduler_copy.timesteps[indices].to(device=pixel_latents.device)
                 sigmas = noise_scheduler_copy.sigmas[indices].to(device=pixel_latents.device)
@@ -1095,12 +1094,10 @@ def load_model_hook(models, input_dir):
                 text_encoding_pipeline = text_encoding_pipeline.to("cuda")
 
                 with torch.no_grad():
-                    # Since the batch will be padded, max_length should be used for padding.
-                    prompt_embeds,pooled_prompt_embeds,= text_encoding_pipeline.encode_prompt(
-                    captions, "",
-                        max_sequence_length=args.max_sequence_length,
-                        padding_type="max_length"
-                    )
+                    (
+                        prompt_embeds,
+                        pooled_prompt_embeds,
+                    ) = text_encoding_pipeline.encode_prompt(captions, "")
                 original_size = (args.resolution, args.resolution)
                 original_size = torch.tensor([original_size], dtype=prompt_embeds.dtype, device=prompt_embeds.device)
 
@@ -1109,8 +1106,6 @@ def load_model_hook(models, input_dir):
 
                 target_size = target_size.repeat(len(batch["captions"]), 1)
                 original_size = original_size.repeat(len(batch["captions"]), 1)
-
-                # TODO: Should a parameter be set here for passing? This is not present in Flux.
                 crops_coords_top_left = torch.tensor([(0, 0)], dtype=prompt_embeds.dtype, device=prompt_embeds.device)
                 crops_coords_top_left = crops_coords_top_left.repeat(len(batch["captions"]), 1)
 
@@ -1140,7 +1135,8 @@ def load_model_hook(models, input_dir):
 
                 weighting = weighting.view(len(batch["captions"]), 1, 1, 1)
                 loss = torch.mean(
-                    (weighting.float() * (noise_pred_cond.float() - target.float()) ** 2).reshape(target.shape[0], -1), 1
+                    (weighting.float() * (noise_pred_cond.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                    1,
                 )
                 loss = loss.mean()
                 accelerator.backward(loss)
diff --git a/src/diffusers/models/transformers/transformer_cogview4.py b/src/diffusers/models/transformers/transformer_cogview4.py
@@ -157,21 +157,17 @@ def __call__(
                 key[:, :, text_seq_length:, :], image_rotary_emb, use_real_unbind_dim=-2
             )
 
-        # 4. Attention
+        # 4. Attention and Attention Mask
         if attention_mask is not None:
-            # construct attention_mask for concated sequence
             text_attention_mask = attention_mask.float().to(query.device)
-            attention_mask = torch.ones((batch_size, text_seq_length + image_seq_length), device=query.device)
-            attention_mask[:, :text_seq_length] = text_attention_mask
-            attention_mask = attention_mask.unsqueeze(2)
-            attention_mask_matrix = attention_mask @ attention_mask.mT
-            attention_mask_matrix = attention_mask_matrix == 1
-            attention_mask_matrix = attention_mask_matrix.unsqueeze(1)
-            attention_mask = attention_mask_matrix
-
-        hidden_states = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-        )
+            actual_text_seq_length = text_attention_mask.size(1)
+            new_attention_mask = torch.zeros((batch_size, text_seq_length + image_seq_length), device=query.device)
+            new_attention_mask[:, :actual_text_seq_length] = text_attention_mask
+            new_attention_mask = new_attention_mask.unsqueeze(2)
+            attention_mask_matrix = new_attention_mask @ new_attention_mask.transpose(1, 2)
+            attention_mask = (attention_mask_matrix > 0).unsqueeze(1).to(query.dtype)
+
+        hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False)
         hidden_states = hidden_states.transpose(1, 2).flatten(2, 3)
         hidden_states = hidden_states.type_as(query)
 
diff --git a/src/diffusers/pipelines/cogview4/pipeline_cogview4_control.py b/src/diffusers/pipelines/cogview4/pipeline_cogview4_control.py
@@ -144,13 +144,11 @@ class CogView4ControlPipeline(DiffusionPipeline):
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`T5EncoderModel`]):
-            Frozen text-encoder. CogView4 uses
-            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
-            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
-        tokenizer (`T5Tokenizer`):
+        text_encoder ([`GLMModel`]):
+            Frozen text-encoder. CogView4 uses [glm-4-9b-hf](https://huggingface.co/THUDM/glm-4-9b-hf).
+        tokenizer (`PreTrainedTokenizer`):
             Tokenizer of class
-            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+            [PreTrainedTokenizer](https://huggingface.co/docs/transformers/main/en/main_classes/tokenizer#transformers.PreTrainedTokenizer).
         transformer ([`CogView4Transformer2DModel`]):
             A text conditioned `CogView4Transformer2DModel` to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
@@ -182,7 +180,6 @@ def _get_glm_embeds(
         prompt: Union[str, List[str]] = None,
         num_images_per_prompt: int = 1,
         max_sequence_length: int = 1024,
-        padding_type: str = "longest",
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
     ):
@@ -194,7 +191,7 @@ def _get_glm_embeds(
 
         text_inputs = self.tokenizer(
             prompt,
-            padding=padding_type,
+            padding="longest",  # not use max length
             max_length=max_sequence_length,
             truncation=True,
             add_special_tokens=True,
@@ -240,7 +237,6 @@ def encode_prompt(
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
         max_sequence_length: int = 1024,
-        padding_type: str = "longest",
     ):
         r"""
         Encodes the prompt into text encoder hidden states.
@@ -278,7 +274,7 @@ def encode_prompt(
         else:
             batch_size = prompt_embeds.shape[0]
         if prompt_embeds is None:
-            prompt_embeds = self._get_glm_embeds(prompt, num_images_per_prompt, max_sequence_length, padding_type,  device, dtype)
+            prompt_embeds = self._get_glm_embeds(prompt, num_images_per_prompt, max_sequence_length, device, dtype)
 
         if do_classifier_free_guidance and negative_prompt_embeds is None:
             negative_prompt = negative_prompt or ""
@@ -297,7 +293,7 @@ def encode_prompt(
                 )
 
             negative_prompt_embeds = self._get_glm_embeds(
-                negative_prompt, num_images_per_prompt, max_sequence_length, "longest", device, dtype
+                negative_prompt, num_images_per_prompt, max_sequence_length, device, dtype
             )
 
         return prompt_embeds, negative_prompt_embeds
@@ -451,7 +447,6 @@ def __call__(
         ] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 1024,
-        padding_type: str = "longest", # For downstream tasks, it can be modified to use max_length for implementation.
     ) -> Union[CogView4PipelineOutput, Tuple]:
         """
         Function invoked when calling the pipeline for generation.
@@ -581,8 +576,7 @@ def __call__(
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
             max_sequence_length=max_sequence_length,
-            padding_type=padding_type,
-            device=device
+            device=device,
         )
 
         # Prepare latents