[WIP][cogview4][refactor]: Split condition/uncondition forward pass in CogView4 pipeline

OleehyO · OleehyO · commit f608f82a8b0a · 2025-01-23T07:35:09.000Z
Split the forward pass for conditional and unconditional predictions in the CogView4 pipeline to match the original implementation. The noise prediction is now done separately for each case before combining them for guidance. However, the results still need improvement.

This is a work in progress as the generated images are not yet matching expected quality.
diff --git a/src/diffusers/models/transformers/transformer_cogview3plus.py b/src/diffusers/models/transformers/transformer_cogview3plus.py
@@ -397,59 +397,62 @@ def forward(
         hidden_states, prompt_embeds, negative_prompt_embeds = self.patch_embed(
             hidden_states, prompt_embeds, negative_prompt_embeds
         )
+        emb = self.time_condition_embed(timestep, original_size, target_size, crop_coords, hidden_states.dtype)
 
-        encoder_hidden_states = torch.cat([prompt_embeds, negative_prompt_embeds], dim=0)
+        encoder_hidden_states_cond = prompt_embeds
+        encoder_hidden_states_uncond = negative_prompt_embeds
+        hidden_states_cond, hidden_states_uncond = hidden_states.chunk(2)
+        emb_cond, emb_uncond = emb.chunk(2)
 
         # prepare image_rotary__emb
         image_rotary_emb = self.get_rope_embedding(
             patch_height, patch_width, target_h=patch_height, target_w=patch_width, device=hidden_states.device
         )
 
-        emb = self.time_condition_embed(timestep, original_size, target_size, crop_coords, hidden_states.dtype)
-
         for index_block, block in enumerate(self.transformer_blocks):
             if torch.is_grad_enabled() and self.gradient_checkpointing:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-
-                    return custom_forward
-
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    encoder_hidden_states,
-                    emb=emb,
+                ...
+            else:
+                hidden_states_cond, encoder_hidden_states_cond = block(
+                    hidden_states=hidden_states_cond,
+                    encoder_hidden_states=encoder_hidden_states_cond,
+                    emb=emb_cond,  # refactor later
                     image_rotary_emb=image_rotary_emb,
-                    **ckpt_kwargs,
                 )
-            else:
-                hidden_states, encoder_hidden_states = block(
-                    hidden_states=hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    emb=emb,
+                hidden_states_uncond, encoder_hidden_states_uncond = block(
+                    hidden_states=hidden_states_uncond,
+                    encoder_hidden_states=encoder_hidden_states_uncond,
+                    emb=emb_uncond,  # refactor later
                     image_rotary_emb=image_rotary_emb,
                 )
 
-        hidden_states = self.norm_out(hidden_states, emb)  # 结果对应于megatron里的final_layer_input
-        hidden_states = self.proj_out(hidden_states)  # (batch_size, height*width, patch_size*patch_size*out_channels)
+        hidden_states_cond = self.norm_out(hidden_states_cond, emb)  # 结果对应于megatron里的final_layer_input
+        hidden_states_uncond = self.norm_out(hidden_states_uncond, emb)  # 结果对应于megatron里的final_layer_input
+        hidden_states_cond = self.proj_out(hidden_states_cond)  # (batch_size, height*width, patch_size*patch_size*out_channels)
+        hidden_states_uncond = self.proj_out(hidden_states_uncond)  # (batch_size, height*width, patch_size*patch_size*out_channels)
 
         # unpatchify
         patch_size = self.config.patch_size
         height = height // patch_size
         width = width // patch_size
 
-        hidden_states = hidden_states.reshape(
-            shape=(hidden_states.shape[0], height, width, self.out_channels, patch_size, patch_size)
+        hidden_states_cond = hidden_states_cond.reshape(
+            shape=(hidden_states_cond.shape[0], height, width, self.out_channels, patch_size, patch_size)
+        )
+        hidden_states_cond = torch.einsum("nhwcpq->nchpwq", hidden_states_cond)
+        output_cond = hidden_states_cond.reshape(
+            shape=(hidden_states_cond.shape[0], self.out_channels, height * patch_size, width * patch_size)
+        )
+
+        hidden_states_uncond = hidden_states_uncond.reshape(
+            shape=(hidden_states_uncond.shape[0], height, width, self.out_channels, patch_size, patch_size)
         )
-        hidden_states = torch.einsum("nhwcpq->nchpwq", hidden_states)
-        output = hidden_states.reshape(
-            shape=(hidden_states.shape[0], self.out_channels, height * patch_size, width * patch_size)
+        hidden_states_uncond = torch.einsum("nhwcpq->nchpwq", hidden_states_uncond)
+        output_uncond = hidden_states_uncond.reshape(
+            shape=(hidden_states_uncond.shape[0], self.out_channels, height * patch_size, width * patch_size)
         )
 
         if not return_dict:
-            return (output,)
+            return (output_cond, output_uncond)
 
-        return Transformer2DModelOutput(sample=output)
+        return Transformer2DModelOutput(sample=output_cond), Transformer2DModelOutput(sample=output_uncond)
diff --git a/src/diffusers/pipelines/cogview4/pipeline_cogview4.py b/src/diffusers/pipelines/cogview4/pipeline_cogview4.py
@@ -314,23 +314,6 @@ def encode_prompt(
                 dtype=dtype,
             )
 
-        # TODO: 先pad 0 ，后续再处理不同长度的问题  (lhy: 这里改为pad padding token试试)
-        seq_len_prompt = prompt_embeds.shape[1]
-        seq_len_neg = negative_prompt_embeds.shape[1]
-        if seq_len_neg < seq_len_prompt:
-            # 创建一个新的张量，大小为 [batch_size, seq_len_prompt, hidden_size]
-            batch_size, seq_len, hidden_size = negative_prompt_embeds.shape
-            # 填充后的张量
-            padded_negative_prompt = torch.full(
-                (batch_size, seq_len_prompt - seq_len_neg),
-                fill_value=self.tokenizer.pad_token_id,
-                device=negative_prompt_embeds.device,
-            )
-            padded_negative_prompt_embeds = self.text_encoder.model.embed_tokens(
-                padded_negative_prompt.to(self.text_encoder.model.device)
-            )
-            negative_prompt_embeds = torch.cat([padded_negative_prompt_embeds, negative_prompt_embeds], dim=1)
-            assert negative_prompt_embeds.shape == prompt_embeds.shape
         return prompt_embeds, negative_prompt_embeds
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
@@ -680,7 +663,7 @@ def __call__(
 
                 # perform guidance
                 if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                    noise_pred_cond, noise_pred_uncond = noise_pred
                     noise_pred_guided = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
 
                 ###########################