[WIP] Add tensor-reload to align input from transformer block

OleehyO · OleehyO · commit f8945ce71f08 · 2025-01-24T11:08:12.000Z
diff --git a/src/diffusers/models/normalization.py b/src/diffusers/models/normalization.py
@@ -333,9 +333,18 @@ def __init__(
 
     def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor:
         # convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT)
+
+        ####################################
         emb = self.linear(self.silu(conditioning_embedding).to(x.dtype))
+        # emb = self.linear(conditioning_embedding).to(x.dtype)
+        ####################################
+
         scale, shift = torch.chunk(emb, 2, dim=1)
+
+        ############################
         x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        # x = x * (1 + scale)[:, None, :] + shift[:, None, :]
+        ############################
         return x
 
 
diff --git a/src/diffusers/models/transformers/transformer_cogview3plus.py b/src/diffusers/models/transformers/transformer_cogview3plus.py
@@ -232,7 +232,8 @@ def __init__(
             embedding_dim=self.inner_dim,
             conditioning_embedding_dim=time_embed_dim,
             elementwise_affine=False,
-            eps=1e-6,
+            # eps=1e-6,
+            eps=1e-5,
         )
         self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
 
@@ -399,8 +400,6 @@ def forward(
         )
         emb = self.time_condition_embed(timestep, original_size, target_size, crop_coords, hidden_states.dtype)
 
-        encoder_hidden_states_cond = prompt_embeds
-        encoder_hidden_states_uncond = negative_prompt_embeds
         hidden_states_cond, hidden_states_uncond = hidden_states.chunk(2)
         emb_cond, emb_uncond = emb.chunk(2)
 
@@ -409,6 +408,22 @@ def forward(
             patch_height, patch_width, target_h=patch_height, target_w=patch_width, device=hidden_states.device
         )
 
+        ######################
+        # prompt_embeds = torch.load("/home/lhy/code/cogview/c_condition_embedding.pt")
+        # negative_prompt_embeds = torch.load("/home/lhy/code/cogview/uc_condition_embedding.pt")
+        prompt_embeds = torch.load("/home/lhy/code/cogview/cp_condition_0_16.pt")[None, ::]
+        negative_prompt_embeds = torch.load("/home/lhy/code/cogview/cp_uncondition_16_32.pt")[None, ::]
+
+        hidden_states_cond = torch.load("/home/lhy/code/cogview/cp_vision_input_0_4096.pt")
+        hidden_states_uncond = torch.load("/home/lhy/code/cogview/cp_vision_input_4096:8192.pt")
+
+        emb_cond = torch.load("/home/lhy/code/cogview/time_embedding_0_1.pt")
+        emb_uncond = torch.load("/home/lhy/code/cogview/time_embedding_1_2.pt")
+        ######################
+
+        encoder_hidden_states_cond = prompt_embeds
+        encoder_hidden_states_uncond = negative_prompt_embeds
+
         for index_block, block in enumerate(self.transformer_blocks):
             if torch.is_grad_enabled() and self.gradient_checkpointing:
                 ...
@@ -418,16 +433,31 @@ def forward(
                     encoder_hidden_states=encoder_hidden_states_cond,
                     emb=emb_cond,  # refactor later
                     image_rotary_emb=image_rotary_emb,
+                    # image_rotary_emb=None,
                 )
+                ###########################
+                # hidden_states_cond, encoder_hidden_states_cond = (
+                #     self.norm_out.norm(hidden_states_cond),
+                #     self.norm_out.norm(encoder_hidden_states_cond),
+                # )
+                ###########################
+
                 hidden_states_uncond, encoder_hidden_states_uncond = block(
                     hidden_states=hidden_states_uncond,
                     encoder_hidden_states=encoder_hidden_states_uncond,
                     emb=emb_uncond,  # refactor later
                     image_rotary_emb=image_rotary_emb,
+                    # image_rotary_emb=None,
                 )
-
-        hidden_states_cond = self.norm_out(hidden_states_cond, emb)  # 结果对应于megatron里的final_layer_input
-        hidden_states_uncond = self.norm_out(hidden_states_uncond, emb)  # 结果对应于megatron里的final_layer_input
+                ###########################
+                # hidden_states_uncond, encoder_hidden_states_uncond = (
+                #     self.norm_out.norm(hidden_states_uncond),
+                #     self.norm_out.norm(encoder_hidden_states_uncond),
+                # )
+                ###########################
+
+        hidden_states_cond = self.norm_out(hidden_states_cond, emb_cond)  # 结果对应于megatron里的final_layer_input
+        hidden_states_uncond = self.norm_out(hidden_states_uncond, emb_uncond)  # 结果对应于megatron里的final_layer_input
         hidden_states_cond = self.proj_out(hidden_states_cond)  # (batch_size, height*width, patch_size*patch_size*out_channels)
         hidden_states_uncond = self.proj_out(hidden_states_uncond)  # (batch_size, height*width, patch_size*patch_size*out_channels)
 
diff --git a/src/diffusers/pipelines/cogview4/pipeline_cogview4.py b/src/diffusers/pipelines/cogview4/pipeline_cogview4.py
@@ -216,7 +216,9 @@ def _get_glm_embeds(
                 device=text_input_ids.device,
             )
             text_input_ids = torch.cat([pad_ids, text_input_ids], dim=1)
-        prompt_embeds = self.text_encoder(text_input_ids.to(self.text_encoder.model.device), output_hidden_states=True).hidden_states[-2]
+        prompt_embeds = self.text_encoder(
+            text_input_ids.to(self.text_encoder.model.device), output_hidden_states=True
+        ).hidden_states[-2]
         prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
         _, seq_len, _= prompt_embeds.shape
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
@@ -592,6 +594,7 @@ def __call__(
 
         # Prepare latents.
         latent_channels = self.transformer.config.in_channels
+        #########################
         latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
             latent_channels,
@@ -602,6 +605,8 @@ def __call__(
             generator,
             latents,
         )
+        latents = torch.ones_like(latents)
+        #########################
 
         # Prepare additional timestep conditions
         original_size = torch.tensor([original_size], dtype=prompt_embeds.dtype)