right way

zRzRzRzRzRzRzR · zRzRzRzRzRzRzR · commit cefca0f67733 · 2024-10-04T19:24:37.000+08:00
diff --git a/scripts/convert_cogview3_to_diffusers.py b/scripts/convert_cogview3_to_diffusers.py
@@ -37,23 +37,11 @@ def reassign_query_key_layernorm_inplace(key: str, state_dict: Dict[str, Any]):
 def reassign_adaln_norm_inplace(key: str, state_dict: Dict[str, Any]):
     layer_id, _, weight_or_bias = key.split(".")[-3:]
 
-    weights_or_biases = state_dict[key].chunk(12, dim=0)
-    norm1_weights_or_biases = torch.cat(weights_or_biases[0:3] + weights_or_biases[6:9])
-    norm2_weights_or_biases = torch.cat(weights_or_biases[3:6] + weights_or_biases[9:12])
-
-    norm1_key = f"transformer_blocks.{layer_id}.norm1.linear.{weight_or_bias}"
-    state_dict[norm1_key] = norm1_weights_or_biases
-
-    norm2_key = f"transformer_blocks.{layer_id}.norm2.linear.{weight_or_bias}"
-    state_dict[norm2_key] = norm2_weights_or_biases
-
-    state_dict.pop(key)
-
-
-def remove_keys_inplace(key: str, state_dict: Dict[str, Any]):
+    weights_or_biases = state_dict[key]
+    norm1_key = f"transformer_blocks.{layer_id}.adaln_modules.1.{weight_or_bias}"
+    state_dict[norm1_key] = weights_or_biases
     state_dict.pop(key)
 
-
 def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
     state_dict = saved_dict
     if "model" in saved_dict.keys():
@@ -73,16 +61,17 @@ def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
     "dense_4h_to_h": "2",
     ".layers": "",
     "dense": "to_out.0",
-    "mixins.patch_embed": "image_patch_embed",
-    "mixins.adaln.adaln_modules": "adaln_module",
-    "time_embed": "time_embed",
-    "label_emb": "label_embed",
-    "mixins.final_layer.adaln": "final_layer.adaln",
+    "mixins.patch_embed": "pos_embed",
+    "time_embed.0": "emb.timestep_embedder.linear_1",
+    "time_embed.2": "emb.timestep_embedder.linear_2",
+    "label_emb.0": "emb.label_embedder",
+    "mixins.final_layer.adaln.1": "norm_out.linear",
     "mixins.final_layer.linear": "proj_out",
 }
 
 TRANSFORMER_SPECIAL_KEYS_REMAP = {
     "query_key_value": reassign_query_key_value_inplace,
+    "mixins.adaln.adaln_modules": reassign_adaln_norm_inplace,
 }
 
 TOKENIZER_MAX_LENGTH = 224
@@ -135,9 +124,8 @@ def convert_transformer(
     transformer = CogView3PlusTransformer2DModel(
         in_channels=16,
         num_layers=num_layers,
-        num_attention_heads=num_attention_heads,
+        num_attention_heads=num_attention_heads
     ).to(dtype=dtype)
-
     for key in list(original_state_dict.keys()):
         new_key = key[len(PREFIX_KEY):]
         for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
diff --git a/show_model.py b/show_model.py
@@ -0,0 +1,91 @@
+import torch
+from diffusers.loaders.single_file_utils import convert_ldm_vae_checkpoint
+from diffusers import AutoencoderKL
+from huggingface_hub import hf_hub_download
+from sgm.models.autoencoder import AutoencodingEngine
+
+# (1) create vae_sat
+# AutoencodingEngine initialization arguments:
+encoder_config={'target': 'sgm.modules.diffusionmodules.model.Encoder', 'params': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 16, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 4, 8, 8], 'num_res_blocks': 3, 'attn_resolutions': [], 'mid_attn': False, 'dropout': 0.0}}
+decoder_config={'target': 'sgm.modules.diffusionmodules.model.Decoder', 'params': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 16, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 4, 8, 8], 'num_res_blocks': 3, 'attn_resolutions': [], 'mid_attn': False, 'dropout': 0.0}}
+loss_config={'target': 'torch.nn.Identity'}
+regularizer_config={'target': 'sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer'}
+optimizer_config=None
+lr_g_factor=1.0
+ckpt_path="/raid/.cache/huggingface/models--ZP2HF--CogView3-SAT/snapshots/ca86ce9ba94f9a7f2dd109e7a59e4c8ad04121be/3plus_ae/imagekl_ch16.pt"
+ignore_keys= []
+kwargs = {"monitor": "val/rec_loss"}
+vae_sat = AutoencodingEngine(
+    encoder_config=encoder_config,
+    decoder_config=decoder_config,
+    loss_config=loss_config,
+    regularizer_config=regularizer_config,
+    optimizer_config=optimizer_config,
+    lr_g_factor=lr_g_factor,
+    ckpt_path=ckpt_path,
+    ignore_keys=ignore_keys,
+    **kwargs)
+
+
+
+# (2) create vae (diffusers)
+ckpt_path_vae_cogview3 = hf_hub_download(repo_id="ZP2HF/CogView3-SAT", subfolder="3plus_ae", filename="imagekl_ch16.pt")
+cogview3_ckpt = torch.load(ckpt_path_vae_cogview3, map_location='cpu')["state_dict"]
+
+in_channels = 3  # Inferred from encoder.conv_in.weight shape
+out_channels = 3  # Inferred from decoder.conv_out.weight shape
+down_block_types = ("DownEncoderBlock2D",) * 4  # Inferred from the presence of 4 encoder.down blocks
+up_block_types = ("UpDecoderBlock2D",) * 4  # Inferred from the presence of 4 decoder.up blocks
+block_out_channels = (128, 512, 1024, 1024)  # Inferred from the channel sizes in encoder.down blocks
+layers_per_block = 3  # Inferred from the number of blocks in each encoder.down and decoder.up
+act_fn = "silu" # This is the default, cannot be inferred from state_dict
+latent_channels = 16  # Inferred from decoder.conv_in.weight shape
+norm_num_groups = 32  # This is the default, cannot be inferred from state_dict
+sample_size = 1024  # This is the default, cannot be inferred from state_dict
+scaling_factor = 0.18215  # This is the default, cannot be inferred from state_dict
+force_upcast = True  # This is the default, cannot be inferred from state_dict
+use_quant_conv = False  # Inferred from the presence of encoder.conv_out
+use_post_quant_conv = False  # Inferred from the presence of decoder.conv_in
+mid_block_add_attention = False  # Inferred from the absence of attention layers in mid blocks
+
+vae = AutoencoderKL(
+    in_channels=in_channels,
+    out_channels=out_channels,
+    down_block_types=down_block_types,
+    up_block_types=up_block_types,
+    block_out_channels=block_out_channels,
+    layers_per_block=layers_per_block,
+    act_fn=act_fn,
+    latent_channels=latent_channels,
+    norm_num_groups=norm_num_groups,
+    sample_size=sample_size,
+    scaling_factor=scaling_factor,
+    force_upcast=force_upcast,
+    use_quant_conv=use_quant_conv,
+    use_post_quant_conv=use_post_quant_conv,
+    mid_block_add_attention=mid_block_add_attention,
+)
+
+vae.eval()
+vae_sat.eval()
+
+converted_vae_state_dict = convert_ldm_vae_checkpoint(cogview3_ckpt, vae.config)
+vae.load_state_dict(converted_vae_state_dict, strict=False)
+
+# (3) run forward pass for both models
+
+# [2, 16, 128, 128] -> [2, 3, 1024, 1024
+z = torch.load("z.pt").float().to("cpu")
+
+with torch.no_grad():
+    print(" ")
+    print(f" running forward pass for diffusers vae")
+    out = vae.decode(z).sample
+    print(f" ")
+    print(f" running forward pass for sgm vae")
+    out_sat = vae_sat.decode(z)
+
+print(f" output shape: {out.shape}")
+print(f" expected output shape: {out_sat.shape}")
+assert out.shape == out_sat.shape
+assert (out - out_sat).abs().max() < 1e-4, f"max diff: {(out - out_sat).abs().max()}"
diff --git a/show_model_cogview.py b/show_model_cogview.py
@@ -0,0 +1,25 @@
+import torch
+from diffusers import CogView3PlusTransformer2DModel
+
+model = CogView3PlusTransformer2DModel.from_pretrained("/share/home/zyx/Models/CogView3Plus_hf/transformer",torch_dtype=torch.bfloat16)
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+
+batch_size = 1
+hidden_states = torch.ones((batch_size, 16, 256, 256), device=device, dtype=torch.bfloat16)
+timestep = torch.full((batch_size,), 999.0, device=device, dtype=torch.bfloat16)
+y = torch.ones((batch_size, 1536), device=device, dtype=torch.bfloat16)
+
+# 模拟调用 forward 方法
+outputs = model(
+    hidden_states=hidden_states,  # hidden_states 输入
+    timestep=timestep,  # timestep 输入
+    y=y,  # 标签输入
+    block_controlnet_hidden_states=None,  # 如果不需要，可以忽略
+    return_dict=True,  # 保持默认值
+    target_size=[(2048, 2048)],
+)
+
+# 输出模型结果
+print("Output shape:", outputs.sample.shape)
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
@@ -714,68 +714,68 @@ def forward(self, ids: torch.Tensor) -> torch.Tensor:
         return freqs_cos, freqs_sin
 
 
-class CogView3PlusPosEmbed(nn.Module):
-    def __init__(
-        self,
-        max_height: int = 128,
-        max_width: int = 128,
-        hidden_size: int = 2560,
-        text_length: int = 0,
-        block_size: int = 16,
-    ):
-        super().__init__()
-        self.max_height = max_height
-        self.max_width = max_width
-        self.hidden_size = hidden_size
-        self.text_length = text_length
-        self.block_size = block_size
-
-        # Initialize the positional embedding as a non-trainable parameter
-        self.image_pos_embedding = nn.Parameter(
-            torch.zeros(self.max_height, self.max_width, hidden_size), requires_grad=False
-        )
-        # Reinitialize the positional embedding using a sin-cos function
-        self.reinit()
-
-    def forward(self, target_size: List[int]) -> torch.Tensor:
-        ret = []
-        for h, w in target_size:
-            # Scale height and width according to the block size
-            h, w = h // self.block_size, w // self.block_size
-
-            # Reshape the image positional embedding for the target size
-            image_pos_embed = self.image_pos_embedding[:h, :w].reshape(h * w, -1)
-
-            # Combine the text positional embedding and image positional embedding
-            pos_embed = torch.cat(
-                [
-                    torch.zeros(
-                        (self.text_length, self.hidden_size),
-                        dtype=image_pos_embed.dtype,
-                        device=image_pos_embed.device,
-                    ),
-                    image_pos_embed,
-                ],
-                dim=0,
-            )
-
-            ret.append(pos_embed[None, ...])  # Add a batch dimension
-
-        return torch.cat(ret, dim=0)  # Concatenate along the batch dimension
-
-    def reinit(self):
-        # Initialize the positional embedding using the updated 2D sin-cos function
-        grid_size = (self.max_height, self.max_width)
-        pos_embed_np = get_2d_sincos_pos_embed(
-            embed_dim=self.hidden_size,
-            grid_size=grid_size,
-        )
-
-        # Reshape the positional embedding to the desired shape
-        pos_embed_np = pos_embed_np.reshape(self.max_height, self.max_width, self.hidden_size)
-
-        # Copy the positional embedding data
-        self.image_pos_embedding.data.copy_(torch.from_numpy(pos_embed_np).float())
+# class CogView3PlusPosEmbed(nn.Module):
+#     def __init__(
+#         self,
+#         max_height: int = 128,
+#         max_width: int = 128,
+#         hidden_size: int = 2560,
+#         text_length: int = 0,
+#         block_size: int = 16,
+#     ):
+#         super().__init__()
+#         self.max_height = max_height
+#         self.max_width = max_width
+#         self.hidden_size = hidden_size
+#         self.text_length = text_length
+#         self.block_size = block_size
+#
+#         # Initialize the positional embedding as a non-trainable parameter
+#         self.image_pos_embedding = nn.Parameter(
+#             torch.zeros(self.max_height, self.max_width, hidden_size), requires_grad=False
+#         )
+#         # Reinitialize the positional embedding using a sin-cos function
+#         self.reinit()
+#
+#     def forward(self, target_size: List[int]) -> torch.Tensor:
+#         ret = []
+#         for h, w in target_size:
+#             # Scale height and width according to the block size
+#             h, w = h // self.block_size, w // self.block_size
+#
+#             # Reshape the image positional embedding for the target size
+#             image_pos_embed = self.image_pos_embedding[:h, :w].reshape(h * w, -1)
+#
+#             # Combine the text positional embedding and image positional embedding
+#             pos_embed = torch.cat(
+#                 [
+#                     torch.zeros(
+#                         (self.text_length, self.hidden_size),
+#                         dtype=image_pos_embed.dtype,
+#                         device=image_pos_embed.device,
+#                     ),
+#                     image_pos_embed,
+#                 ],
+#                 dim=0,
+#             )
+#
+#             ret.append(pos_embed[None, ...])  # Add a batch dimension
+#
+#         return torch.cat(ret, dim=0)  # Concatenate along the batch dimension
+#
+#     def reinit(self):
+#         # Initialize the positional embedding using the updated 2D sin-cos function
+#         grid_size = (self.max_height, self.max_width)
+#         pos_embed_np = get_2d_sincos_pos_embed(
+#             embed_dim=self.hidden_size,
+#             grid_size=grid_size,
+#         )
+#
+#         # Reshape the positional embedding to the desired shape
+#         pos_embed_np = pos_embed_np.reshape(self.max_height, self.max_width, self.hidden_size)
+#
+#         # Copy the positional embedding data
+#         self.image_pos_embedding.data.copy_(torch.from_numpy(pos_embed_np).float())
 
 
 class CogView3PlusImagePatchEmbedding(nn.Module):
@@ -809,8 +809,6 @@ def forward(self, images: torch.Tensor, encoder_outputs: torch.Tensor = None) ->
         images = images.view(b, c, h // p1, p1, w // p2, p2)
         patches_images = images.permute(0, 2, 4, 1, 3, 5).contiguous()
         patches_images = patches_images.view(b, (h // p1) * (w // p2), c * p1 * p2)
-
-        # Project the patches
         image_emb = self.proj(patches_images)
 
         # If text embeddings are provided, project and concatenate them
@@ -1135,6 +1133,27 @@ def forward(self, image_embeds: torch.Tensor):
         return self.norm(x)
 
 
+class CogView3CombineTimestepLabelEmbedding(nn.Module):
+    def __init__(self, time_embed_dim, label_embed_dim, in_channels=2560):
+        super().__init__()
+
+        self.time_proj = Timesteps(num_channels=in_channels, flip_sin_to_cos=True, downscale_freq_shift=1)
+        self.timestep_embedder = TimestepEmbedding(in_channels=in_channels, time_embed_dim=time_embed_dim)
+        self.label_embedder = nn.Sequential(
+            nn.Linear(label_embed_dim, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim),
+        )
+
+    def forward(self, timestep, class_labels, hidden_dtype=None):
+        t_proj = self.time_proj(timestep)
+        t_emb = self.timestep_embedder(t_proj.to(dtype=hidden_dtype))
+        label_emb = self.label_embedder(class_labels)
+        emb = t_emb + label_emb
+
+        return emb
+
+
 class CombinedTimestepLabelEmbeddings(nn.Module):
     def __init__(self, num_classes, embedding_dim, class_dropout_prob=0.1):
         super().__init__()
diff --git a/src/diffusers/models/transformers/transformer_cogview3plus.py b/src/diffusers/models/transformers/transformer_cogview3plus.py