make style

a-r-r-o-w · a-r-r-o-w · commit b439f4c82f24 · 2024-10-09T15:35:52.000+02:00
diff --git a/scripts/convert_cogview3_to_diffusers.py b/scripts/convert_cogview3_to_diffusers.py
@@ -6,26 +6,19 @@
 
 Example usage:
     python scripts/convert_cogview3_to_diffusers.py \
-        --original_state_dict_repo_id "THUDM/cogview3-sat" \
-        --filename "cogview3.pt" \
-        --transformer \
-        --output_path "./cogview3_diffusers" \
-        --dtype "bf16"
-
-Alternatively, if you have a local checkpoint:
-    python scripts/convert_cogview3_to_diffusers.py \
-        --checkpoint_path 'your path/cogview3plus_3b/1/mp_rank_00_model_states.pt' \
-        --transformer \
+        --transformer_checkpoint_path 'your path/cogview3plus_3b/1/mp_rank_00_model_states.pt' \
+        --vae_checkpoint_path 'your path/3plus_ae/imagekl_ch16.pt' \
         --output_path "/raid/yiyi/cogview3_diffusers" \
         --dtype "bf16"
 
 Arguments:
-    --original_state_dict_repo_id: The Hugging Face repo ID containing the original checkpoint.
-    --filename: The filename of the checkpoint in the repo (default: "flux.safetensors").
-    --checkpoint_path: Path to a local checkpoint file (alternative to repo_id and filename).
-    --transformer: Flag to convert the transformer model.
+    --transformer_checkpoint_path: Path to Transformer state dict.
+    --vae_checkpoint_path: Path to VAE state dict.
     --output_path: The path to save the converted model.
-    --dtype: The dtype to save the model in (default: "bf16", options: "fp16", "bf16", "fp32").
+    --push_to_hub: Whether to push the converted checkpoint to the HF Hub or not. Defaults to `False`.
+    --text_encoder_cache_dir: Cache directory where text encoder is located. Defaults to None, which means HF_HOME will be used
+    --dtype: The dtype to save the model in (default: "bf16", options: "fp16", "bf16", "fp32"). If None, the dtype of the state dict is considered.
+
     Default is "bf16" because CogView3 uses bfloat16 for Training.
 
 Note: You must provide either --original_state_dict_repo_id or --checkpoint_path.
@@ -73,11 +66,11 @@ def convert_cogview3_transformer_checkpoint_to_diffusers(ckpt_path):
 
     new_state_dict = {}
 
-    # Convert pos_embed
-    new_state_dict["pos_embed.proj.weight"] = original_state_dict.pop("mixins.patch_embed.proj.weight")
-    new_state_dict["pos_embed.proj.bias"] = original_state_dict.pop("mixins.patch_embed.proj.bias")
-    new_state_dict["pos_embed.text_proj.weight"] = original_state_dict.pop("mixins.patch_embed.text_proj.weight")
-    new_state_dict["pos_embed.text_proj.bias"] = original_state_dict.pop("mixins.patch_embed.text_proj.bias")
+    # Convert patch_embed
+    new_state_dict["patch_embed.proj.weight"] = original_state_dict.pop("mixins.patch_embed.proj.weight")
+    new_state_dict["patch_embed.proj.bias"] = original_state_dict.pop("mixins.patch_embed.proj.bias")
+    new_state_dict["patch_embed.text_proj.weight"] = original_state_dict.pop("mixins.patch_embed.text_proj.weight")
+    new_state_dict["patch_embed.text_proj.bias"] = original_state_dict.pop("mixins.patch_embed.text_proj.bias")
 
     # Convert time_condition_embed
     new_state_dict["time_condition_embed.timestep_embedder.linear_1.weight"] = original_state_dict.pop(
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
@@ -469,10 +469,10 @@ def __init__(
 
     def forward(self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, channel, height, width = hidden_states.shape
-        
+
         if height % self.patch_size != 0 or width % self.patch_size != 0:
             raise ValueError("Height and width must be divisible by patch size")
-        
+
         height = height // self.patch_size
         width = width // self.patch_size
         hidden_states = hidden_states.view(batch_size, channel, height, self.patch_size, width, self.patch_size)
@@ -1156,11 +1156,9 @@ def forward(
         original_size_proj = self.condition_proj(original_size.flatten()).view(original_size.size(0), -1)
         crop_coords_proj = self.condition_proj(crop_coords.flatten()).view(crop_coords.size(0), -1)
         target_size_proj = self.condition_proj(target_size.flatten()).view(target_size.size(0), -1)
-        
+
         # (B, 3 * condition_dim)
-        condition_proj = torch.cat(
-            [original_size_proj, crop_coords_proj, target_size_proj], dim=1
-        )
+        condition_proj = torch.cat([original_size_proj, crop_coords_proj, target_size_proj], dim=1)
 
         timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (B, embedding_dim)
         condition_emb = self.condition_embedder(condition_proj.to(dtype=hidden_dtype))  # (B, embedding_dim)
diff --git a/src/diffusers/models/transformers/transformer_cogview3plus.py b/src/diffusers/models/transformers/transformer_cogview3plus.py
@@ -140,20 +140,22 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
         time_embed_dim (`int`, defaults to `512`):
             Output dimension of timestep embeddings.
         condition_dim (`int`, defaults to `256`):
-            The embedding dimension of the input SDXL-style resolution conditions (original_size, target_size, crop_coords).
+            The embedding dimension of the input SDXL-style resolution conditions (original_size, target_size,
+            crop_coords).
         pooled_projection_dim (`int`, defaults to `1536`):
-            The overall pooled dimension by concatenating SDXL-style resolution conditions. As 3 additional conditions are
-            used (original_size, target_size, crop_coords), and each is a sinusoidal condition of dimension `2 * condition_dim`,
-            we get the pooled projection dimension as `2 * condition_dim * 3 => 1536`. The timestep embeddings will be projected
-            to this dimension as well.
-            TODO(yiyi): Do we need this parameter based on the above explanation?
+            The overall pooled dimension by concatenating SDXL-style resolution conditions. As 3 additional conditions
+            are used (original_size, target_size, crop_coords), and each is a sinusoidal condition of dimension `2 *
+            condition_dim`, we get the pooled projection dimension as `2 * condition_dim * 3 => 1536`. The timestep
+            embeddings will be projected to this dimension as well. TODO(yiyi): Do we need this parameter based on the
+            above explanation?
         pos_embed_max_size (`int`, defaults to `128`):
-            The maximum resolution of the positional embeddings, from which slices of shape `H x W` are taken and added to input
-            patched latents, where `H` and `W` are the latent height and width respectively. A value of 128 means that the maximum
-            supported height and width for image generation is `128 * vae_scale_factor * patch_size => 128 * 8 * 2 => 2048`.
+            The maximum resolution of the positional embeddings, from which slices of shape `H x W` are taken and added
+            to input patched latents, where `H` and `W` are the latent height and width respectively. A value of 128
+            means that the maximum supported height and width for image generation is `128 * vae_scale_factor *
+            patch_size => 128 * 8 * 2 => 2048`.
         sample_size (`int`, defaults to `128`):
-            The base resolution of input latents. If height/width is not provided during generation, this value is used to determine
-            the resolution as `sample_size * vae_scale_factor => 128 * 8 => 1024`
+            The base resolution of input latents. If height/width is not provided during generation, this value is used
+            to determine the resolution as `sample_size * vae_scale_factor => 128 * 8 => 1024`
     """
 
     _supports_gradient_checkpointing = True
@@ -336,16 +338,19 @@ def forward(
             hidden_states (`torch.Tensor`):
                 Input `hidden_states` of shape `(batch size, channel, height, width)`.
             encoder_hidden_states (`torch.Tensor`):
-                Conditional embeddings (embeddings computed from the input conditions such as prompts)
-                of shape `(batch_size, sequence_len, text_embed_dim)`
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) of shape
+                `(batch_size, sequence_len, text_embed_dim)`
             timestep (`torch.LongTensor`):
                 Used to indicate denoising step.
             original_size (`torch.Tensor`):
-                CogView3 uses SDXL-like micro-conditioning for original image size as explained in section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+                CogView3 uses SDXL-like micro-conditioning for original image size as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
             target_size (`torch.Tensor`):
-                CogView3 uses SDXL-like micro-conditioning for target image size as explained in section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+                CogView3 uses SDXL-like micro-conditioning for target image size as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
             crop_coords (`torch.Tensor`):
-                CogView3 uses SDXL-like micro-conditioning for crop coordinates as explained in section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+                CogView3 uses SDXL-like micro-conditioning for crop coordinates as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
                 tuple.
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
@@ -145,9 +145,7 @@
         "CogVideoXImageToVideoPipeline",
         "CogVideoXVideoToVideoPipeline",
     ]
-    _import_structure["cogview3"] = [
-        "CogView3PlusPipeline"
-    ]
+    _import_structure["cogview3"] = ["CogView3PlusPipeline"]
     _import_structure["controlnet"].extend(
         [
             "BlipDiffusionControlNetPipeline",

Original file line number	Diff line number	Diff line change
`@@ -145,9 +145,7 @@`
`145`	`145`	`"CogVideoXImageToVideoPipeline",`
`146`	`146`	`"CogVideoXVideoToVideoPipeline",`
`147`	`147`	`]`
`148`		`- _import_structure["cogview3"] = [`
`149`		`- "CogView3PlusPipeline"`
`150`		`- ]`
	`148`	`+ _import_structure["cogview3"] = ["CogView3PlusPipeline"]`
`151`	`149`	`_import_structure["controlnet"].extend(`
`152`	`150`	`[`
`153`	`151`	`"BlipDiffusionControlNetPipeline",`