huggingface · a-r-r-o-w · Nov 18, 2024 · Nov 6, 2024 · Nov 6, 2024 · Nov 8, 2024
diff --git a/docs/source/en/api/pipelines/cogvideox.md b/docs/source/en/api/pipelines/cogvideox.md
@@ -29,16 +29,18 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.m
 
 This pipeline was contributed by [zRzRzRzRzRzRzR](https://github.com/zRzRzRzRzRzRzR). The original codebase can be found [here](https://huggingface.co/THUDM). The original weights can be found under [hf.co/THUDM](https://huggingface.co/THUDM).
 
-There are two models available that can be used with the text-to-video and video-to-video CogVideoX pipelines:
-- [`THUDM/CogVideoX-2b`](https://huggingface.co/THUDM/CogVideoX-2b): The recommended dtype for running this model is `fp16`.
-- [`THUDM/CogVideoX-5b`](https://huggingface.co/THUDM/CogVideoX-5b): The recommended dtype for running this model is `bf16`.
+There are three official models available that can be used with the text-to-video and video-to-video CogVideoX pipelines:
+- [`THUDM/CogVideoX-2b`](https://huggingface.co/THUDM/CogVideoX-2b): The recommended dtype for running this model is `torch.float16`.
+- [`THUDM/CogVideoX-5b`](https://huggingface.co/THUDM/CogVideoX-5b): The recommended dtype for running this model is `torch.bfloat16`.
+- [`THUDM/CogVideoX-1.5-5b`](https://huggingface.co/THUDM/CogVideoX-1.5-5b): The recommended dtype for running this mdoel is `torch.bfloat16`.
 
 There is one model available that can be used with the image-to-video CogVideoX pipeline:
-- [`THUDM/CogVideoX-5b-I2V`](https://huggingface.co/THUDM/CogVideoX-5b-I2V): The recommended dtype for running this model is `bf16`.
+- [`THUDM/CogVideoX-5b-I2V`](https://huggingface.co/THUDM/CogVideoX-5b-I2V): The recommended dtype for running this model is `torch.bfloat16`.
+- [`THUDM/CogVideoX-1.5-5b-I2V`](https://huggingface.co/THUDM/CogVideoX-1.5-5b-I2V): The recommended dtype for running this mdoel is `torch.bfloat16`.
 
 There are two models that support pose controllable generation (by the [Alibaba-PAI](https://huggingface.co/alibaba-pai) team):
-- [`alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose`](https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose): The recommended dtype for running this model is `bf16`.
-- [`alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose`](https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose): The recommended dtype for running this model is `bf16`.
+- [`alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose`](https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose): The recommended dtype for running this model is `torch.bfloat16`.
+- [`alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose`](https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose): The recommended dtype for running this model is `torch.bfloat16`.
 
 ## Inference
 

diff --git a/scripts/convert_cogvideox_to_diffusers.py b/scripts/convert_cogvideox_to_diffusers.py
@@ -80,6 +80,8 @@ def replace_up_keys_inplace(key: str, state_dict: Dict[str, Any]):
     "post_attn1_layernorm": "norm2.norm",
     "time_embed.0": "time_embedding.linear_1",
     "time_embed.2": "time_embedding.linear_2",
+    "ofs_embed.0": "ofs_embedding.linear_1",
+    "ofs_embed.2": "ofs_embedding.linear_2",
     "mixins.patch_embed": "patch_embed",
     "mixins.final_layer.norm_final": "norm_out.norm",
     "mixins.final_layer.linear": "proj_out",
@@ -140,6 +142,7 @@ def convert_transformer(
     use_rotary_positional_embeddings: bool,
     i2v: bool,
     dtype: torch.dtype,
+    init_kwargs: Dict[str, Any],
 ):
     PREFIX_KEY = "model.diffusion_model."
 
@@ -149,7 +152,9 @@ def convert_transformer(
         num_layers=num_layers,
         num_attention_heads=num_attention_heads,
         use_rotary_positional_embeddings=use_rotary_positional_embeddings,
-        use_learned_positional_embeddings=i2v,
+        ofs_embed_dim=512 if (i2v and init_kwargs["patch_size_t"] is not None) else None,  # CogVideoX1.5-5B-I2V
+        use_learned_positional_embeddings=i2v and init_kwargs["patch_size_t"] is None,  # CogVideoX-5B-I2V
+        **init_kwargs,
     ).to(dtype=dtype)
 
     for key in list(original_state_dict.keys()):
@@ -163,6 +168,7 @@ def convert_transformer(
             if special_key not in key:
                 continue
             handler_fn_inplace(key, original_state_dict)
+
     transformer.load_state_dict(original_state_dict, strict=True)
     return transformer
 
@@ -187,6 +193,34 @@ def convert_vae(ckpt_path: str, scaling_factor: float, dtype: torch.dtype):
     return vae
 
 
+def get_init_kwargs(version: str):
+    if version == "1.0":
+        vae_scale_factor_spatial = 8
+        init_kwargs = {
+            "patch_size": 2,
+            "patch_size_t": None,
+            "patch_bias": True,
+            "sample_height": 480 // vae_scale_factor_spatial,
+            "sample_width": 720 // vae_scale_factor_spatial,
+            "sample_frames": 49,
+        }
+
+    elif version == "1.5":
+        vae_scale_factor_spatial = 8
+        init_kwargs = {
+            "patch_size": 2,
+            "patch_size_t": 2,
+            "patch_bias": False,
+            "sample_height": 768 // vae_scale_factor_spatial,
+            "sample_width": 1360 // vae_scale_factor_spatial,
+            "sample_frames": 81, # TODO: Need Test with 161 for 10 seconds
-            "sample_frames": 81, # TODO: Need Test with 161 for 10 seconds
+            "sample_frames": 81,
-            "sample_frames": 81, # TODO: Need Test with 161 for 10 seconds
+            "sample_frames": 81,
+        }
+    else:
+        raise ValueError("Unsupported version of CogVideoX.")
+
+    return init_kwargs
+
+
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -202,6 +236,12 @@ def get_args():
     parser.add_argument(
         "--text_encoder_cache_dir", type=str, default=None, help="Path to text encoder cache directory"
     )
+    parser.add_argument(
+        "--typecast_text_encoder",
+        action="store_true",
+        default=False,
+        help="Whether or not to apply fp16/bf16 precision to text_encoder",
+    )
     # For CogVideoX-2B, num_layers is 30. For 5B, it is 42
     parser.add_argument("--num_layers", type=int, default=30, help="Number of transformer blocks")
     # For CogVideoX-2B, num_attention_heads is 30. For 5B, it is 48
@@ -214,7 +254,18 @@ def get_args():
     parser.add_argument("--scaling_factor", type=float, default=1.15258426, help="Scaling factor in the VAE")
     # For CogVideoX-2B, snr_shift_scale is 3.0. For 5B, it is 1.0
     parser.add_argument("--snr_shift_scale", type=float, default=3.0, help="Scaling factor in the VAE")
-    parser.add_argument("--i2v", action="store_true", default=False, help="Whether to save the model weights in fp16")
+    parser.add_argument(
+        "--i2v",
+        action="store_true",
+        default=False,
+        help="Whether the model to be converted is the Image-to-Video version of CogVideoX.",
+    )
+    parser.add_argument(
+        "--version",
+        choices=["1.0", "1.5"],
+        default="1.0",
+        help="Which version of CogVideoX to use for initializing default modeling parameters.",
+    )
     return parser.parse_args()
 
 
@@ -230,21 +281,27 @@ def get_args():
     dtype = torch.float16 if args.fp16 else torch.bfloat16 if args.bf16 else torch.float32
 
     if args.transformer_ckpt_path is not None:
+        init_kwargs = get_init_kwargs(args.version)
         transformer = convert_transformer(
             args.transformer_ckpt_path,
             args.num_layers,
             args.num_attention_heads,
             args.use_rotary_positional_embeddings,
             args.i2v,
             dtype,
+            init_kwargs,
         )
     if args.vae_ckpt_path is not None:
-        vae = convert_vae(args.vae_ckpt_path, args.scaling_factor, dtype)
+        # Keep VAE in float32 for better quality
+        vae = convert_vae(args.vae_ckpt_path, args.scaling_factor, torch.float32)
 
     text_encoder_id = "google/t5-v1_1-xxl"
     tokenizer = T5Tokenizer.from_pretrained(text_encoder_id, model_max_length=TOKENIZER_MAX_LENGTH)
     text_encoder = T5EncoderModel.from_pretrained(text_encoder_id, cache_dir=args.text_encoder_cache_dir)
 
+    if args.typecast_text_encoder:
+        text_encoder = text_encoder.to(dtype=dtype)
+
     # Apparently, the conversion does not work anymore without this :shrug:
     for param in text_encoder.parameters():
         param.data = param.data.contiguous()
@@ -276,11 +333,6 @@ def get_args():
         scheduler=scheduler,
     )
 
-    if args.fp16:
-        pipe = pipe.to(dtype=torch.float16)
-    if args.bf16:
-        pipe = pipe.to(dtype=torch.bfloat16)
-
     # We don't use variant here because the model must be run in fp16 (2B) or bf16 (5B). It would be weird
     # for users to specify variant when the default is not fp32 and they want to run with the correct default (which
     # is either fp16/bf16 here).

diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
@@ -338,6 +338,7 @@ class CogVideoXPatchEmbed(nn.Module):
     def __init__(
         self,
         patch_size: int = 2,
+        patch_size_t: Optional[int] = None,
         in_channels: int = 16,
         embed_dim: int = 1920,
         text_embed_dim: int = 4096,
@@ -355,6 +356,7 @@ def __init__(
         super().__init__()
 
         self.patch_size = patch_size
+        self.patch_size_t = patch_size_t
         self.embed_dim = embed_dim
         self.sample_height = sample_height
         self.sample_width = sample_width
@@ -366,9 +368,15 @@ def __init__(
         self.use_positional_embeddings = use_positional_embeddings
         self.use_learned_positional_embeddings = use_learned_positional_embeddings
 
-        self.proj = nn.Conv2d(
-            in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
-        )
+        if patch_size_t is None:
+            # CogVideoX 1.0 checkpoints
+            self.proj = nn.Conv2d(
+                in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
+            )
+        else:
+            # CogVideoX 1.5 checkpoints
+            self.proj = nn.Linear(in_channels * patch_size * patch_size * patch_size_t, embed_dim)
+
         self.text_proj = nn.Linear(text_embed_dim, embed_dim)
 
         if use_positional_embeddings or use_learned_positional_embeddings:
@@ -407,12 +415,24 @@ def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
         """
         text_embeds = self.text_proj(text_embeds)
 
-        batch, num_frames, channels, height, width = image_embeds.shape
-        image_embeds = image_embeds.reshape(-1, channels, height, width)
-        image_embeds = self.proj(image_embeds)
-        image_embeds = image_embeds.view(batch, num_frames, *image_embeds.shape[1:])
-        image_embeds = image_embeds.flatten(3).transpose(2, 3)  # [batch, num_frames, height x width, channels]
-        image_embeds = image_embeds.flatten(1, 2)  # [batch, num_frames x height x width, channels]
+        batch_size, num_frames, channels, height, width = image_embeds.shape
+
+        if self.patch_size_t is None:
+            image_embeds = image_embeds.reshape(-1, channels, height, width)
+            image_embeds = self.proj(image_embeds)
+            image_embeds = image_embeds.view(batch_size, num_frames, *image_embeds.shape[1:])
+            image_embeds = image_embeds.flatten(3).transpose(2, 3)  # [batch, num_frames, height x width, channels]
+            image_embeds = image_embeds.flatten(1, 2)  # [batch, num_frames x height x width, channels]
+        else:
+            p = self.patch_size
+            p_t = self.patch_size_t
+
+            image_embeds = image_embeds.permute(0, 1, 3, 4, 2)
+            image_embeds = image_embeds.reshape(
+                batch_size, num_frames // p_t, p_t, height // p, p, width // p, p, channels
+            )
+            image_embeds = image_embeds.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(4, 7).flatten(1, 3)
+            image_embeds = self.proj(image_embeds)
 
         embeds = torch.cat(
             [text_embeds, image_embeds], dim=1

diff --git a/src/diffusers/models/transformers/cogvideox_transformer_3d.py b/src/diffusers/models/transformers/cogvideox_transformer_3d.py
@@ -170,14 +170,16 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
             Whether to flip the sin to cos in the time embedding.
         time_embed_dim (`int`, defaults to `512`):
             Output dimension of timestep embeddings.
+        ofs_embed_dim (`int`, defaults to `512`):
+            scaling factor in the VAE process for the Image-to-Video (I2V) transformation in CogVideoX1.5-5B.
         text_embed_dim (`int`, defaults to `4096`):
             Input dimension of text embeddings from the text encoder.
         num_layers (`int`, defaults to `30`):
             The number of layers of Transformer blocks to use.
         dropout (`float`, defaults to `0.0`):
             The dropout probability to use.
         attention_bias (`bool`, defaults to `True`):
-            Whether or not to use bias in the attention projection layers.
+            Whether to use bias in the attention projection layers.
         sample_width (`int`, defaults to `90`):
             The width of the input latents.
         sample_height (`int`, defaults to `60`):
@@ -198,7 +200,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         timestep_activation_fn (`str`, defaults to `"silu"`):
             Activation function to use when generating the timestep embeddings.
         norm_elementwise_affine (`bool`, defaults to `True`):
-            Whether or not to use elementwise affine in normalization layers.
+            Whether to use elementwise affine in normalization layers.
         norm_eps (`float`, defaults to `1e-5`):
             The epsilon value to use in normalization layers.
         spatial_interpolation_scale (`float`, defaults to `1.875`):
@@ -219,6 +221,7 @@ def __init__(
         flip_sin_to_cos: bool = True,
         freq_shift: int = 0,
         time_embed_dim: int = 512,
+        ofs_embed_dim: Optional[int] = None,
         text_embed_dim: int = 4096,
         num_layers: int = 30,
         dropout: float = 0.0,
@@ -227,6 +230,7 @@ def __init__(
         sample_height: int = 60,
         sample_frames: int = 49,
         patch_size: int = 2,
+        patch_size_t: int = 2,
         temporal_compression_ratio: int = 4,
         max_text_seq_length: int = 226,
         activation_fn: str = "gelu-approximate",
@@ -237,6 +241,7 @@ def __init__(
         temporal_interpolation_scale: float = 1.0,
         use_rotary_positional_embeddings: bool = False,
         use_learned_positional_embeddings: bool = False,
+        patch_bias: bool = True,
     ):
         super().__init__()
         inner_dim = num_attention_heads * attention_head_dim
@@ -251,10 +256,11 @@ def __init__(
         # 1. Patch embedding
         self.patch_embed = CogVideoXPatchEmbed(
             patch_size=patch_size,
+            patch_size_t=patch_size_t,
             in_channels=in_channels,
             embed_dim=inner_dim,
             text_embed_dim=text_embed_dim,
-            bias=True,
+            bias=patch_bias,
             sample_width=sample_width,
             sample_height=sample_height,
             sample_frames=sample_frames,
@@ -267,10 +273,16 @@ def __init__(
         )
         self.embedding_dropout = nn.Dropout(dropout)
 
-        # 2. Time embeddings
+        # 2. Time embeddings and ofs embedding(Only CogVideoX1.5-5B I2V have)
+
         self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
         self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
 
+        self.ofs_embedding = None
+
+        if ofs_embed_dim:
+            self.ofs_embedding = TimestepEmbedding(ofs_embed_dim, ofs_embed_dim, timestep_activation_fn) # same as time embeddings, for ofs
+
         # 3. Define spatio-temporal transformers blocks
         self.transformer_blocks = nn.ModuleList(
             [
@@ -298,7 +310,15 @@ def __init__(
             norm_eps=norm_eps,
             chunk_dim=1,
         )
-        self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels)
+
+        if patch_size_t is None:
+            # For CogVideox 1.0
+            output_dim = patch_size * patch_size * out_channels
+        else:
+            # For CogVideoX 1.5
+            output_dim = patch_size * patch_size * patch_size_t * out_channels
+
+        self.proj_out = nn.Linear(inner_dim, output_dim)
 
         self.gradient_checkpointing = False
 
@@ -441,8 +461,21 @@ def forward(
         # there might be better ways to encapsulate this.
         t_emb = t_emb.to(dtype=hidden_states.dtype)
         emb = self.time_embedding(t_emb, timestep_cond)
+        if self.ofs_embedding is not None:
+            emb_ofs = self.ofs_embedding(emb, timestep_cond)
+            emb = emb + emb_ofs
 
         # 2. Patch embedding
+        p = self.config.patch_size
+        p_t = self.config.patch_size_t
+
+        # We know that the hidden states height and width will always be divisible by patch_size.
+        # But, the number of frames may not be divisible by patch_size_t. So, we pad with the beginning frames.
+        if p_t is not None:
+            remaining_frames = p_t - num_frames % p_t
+            first_frame = hidden_states[:, :1].repeat(1, 1 + remaining_frames, 1, 1, 1)
+            hidden_states = torch.cat([first_frame, hidden_states[:, 1:]], dim=1)
+
         hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
         hidden_states = self.embedding_dropout(hidden_states)
 
@@ -491,12 +524,15 @@ def custom_forward(*inputs):
         hidden_states = self.proj_out(hidden_states)
 
         # 5. Unpatchify
-        # Note: we use `-1` instead of `channels`:
-        #   - It is okay to `channels` use for CogVideoX-2b and CogVideoX-5b (number of input channels is equal to output channels)
-        #   - However, for CogVideoX-5b-I2V also takes concatenated input image latents (number of input channels is twice the output channels)
-        p = self.config.patch_size
-        output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
-        output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
+        if p_t is None:
+            output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
+            output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
+        else:
+            output = hidden_states.reshape(
+                batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
+            )
+            output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
+            output = output[:, remaining_frames:]
 
         if USE_PEFT_BACKEND:
             # remove `lora_scale` from each PEFT layer