[cogvideox] add more transformer config (mindspore-lab#925)

CaitinZhao · zhaoting · web-flow · commit 67be51198ffb · 2025-03-27T06:58:41.000Z
* [cogvideox] add more transformer config

* add i2v transformer config support

* add performence

* update readme

---------

Co-authored-by: zhaoting &lt;zhaoting23@huawei.com&gt;
diff --git a/examples/diffusers/cogvideox_factory/README.md b/examples/diffusers/cogvideox_factory/README.md
@@ -356,8 +356,38 @@ NODE_RANK="0"
   done
   ```
 
-要了解不同参数的含义，你可以查看 [args](./scripts/args.py) 文件，或者使用 `--help` 运行训练脚本。
+> [!TIP]
+> 如果想修改transformer的模型结构，可以设置`--transformer_config`。比如修改成30B的模型，可以设置`--transformer_config=configs/cogvideox1.5_30B.yaml`；
+> 当配置了`transformer_config`，可以配置`--transformer_ckpt_path`加载checkpoint权重。
+
+要了解更多参数的含义，你可以查看 [args](./scripts/args.py) 文件，或者使用 `--help` 运行训练脚本。
+
+## 性能数据
+
+### 训练
+
+|       model       | cards | DP | SP | zero  | vae cache | video shape | precision | jit level | s/step | memory usage |
+|:-----------------:|:-----:|:--:|:--:|:-----:|:---------:|:-----------:|:---------:|:---------:|:------:|:------------:|
+| CogvideoX 1.5 T2V 5B  |   8   | 8  | 1  | zero3 |    ON     | 1x77x768x1360 |   bf16    |    O1     | 39.23  |   35.6 GB    |
+| CogvideoX 1.5 T2V 5B  |   8   | 4  | 2  | zero3 |    ON     | 1x77x768x1360 |   bf16    |    O1     |  20.9  |   19.9 GB    |
+| CogvideoX 1.5 T2V 5B  |   8   | 2  | 4  | zero3 |    ON     | 1x77x768x1360 |   bf16    |    O1     |  10.1  |   14.6 GB    |
+| CogvideoX 1.5 T2V 5B  |   8   | 1  | 8  | zero3 |    ON     | 1x77x768x1360 |   bf16    |    O1     |  5.16  |    8.2 GB    |
+| CogvideoX 1.5 T2V 5B  |  16   | 2  | 8  | zero3 |    ON     | 1x77x768x1360 |   bf16    |    O1     |  5.24  |    6.3 GB    |
+| CogvideoX 1.5 T2V 5B  |   8   | 8  | 1  | zero3 |    OFF    | 1x77x768x1360 |   bf16    |    O1     |   49   |    40 GB     |
+| CogvideoX 1.5 T2V 5B  |   8   | 1  | 8  | zero3 |    OFF    | 1x77x768x1360 |   bf16    |    O1     | 10.58  |    9.3 GB    |
+| CogvideoX 1.5 T2V 10B |   8   | 2  | 4  | zero3 |    ON     | 1x77x768x1360 |   bf16    |    O1     |  15.2  |   25.6 GB    |
+| CogvideoX 1.5 T2V 20B |   8   | 2  | 4  | zero3 |    ON     | 1x77x768x1360 |   bf16    |    O1     |  20.1  |   35.7 GB    |
+| CogvideoX 1.5 T2V 30B |   8   | 2  | 4  | zero3 |    ON     | 1x77x768x1360 |   bf16    |    O1     |  26.5  |   47.3 GB    |
+
+以上数据在Disney数据集，910*上获得。
+
+### 推理
+
+|       model       | cards | DP | SP | zero  |  video shape  | precision | jit level | s/step | total cost |
+|:-----------------:|:-----:|:--:|:--:|:-----:|:-------------:|:---------:|:---------:|:------:|:----------:|
+| CogvideoX 1.5 T2V 5B  |   8   | 1  | 8  | zero3 | 1x77x768x1360 |   bf16    |    O1     |  3.21  |   ~ 5min   |
 
+以上数据在910*上获得。
 
 ## 与原仓的差异&功能限制
 
diff --git a/examples/diffusers/cogvideox_factory/configs/cogvideox1.5_10B.yaml b/examples/diffusers/cogvideox_factory/configs/cogvideox1.5_10B.yaml
@@ -0,0 +1,28 @@
+transformer:
+    "activation_fn": "gelu-approximate"
+    "attention_bias": True
+    "attention_head_dim": 96
+    "dropout": 0.0
+    "flip_sin_to_cos": True
+    "freq_shift": 0
+    "in_channels": 16
+    "max_text_seq_length": 226
+    "norm_elementwise_affine": True
+    "norm_eps": 1e-05
+    "num_attention_heads": 48
+    "num_layers": 48
+    "out_channels": 16
+    "patch_bias": False
+    "patch_size": 2
+    "patch_size_t": 2
+    "sample_frames": 81
+    "sample_height": 96
+    "sample_width": 170
+    "spatial_interpolation_scale": 1.875
+    "temporal_compression_ratio": 4
+    "temporal_interpolation_scale": 1.0
+    "text_embed_dim": 4096
+    "time_embed_dim": 512
+    "timestep_activation_fn": "silu"
+    "use_learned_positional_embeddings": False
+    "use_rotary_positional_embeddings": True
diff --git a/examples/diffusers/cogvideox_factory/configs/cogvideox1.5_20B.yaml b/examples/diffusers/cogvideox_factory/configs/cogvideox1.5_20B.yaml
@@ -0,0 +1,28 @@
+transformer:
+    "activation_fn": "gelu-approximate"
+    "attention_bias": True
+    "attention_head_dim": 128
+    "dropout": 0.0
+    "flip_sin_to_cos": True
+    "freq_shift": 0
+    "in_channels": 16
+    "max_text_seq_length": 226
+    "norm_elementwise_affine": True
+    "norm_eps": 1e-05
+    "num_attention_heads": 48
+    "num_layers": 48
+    "out_channels": 16
+    "patch_bias": False
+    "patch_size": 2
+    "patch_size_t": 2
+    "sample_frames": 81
+    "sample_height": 96
+    "sample_width": 170
+    "spatial_interpolation_scale": 1.875
+    "temporal_compression_ratio": 4
+    "temporal_interpolation_scale": 1.0
+    "text_embed_dim": 4096
+    "time_embed_dim": 512
+    "timestep_activation_fn": "silu"
+    "use_learned_positional_embeddings": False
+    "use_rotary_positional_embeddings": True
diff --git a/examples/diffusers/cogvideox_factory/configs/cogvideox1.5_30B.yaml b/examples/diffusers/cogvideox_factory/configs/cogvideox1.5_30B.yaml
@@ -0,0 +1,28 @@
+transformer:
+    "activation_fn": "gelu-approximate"
+    "attention_bias": True
+    "attention_head_dim": 128
+    "dropout": 0.0
+    "flip_sin_to_cos": True
+    "freq_shift": 0
+    "in_channels": 16
+    "max_text_seq_length": 226
+    "norm_elementwise_affine": True
+    "norm_eps": 1e-05
+    "num_attention_heads": 48
+    "num_layers": 64
+    "out_channels": 16
+    "patch_bias": False
+    "patch_size": 2
+    "patch_size_t": 2
+    "sample_frames": 81
+    "sample_height": 96
+    "sample_width": 170
+    "spatial_interpolation_scale": 1.875
+    "temporal_compression_ratio": 4
+    "temporal_interpolation_scale": 1.0
+    "text_embed_dim": 4096
+    "time_embed_dim": 512
+    "timestep_activation_fn": "silu"
+    "use_learned_positional_embeddings": False
+    "use_rotary_positional_embeddings": True
diff --git a/examples/diffusers/cogvideox_factory/scripts/args.py b/examples/diffusers/cogvideox_factory/scripts/args.py
@@ -17,6 +17,18 @@ def _get_model_args(parser: argparse.ArgumentParser) -> None:
         required=True,
         help="Path to pretrained model or model identifier from huggingface.co/models.",
     )
+    parser.add_argument(
+        "--transformer_config",
+        type=str,
+        default=None,
+        help="Config of transformers. If set it, not use the pretrained_model_name_or_path transformer config.",
+    )
+    parser.add_argument(
+        "--transformer_ckpt_path",
+        type=str,
+        default=None,
+        help="Path to the transformer checkpoint. Only effective when set transformer_config.",
+    )
     parser.add_argument(
         "--revision",
         type=str,
diff --git a/examples/diffusers/cogvideox_factory/scripts/cogvideox_image_to_video_sft.py b/examples/diffusers/cogvideox_factory/scripts/cogvideox_image_to_video_sft.py
@@ -171,15 +171,29 @@ def main(args):
     # CogVideoX-2b weights are stored in float16
     # CogVideoX-5b and CogVideoX-5b-I2V weights are stored in bfloat16
     # load_dtype = ms.bfloat16 if "5b" in args.pretrained_model_name_or_path.lower() else ms.float16
-    transformer = CogVideoXTransformer3DModel_SP.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="transformer",
-        mindspore_dtype=weight_dtype,
-        revision=args.revision,
-        variant=args.variant,
-        max_text_seq_length=args.max_sequence_length,
-        enable_sequence_parallelism=enable_sequence_parallelism,
-    )
+    if args.transformer_config is None:
+        transformer = CogVideoXTransformer3DModel_SP.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="transformer",
+            mindspore_dtype=weight_dtype,
+            revision=args.revision,
+            variant=args.variant,
+            max_text_seq_length=args.max_sequence_length,
+            enable_sequence_parallelism=enable_sequence_parallelism,
+        )
+    elif os.path.exists(args.transformer_config):
+        with open(args.transformer_config) as f:
+            config = yaml.safe_load(f)["transformer"]
+            config["max_text_seq_length"] = args.max_sequence_length
+            config["enable_sequence_parallelism"] = enable_sequence_parallelism
+            transformer = CogVideoXTransformer3DModel_SP(**config)
+            logger.info(f"Build transformer model from {args.transformer_config}")
+            if os.path.exists(args.transformer_ckpt_path):
+                ms.load_checkpoint(args.transformer_ckpt_path, transformer)
+                logger.info(f"Load transformer checkpoint from {args.transformer_ckpt_path}")
+
+    else:
+        raise ValueError(f"transformer_config: {args.transformer_config} is not exist!")
     transformer.fa_checkpointing = args.fa_gradient_checkpointing
 
     text_encoder, vae = None, None
diff --git a/examples/diffusers/cogvideox_factory/scripts/cogvideox_text_to_video_sft.py b/examples/diffusers/cogvideox_factory/scripts/cogvideox_text_to_video_sft.py
@@ -168,15 +168,30 @@ def main(args):
     # CogVideoX-2b weights are stored in float16
     # CogVideoX-5b and CogVideoX-5b-I2V weights are stored in bfloat16
     # load_dtype = ms.bfloat16 if "5b" in args.pretrained_model_name_or_path.lower() else ms.float16
-    transformer = CogVideoXTransformer3DModel_SP.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="transformer",
-        mindspore_dtype=weight_dtype,
-        revision=args.revision,
-        variant=args.variant,
-        max_text_seq_length=args.max_sequence_length,
-        enable_sequence_parallelism=enable_sequence_parallelism,
-    )
+    if args.transformer_config is None:
+        transformer = CogVideoXTransformer3DModel_SP.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="transformer",
+            mindspore_dtype=weight_dtype,
+            revision=args.revision,
+            variant=args.variant,
+            max_text_seq_length=args.max_sequence_length,
+            enable_sequence_parallelism=enable_sequence_parallelism,
+        )
+    elif os.path.exists(args.transformer_config):
+        with open(args.transformer_config) as f:
+            config = yaml.safe_load(f)["transformer"]
+            config["max_text_seq_length"] = args.max_sequence_length
+            config["enable_sequence_parallelism"] = enable_sequence_parallelism
+            transformer = CogVideoXTransformer3DModel_SP(**config)
+            logger.info(f"Build transformer model from {args.transformer_config}")
+            if os.path.exists(args.transformer_ckpt_path):
+                ms.load_checkpoint(args.transformer_ckpt_path, transformer)
+                logger.info(f"Load transformer checkpoint from {args.transformer_ckpt_path}")
+
+    else:
+        raise ValueError(f"transformer_config: {args.transformer_config} is not exist!")
+
     transformer.fa_checkpointing = args.fa_gradient_checkpointing
 
     text_encoder, vae = None, None