adaption for CogVideoX1.5 (#92)

jiashenggu · sayakpaul · web-flow · commit 0b80dba1de53 · 2024-11-23T06:28:20.000+05:30
* adaption for CogVideoX1.5

* add patch_size_t in full finetuning of T2V and lora finetuning of I2V

* Update training/args.py

Co-authored-by: Sayak Paul &lt;spsayakpaul@gmail.com&gt;

---------

Co-authored-by: Sayak Paul &lt;spsayakpaul@gmail.com&gt;
diff --git a/training/args.py b/training/args.py
@@ -78,6 +78,7 @@ def _get_dataset_args(parser: argparse.ArgumentParser) -> None:
         nargs="+",
         type=int,
         default=[49],
+        help="CogVideoX1.5 need to guarantee that ((num_frames - 1) // self.vae_scale_factor_temporal + 1) % patch_size_t != 0, such as 53"
     )
     parser.add_argument(
         "--load_tensors",
diff --git a/training/cogvideox_image_to_video_lora.py b/training/cogvideox_image_to_video_lora.py
@@ -787,6 +787,7 @@ def load_model_hook(models, input_dir):
                         num_frames=num_frames,
                         vae_scale_factor_spatial=VAE_SCALE_FACTOR_SPATIAL,
                         patch_size=model_config.patch_size,
+                        patch_size_t=model_config.patch_size_t if hasattr(model_config, "patch_size_t") else None,
                         attention_head_dim=model_config.attention_head_dim,
                         device=accelerator.device,
                     )
diff --git a/training/cogvideox_text_to_video_lora.py b/training/cogvideox_text_to_video_lora.py
@@ -696,6 +696,7 @@ def load_model_hook(models, input_dir):
                         num_frames=num_frames,
                         vae_scale_factor_spatial=VAE_SCALE_FACTOR_SPATIAL,
                         patch_size=model_config.patch_size,
+                        patch_size_t=model_config.patch_size_t if hasattr(model_config, "patch_size_t") else None,
                         attention_head_dim=model_config.attention_head_dim,
                         device=accelerator.device,
                     )
diff --git a/training/cogvideox_text_to_video_sft.py b/training/cogvideox_text_to_video_sft.py
@@ -662,6 +662,7 @@ def load_model_hook(models, input_dir):
                         num_frames=num_frames,
                         vae_scale_factor_spatial=VAE_SCALE_FACTOR_SPATIAL,
                         patch_size=model_config.patch_size,
+                        patch_size_t=model_config.patch_size_t if hasattr(model_config, "patch_size_t") else None,
                         attention_head_dim=model_config.attention_head_dim,
                         device=accelerator.device,
                     )
diff --git a/training/utils.py b/training/utils.py
@@ -198,6 +198,7 @@ def prepare_rotary_positional_embeddings(
     num_frames: int,
     vae_scale_factor_spatial: int = 8,
     patch_size: int = 2,
+    patch_size_t: int = None,
     attention_head_dim: int = 64,
     device: Optional[torch.device] = None,
     base_height: int = 480,
@@ -207,14 +208,28 @@ def prepare_rotary_positional_embeddings(
     grid_width = width // (vae_scale_factor_spatial * patch_size)
     base_size_width = base_width // (vae_scale_factor_spatial * patch_size)
     base_size_height = base_height // (vae_scale_factor_spatial * patch_size)
+    if patch_size_t is None:
+        # CogVideoX 1.0
+        grid_crops_coords = get_resize_crop_region_for_grid((grid_height, grid_width), base_size_width, base_size_height)
+        freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+            embed_dim=attention_head_dim,
+            crops_coords=grid_crops_coords,
+            grid_size=(grid_height, grid_width),
+            temporal_size=num_frames,
+        )
+    else:
+        # CogVideoX 1.5
+        base_num_frames = (num_frames + patch_size_t - 1) // patch_size_t
+
+        freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+            embed_dim=attention_head_dim,
+            crops_coords=None,
+            grid_size=(grid_height, grid_width),
+            temporal_size=base_num_frames,
+            grid_type="slice",
+            max_size=(base_size_height, base_size_width),
+        )
 
-    grid_crops_coords = get_resize_crop_region_for_grid((grid_height, grid_width), base_size_width, base_size_height)
-    freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
-        embed_dim=attention_head_dim,
-        crops_coords=grid_crops_coords,
-        grid_size=(grid_height, grid_width),
-        temporal_size=num_frames,
-    )
 
     freqs_cos = freqs_cos.to(device=device)
     freqs_sin = freqs_sin.to(device=device)

Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,7 @@ def _get_dataset_args(parser: argparse.ArgumentParser) -> None:`
`78`	`78`	`nargs="+",`
`79`	`79`	`type=int,`
`80`	`80`	`default=[49],`
	`81`	`+ help="CogVideoX1.5 need to guarantee that ((num_frames - 1) // self.vae_scale_factor_temporal + 1) % patch_size_t != 0, such as 53"`
`81`	`82`	`)`
`82`	`83`	`parser.add_argument(`
`83`	`84`	`"--load_tensors",`
Original file line number	Diff line number	Diff line change
`@@ -787,6 +787,7 @@ def load_model_hook(models, input_dir):`
`787`	`787`	`num_frames=num_frames,`
`788`	`788`	`vae_scale_factor_spatial=VAE_SCALE_FACTOR_SPATIAL,`
`789`	`789`	`patch_size=model_config.patch_size,`
	`790`	`+ patch_size_t=model_config.patch_size_t if hasattr(model_config, "patch_size_t") else None,`
`790`	`791`	`attention_head_dim=model_config.attention_head_dim,`
`791`	`792`	`device=accelerator.device,`
`792`	`793`	`)`
Original file line number	Diff line number	Diff line change
`@@ -696,6 +696,7 @@ def load_model_hook(models, input_dir):`
`696`	`696`	`num_frames=num_frames,`
`697`	`697`	`vae_scale_factor_spatial=VAE_SCALE_FACTOR_SPATIAL,`
`698`	`698`	`patch_size=model_config.patch_size,`
	`699`	`+ patch_size_t=model_config.patch_size_t if hasattr(model_config, "patch_size_t") else None,`
`699`	`700`	`attention_head_dim=model_config.attention_head_dim,`
`700`	`701`	`device=accelerator.device,`
`701`	`702`	`)`
Original file line number	Diff line number	Diff line change
`@@ -662,6 +662,7 @@ def load_model_hook(models, input_dir):`
`662`	`662`	`num_frames=num_frames,`
`663`	`663`	`vae_scale_factor_spatial=VAE_SCALE_FACTOR_SPATIAL,`
`664`	`664`	`patch_size=model_config.patch_size,`
	`665`	`+ patch_size_t=model_config.patch_size_t if hasattr(model_config, "patch_size_t") else None,`
`665`	`666`	`attention_head_dim=model_config.attention_head_dim,`
`666`	`667`	`device=accelerator.device,`
`667`	`668`	`)`