add 10 second comment

zRzRzRzRzRzRzR · zRzRzRzRzRzRzR · commit d8ee01384225 · 2024-11-08T22:31:39.000+08:00
diff --git a/sat/configs/cogvideox1.5_5b.yaml b/sat/configs/cogvideox1.5_5b.yaml
@@ -23,7 +23,7 @@ model:
     params:
       time_embed_dim: 512
       elementwise_affine: True
-      num_frames: 81
+      num_frames: 81 # for 5 seconds and 161 for 10 seconds
       time_compressed_rate: 4
       latent_width: 300
       latent_height: 300
diff --git a/sat/configs/cogvideox1.5_5b_i2v.yaml b/sat/configs/cogvideox1.5_5b_i2v.yaml
@@ -25,11 +25,10 @@ model:
   network_config:
     target: dit_video_concat.DiffusionTransformer
     params:
-#      space_interpolation: 1.875
       ofs_embed_dim: 512
       time_embed_dim: 512
       elementwise_affine: True
-      num_frames: 81
+      num_frames: 81  # for 5 seconds and 161 for 10 seconds
       time_compressed_rate: 4
       latent_width: 300
       latent_height: 300
diff --git a/sat/configs/inference.yaml b/sat/configs/inference.yaml
@@ -1,16 +1,14 @@
 args:
-  image2video: False # True for image2video, False for text2video
+#  image2video: True  # True for image2video, False for text2video
   latent_channels: 16
   mode: inference
   load: "{your CogVideoX SAT folder}/transformer" # This is for Full model without lora adapter
-  # load: "{your lora folder} such as zRzRzRzRzRzRzR/lora-disney-08-20-13-28" # This is for Full model without lora adapter
   batch_size: 1
   input_type: txt
   input_file: configs/test.txt
-  sampling_image_size: [480, 720]
-  sampling_num_frames: 13  # Must be 13, 11 or 9
-  sampling_fps: 8
-#  fp16: True # For CogVideoX-2B
-  bf16: True # For CogVideoX-5B and CoGVideoX-5B-I2V
-  output_dir: outputs/
+  sampling_image_size: [768, 1360] # remove this for I2V
+  sampling_num_frames: 22  # 42 for 10 seconds and 22 for 5 seconds
+  sampling_fps: 16
+  bf16: True
+  output_dir: outputs
   force_inference: True