33The script supports different types of video generation, including text-to-video (t2v), image-to-video (i2v),
44and video-to-video (v2v), depending on the input data and different weight.
55
6- - text-to-video: THUDM/CogVideoX-5b or THUDM/CogVideoX-2b
7- - video-to-video: THUDM/CogVideoX-5b or THUDM/CogVideoX-2b
8- - image-to-video: THUDM/CogVideoX-5b-I2V
6+ - text-to-video: THUDM/CogVideoX-5b, THUDM/CogVideoX-2b or THUDM/CogVideoX1.5-5b
7+ - video-to-video: THUDM/CogVideoX-5b, THUDM/CogVideoX-2b or THUDM/CogVideoX1.5-5b
8+ - image-to-video: THUDM/CogVideoX-5b-I2V or THUDM/CogVideoX1.5-5b-I2V
99
1010Running the Script:
1111To run the script, use the following command with appropriate arguments:
1212
1313```bash
14- $ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX -5b --generate_type "t2v"
14+ $ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5 -5b --generate_type "t2v"
1515```
1616
1717Additional options are available to specify the model path, guidance scale, number of inference steps, video generation type, and output paths.
2323import torch
2424from diffusers import (
2525 CogVideoXPipeline ,
26- CogVideoXDDIMScheduler ,
2726 CogVideoXDPMScheduler ,
2827 CogVideoXImageToVideoPipeline ,
2928 CogVideoXVideoToVideoPipeline ,
@@ -37,6 +36,7 @@ def generate_video(
3736 model_path : str ,
3837 lora_path : str = None ,
3938 lora_rank : int = 128 ,
39+ num_frames = 81 ,
4040 output_path : str = "./output.mp4" ,
4141 image_or_video_path : str = "" ,
4242 num_inference_steps : int = 50 ,
@@ -45,6 +45,7 @@ def generate_video(
4545 dtype : torch .dtype = torch .bfloat16 ,
4646 generate_type : str = Literal ["t2v" , "i2v" , "v2v" ], # i2v: image to video, v2v: video to video
4747 seed : int = 42 ,
48+ fps : int = 8 ,
4849):
4950 """
5051 Generates a video based on the given prompt and saves it to the specified path.
@@ -56,11 +57,13 @@ def generate_video(
5657 - lora_rank (int): The rank of the LoRA weights.
5758 - output_path (str): The path where the generated video will be saved.
5859 - num_inference_steps (int): Number of steps for the inference process. More steps can result in better quality.
60+ - num_frames (int): Number of frames to generate.
5961 - guidance_scale (float): The scale for classifier-free guidance. Higher values can lead to better alignment with the prompt.
6062 - num_videos_per_prompt (int): Number of videos to generate per prompt.
6163 - dtype (torch.dtype): The data type for computation (default is torch.bfloat16).
6264 - generate_type (str): The type of video generation (e.g., 't2v', 'i2v', 'v2v').·
6365 - seed (int): The seed for reproducibility.
66+ - fps (int): The frames per second for the generated video.
6467 """
6568
6669 # 1. Load the pre-trained CogVideoX pipeline with the specified precision (bfloat16).
@@ -109,11 +112,11 @@ def generate_video(
109112 if generate_type == "i2v" :
110113 video_generate = pipe (
111114 prompt = prompt ,
112- image = image , # The path of the image to be used as the background of the video
115+ image = image , # The path of the image, the resolution of video will be the same as the image for CogVideoX1.5-5B-I2V, otherwise it will be 720 * 480
113116 num_videos_per_prompt = num_videos_per_prompt , # Number of videos to generate per prompt
114117 num_inference_steps = num_inference_steps , # Number of inference steps
115- num_frames = 49 , # Number of frames to generate,changed to 49 for diffusers version `0.30.3` and after.
116- use_dynamic_cfg = True , # This id used for DPM Sechduler , for DDIM scheduler, it should be False
118+ num_frames = num_frames , # Number of frames to generate
119+ use_dynamic_cfg = True , # This id used for DPM scheduler , for DDIM scheduler, it should be False
117120 guidance_scale = guidance_scale ,
118121 generator = torch .Generator ().manual_seed (seed ), # Set the seed for reproducibility
119122 ).frames [0 ]
@@ -122,7 +125,7 @@ def generate_video(
122125 prompt = prompt ,
123126 num_videos_per_prompt = num_videos_per_prompt ,
124127 num_inference_steps = num_inference_steps ,
125- num_frames = 49 ,
128+ num_frames = num_frames ,
126129 use_dynamic_cfg = True ,
127130 guidance_scale = guidance_scale ,
128131 generator = torch .Generator ().manual_seed (seed ),
@@ -133,13 +136,12 @@ def generate_video(
133136 video = video , # The path of the video to be used as the background of the video
134137 num_videos_per_prompt = num_videos_per_prompt ,
135138 num_inference_steps = num_inference_steps ,
136- # num_frames=49 ,
139+ num_frames = num_frames ,
137140 use_dynamic_cfg = True ,
138141 guidance_scale = guidance_scale ,
139142 generator = torch .Generator ().manual_seed (seed ), # Set the seed for reproducibility
140143 ).frames [0 ]
141- # 5. Export the generated frames to a video file. fps must be 8 for original video.
142- export_to_video (video_generate , output_path , fps = 8 )
144+ export_to_video (video_generate , output_path , fps = fps )
143145
144146
145147if __name__ == "__main__" :
@@ -152,24 +154,18 @@ def generate_video(
152154 help = "The path of the image to be used as the background of the video" ,
153155 )
154156 parser .add_argument (
155- "--model_path" , type = str , default = "THUDM/CogVideoX-5b" , help = "The path of the pre-trained model to be used "
157+ "--model_path" , type = str , default = "THUDM/CogVideoX-5b" , help = "Path of the pre-trained model use "
156158 )
157159 parser .add_argument ("--lora_path" , type = str , default = None , help = "The path of the LoRA weights to be used" )
158160 parser .add_argument ("--lora_rank" , type = int , default = 128 , help = "The rank of the LoRA weights" )
159- parser .add_argument (
160- "--output_path" , type = str , default = "./output.mp4" , help = "The path where the generated video will be saved"
161- )
161+ parser .add_argument ("--output_path" , type = str , default = "./output.mp4" , help = "The path save generated video" )
162162 parser .add_argument ("--guidance_scale" , type = float , default = 6.0 , help = "The scale for classifier-free guidance" )
163- parser .add_argument (
164- "--num_inference_steps " , type = int , default = 50 , help = "Number of steps for the inference process"
165- )
163+ parser .add_argument ("--num_inference_steps" , type = int , default = 50 , help = "Inference steps" )
164+ parser . add_argument ( "--num_frames " , type = int , default = 81 , help = "Number of steps for the inference process" )
165+ parser . add_argument ( "--fps" , type = int , default = 16 , help = "Number of steps for the inference process" )
166166 parser .add_argument ("--num_videos_per_prompt" , type = int , default = 1 , help = "Number of videos to generate per prompt" )
167- parser .add_argument (
168- "--generate_type" , type = str , default = "t2v" , help = "The type of video generation (e.g., 't2v', 'i2v', 'v2v')"
169- )
170- parser .add_argument (
171- "--dtype" , type = str , default = "bfloat16" , help = "The data type for computation (e.g., 'float16' or 'bfloat16')"
172- )
167+ parser .add_argument ("--generate_type" , type = str , default = "t2v" , help = "The type of video generation" )
168+ parser .add_argument ("--dtype" , type = str , default = "bfloat16" , help = "The data type for computation" )
173169 parser .add_argument ("--seed" , type = int , default = 42 , help = "The seed for reproducibility" )
174170
175171 args = parser .parse_args ()
@@ -180,11 +176,13 @@ def generate_video(
180176 lora_path = args .lora_path ,
181177 lora_rank = args .lora_rank ,
182178 output_path = args .output_path ,
179+ num_frames = args .num_frames ,
183180 image_or_video_path = args .image_or_video_path ,
184181 num_inference_steps = args .num_inference_steps ,
185182 guidance_scale = args .guidance_scale ,
186183 num_videos_per_prompt = args .num_videos_per_prompt ,
187184 dtype = dtype ,
188185 generate_type = args .generate_type ,
189186 seed = args .seed ,
187+ fps = args .fps ,
190188 )
0 commit comments