Skip to content

Commit e26c3c4

Browse files
Merge branch 'CogVideoX_dev' of github.com:THUDM/CogVideo into CogVideoX_dev
2 parents d9e75ce + 67ba369 commit e26c3c4

File tree

10 files changed

+382
-2
lines changed

10 files changed

+382
-2
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ Experience the CogVideoX-5B model online at <a href="https://huggingface.co/spac
2222

2323
## Project Updates
2424

25+
- 🔥🔥 **News**: ```2024/9/25```: CogVideoX web demo is available on Replicate. Try the text-to-video model **CogVideoX-5B** here [![Replicate](https://replicate.com/chenxwh/cogvideox-t2v/badge)](https://replicate.com/chenxwh/cogvideox-t2v) and image-to-video model **CogVideoX-5B-I2V** here [![Replicate](https://replicate.com/chenxwh/cogvideox-i2v/badge)](https://replicate.com/chenxwh/cogvideox-i2v).
2526
- 🔥🔥 **News**: ```2024/9/19```: We have open-sourced the CogVideoX series image-to-video model **CogVideoX-5B-I2V**.
2627
This model can take an image as a background input and generate a video combined with prompt words, offering greater
2728
controllability. With this, the CogVideoX series models now support three tasks: text-to-video generation, video
@@ -358,6 +359,9 @@ This folder contains some tools for model conversion / caption generation, etc.
358359
Adapter.
359360
+ [llm_flux_cogvideox](tools/llm_flux_cogvideox/llm_flux_cogvideox.py): Automatically generate videos using an
360361
open-source local large language model + Flux + CogVideoX.
362+
+ [parallel_inference_xdit](tools/parallel_inference/parallel_inference_xdit.py):
363+
Supported by [xDiT](https://github.com/xdit-project/xDiT), parallelize the
364+
video generation process on multiple GPUs.
361365

362366
## CogVideo(ICLR'23)
363367

README_ja.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,9 @@ pipe.vae.enable_tiling()
329329
をロードするためのツールコード。
330330
+ [llm_flux_cogvideox](tools/llm_flux_cogvideox/llm_flux_cogvideox.py): オープンソースのローカル大規模言語モデル +
331331
Flux + CogVideoX を使用して自動的に動画を生成します。
332+
+ [parallel_inference_xdit](tools/parallel_inference/parallel_inference_xdit.py)
333+
[xDiT](https://github.com/xdit-project/xDiT)
334+
によってサポートされ、ビデオ生成プロセスを複数の GPU で並列化します。
332335

333336
## CogVideo(ICLR'23)
334337

README_zh.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,9 @@ pipe.vae.enable_tiling()
312312
+ [load_cogvideox_lora](tools/load_cogvideox_lora.py): 载入diffusers版微调Lora Adapter的工具代码。
313313
+ [llm_flux_cogvideox](tools/llm_flux_cogvideox/llm_flux_cogvideox.py): 使用开源本地大语言模型 + Flux +
314314
CogVideoX实现自动化生成视频。
315+
+ [parallel_inference_xdit](tools/parallel_inference/parallel_inference_xdit.py):
316+
在多个 GPU 上并行化视频生成过程,
317+
[xDiT](https://github.com/xdit-project/xDiT)提供支持。
315318

316319
## CogVideo(ICLR'23)
317320

inference/cli_demo.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def generate_video(
133133
video=video, # The path of the video to be used as the background of the video
134134
num_videos_per_prompt=num_videos_per_prompt,
135135
num_inference_steps=num_inference_steps,
136-
num_frames=49,
136+
# num_frames=49,
137137
use_dynamic_cfg=True,
138138
guidance_scale=guidance_scale,
139139
generator=torch.Generator().manual_seed(seed), # Set the seed for reproducibility

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ imageio>=2.35.1
1111
imageio-ffmpeg>=0.5.1
1212
openai>=1.45.0
1313
moviepy>=1.0.3
14-
pillow==9.5.0
14+
pillow==9.5.0
15+
scikit-video
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
"""
2+
This is a parallel inference script for CogVideo. The original script
3+
can be found from the xDiT project at
4+
5+
https://github.com/xdit-project/xDiT/blob/main/examples/cogvideox_example.py
6+
7+
By using this code, the inference process is parallelized on multiple GPUs,
8+
and thus speeded up.
9+
10+
Usage:
11+
1. pip install xfuser
12+
2. mkdir results
13+
3. run the following command to generate video
14+
torchrun --nproc_per_node=4 parallel_inference_xdit.py \
15+
--model <cogvideox-model-path> --ulysses_degree 1 --ring_degree 2 \
16+
--use_cfg_parallel --height 480 --width 720 --num_frames 9 \
17+
--prompt 'A small dog.'
18+
19+
You can also use the run.sh file in the same folder to automate running this
20+
code for batch generation of videos, by running:
21+
22+
sh ./run.sh
23+
24+
"""
25+
26+
import time
27+
import torch
28+
import torch.distributed
29+
from diffusers import AutoencoderKLTemporalDecoder
30+
from xfuser import xFuserCogVideoXPipeline, xFuserArgs
31+
from xfuser.config import FlexibleArgumentParser
32+
from xfuser.core.distributed import (
33+
get_world_group,
34+
get_data_parallel_rank,
35+
get_data_parallel_world_size,
36+
get_runtime_state,
37+
is_dp_last_group,
38+
)
39+
from diffusers.utils import export_to_video
40+
41+
42+
def main():
43+
parser = FlexibleArgumentParser(description="xFuser Arguments")
44+
args = xFuserArgs.add_cli_args(parser).parse_args()
45+
engine_args = xFuserArgs.from_cli_args(args)
46+
47+
# Check if ulysses_degree is valid
48+
num_heads = 30
49+
if engine_args.ulysses_degree > 0 and num_heads % engine_args.ulysses_degree != 0:
50+
raise ValueError(
51+
f"ulysses_degree ({engine_args.ulysses_degree}) must be a divisor of the number of heads ({num_heads})"
52+
)
53+
54+
engine_config, input_config = engine_args.create_config()
55+
local_rank = get_world_group().local_rank
56+
57+
pipe = xFuserCogVideoXPipeline.from_pretrained(
58+
pretrained_model_name_or_path=engine_config.model_config.model,
59+
engine_config=engine_config,
60+
torch_dtype=torch.bfloat16,
61+
)
62+
if args.enable_sequential_cpu_offload:
63+
pipe.enable_model_cpu_offload(gpu_id=local_rank)
64+
pipe.vae.enable_tiling()
65+
else:
66+
device = torch.device(f"cuda:{local_rank}")
67+
pipe = pipe.to(device)
68+
69+
torch.cuda.reset_peak_memory_stats()
70+
start_time = time.time()
71+
72+
output = pipe(
73+
height=input_config.height,
74+
width=input_config.width,
75+
num_frames=input_config.num_frames,
76+
prompt=input_config.prompt,
77+
num_inference_steps=input_config.num_inference_steps,
78+
generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
79+
guidance_scale=6,
80+
).frames[0]
81+
82+
end_time = time.time()
83+
elapsed_time = end_time - start_time
84+
peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
85+
86+
parallel_info = (
87+
f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
88+
f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
89+
f"tp{engine_args.tensor_parallel_degree}_"
90+
f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
91+
)
92+
if is_dp_last_group():
93+
world_size = get_data_parallel_world_size()
94+
resolution = f"{input_config.width}x{input_config.height}"
95+
output_filename = f"results/cogvideox_{parallel_info}_{resolution}.mp4"
96+
export_to_video(output, output_filename, fps=8)
97+
print(f"output saved to {output_filename}")
98+
99+
if get_world_group().rank == get_world_group().world_size - 1:
100+
print(f"epoch time: {elapsed_time:.2f} sec, memory: {peak_memory/1e9} GB")
101+
get_runtime_state().destory_distributed_env()
102+
103+
104+
if __name__ == "__main__":
105+
main()

tools/parallel_inference/run.sh

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
set -x
2+
3+
export PYTHONPATH=$PWD:$PYTHONPATH
4+
5+
# Select the model type
6+
# The model is downloaded to a specified location on disk,
7+
# or you can simply use the model's ID on Hugging Face,
8+
# which will then be downloaded to the default cache path on Hugging Face.
9+
10+
export MODEL_TYPE="CogVideoX"
11+
# Configuration for different model types
12+
# script, model_id, inference_step
13+
declare -A MODEL_CONFIGS=(
14+
["CogVideoX"]="parallel_inference_xdit.py /cfs/dit/CogVideoX-2b 20"
15+
)
16+
17+
if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then
18+
IFS=' ' read -r SCRIPT MODEL_ID INFERENCE_STEP <<< "${MODEL_CONFIGS[$MODEL_TYPE]}"
19+
export SCRIPT MODEL_ID INFERENCE_STEP
20+
else
21+
echo "Invalid MODEL_TYPE: $MODEL_TYPE"
22+
exit 1
23+
fi
24+
25+
mkdir -p ./results
26+
27+
# task args
28+
if [ "$MODEL_TYPE" = "CogVideoX" ]; then
29+
TASK_ARGS="--height 480 --width 720 --num_frames 9"
30+
fi
31+
32+
# CogVideoX asserts sp_degree == ulysses_degree*ring_degree <= 2. Also, do not set the pipefusion degree.
33+
if [ "$MODEL_TYPE" = "CogVideoX" ]; then
34+
N_GPUS=4
35+
PARALLEL_ARGS="--ulysses_degree 2 --ring_degree 1"
36+
CFG_ARGS="--use_cfg_parallel"
37+
fi
38+
39+
40+
torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \
41+
--model $MODEL_ID \
42+
$PARALLEL_ARGS \
43+
$TASK_ARGS \
44+
$PIPEFUSION_ARGS \
45+
$OUTPUT_ARGS \
46+
--num_inference_steps $INFERENCE_STEP \
47+
--warmup_steps 0 \
48+
--prompt "A small dog." \
49+
$CFG_ARGS \
50+
$PARALLLEL_VAE \
51+
$COMPILE_FLAG

tools/replicate/cog.yaml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Configuration for Cog ⚙️
2+
# Reference: https://cog.run/yaml
3+
4+
build:
5+
# set to true if your model requires a GPU
6+
gpu: true
7+
8+
# a list of ubuntu apt packages to install
9+
system_packages:
10+
- "libgl1-mesa-glx"
11+
- "libglib2.0-0"
12+
13+
# python version in the form '3.11' or '3.11.4'
14+
python_version: "3.11"
15+
16+
# a list of packages in the format <package-name>==<version>
17+
python_packages:
18+
- diffusers>=0.30.3
19+
- accelerate>=0.34.2
20+
- transformers>=4.44.2
21+
- numpy==1.26.0
22+
- torch>=2.4.0
23+
- torchvision>=0.19.0
24+
- sentencepiece>=0.2.0
25+
- SwissArmyTransformer>=0.4.12
26+
- imageio>=2.35.1
27+
- imageio-ffmpeg>=0.5.1
28+
- openai>=1.45.0
29+
- moviepy>=1.0.3
30+
- pillow==9.5.0
31+
- pydantic==1.10.7
32+
run:
33+
- curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.8.2/pget_linux_x86_64" && chmod +x /usr/local/bin/pget
34+
35+
# predict.py defines how predictions are run on your model
36+
predict: "predict_t2v.py:Predictor"
37+
# predict: "predict_i2v.py:Predictor"

tools/replicate/predict_i2v.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# Prediction interface for Cog ⚙️
2+
# https://cog.run/python
3+
4+
import os
5+
import subprocess
6+
import time
7+
import torch
8+
from diffusers import CogVideoXImageToVideoPipeline
9+
from diffusers.utils import export_to_video, load_image
10+
from cog import BasePredictor, Input, Path
11+
12+
13+
MODEL_CACHE = "model_cache_i2v"
14+
MODEL_URL = (
15+
f"https://weights.replicate.delivery/default/THUDM/CogVideo/{MODEL_CACHE}.tar"
16+
)
17+
os.environ["HF_DATASETS_OFFLINE"] = "1"
18+
os.environ["TRANSFORMERS_OFFLINE"] = "1"
19+
os.environ["HF_HOME"] = MODEL_CACHE
20+
os.environ["TORCH_HOME"] = MODEL_CACHE
21+
os.environ["HF_DATASETS_CACHE"] = MODEL_CACHE
22+
os.environ["TRANSFORMERS_CACHE"] = MODEL_CACHE
23+
os.environ["HUGGINGFACE_HUB_CACHE"] = MODEL_CACHE
24+
25+
26+
def download_weights(url, dest):
27+
start = time.time()
28+
print("downloading url: ", url)
29+
print("downloading to: ", dest)
30+
subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
31+
print("downloading took: ", time.time() - start)
32+
33+
34+
class Predictor(BasePredictor):
35+
def setup(self) -> None:
36+
"""Load the model into memory to make running multiple predictions efficient"""
37+
38+
if not os.path.exists(MODEL_CACHE):
39+
download_weights(MODEL_URL, MODEL_CACHE)
40+
41+
# model_id: THUDM/CogVideoX-5b-I2V
42+
self.pipe = CogVideoXImageToVideoPipeline.from_pretrained(
43+
MODEL_CACHE, torch_dtype=torch.bfloat16
44+
).to("cuda")
45+
46+
self.pipe.enable_model_cpu_offload()
47+
self.pipe.vae.enable_tiling()
48+
49+
def predict(
50+
self,
51+
prompt: str = Input(
52+
description="Input prompt", default="Starry sky slowly rotating."
53+
),
54+
image: Path = Input(description="Input image"),
55+
num_inference_steps: int = Input(
56+
description="Number of denoising steps", ge=1, le=500, default=50
57+
),
58+
guidance_scale: float = Input(
59+
description="Scale for classifier-free guidance", ge=1, le=20, default=6
60+
),
61+
num_frames: int = Input(
62+
description="Number of frames for the output video", default=49
63+
),
64+
seed: int = Input(
65+
description="Random seed. Leave blank to randomize the seed", default=None
66+
),
67+
) -> Path:
68+
"""Run a single prediction on the model"""
69+
70+
if seed is None:
71+
seed = int.from_bytes(os.urandom(2), "big")
72+
print(f"Using seed: {seed}")
73+
74+
img = load_image(image=str(image))
75+
76+
video = self.pipe(
77+
prompt=prompt,
78+
image=img,
79+
num_videos_per_prompt=1,
80+
num_inference_steps=num_inference_steps,
81+
num_frames=num_frames,
82+
guidance_scale=guidance_scale,
83+
generator=torch.Generator(device="cuda").manual_seed(seed),
84+
).frames[0]
85+
86+
out_path = "/tmp/out.mp4"
87+
88+
export_to_video(video, out_path, fps=8)
89+
return Path(out_path)

0 commit comments

Comments
 (0)