Skip to content

Commit 5357e12

Browse files
Update v1 inference scripts (#467)
Co-authored-by: “BrianChen1129” <[email protected]>
1 parent bdfdf1d commit 5357e12

File tree

9 files changed

+78
-97
lines changed

9 files changed

+78
-97
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ env
2727
**/build/
2828
**.pyc
2929
**.txt
30-
**.json
3130

3231
# Distribution / packaging
3332
build/

docs/source/inference/inference_quick_start.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,9 @@ Run the script with:
5757
python example.py
5858
```
5959

60-
The generated video will be saved in the current directory under `my_videos/`.
60+
The generated video will be saved in the current directory under `my_videos/`
6161

62+
More inference example scripts can be found in `scripts/inference/`
6263
## Available Models
6364

6465
Please see the [support matrix](#support-matrix) for the list of supported models and their available optimizations.

scripts/inference/v1_inference_fasthunyuan.sh

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,17 @@ export FASTVIDEO_ATTENTION_BACKEND=FLASH_ATTN
66
# Note that the tp_size and sp_size should be the same and equal to the number
77
# of GPUs. They are used for different parallel groups. sp_size is used for
88
# dit model and tp_size is used for encoder models.
9-
torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
10-
fastvideo/v1/sample/v1_fastvideo_inference.py \
11-
--sp_size $num_gpus \
12-
--tp_size $num_gpus \
9+
fastvideo generate \
10+
--model-path $MODEL_BASE \
11+
--sp-size $num_gpus \
12+
--tp-size $num_gpus \
1313
--height 720 \
1414
--width 1280 \
15-
--num_frames 125 \
16-
--num_inference_steps 6 \
17-
--guidance_scale 1 \
18-
--embedded_cfg_scale 6 \
19-
--flow_shift 17 \
20-
--prompt_path ./assets/prompt.txt \
15+
--num-frames 125 \
16+
--num-inference-steps 6 \
17+
--guidance-scale 1 \
18+
--embedded-cfg-scale 6 \
19+
--flow-shift 17 \
20+
--prompt "A beautiful woman in a red dress walking down a street" \
2121
--seed 1024 \
22-
--output_path outputs_video/ \
23-
--model_path $MODEL_BASE \
24-
--vae-sp
22+
--output-path outputs_video/

scripts/inference/v1_inference_hunyuan.sh

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,17 @@ export FASTVIDEO_ATTENTION_BACKEND=FLASH_ATTN
77
# Note that the tp_size and sp_size should be the same and equal to the number
88
# of GPUs. They are used for different parallel groups. sp_size is used for
99
# dit model and tp_size is used for encoder models.
10-
torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
11-
fastvideo/v1/sample/v1_fastvideo_inference.py \
12-
--sp_size $num_gpus \
13-
--tp_size $num_gpus \
10+
fastvideo generate \
11+
--model-path $MODEL_BASE \
12+
--sp-size $num_gpus \
13+
--tp-size $num_gpus \
1414
--height 720 \
1515
--width 1280 \
16-
--num_frames 125 \
17-
--num_inference_steps 50 \
18-
--guidance_scale 1 \
19-
--embedded_cfg_scale 6 \
20-
--flow_shift 7 \
21-
--prompt_path ./assets/prompt.txt \
16+
--num-frames 125 \
17+
--num-inference-steps 50 \
18+
--guidance-scale 1 \
19+
--embedded-cfg-scale 6 \
20+
--flow-shift 7 \
21+
--prompt "A beautiful woman in a red dress walking down a street" \
2222
--seed 1024 \
23-
--output_path outputs_video/ \
24-
--model_path $MODEL_BASE \
25-
--vae-sp
23+
--output-path outputs_video/

scripts/inference/v1_inference_hunyuan_STA.sh

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,19 +8,17 @@ export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
88
# Note that the tp_size and sp_size should be the same and equal to the number
99
# of GPUs. They are used for different parallel groups. sp_size is used for
1010
# dit model and tp_size is used for encoder models.
11-
torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
12-
fastvideo/v1/sample/v1_fastvideo_inference.py \
13-
--sp_size ${num_gpus} \
14-
--tp_size ${num_gpus} \
11+
fastvideo generate \
12+
--model-path $MODEL_BASE \
13+
--sp-size ${num_gpus} \
14+
--tp-size ${num_gpus} \
1515
--height 768 \
1616
--width 1280 \
17-
--num_frames 117 \
18-
--num_inference_steps 50 \
19-
--guidance_scale 1 \
20-
--embedded_cfg_scale 6 \
21-
--flow_shift 7 \
22-
--prompt_path ./assets/prompt.txt \
17+
--num-frames 117 \
18+
--num-inference-steps 50 \
19+
--guidance-scale 1 \
20+
--embedded-cfg-scale 6 \
21+
--flow-shift 7 \
22+
--prompt "A beautiful woman in a red dress walking down a street" \
2323
--seed 1024 \
24-
--output_path outputs_video/ \
25-
--model_path $MODEL_BASE \
26-
--vae-sp
24+
--output-path outputs_video/

scripts/inference/v1_inference_stepvideo.sh

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,18 @@ export FASTVIDEO_ATTENTION_BACKEND=
55
num_gpus=2
66
url='127.0.0.1'
77
model_dir=data/stepvideo-t2v
8-
torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
9-
fastvideo/v1/sample/v1_fastvideo_inference.py \
10-
--sp_size ${num_gpus} \
11-
--tp_size ${num_gpus} \
8+
fastvideo generate \
9+
--model-path $model_dir \
10+
--sp-size ${num_gpus} \
11+
--tp-size ${num_gpus} \
1212
--height 256 \
1313
--width 256 \
14-
--num_frames 29 \
15-
--num_inference_steps 50 \
16-
--embedded_cfg_scale 9.0 \
17-
--guidance_scale 9.0 \
18-
--prompt_path ./assets/prompt.txt \
14+
--num-frames 29 \
15+
--num-inference-steps 50 \
16+
--embedded-cfg-scale 9.0 \
17+
--guidance-scale 9.0 \
18+
--prompt "A beautiful woman in a red dress walking down a street" \
1919
--seed 1024 \
20-
--output_path outputs_stepvideo/ \
21-
--model_path $model_dir \
22-
--flow_shift 13.0 \
23-
--vae_precision bf16
20+
--output-path outputs_stepvideo/ \
21+
--flow-shift 13.0 \
22+
--vae-precision bf16

scripts/inference/v1_inference_wan.sh

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,17 @@ export MODEL_BASE=Wan-AI/Wan2.1-T2V-1.3B-Diffusers
77
# Note that the tp_size and sp_size should be the same and equal to the number
88
# of GPUs. They are used for different parallel groups. sp_size is used for
99
# dit model and tp_size is used for encoder models.
10-
torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
11-
fastvideo/v1/sample/v1_fastvideo_inference.py \
12-
--sp_size $num_gpus \
13-
--tp_size $num_gpus \
10+
fastvideo generate \
11+
--model-path $MODEL_BASE \
12+
--sp-size $num_gpus \
13+
--tp-size $num_gpus \
1414
--height 480 \
1515
--width 832 \
16-
--num_frames 77 \
17-
--num_inference_steps 50 \
16+
--num-frames 77 \
17+
--num-inference-steps 50 \
1818
--fps 16 \
19-
--guidance_scale 3.0 \
20-
--prompt_path ./assets/prompt.txt \
21-
--neg_prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
19+
--guidance-scale 3.0 \
20+
--prompt "A beautiful woman in a red dress walking down a street" \
21+
--negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
2222
--seed 1024 \
23-
--output_path outputs_video/ \
24-
--model_path $MODEL_BASE \
25-
--vae-sp \
26-
--text-encoder-precision "fp32" \
27-
--use-cpu-offload
23+
--output-path outputs_video/

scripts/inference/v1_inference_wan_STA.sh

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,17 @@ export MODEL_BASE=Wan-AI/Wan2.1-T2V-14B-Diffusers
77
# Note that the tp_size and sp_size should be the same and equal to the number
88
# of GPUs. They are used for different parallel groups. sp_size is used for
99
# dit model and tp_size is used for encoder models.
10-
torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
11-
fastvideo/v1/sample/v1_fastvideo_inference.py \
12-
--sp_size $num_gpus \
13-
--tp_size $num_gpus \
10+
fastvideo generate \
11+
--model-path $MODEL_BASE \
12+
--sp-size $num_gpus \
13+
--tp-size $num_gpus \
1414
--height 768 \
1515
--width 1280 \
16-
--num_frames 69 \
17-
--num_inference_steps 50 \
16+
--num-frames 69 \
17+
--num-inference-steps 50 \
1818
--fps 16 \
19-
--guidance_scale 5.0 \
20-
--prompt_path ./assets/prompt.txt \
21-
--neg_prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
19+
--guidance-scale 5.0 \
20+
--prompt "A beautiful woman in a red dress walking down a street" \
21+
--negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
2222
--seed 12345 \
23-
--output_path outputs_video/ \
24-
--model_path $MODEL_BASE \
25-
--vae-sp \
26-
--text-encoder-precision "fp32" \
27-
--use-cpu-offload
23+
--output-path outputs_video/

scripts/inference/v1_inference_wan_i2v.sh

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,19 @@ export MODEL_BASE=Wan-AI/Wan2.1-I2V-14B-480P-Diffusers
77
# Note that the tp_size and sp_size should be the same and equal to the number
88
# of GPUs. They are used for different parallel groups. sp_size is used for
99
# dit model and tp_size is used for encoder models.
10-
torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
11-
fastvideo/v1/sample/v1_fastvideo_inference.py \
12-
--sp_size $num_gpus \
13-
--tp_size $num_gpus \
10+
fastvideo generate \
11+
--model-path $MODEL_BASE \
12+
--sp-size $num_gpus \
13+
--tp-size $num_gpus \
1414
--height 480 \
1515
--width 832 \
16-
--num_frames 77 \
17-
--num_inference_steps 40 \
16+
--num-frames 77 \
17+
--num-inference-steps 40 \
1818
--fps 16 \
19-
--flow_shift 3.0 \
20-
--guidance_scale 5.0 \
21-
--image_path "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg" \
19+
--flow-shift 3.0 \
20+
--guidance-scale 5.0 \
21+
--image-path "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg" \
2222
--prompt "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot." \
23-
--neg_prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
23+
--negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
2424
--seed 1024 \
25-
--output_path outputs_i2v/ \
26-
--model_path $MODEL_BASE \
27-
--vae-sp \
28-
--text-encoder-precision "fp32" \
29-
--use-cpu-offload
25+
--output-path outputs_i2v/

0 commit comments

Comments
 (0)