Update v1 inference scripts (#467)

JerryZhou54 · BrianChen1129 · web-flow · commit 5357e12b5af6 · 2025-06-03T02:54:12.000-04:00
Co-authored-by: “BrianChen1129” &lt;yongqich@umich.edu&gt;
diff --git a/.gitignore b/.gitignore
@@ -27,7 +27,6 @@ env
 **/build/
 **.pyc
 **.txt
-**.json
 
 # Distribution / packaging
 build/
diff --git a/docs/source/inference/inference_quick_start.md b/docs/source/inference/inference_quick_start.md
@@ -57,8 +57,9 @@ Run the script with:
 python example.py
 ```
 
-The generated video will be saved in the current directory under `my_videos/`.
+The generated video will be saved in the current directory under `my_videos/`  
 
+More inference example scripts can be found in `scripts/inference/`
 ## Available Models
 
 Please see the [support matrix](#support-matrix) for the list of supported models and their available optimizations.
diff --git a/scripts/inference/v1_inference_fasthunyuan.sh b/scripts/inference/v1_inference_fasthunyuan.sh
@@ -6,19 +6,17 @@ export FASTVIDEO_ATTENTION_BACKEND=FLASH_ATTN
 # Note that the tp_size and sp_size should be the same and equal to the number
 # of GPUs. They are used for different parallel groups. sp_size is used for
 # dit model and tp_size is used for encoder models.
-torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
-    fastvideo/v1/sample/v1_fastvideo_inference.py \
-    --sp_size $num_gpus \
-    --tp_size $num_gpus \
+fastvideo generate \
+    --model-path $MODEL_BASE \
+    --sp-size $num_gpus \
+    --tp-size $num_gpus \
     --height 720 \
     --width 1280 \
-    --num_frames 125 \
-    --num_inference_steps 6 \
-    --guidance_scale 1 \
-    --embedded_cfg_scale 6 \
-    --flow_shift 17 \
-    --prompt_path ./assets/prompt.txt \
+    --num-frames 125 \
+    --num-inference-steps 6 \
+    --guidance-scale 1 \
+    --embedded-cfg-scale 6 \
+    --flow-shift 17 \
+    --prompt "A beautiful woman in a red dress walking down a street" \
     --seed 1024 \
-    --output_path outputs_video/ \
-    --model_path $MODEL_BASE \
-    --vae-sp
+    --output-path outputs_video/
diff --git a/scripts/inference/v1_inference_hunyuan.sh b/scripts/inference/v1_inference_hunyuan.sh
@@ -7,19 +7,17 @@ export FASTVIDEO_ATTENTION_BACKEND=FLASH_ATTN
 # Note that the tp_size and sp_size should be the same and equal to the number
 # of GPUs. They are used for different parallel groups. sp_size is used for
 # dit model and tp_size is used for encoder models.
-torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
-    fastvideo/v1/sample/v1_fastvideo_inference.py \
-    --sp_size $num_gpus \
-    --tp_size $num_gpus \
+fastvideo generate \
+    --model-path $MODEL_BASE \
+    --sp-size $num_gpus \
+    --tp-size $num_gpus \
     --height 720 \
     --width 1280 \
-    --num_frames 125 \
-    --num_inference_steps 50 \
-    --guidance_scale 1 \
-    --embedded_cfg_scale 6 \
-    --flow_shift 7 \
-    --prompt_path ./assets/prompt.txt \
+    --num-frames 125 \
+    --num-inference-steps 50 \
+    --guidance-scale 1 \
+    --embedded-cfg-scale 6 \
+    --flow-shift 7 \
+    --prompt "A beautiful woman in a red dress walking down a street" \
     --seed 1024 \
-    --output_path outputs_video/ \
-    --model_path $MODEL_BASE \
-    --vae-sp
+    --output-path outputs_video/
diff --git a/scripts/inference/v1_inference_hunyuan_STA.sh b/scripts/inference/v1_inference_hunyuan_STA.sh
@@ -8,19 +8,17 @@ export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
 # Note that the tp_size and sp_size should be the same and equal to the number
 # of GPUs. They are used for different parallel groups. sp_size is used for
 # dit model and tp_size is used for encoder models.
-torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
-    fastvideo/v1/sample/v1_fastvideo_inference.py \
-    --sp_size ${num_gpus} \
-    --tp_size ${num_gpus} \
+fastvideo generate \
+    --model-path $MODEL_BASE \
+    --sp-size ${num_gpus} \
+    --tp-size ${num_gpus} \
     --height 768 \
     --width 1280 \
-    --num_frames 117 \
-    --num_inference_steps 50 \
-    --guidance_scale 1 \
-    --embedded_cfg_scale 6 \
-    --flow_shift 7 \
-    --prompt_path ./assets/prompt.txt \
+    --num-frames 117 \
+    --num-inference-steps 50 \
+    --guidance-scale 1 \
+    --embedded-cfg-scale 6 \
+    --flow-shift 7 \
+    --prompt "A beautiful woman in a red dress walking down a street" \
     --seed 1024 \
-    --output_path outputs_video/ \
-    --model_path $MODEL_BASE \
-    --vae-sp
+    --output-path outputs_video/
diff --git a/scripts/inference/v1_inference_stepvideo.sh b/scripts/inference/v1_inference_stepvideo.sh
@@ -5,19 +5,18 @@ export FASTVIDEO_ATTENTION_BACKEND=
 num_gpus=2
 url='127.0.0.1'
 model_dir=data/stepvideo-t2v
-torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
-    fastvideo/v1/sample/v1_fastvideo_inference.py \
-    --sp_size ${num_gpus} \
-    --tp_size ${num_gpus} \
+fastvideo generate \
+    --model-path $model_dir \
+    --sp-size ${num_gpus} \
+    --tp-size ${num_gpus} \
     --height 256 \
     --width 256 \
-    --num_frames 29 \
-    --num_inference_steps 50 \
-    --embedded_cfg_scale 9.0 \
-    --guidance_scale 9.0 \
-    --prompt_path ./assets/prompt.txt \
+    --num-frames 29 \
+    --num-inference-steps 50 \
+    --embedded-cfg-scale 9.0 \
+    --guidance-scale 9.0 \
+    --prompt "A beautiful woman in a red dress walking down a street" \
     --seed 1024 \
-    --output_path outputs_stepvideo/ \
-    --model_path $model_dir \
-    --flow_shift 13.0 \
-    --vae_precision bf16
+    --output-path outputs_stepvideo/ \
+    --flow-shift 13.0 \
+    --vae-precision bf16
diff --git a/scripts/inference/v1_inference_wan.sh b/scripts/inference/v1_inference_wan.sh
@@ -7,21 +7,17 @@ export MODEL_BASE=Wan-AI/Wan2.1-T2V-1.3B-Diffusers
 # Note that the tp_size and sp_size should be the same and equal to the number
 # of GPUs. They are used for different parallel groups. sp_size is used for
 # dit model and tp_size is used for encoder models.
-torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
-    fastvideo/v1/sample/v1_fastvideo_inference.py \
-    --sp_size $num_gpus \
-    --tp_size $num_gpus \
+fastvideo generate \
+    --model-path $MODEL_BASE \
+    --sp-size $num_gpus \
+    --tp-size $num_gpus \
     --height 480 \
     --width 832 \
-    --num_frames 77 \
-    --num_inference_steps 50 \
+    --num-frames 77 \
+    --num-inference-steps 50 \
     --fps 16 \
-    --guidance_scale 3.0 \
-    --prompt_path ./assets/prompt.txt \
-    --neg_prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
+    --guidance-scale 3.0 \
+    --prompt "A beautiful woman in a red dress walking down a street" \
+    --negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
     --seed 1024 \
-    --output_path outputs_video/ \
-    --model_path $MODEL_BASE \
-    --vae-sp \
-    --text-encoder-precision "fp32" \
-    --use-cpu-offload
+    --output-path outputs_video/
diff --git a/scripts/inference/v1_inference_wan_STA.sh b/scripts/inference/v1_inference_wan_STA.sh
@@ -7,21 +7,17 @@ export MODEL_BASE=Wan-AI/Wan2.1-T2V-14B-Diffusers
 # Note that the tp_size and sp_size should be the same and equal to the number
 # of GPUs. They are used for different parallel groups. sp_size is used for
 # dit model and tp_size is used for encoder models.
-torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
-    fastvideo/v1/sample/v1_fastvideo_inference.py \
-    --sp_size $num_gpus \
-    --tp_size $num_gpus \
+fastvideo generate \
+    --model-path $MODEL_BASE \
+    --sp-size $num_gpus \
+    --tp-size $num_gpus \
     --height 768 \
     --width 1280 \
-    --num_frames 69 \
-    --num_inference_steps 50 \
+    --num-frames 69 \
+    --num-inference-steps 50 \
     --fps 16 \
-    --guidance_scale 5.0 \
-    --prompt_path ./assets/prompt.txt \
-    --neg_prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
+    --guidance-scale 5.0 \
+    --prompt "A beautiful woman in a red dress walking down a street" \
+    --negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
     --seed 12345 \
-    --output_path outputs_video/ \
-    --model_path $MODEL_BASE \
-    --vae-sp \
-    --text-encoder-precision "fp32" \
-    --use-cpu-offload
+    --output-path outputs_video/
diff --git a/scripts/inference/v1_inference_wan_i2v.sh b/scripts/inference/v1_inference_wan_i2v.sh
@@ -7,23 +7,19 @@ export MODEL_BASE=Wan-AI/Wan2.1-I2V-14B-480P-Diffusers
 # Note that the tp_size and sp_size should be the same and equal to the number
 # of GPUs. They are used for different parallel groups. sp_size is used for
 # dit model and tp_size is used for encoder models.
-torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
-    fastvideo/v1/sample/v1_fastvideo_inference.py \
-    --sp_size $num_gpus \
-    --tp_size $num_gpus \
+fastvideo generate \
+    --model-path $MODEL_BASE \
+    --sp-size $num_gpus \
+    --tp-size $num_gpus \
     --height 480 \
     --width 832 \
-    --num_frames 77 \
-    --num_inference_steps 40 \
+    --num-frames 77 \
+    --num-inference-steps 40 \
     --fps 16 \
-    --flow_shift 3.0 \
-    --guidance_scale 5.0 \
-    --image_path "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg" \
+    --flow-shift 3.0 \
+    --guidance-scale 5.0 \
+    --image-path "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg" \
     --prompt "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot." \
-    --neg_prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
+    --negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
     --seed 1024 \
-    --output_path outputs_i2v/ \
-    --model_path $MODEL_BASE \
-    --vae-sp \
-    --text-encoder-precision "fp32" \
-    --use-cpu-offload
+    --output-path outputs_i2v/