Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/source/inference/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ fastvideo generate --help
### Hardware Configuration

- `--num-gpus {NUM_GPUS}`: Number of GPUs to use
- `--tp-size {TP_SIZE}`: Tensor parallelism size (Typically should match the number of GPUs)
- `--tp-size {TP_SIZE}`: Tensor parallelism size (only for the encoder, should not be larger than 1 if text encoder offload is enabled, as layerwise offload + prefetch is faster)
- `--sp-size {SP_SIZE}`: Sequence parallelism size (Typically should match the number of GPUs)

#### Video Configuration
Expand Down Expand Up @@ -68,7 +68,7 @@ Example configuration file (config.json):
"output_path": "outputs/",
"num_gpus": 2,
"sp_size": 2,
"tp_size": 2,
"tp_size": 1,
"num_frames": 45,
"height": 720,
"width": 1280,
Expand Down Expand Up @@ -102,7 +102,7 @@ prompt: "A beautiful woman in a red dress walking down a street"
output_path: "outputs/"
num_gpus: 2
sp_size: 2
tp_size: 2
tp_size: 1
num_frames: 45
height: 720
width: 1280
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ training_args=(
parallel_args=(
--num_gpus $NUM_GPUS
--sp_size 8
--tp_size 8
--tp_size 1
--hsdp_replicate_dim 1
--hsdp_shard_dim 8
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ training_args=(
parallel_args=(
--num_gpus $NUM_GPUS
--sp_size $NUM_GPUS
--tp_size $NUM_GPUS
--tp_size 1
--hsdp_replicate_dim $SLURM_JOB_NUM_NODES
--hsdp_shard_dim $NUM_GPUS
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ training_args=(
parallel_args=(
--num_gpus $NUM_GPUS
--sp_size $NUM_GPUS
--tp_size $NUM_GPUS
--tp_size 1
--hsdp_replicate_dim 1
--hsdp_shard_dim $NUM_GPUS
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ training_args=(
parallel_args=(
--num_gpus $NUM_GPUS
--sp_size 4
--tp_size 4
--tp_size 1
--hsdp_replicate_dim 2
--hsdp_shard_dim 4
)
Expand Down
7 changes: 1 addition & 6 deletions fastvideo/v1/fastvideo_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def check_fastvideo_args(self) -> None:
assert self.sp_size != -1, "sp_size must be set for training"

if self.tp_size == -1:
self.tp_size = self.num_gpus
self.tp_size = 1
if self.sp_size == -1:
self.sp_size = self.num_gpus
if self.hsdp_shard_dim == -1:
Expand All @@ -305,11 +305,6 @@ def check_fastvideo_args(self) -> None:
if self.num_gpus < max(self.tp_size, self.sp_size):
self.num_gpus = max(self.tp_size, self.sp_size)

if self.tp_size != self.sp_size:
raise ValueError(
f"tp_size ({self.tp_size}) must be equal to sp_size ({self.sp_size})"
)

if self.enable_torch_compile and self.num_gpus > 1:
logger.warning(
"Currently torch compile does not work with multi-gpu. Setting enable_torch_compile to False"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def run_training():
"--num_latent_t", "8",
"--num_gpus", NUM_GPUS_PER_NODE_TRAINING,
"--sp_size", NUM_GPUS_PER_NODE_TRAINING,
"--tp_size", NUM_GPUS_PER_NODE_TRAINING,
"--tp_size", 1,
"--hsdp_replicate_dim", "1",
"--hsdp_shard_dim", NUM_GPUS_PER_NODE_TRAINING,
"--num_gpus", NUM_GPUS_PER_NODE_TRAINING,
Expand Down
6 changes: 3 additions & 3 deletions fastvideo/v1/tests/ssim/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ FastHunyuan-diffusers: {
"flow_shift": 17,
"seed": 1024,
"sp_size": 2,
"tp_size": 2,
"tp_size": 1,
"vae_sp": true,
"fps": 24
}
Expand All @@ -41,7 +41,7 @@ Wan2.1-T2V-1.3B-Diffusers: {
"flow_shift": 7.0,
"seed": 1024,
"sp_size": 2,
"tp_size": 2,
"tp_size": 1,
"vae_sp": True,
"fps": 24,
"neg_prompt": "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards",
Expand All @@ -60,7 +60,7 @@ Wan2.1-I2V-14B-480P-Diffusers: {
"flow_shift": 7.0,
"seed": 1024,
"sp_size": 2,
"tp_size": 2,
"tp_size": 1,
"vae_sp": True,
"fps": 24,
"neg_prompt": "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards",
Expand Down
12 changes: 6 additions & 6 deletions fastvideo/v1/tests/ssim/test_inference_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
"flow_shift": 17,
"seed": 1024,
"sp_size": 2,
"tp_size": 2,
"tp_size": 1,
"vae_sp": True,
"fps": 24,
}
Expand All @@ -50,7 +50,7 @@
"flow_shift": 7.0,
"seed": 1024,
"sp_size": 2,
"tp_size": 2,
"tp_size": 1,
"vae_sp": True,
"fps": 24,
"neg_prompt": "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards",
Expand All @@ -69,7 +69,7 @@
"flow_shift": 7.0,
"seed": 1024,
"sp_size": 2,
"tp_size": 2,
"tp_size": 1,
"vae_sp": True,
"fps": 24,
"neg_prompt": "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards",
Expand Down Expand Up @@ -238,7 +238,7 @@ def test_i2v_inference_similarity(prompt, ATTENTION_BACKEND, model_id):
logger.error("Failed to write SSIM results to file")

min_acceptable_ssim = 0.97
assert mean_ssim >= min_acceptable_ssim, f"SSIM value {mean_ssim} is below threshold {min_acceptable_ssim}"
assert mean_ssim >= min_acceptable_ssim, f"SSIM value {mean_ssim} is below threshold {min_acceptable_ssim} for {model_id} with backend {ATTENTION_BACKEND}"

@pytest.mark.parametrize("prompt", TEST_PROMPTS)
@pytest.mark.parametrize("ATTENTION_BACKEND", ["FLASH_ATTN", "TORCH_SDPA"])
Expand Down Expand Up @@ -337,5 +337,5 @@ def test_inference_similarity(prompt, ATTENTION_BACKEND, model_id):
if not success:
logger.error("Failed to write SSIM results to file")

min_acceptable_ssim = 0.95
assert mean_ssim >= min_acceptable_ssim, f"SSIM value {mean_ssim} is below threshold {min_acceptable_ssim}"
min_acceptable_ssim = 0.93
assert mean_ssim >= min_acceptable_ssim, f"SSIM value {mean_ssim} is below threshold {min_acceptable_ssim} for {model_id} with backend {ATTENTION_BACKEND}"
2 changes: 1 addition & 1 deletion fastvideo/v1/tests/training/Vanilla/test_training_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def run_worker():
"--num_latent_t", "4",
"--num_gpus", "4",
"--sp_size", "4",
"--tp_size", "4",
"--tp_size", "1",
"--hsdp_replicate_dim", "1",
"--hsdp_shard_dim", "4",
"--train_sp_batch_size", "1",
Expand Down
2 changes: 1 addition & 1 deletion scripts/finetune/finetune_v1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ torchrun --nnodes 1 --nproc_per_node $NUM_GPUS\
--train_batch_size=4 \
--num_latent_t 20 \
--sp_size 4 \
--tp_size 4 \
--tp_size 1 \
--hsdp_replicate_dim 1 \
--hsdp_shard_dim 4 \
--num_gpus $NUM_GPUS \
Expand Down
5 changes: 1 addition & 4 deletions scripts/inference/v1_inference_fasthunyuan.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,10 @@
num_gpus=4
export MODEL_BASE=FastVideo/FastHunyuan-Diffusers
export FASTVIDEO_ATTENTION_BACKEND=FLASH_ATTN
# Note that the tp_size and sp_size should be the same and equal to the number
# of GPUs. They are used for different parallel groups. sp_size is used for
# dit model and tp_size is used for encoder models.
fastvideo generate \
--model-path $MODEL_BASE \
--sp-size $num_gpus \
--tp-size $num_gpus \
--tp-size 1 \
--num-gpus $num_gpus \
--height 720 \
--width 1280 \
Expand Down
5 changes: 1 addition & 4 deletions scripts/inference/v1_inference_hunyuan.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,10 @@ num_gpus=4
export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
export FASTVIDEO_ATTENTION_BACKEND=FLASH_ATTN
# export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
# Note that the tp_size and sp_size should be the same and equal to the number
# of GPUs. They are used for different parallel groups. sp_size is used for
# dit model and tp_size is used for encoder models.
fastvideo generate \
--model-path $MODEL_BASE \
--sp-size $num_gpus \
--tp-size $num_gpus \
--tp-size 1 \
--num-gpus $num_gpus \
--height 720 \
--width 1280 \
Expand Down
5 changes: 1 addition & 4 deletions scripts/inference/v1_inference_hunyuan_STA.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,10 @@ export FASTVIDEO_ATTENTION_CONFIG=assets/mask_strategy_hunyuan.json
export FASTVIDEO_ATTENTION_BACKEND=SLIDING_TILE_ATTN
export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
# export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
# Note that the tp_size and sp_size should be the same and equal to the number
# of GPUs. They are used for different parallel groups. sp_size is used for
# dit model and tp_size is used for encoder models.
fastvideo generate \
--model-path $MODEL_BASE \
--sp-size ${num_gpus} \
--tp-size ${num_gpus} \
--tp-size 1 \
--height 768 \
--width 1280 \
--num-frames 117 \
Expand Down
5 changes: 1 addition & 4 deletions scripts/inference/v1_inference_wan.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,10 @@ num_gpus=2
export FASTVIDEO_ATTENTION_BACKEND=
export MODEL_BASE=Wan-AI/Wan2.1-T2V-1.3B-Diffusers
# export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
# Note that the tp_size and sp_size should be the same and equal to the number
# of GPUs. They are used for different parallel groups. sp_size is used for
# dit model and tp_size is used for encoder models.
fastvideo generate \
--model-path $MODEL_BASE \
--sp-size $num_gpus \
--tp-size $num_gpus \
--tp-size 1 \
--num-gpus $num_gpus \
--height 480 \
--width 832 \
Expand Down
5 changes: 1 addition & 4 deletions scripts/inference/v1_inference_wan_STA.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,10 @@ num_gpus=2
export FASTVIDEO_ATTENTION_CONFIG=assets/mask_strategy_wan.json
export FASTVIDEO_ATTENTION_BACKEND=SLIDING_TILE_ATTN
export MODEL_BASE=Wan-AI/Wan2.1-T2V-14B-Diffusers
# Note that the tp_size and sp_size should be the same and equal to the number
# of GPUs. They are used for different parallel groups. sp_size is used for
# dit model and tp_size is used for encoder models.
fastvideo generate \
--model-path $MODEL_BASE \
--sp-size $num_gpus \
--tp-size $num_gpus \
--tp-size 1 \
--num-gpus $num_gpus \
--height 768 \
--width 1280 \
Expand Down
7 changes: 2 additions & 5 deletions scripts/inference/v1_inference_wan_VSA.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,11 @@ num_gpus=1
export FASTVIDEO_ATTENTION_BACKEND=VIDEO_SPARSE_ATTN
# change model path to local dir if you want to inference using your checkpoint
export MODEL_BASE=Wan-AI/Wan2.1-T2V-1.3B-Diffusers
# export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
# Note that the tp_size and sp_size should be the same and equal to the number
# of GPUs. They are used for different parallel groups. sp_size is used for
# dit model and tp_size is used for encoder models.
# export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
fastvideo generate \
--model-path $MODEL_BASE \
--sp-size $num_gpus \
--tp-size $num_gpus \
--tp-size 1 \
--num-gpus $num_gpus \
--height 448 \
--width 832 \
Expand Down
3 changes: 0 additions & 3 deletions scripts/inference/v1_inference_wan_i2v.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@ num_gpus=2
export FASTVIDEO_ATTENTION_BACKEND=
export MODEL_BASE=Wan-AI/Wan2.1-I2V-14B-480P-Diffusers
# export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
# Note that the tp_size and sp_size should be the same and equal to the number
# of GPUs. They are used for different parallel groups. sp_size is used for
# dit model and tp_size is used for encoder models.
fastvideo generate \
--model-path $MODEL_BASE \
--sp-size $num_gpus \
Expand Down