Skip to content
Draft
Show file tree
Hide file tree
Changes from 58 commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
dc85bbf
update
a-r-r-o-w Mar 9, 2025
755fee8
Merge branch 'main' into feature/control-lora-trainer
a-r-r-o-w Mar 9, 2025
8812036
update
a-r-r-o-w Mar 10, 2025
e39d255
update
a-r-r-o-w Mar 10, 2025
18bd9ce
update
a-r-r-o-w Mar 10, 2025
3ef07fc
update
a-r-r-o-w Mar 11, 2025
ea07973
update
a-r-r-o-w Mar 11, 2025
9f3d2cb
add valid names to dataset docs
neph1 Mar 11, 2025
2af75b1
update
a-r-r-o-w Mar 12, 2025
28b86c8
update
a-r-r-o-w Mar 13, 2025
84ffbd3
update
a-r-r-o-w Mar 13, 2025
1684ee5
update
a-r-r-o-w Mar 13, 2025
483e891
update
a-r-r-o-w Mar 13, 2025
657fb74
update
a-r-r-o-w Mar 13, 2025
cd859b3
update
a-r-r-o-w Mar 13, 2025
8cea261
update
a-r-r-o-w Mar 13, 2025
90d6d38
Merge branch 'main' into feature/control-lora-trainer
a-r-r-o-w Mar 13, 2025
45bbf22
fix
a-r-r-o-w Mar 14, 2025
825976d
update
a-r-r-o-w Mar 15, 2025
053757d
Merge branch 'main' into feature/control-lora-trainer
a-r-r-o-w Mar 15, 2025
eaafeab
update
a-r-r-o-w Mar 15, 2025
9144f28
Merge branch 'feature/control-lora-trainer' of https://github.com/a-r…
a-r-r-o-w Mar 15, 2025
3745ae5
update
a-r-r-o-w Mar 15, 2025
d7ba5e1
Merge branch 'main' of github.com:neph1/finetrainers
neph1 Mar 15, 2025
7245b5a
update
a-r-r-o-w Mar 16, 2025
c1c600f
update
a-r-r-o-w Mar 16, 2025
e1ef448
Merge branch 'main' into feature/control-lora-trainer
a-r-r-o-w Mar 18, 2025
8587874
update
a-r-r-o-w Mar 19, 2025
495e2b1
Merge branch 'main' into feature/control-lora-trainer
a-r-r-o-w Mar 19, 2025
322d610
Merge branch 'feature/control-lora-trainer' into feature/control-lora…
a-r-r-o-w Mar 19, 2025
2aeca67
Merge branch 'main' into feature/control-lora-trainer
a-r-r-o-w Mar 20, 2025
f256ea7
Merge branch 'feature/control-lora-trainer' into feature/control-lora…
a-r-r-o-w Mar 20, 2025
c9fa316
Merge branch 'main' into feature/control-lora-trainer
a-r-r-o-w Mar 21, 2025
ea09ef7
Merge branch 'feature/control-lora-trainer' into feature/control-lora…
a-r-r-o-w Mar 21, 2025
1722964
Merge branch 'main' of github.com:neph1/finetrainers
neph1 Mar 24, 2025
c80c995
hunyuan control specification
neph1 Mar 27, 2025
232508d
hunyuan control lora
neph1 Mar 29, 2025
2853be5
refactor
neph1 Mar 30, 2025
d014b04
clean up
neph1 Mar 30, 2025
7db51c4
free memory for single gpus
neph1 Mar 30, 2025
f5fb737
init as hunyuan base
neph1 Mar 31, 2025
b2b77a8
move back transformer to device after pass
neph1 Mar 31, 2025
916bd33
add todo about updating patch embedding layer
neph1 Mar 31, 2025
3c53e1e
update x_embed.proj
neph1 Apr 2, 2025
52a6034
output_names == 1 for sft trainer
neph1 Apr 6, 2025
197e2fe
Merge branch 'main' of github.com:neph1/finetrainers
neph1 Apr 11, 2025
022fcfe
Merge branch 'main' of github.com:neph1/finetrainers
neph1 Apr 12, 2025
7d31522
Merge branch 'main' into control-lora-trainer-hunyuan
neph1 Apr 12, 2025
d4002ce
apply latest changes
neph1 Apr 12, 2025
4fcb7c6
remove legacy dataset
neph1 Apr 12, 2025
d894b05
clean up
neph1 Apr 13, 2025
9d43e8a
remove hunyuan_common.py
neph1 Apr 13, 2025
9be0e0c
fix sorting
neph1 Apr 13, 2025
1e12216
fixes
neph1 Apr 13, 2025
681a62f
add training script and remove omni
neph1 Apr 15, 2025
edc50a8
fix quality
neph1 Apr 15, 2025
09f8b7d
optimize imports
neph1 Apr 16, 2025
1d6e74f
fixes
neph1 Apr 16, 2025
12d61f3
move import
neph1 Apr 18, 2025
3706569
reverting unnecessary hunyuan changes
neph1 Apr 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions examples/formats/hunyuan_video/convert_to_original_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,17 @@ def convert_lora_sd(diffusers_lora_sd):
elif "proj_out" in key:
new_key = key.replace("proj_out", "linear2").replace(single_block_pattern, prefix + "single_blocks")
converted_lora_sd[new_key] = diffusers_lora_sd[key]
elif "x_embedder" in key:
new_key = key.replace("x_embedder", "img_in").replace(double_block_pattern, prefix + "")
if "lora_A" in key:
embed = diffusers_lora_sd[key]
sizes = embed.size()
x_reshaped = embed.view(sizes[0], 16, sizes[2], sizes[3], sizes[4], 2)
x_meaned = x_reshaped.mean(dim=2)
converted_lora_sd[new_key] = x_meaned
else:
converted_lora_sd[new_key] = diffusers_lora_sd[key]
print(new_key, diffusers_lora_sd[key].size())

else:
print(f"unknown or not implemented: {key}")
Expand Down
175 changes: 175 additions & 0 deletions examples/training/control/hunyuan_video/image_condition/train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
#!/bin/bash

set -e -x

# export TORCH_LOGS="+dynamo,recompiles,graph_breaks"
# export TORCHDYNAMO_VERBOSE=1
export WANDB_MODE="offline"
export NCCL_P2P_DISABLE=1
export NCCL_IB_DISABLE=1
export TORCH_NCCL_ENABLE_MONITORING=0
export FINETRAINERS_LOG_LEVEL="INFO"

# Download the validation dataset
if [ ! -d "examples/training/control/wan/image_condition/validation_dataset" ]; then
echo "Downloading validation dataset..."
huggingface-cli download --repo-type dataset finetrainers/OpenVid-1k-split-validation --local-dir examples/training/control/wan/image_condition/validation_dataset
else
echo "Validation dataset already exists. Skipping download."
fi

# Finetrainers supports multiple backends for distributed training. Select your favourite and benchmark the differences!
# BACKEND="accelerate"
BACKEND="ptd"

# In this setting, I'm using 1 GPU on 4-GPU node for training
NUM_GPUS=1
CUDA_VISIBLE_DEVICES="3"

# Check the JSON files for the expected JSON format
TRAINING_DATASET_CONFIG="examples/training/control/hunyuan_video/image_condition/training.json"
VALIDATION_DATASET_FILE="examples/training/control/hunyuan_video/image_condition/validation.json"

# Depending on how many GPUs you have available, choose your degree of parallelism and technique!
DDP_1="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 1 --cp_degree 1 --tp_degree 1"
DDP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 2 --dp_shards 1 --cp_degree 1 --tp_degree 1"
DDP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 4 --dp_shards 1 --cp_degree 1 --tp_degree 1"
DDP_8="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 8 --dp_shards 1 --cp_degree 1 --tp_degree 1"
FSDP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 2 --cp_degree 1 --tp_degree 1"
FSDP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 4 --cp_degree 1 --tp_degree 1"
HSDP_2_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 2 --dp_shards 2 --cp_degree 1 --tp_degree 1"

# Parallel arguments
parallel_cmd=(
$DDP_1
)

# Model arguments
model_cmd=(
--model_name "hunyuan_video"
--pretrained_model_name_or_path "hunyuanvideo-community/HunyuanVideo"
--compile_modules transformer
)

# Control arguments
control_cmd=(
--control_type none
--rank 128
--lora_alpha 128
--target_modules "blocks.*(to_q|to_k|to_v|to_out.0|ff.net.0.proj|ff.net.2)"
--frame_conditioning_type index
--frame_conditioning_index 0
)

# Dataset arguments
dataset_cmd=(
--dataset_config $TRAINING_DATASET_CONFIG
--dataset_shuffle_buffer_size 32
)

# Dataloader arguments
dataloader_cmd=(
--dataloader_num_workers 0
)

# Diffusion arguments
diffusion_cmd=(
--flow_weighting_scheme "logit_normal"
)

# Training arguments
# We target just the attention projections layers for LoRA training here.
# You can modify as you please and target any layer (regex is supported)
training_cmd=(
--training_type control-lora
--seed 42
--batch_size 1
--train_steps 10000
--gradient_accumulation_steps 1
--gradient_checkpointing
--checkpointing_steps 1000
--checkpointing_limit 2
# --resume_from_checkpoint 3000
--enable_slicing
--enable_tiling
)

# Optimizer arguments
optimizer_cmd=(
--optimizer "adamw"
--lr 2e-5
--lr_scheduler "constant_with_warmup"
--lr_warmup_steps 1000
--lr_num_cycles 1
--beta1 0.9
--beta2 0.99
--weight_decay 1e-4
--epsilon 1e-8
--max_grad_norm 1.0
)

# Validation arguments
validation_cmd=(
--validation_dataset_file "$VALIDATION_DATASET_FILE"
--validation_steps 501
)

# Miscellaneous arguments
miscellaneous_cmd=(
--tracker_name "finetrainers-hunyuan_video-control"
--output_dir "/raid/aryan/hunyuan_video-control-image-condition"
--init_timeout 600
--nccl_timeout 600
--report_to "wandb"
)

# Execute the training script
if [ "$BACKEND" == "accelerate" ]; then

ACCELERATE_CONFIG_FILE=""
if [ "$NUM_GPUS" == 1 ]; then
ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_1.yaml"
elif [ "$NUM_GPUS" == 2 ]; then
ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_2.yaml"
elif [ "$NUM_GPUS" == 4 ]; then
ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_4.yaml"
elif [ "$NUM_GPUS" == 8 ]; then
ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_8.yaml"
fi

accelerate launch --config_file "$ACCELERATE_CONFIG_FILE" --gpu_ids $CUDA_VISIBLE_DEVICES train.py \
"${parallel_cmd[@]}" \
"${model_cmd[@]}" \
"${control_cmd[@]}" \
"${dataset_cmd[@]}" \
"${dataloader_cmd[@]}" \
"${diffusion_cmd[@]}" \
"${training_cmd[@]}" \
"${optimizer_cmd[@]}" \
"${validation_cmd[@]}" \
"${miscellaneous_cmd[@]}"

elif [ "$BACKEND" == "ptd" ]; then

export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES

torchrun \
--standalone \
--nnodes=1 \
--nproc_per_node=$NUM_GPUS \
--rdzv_backend c10d \
--rdzv_endpoint="localhost:19242" \
train.py \
"${parallel_cmd[@]}" \
"${model_cmd[@]}" \
"${control_cmd[@]}" \
"${dataset_cmd[@]}" \
"${dataloader_cmd[@]}" \
"${diffusion_cmd[@]}" \
"${training_cmd[@]}" \
"${optimizer_cmd[@]}" \
"${validation_cmd[@]}" \
"${miscellaneous_cmd[@]}"
fi

echo -ne "-------------------- Finished executing script --------------------\n\n"
3 changes: 2 additions & 1 deletion finetrainers/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from .models.cogvideox import CogVideoXModelSpecification
from .models.cogview4 import CogView4ControlModelSpecification, CogView4ModelSpecification
from .models.flux import FluxModelSpecification
from .models.hunyuan_video import HunyuanVideoModelSpecification
from .models.hunyuan_video import HunyuanVideoControlModelSpecification, HunyuanVideoModelSpecification
from .models.ltx_video import LTXVideoModelSpecification
from .models.wan import WanControlModelSpecification, WanModelSpecification

Expand Down Expand Up @@ -49,6 +49,7 @@ class TrainingType(str, Enum):
ModelType.HUNYUAN_VIDEO: {
TrainingType.LORA: HunyuanVideoModelSpecification,
TrainingType.FULL_FINETUNE: HunyuanVideoModelSpecification,
TrainingType.CONTROL_LORA: HunyuanVideoControlModelSpecification,
},
ModelType.LTX_VIDEO: {
TrainingType.LORA: LTXVideoModelSpecification,
Expand Down
1 change: 1 addition & 0 deletions finetrainers/models/hunyuan_video/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .base_specification import HunyuanVideoModelSpecification
from .control_specification import HunyuanVideoControlModelSpecification
33 changes: 24 additions & 9 deletions finetrainers/models/hunyuan_video/base_specification.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class HunyuanLatentEncodeProcessor(ProcessorMixin):
def __init__(self, output_names: List[str]):
super().__init__()
self.output_names = output_names
assert len(self.output_names) == 1
assert len(self.output_names) == 3

def forward(
self,
Expand All @@ -58,18 +58,24 @@ def forward(
video = video.to(device=device, dtype=vae.dtype)
video = video.permute(0, 2, 1, 3, 4).contiguous() # [B, F, C, H, W] -> [B, C, F, H, W]

compute_posterior = False
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So far only made it work with compute_posterior false

if compute_posterior:
latents = vae.encode(video).latent_dist.sample(generator=generator)
latents = latents.to(dtype=dtype)
else:
if vae.use_slicing and video.shape[0] > 1:
encoded_slices = [vae._encode(x_slice) for x_slice in video.split(1)]
moments = torch.cat(encoded_slices)
else:
moments = vae._encode(video)
# TODO(aryan): refactor in diffusers to have use_slicing attribute
# if vae.use_slicing and video.shape[0] > 1:
# encoded_slices = [vae._encode(x_slice) for x_slice in video.split(1)]
# moments = torch.cat(encoded_slices)
# else:
# moments = vae._encode(video)
moments = vae._encode(video)
latents = moments.to(dtype=dtype)

return {self.output_names[0]: latents}
latents_mean = torch.tensor(vae.latent_channels)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@neph1 These changes seem incorrec to me and will cause worse generations. The previous implementation that did not perform this normalization was correct, I think.

Was this modified from Wan? If so, it's incorrect because they are different models and preprocess latents differently

latents_std = 1.0 / torch.tensor(vae.latent_channels)

return {self.output_names[0]: latents, self.output_names[1]: latents_mean, self.output_names[2]: latents_std}


class HunyuanVideoModelSpecification(ModelSpecification):
Expand Down Expand Up @@ -115,7 +121,7 @@ def __init__(
),
]
if latent_model_processors is None:
latent_model_processors = [HunyuanLatentEncodeProcessor(["latents"])]
latent_model_processors = [HunyuanLatentEncodeProcessor(["latents", "latents_mean", "latents_std"])]

self.condition_model_processors = condition_model_processors
self.latent_model_processors = latent_model_processors
Expand Down Expand Up @@ -305,7 +311,16 @@ def forward(
if compute_posterior:
latents = latent_model_conditions.pop("latents")
else:
posterior = DiagonalGaussianDistribution(latent_model_conditions.pop("latents"))
latents = latent_model_conditions.pop("latents")
latents_mean = latent_model_conditions.pop("latents_mean")
latents_std = latent_model_conditions.pop("latents_std")

mu, logvar = torch.chunk(latents, 2, dim=1)
mu = self._normalize_latents(mu, latents_mean, latents_std)
logvar = self._normalize_latents(logvar, latents_mean, latents_std)
latents = torch.cat([mu, logvar], dim=1)

posterior = DiagonalGaussianDistribution(latents)
latents = posterior.sample(generator=generator)
del posterior

Expand Down
Loading