Skip to content
Draft
Show file tree
Hide file tree
Changes from 51 commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
dc85bbf
update
a-r-r-o-w Mar 9, 2025
755fee8
Merge branch 'main' into feature/control-lora-trainer
a-r-r-o-w Mar 9, 2025
8812036
update
a-r-r-o-w Mar 10, 2025
e39d255
update
a-r-r-o-w Mar 10, 2025
18bd9ce
update
a-r-r-o-w Mar 10, 2025
3ef07fc
update
a-r-r-o-w Mar 11, 2025
ea07973
update
a-r-r-o-w Mar 11, 2025
9f3d2cb
add valid names to dataset docs
neph1 Mar 11, 2025
2af75b1
update
a-r-r-o-w Mar 12, 2025
28b86c8
update
a-r-r-o-w Mar 13, 2025
84ffbd3
update
a-r-r-o-w Mar 13, 2025
1684ee5
update
a-r-r-o-w Mar 13, 2025
483e891
update
a-r-r-o-w Mar 13, 2025
657fb74
update
a-r-r-o-w Mar 13, 2025
cd859b3
update
a-r-r-o-w Mar 13, 2025
8cea261
update
a-r-r-o-w Mar 13, 2025
90d6d38
Merge branch 'main' into feature/control-lora-trainer
a-r-r-o-w Mar 13, 2025
45bbf22
fix
a-r-r-o-w Mar 14, 2025
825976d
update
a-r-r-o-w Mar 15, 2025
053757d
Merge branch 'main' into feature/control-lora-trainer
a-r-r-o-w Mar 15, 2025
eaafeab
update
a-r-r-o-w Mar 15, 2025
9144f28
Merge branch 'feature/control-lora-trainer' of https://github.com/a-r…
a-r-r-o-w Mar 15, 2025
3745ae5
update
a-r-r-o-w Mar 15, 2025
d7ba5e1
Merge branch 'main' of github.com:neph1/finetrainers
neph1 Mar 15, 2025
7245b5a
update
a-r-r-o-w Mar 16, 2025
c1c600f
update
a-r-r-o-w Mar 16, 2025
e1ef448
Merge branch 'main' into feature/control-lora-trainer
a-r-r-o-w Mar 18, 2025
8587874
update
a-r-r-o-w Mar 19, 2025
495e2b1
Merge branch 'main' into feature/control-lora-trainer
a-r-r-o-w Mar 19, 2025
322d610
Merge branch 'feature/control-lora-trainer' into feature/control-lora…
a-r-r-o-w Mar 19, 2025
2aeca67
Merge branch 'main' into feature/control-lora-trainer
a-r-r-o-w Mar 20, 2025
f256ea7
Merge branch 'feature/control-lora-trainer' into feature/control-lora…
a-r-r-o-w Mar 20, 2025
c9fa316
Merge branch 'main' into feature/control-lora-trainer
a-r-r-o-w Mar 21, 2025
ea09ef7
Merge branch 'feature/control-lora-trainer' into feature/control-lora…
a-r-r-o-w Mar 21, 2025
1722964
Merge branch 'main' of github.com:neph1/finetrainers
neph1 Mar 24, 2025
c80c995
hunyuan control specification
neph1 Mar 27, 2025
232508d
hunyuan control lora
neph1 Mar 29, 2025
2853be5
refactor
neph1 Mar 30, 2025
d014b04
clean up
neph1 Mar 30, 2025
7db51c4
free memory for single gpus
neph1 Mar 30, 2025
f5fb737
init as hunyuan base
neph1 Mar 31, 2025
b2b77a8
move back transformer to device after pass
neph1 Mar 31, 2025
916bd33
add todo about updating patch embedding layer
neph1 Mar 31, 2025
3c53e1e
update x_embed.proj
neph1 Apr 2, 2025
52a6034
output_names == 1 for sft trainer
neph1 Apr 6, 2025
197e2fe
Merge branch 'main' of github.com:neph1/finetrainers
neph1 Apr 11, 2025
022fcfe
Merge branch 'main' of github.com:neph1/finetrainers
neph1 Apr 12, 2025
7d31522
Merge branch 'main' into control-lora-trainer-hunyuan
neph1 Apr 12, 2025
d4002ce
apply latest changes
neph1 Apr 12, 2025
4fcb7c6
remove legacy dataset
neph1 Apr 12, 2025
d894b05
clean up
neph1 Apr 13, 2025
9d43e8a
remove hunyuan_common.py
neph1 Apr 13, 2025
9be0e0c
fix sorting
neph1 Apr 13, 2025
1e12216
fixes
neph1 Apr 13, 2025
681a62f
add training script and remove omni
neph1 Apr 15, 2025
edc50a8
fix quality
neph1 Apr 15, 2025
09f8b7d
optimize imports
neph1 Apr 16, 2025
1d6e74f
fixes
neph1 Apr 16, 2025
12d61f3
move import
neph1 Apr 18, 2025
3706569
reverting unnecessary hunyuan changes
neph1 Apr 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 168 additions & 0 deletions examples/training/control/hunyuan_video/omni_edit/train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
#!/bin/bash

set -e -x

# export TORCH_LOGS="+dynamo,recompiles,graph_breaks"
# export TORCHDYNAMO_VERBOSE=1
export WANDB_MODE="offline"
export NCCL_P2P_DISABLE=1
export NCCL_IB_DISABLE=1
export TORCH_NCCL_ENABLE_MONITORING=0
export FINETRAINERS_LOG_LEVEL="INFO"

# Finetrainers supports multiple backends for distributed training. Select your favourite and benchmark the differences!
# BACKEND="accelerate"
BACKEND="ptd"

# In this setting, I'm using 2 GPUs on a 4-GPU node for training
NUM_GPUS=8
CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"

# Check the JSON files for the expected JSON format
TRAINING_DATASET_CONFIG="examples/training/control/cogview4/omni_edit/training.json"
VALIDATION_DATASET_FILE="examples/training/control/cogview4/omni_edit/validation.json"

# Depending on how many GPUs you have available, choose your degree of parallelism and technique!
DDP_1="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 1 --cp_degree 1 --tp_degree 1"
DDP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 2 --dp_shards 1 --cp_degree 1 --tp_degree 1"
DDP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 4 --dp_shards 1 --cp_degree 1 --tp_degree 1"
DDP_8="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 8 --dp_shards 1 --cp_degree 1 --tp_degree 1"
FSDP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 2 --cp_degree 1 --tp_degree 1"
FSDP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 4 --cp_degree 1 --tp_degree 1"
HSDP_2_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 2 --dp_shards 2 --cp_degree 1 --tp_degree 1"

# Parallel arguments
parallel_cmd=(
$DDP_1
)

# Model arguments
model_cmd=(
--model_name "hunyuan_video"
--pretrained_model_name_or_path "hunyuanvideo-community/HunyuanVideo"
)

# Control arguments
control_cmd=(
--control_type custom
--rank 32
--lora_alpha 32
--target_modules "transformer_blocks.*(to_q|to_k|to_v|to_out.0)"
)

# Dataset arguments
dataset_cmd=(
--dataset_config $TRAINING_DATASET_CONFIG
--dataset_shuffle_buffer_size 16
--enable_precomputation
--precomputation_items 16
)

# Dataloader arguments
dataloader_cmd=(
--dataloader_num_workers 0
)

# Diffusion arguments
diffusion_cmd=(
--flow_weighting_scheme "logit_normal"
)

# Training arguments
# We target just the attention projections layers for LoRA training here.
# You can modify as you please and target any layer (regex is supported)
training_cmd=(
--training_type control-lora
--seed 42
--batch_size 1
--train_steps 10000
--gradient_accumulation_steps 4
--gradient_checkpointing
--checkpointing_steps 1000
--checkpointing_limit 5
# --resume_from_checkpoint 3000
--enable_slicing
--enable_tiling
)

# Optimizer arguments
optimizer_cmd=(
--optimizer "adamw"
--lr 3e-5
--lr_scheduler "constant_with_warmup"
--lr_warmup_steps 2000
--lr_num_cycles 1
--beta1 0.9
--beta2 0.99
--weight_decay 1e-4
--epsilon 1e-8
--max_grad_norm 1.0
)

# Validation arguments
validation_cmd=(
--validation_dataset_file "$VALIDATION_DATASET_FILE"
--validation_steps 500
)

# Miscellaneous arguments
miscellaneous_cmd=(
--tracker_name "finetrainers"
--output_dir "/fsx/aryan/lora-training/hunyuanvideo"
--init_timeout 600
--nccl_timeout 600
--report_to "none"
)

# Execute the training script
if [ "$BACKEND" == "accelerate" ]; then

ACCELERATE_CONFIG_FILE=""
if [ "$NUM_GPUS" == 1 ]; then
ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_1.yaml"
elif [ "$NUM_GPUS" == 2 ]; then
ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_2.yaml"
elif [ "$NUM_GPUS" == 4 ]; then
ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_4.yaml"
elif [ "$NUM_GPUS" == 8 ]; then
ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_8.yaml"
fi

export WORLD_SIZE=$NUM_GPUS
export RANK=0
export MASTER_ADDR=localhost
export MASTER_PORT=0

accelerate launch --config_file "$ACCELERATE_CONFIG_FILE" --gpu_ids $CUDA_VISIBLE_DEVICES train.py \
"${parallel_cmd[@]}" \
"${model_cmd[@]}" \
"${control_cmd[@]}" \
"${dataset_cmd[@]}" \
"${dataloader_cmd[@]}" \
"${diffusion_cmd[@]}" \
"${training_cmd[@]}" \
"${optimizer_cmd[@]}" \
"${miscellaneous_cmd[@]}"

elif [ "$BACKEND" == "ptd" ]; then

export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
torchrun \
--standalone \
--nnodes=1 \
--nproc_per_node=$NUM_GPUS \
--rdzv_backend c10d \
--rdzv_endpoint="localhost:19242" \
train.py \
"${parallel_cmd[@]}" \
"${model_cmd[@]}" \
"${control_cmd[@]}" \
"${dataset_cmd[@]}" \
"${dataloader_cmd[@]}" \
"${diffusion_cmd[@]}" \
"${training_cmd[@]}" \
"${optimizer_cmd[@]}" \
"${miscellaneous_cmd[@]}"
fi

echo -ne "-------------------- Finished executing script --------------------\n\n"
3 changes: 3 additions & 0 deletions finetrainers/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from enum import Enum
from typing import Type

from finetrainers.models.hunyuan_video.control_specification import HunyuanVideoControlModelSpecification

from .models import ModelSpecification
from .models.cogvideox import CogVideoXModelSpecification
from .models.cogview4 import CogView4ControlModelSpecification, CogView4ModelSpecification
Expand Down Expand Up @@ -49,6 +51,7 @@ class TrainingType(str, Enum):
ModelType.HUNYUAN_VIDEO: {
TrainingType.LORA: HunyuanVideoModelSpecification,
TrainingType.FULL_FINETUNE: HunyuanVideoModelSpecification,
TrainingType.CONTROL_LORA: HunyuanVideoControlModelSpecification,
},
ModelType.LTX_VIDEO: {
TrainingType.LORA: LTXVideoModelSpecification,
Expand Down
1 change: 1 addition & 0 deletions finetrainers/models/hunyuan_video/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .base_specification import HunyuanVideoModelSpecification
from .control_specification import HunyuanVideoControlModelSpecification
137 changes: 29 additions & 108 deletions finetrainers/models/hunyuan_video/base_specification.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer, LlamaModel

from finetrainers.models.hunyuan_video import hunyuan_common

import finetrainers.functional as FF
from finetrainers.data import VideoArtifact
from finetrainers.logging import get_logger
Expand All @@ -38,7 +40,7 @@ class HunyuanLatentEncodeProcessor(ProcessorMixin):
def __init__(self, output_names: List[str]):
super().__init__()
self.output_names = output_names
assert len(self.output_names) == 1
assert len(self.output_names) == 3

def forward(
self,
Expand All @@ -58,18 +60,24 @@ def forward(
video = video.to(device=device, dtype=vae.dtype)
video = video.permute(0, 2, 1, 3, 4).contiguous() # [B, F, C, H, W] -> [B, C, F, H, W]

compute_posterior = False
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So far only made it work with compute_posterior false

if compute_posterior:
latents = vae.encode(video).latent_dist.sample(generator=generator)
latents = latents.to(dtype=dtype)
else:
if vae.use_slicing and video.shape[0] > 1:
encoded_slices = [vae._encode(x_slice) for x_slice in video.split(1)]
moments = torch.cat(encoded_slices)
else:
moments = vae._encode(video)
# TODO(aryan): refactor in diffusers to have use_slicing attribute
# if vae.use_slicing and video.shape[0] > 1:
# encoded_slices = [vae._encode(x_slice) for x_slice in video.split(1)]
# moments = torch.cat(encoded_slices)
# else:
# moments = vae._encode(video)
moments = vae._encode(video)
latents = moments.to(dtype=dtype)

return {self.output_names[0]: latents}
latents_mean = torch.zeros((vae.latent_channels,), requires_grad=False)
latents_std = torch.ones((vae.latent_channels,), requires_grad=False)

return {self.output_names[0]: latents, self.output_names[1]: latents_mean, self.output_names[2]: latents_std}


class HunyuanVideoModelSpecification(ModelSpecification):
Expand Down Expand Up @@ -115,7 +123,7 @@ def __init__(
),
]
if latent_model_processors is None:
latent_model_processors = [HunyuanLatentEncodeProcessor(["latents"])]
latent_model_processors = [HunyuanLatentEncodeProcessor(["latents", "latents_mean", "latents_std"])]

self.condition_model_processors = condition_model_processors
self.latent_model_processors = latent_model_processors
Expand All @@ -124,65 +132,11 @@ def __init__(
def _resolution_dim_keys(self):
return {"latents": (2, 3, 4)}

def load_condition_models(self) -> Dict[str, torch.nn.Module]:
common_kwargs = {"revision": self.revision, "cache_dir": self.cache_dir}

if self.tokenizer_id is not None:
tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_id, **common_kwargs)
else:
tokenizer = AutoTokenizer.from_pretrained(
self.pretrained_model_name_or_path, subfolder="tokenizer", **common_kwargs
)

if self.tokenizer_2_id is not None:
tokenizer_2 = AutoTokenizer.from_pretrained(self.tokenizer_2_id, **common_kwargs)
else:
tokenizer_2 = CLIPTokenizer.from_pretrained(
self.pretrained_model_name_or_path, subfolder="tokenizer_2", **common_kwargs
)
load_condition_models = hunyuan_common.load_condition_models

if self.text_encoder_id is not None:
text_encoder = LlamaModel.from_pretrained(
self.text_encoder_id, torch_dtype=self.text_encoder_dtype, **common_kwargs
)
else:
text_encoder = LlamaModel.from_pretrained(
self.pretrained_model_name_or_path,
subfolder="text_encoder",
torch_dtype=self.text_encoder_dtype,
**common_kwargs,
)
load_latent_models = hunyuan_common.load_latent_models

if self.text_encoder_2_id is not None:
text_encoder_2 = CLIPTextModel.from_pretrained(
self.text_encoder_2_id, torch_dtype=self.text_encoder_2_dtype, **common_kwargs
)
else:
text_encoder_2 = CLIPTextModel.from_pretrained(
self.pretrained_model_name_or_path,
subfolder="text_encoder_2",
torch_dtype=self.text_encoder_2_dtype,
**common_kwargs,
)

return {
"tokenizer": tokenizer,
"tokenizer_2": tokenizer_2,
"text_encoder": text_encoder,
"text_encoder_2": text_encoder_2,
}

def load_latent_models(self) -> Dict[str, torch.nn.Module]:
common_kwargs = {"revision": self.revision, "cache_dir": self.cache_dir}

if self.vae_id is not None:
vae = AutoencoderKLHunyuanVideo.from_pretrained(self.vae_id, torch_dtype=self.vae_dtype, **common_kwargs)
else:
vae = AutoencoderKLHunyuanVideo.from_pretrained(
self.pretrained_model_name_or_path, subfolder="vae", torch_dtype=self.vae_dtype, **common_kwargs
)

return {"vae": vae}
load_pipeline = hunyuan_common.load_pipeline

def load_diffusion_models(self) -> Dict[str, torch.nn.Module]:
common_kwargs = {"revision": self.revision, "cache_dir": self.cache_dir}
Expand All @@ -203,46 +157,6 @@ def load_diffusion_models(self) -> Dict[str, torch.nn.Module]:

return {"transformer": transformer, "scheduler": scheduler}

def load_pipeline(
self,
tokenizer: Optional[AutoTokenizer] = None,
tokenizer_2: Optional[CLIPTokenizer] = None,
text_encoder: Optional[LlamaModel] = None,
text_encoder_2: Optional[CLIPTextModel] = None,
transformer: Optional[HunyuanVideoTransformer3DModel] = None,
vae: Optional[AutoencoderKLHunyuanVideo] = None,
scheduler: Optional[FlowMatchEulerDiscreteScheduler] = None,
enable_slicing: bool = False,
enable_tiling: bool = False,
enable_model_cpu_offload: bool = False,
training: bool = False,
**kwargs,
) -> HunyuanVideoPipeline:
components = {
"tokenizer": tokenizer,
"tokenizer_2": tokenizer_2,
"text_encoder": text_encoder,
"text_encoder_2": text_encoder_2,
"transformer": transformer,
"vae": vae,
"scheduler": scheduler,
}
components = get_non_null_items(components)

pipe = HunyuanVideoPipeline.from_pretrained(
self.pretrained_model_name_or_path, **components, revision=self.revision, cache_dir=self.cache_dir
)
pipe.text_encoder.to(self.text_encoder_dtype)
pipe.text_encoder_2.to(self.text_encoder_2_dtype)
pipe.vae.to(self.vae_dtype)

_enable_vae_memory_optimizations(pipe.vae, enable_slicing, enable_tiling)
if not training:
pipe.transformer.to(self.transformer_dtype)
if enable_model_cpu_offload:
pipe.enable_model_cpu_offload()
return pipe

@torch.no_grad()
def prepare_conditions(
self,
Expand Down Expand Up @@ -305,14 +219,21 @@ def forward(
if compute_posterior:
latents = latent_model_conditions.pop("latents")
else:
posterior = DiagonalGaussianDistribution(latent_model_conditions.pop("latents"))
latents = latent_model_conditions.pop("latents")
latents_mean = latent_model_conditions.pop("latents_mean")
latents_std = latent_model_conditions.pop("latents_std")

mu, logvar = torch.chunk(latents, 2, dim=1)
mu = self._normalize_latents(mu, latents_mean, latents_std)
logvar = self._normalize_latents(logvar, latents_mean, latents_std)
latents = torch.cat([mu, logvar], dim=1)

posterior = DiagonalGaussianDistribution(latents)
latents = posterior.sample(generator=generator)
del posterior

latents = latents * self.vae_config.scaling_factor
noise = torch.zeros_like(latents).normal_(generator=generator)
noisy_latents = FF.flow_match_xt(latents, noise, sigmas)

timesteps = (sigmas.flatten() * 1000.0).long()
guidance = latents.new_full((latents.size(0),), fill_value=guidance) * 1000.0

Expand Down
Loading
Loading