huggingface · neph1 · Mar 9, 2025 · Mar 9, 2025 · Mar 10, 2025 · Mar 10, 2025
diff --git a/examples/training/control/hunyuan_video/omni_edit/train.sh b/examples/training/control/hunyuan_video/omni_edit/train.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+
+set -e -x
+
+# export TORCH_LOGS="+dynamo,recompiles,graph_breaks"
+# export TORCHDYNAMO_VERBOSE=1
+export WANDB_MODE="offline"
+export NCCL_P2P_DISABLE=1
+export NCCL_IB_DISABLE=1
+export TORCH_NCCL_ENABLE_MONITORING=0
+export FINETRAINERS_LOG_LEVEL="INFO"
+
+# Finetrainers supports multiple backends for distributed training. Select your favourite and benchmark the differences!
+# BACKEND="accelerate"
+BACKEND="ptd"
+
+# In this setting, I'm using 2 GPUs on a 4-GPU node for training
+NUM_GPUS=8
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+
+# Check the JSON files for the expected JSON format
+TRAINING_DATASET_CONFIG="examples/training/control/cogview4/omni_edit/training.json"
+VALIDATION_DATASET_FILE="examples/training/control/cogview4/omni_edit/validation.json"
+
+# Depending on how many GPUs you have available, choose your degree of parallelism and technique!
+DDP_1="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 1 --cp_degree 1 --tp_degree 1"
+DDP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 2 --dp_shards 1 --cp_degree 1 --tp_degree 1"
+DDP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 4 --dp_shards 1 --cp_degree 1 --tp_degree 1"
+DDP_8="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 8 --dp_shards 1 --cp_degree 1 --tp_degree 1"
+FSDP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 2 --cp_degree 1 --tp_degree 1"
+FSDP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 4 --cp_degree 1 --tp_degree 1"
+HSDP_2_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 2 --dp_shards 2 --cp_degree 1 --tp_degree 1"
+
+# Parallel arguments
+parallel_cmd=(
+  $DDP_1
+)
+
+# Model arguments
+model_cmd=(
+  --model_name "hunyuan_video"
+  --pretrained_model_name_or_path "hunyuanvideo-community/HunyuanVideo"
+)
+
+# Control arguments
+control_cmd=(
+  --control_type custom
+  --rank 32
+  --lora_alpha 32
+  --target_modules "transformer_blocks.*(to_q|to_k|to_v|to_out.0)"
+)
+
+# Dataset arguments
+dataset_cmd=(
+  --dataset_config $TRAINING_DATASET_CONFIG
+  --dataset_shuffle_buffer_size 16
+  --enable_precomputation
+  --precomputation_items 16
+)
+
+# Dataloader arguments
+dataloader_cmd=(
+  --dataloader_num_workers 0
+)
+
+# Diffusion arguments
+diffusion_cmd=(
+  --flow_weighting_scheme "logit_normal"
+)
+
+# Training arguments
+# We target just the attention projections layers for LoRA training here.
+# You can modify as you please and target any layer (regex is supported)
+training_cmd=(
+  --training_type control-lora
+  --seed 42
+  --batch_size 1
+  --train_steps 10000
+  --gradient_accumulation_steps 4
+  --gradient_checkpointing
+  --checkpointing_steps 1000
+  --checkpointing_limit 5
+  # --resume_from_checkpoint 3000
+  --enable_slicing
+  --enable_tiling
+)
+
+# Optimizer arguments
+optimizer_cmd=(
+  --optimizer "adamw"
+  --lr 3e-5
+  --lr_scheduler "constant_with_warmup"
+  --lr_warmup_steps 2000
+  --lr_num_cycles 1
+  --beta1 0.9
+  --beta2 0.99
+  --weight_decay 1e-4
+  --epsilon 1e-8
+  --max_grad_norm 1.0
+)
+
+# Validation arguments
+validation_cmd=(
+  --validation_dataset_file "$VALIDATION_DATASET_FILE"
+  --validation_steps 500
+)
+
+# Miscellaneous arguments
+miscellaneous_cmd=(
+  --tracker_name "finetrainers"
+  --output_dir "/fsx/aryan/lora-training/hunyuanvideo"
+  --init_timeout 600
+  --nccl_timeout 600
+  --report_to "none"
+)
+
+# Execute the training script
+if [ "$BACKEND" == "accelerate" ]; then
+
+  ACCELERATE_CONFIG_FILE=""
+  if [ "$NUM_GPUS" == 1 ]; then
+    ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_1.yaml"
+  elif [ "$NUM_GPUS" == 2 ]; then
+    ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_2.yaml"
+  elif [ "$NUM_GPUS" == 4 ]; then
+    ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_4.yaml"
+  elif [ "$NUM_GPUS" == 8 ]; then
+    ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_8.yaml"
+  fi
+
+  export WORLD_SIZE=$NUM_GPUS
+  export RANK=0
+  export MASTER_ADDR=localhost
+  export MASTER_PORT=0
+
+  accelerate launch --config_file "$ACCELERATE_CONFIG_FILE" --gpu_ids $CUDA_VISIBLE_DEVICES train.py \
+    "${parallel_cmd[@]}" \
+    "${model_cmd[@]}" \
+    "${control_cmd[@]}" \
+    "${dataset_cmd[@]}" \
+    "${dataloader_cmd[@]}" \
+    "${diffusion_cmd[@]}" \
+    "${training_cmd[@]}" \
+    "${optimizer_cmd[@]}" \
+    "${miscellaneous_cmd[@]}"
+
+elif [ "$BACKEND" == "ptd" ]; then
+
+  export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
+  torchrun \
+    --standalone \
+    --nnodes=1 \
+    --nproc_per_node=$NUM_GPUS \
+    --rdzv_backend c10d \
+    --rdzv_endpoint="localhost:19242" \
+    train.py \
+      "${parallel_cmd[@]}" \
+      "${model_cmd[@]}" \
+      "${control_cmd[@]}" \
+      "${dataset_cmd[@]}" \
+      "${dataloader_cmd[@]}" \
+      "${diffusion_cmd[@]}" \
+      "${training_cmd[@]}" \
+      "${optimizer_cmd[@]}" \
+      "${miscellaneous_cmd[@]}"
+fi
+
+echo -ne "-------------------- Finished executing script --------------------\n\n"
diff --git a/finetrainers/config.py b/finetrainers/config.py
@@ -1,6 +1,8 @@
 from enum import Enum
 from typing import Type
 
+from finetrainers.models.hunyuan_video.control_specification import HunyuanVideoControlModelSpecification
+
 from .models import ModelSpecification
 from .models.cogvideox import CogVideoXModelSpecification
 from .models.cogview4 import CogView4ControlModelSpecification, CogView4ModelSpecification
@@ -49,6 +51,7 @@ class TrainingType(str, Enum):
     ModelType.HUNYUAN_VIDEO: {
         TrainingType.LORA: HunyuanVideoModelSpecification,
         TrainingType.FULL_FINETUNE: HunyuanVideoModelSpecification,
+        TrainingType.CONTROL_LORA: HunyuanVideoControlModelSpecification,
     },
     ModelType.LTX_VIDEO: {
         TrainingType.LORA: LTXVideoModelSpecification,

diff --git a/finetrainers/models/hunyuan_video/__init__.py b/finetrainers/models/hunyuan_video/__init__.py
@@ -1 +1,2 @@
 from .base_specification import HunyuanVideoModelSpecification
+from .control_specification import HunyuanVideoControlModelSpecification
diff --git a/finetrainers/models/hunyuan_video/base_specification.py b/finetrainers/models/hunyuan_video/base_specification.py
@@ -13,6 +13,8 @@
 from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
 from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer, LlamaModel
 
+from finetrainers.models.hunyuan_video import hunyuan_common
+
 import finetrainers.functional as FF
 from finetrainers.data import VideoArtifact
 from finetrainers.logging import get_logger
@@ -38,7 +40,7 @@ class HunyuanLatentEncodeProcessor(ProcessorMixin):
     def __init__(self, output_names: List[str]):
         super().__init__()
         self.output_names = output_names
-        assert len(self.output_names) == 1
+        assert len(self.output_names) == 3
 
     def forward(
         self,
@@ -58,18 +60,24 @@ def forward(
         video = video.to(device=device, dtype=vae.dtype)
         video = video.permute(0, 2, 1, 3, 4).contiguous()  # [B, F, C, H, W] -> [B, C, F, H, W]
 
+        compute_posterior = False
         if compute_posterior:
             latents = vae.encode(video).latent_dist.sample(generator=generator)
             latents = latents.to(dtype=dtype)
         else:
-            if vae.use_slicing and video.shape[0] > 1:
-                encoded_slices = [vae._encode(x_slice) for x_slice in video.split(1)]
-                moments = torch.cat(encoded_slices)
-            else:
-                moments = vae._encode(video)
+            # TODO(aryan): refactor in diffusers to have use_slicing attribute
+            # if vae.use_slicing and video.shape[0] > 1:
+            #     encoded_slices = [vae._encode(x_slice) for x_slice in video.split(1)]
+            #     moments = torch.cat(encoded_slices)
+            # else:
+            #     moments = vae._encode(video)
+            moments = vae._encode(video)
             latents = moments.to(dtype=dtype)
 
-        return {self.output_names[0]: latents}
+        latents_mean = torch.zeros((vae.latent_channels,), requires_grad=False)
+        latents_std = torch.ones((vae.latent_channels,), requires_grad=False)
+
+        return {self.output_names[0]: latents, self.output_names[1]: latents_mean, self.output_names[2]: latents_std}
 
 
 class HunyuanVideoModelSpecification(ModelSpecification):
@@ -115,7 +123,7 @@ def __init__(
                 ),
             ]
         if latent_model_processors is None:
-            latent_model_processors = [HunyuanLatentEncodeProcessor(["latents"])]
+            latent_model_processors = [HunyuanLatentEncodeProcessor(["latents", "latents_mean", "latents_std"])]
 
         self.condition_model_processors = condition_model_processors
         self.latent_model_processors = latent_model_processors
@@ -124,65 +132,11 @@ def __init__(
     def _resolution_dim_keys(self):
         return {"latents": (2, 3, 4)}
 
-    def load_condition_models(self) -> Dict[str, torch.nn.Module]:
-        common_kwargs = {"revision": self.revision, "cache_dir": self.cache_dir}
-
-        if self.tokenizer_id is not None:
-            tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_id, **common_kwargs)
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(
-                self.pretrained_model_name_or_path, subfolder="tokenizer", **common_kwargs
-            )
-
-        if self.tokenizer_2_id is not None:
-            tokenizer_2 = AutoTokenizer.from_pretrained(self.tokenizer_2_id, **common_kwargs)
-        else:
-            tokenizer_2 = CLIPTokenizer.from_pretrained(
-                self.pretrained_model_name_or_path, subfolder="tokenizer_2", **common_kwargs
-            )
+    load_condition_models = hunyuan_common.load_condition_models
 
-        if self.text_encoder_id is not None:
-            text_encoder = LlamaModel.from_pretrained(
-                self.text_encoder_id, torch_dtype=self.text_encoder_dtype, **common_kwargs
-            )
-        else:
-            text_encoder = LlamaModel.from_pretrained(
-                self.pretrained_model_name_or_path,
-                subfolder="text_encoder",
-                torch_dtype=self.text_encoder_dtype,
-                **common_kwargs,
-            )
+    load_latent_models = hunyuan_common.load_latent_models
 
-        if self.text_encoder_2_id is not None:
-            text_encoder_2 = CLIPTextModel.from_pretrained(
-                self.text_encoder_2_id, torch_dtype=self.text_encoder_2_dtype, **common_kwargs
-            )
-        else:
-            text_encoder_2 = CLIPTextModel.from_pretrained(
-                self.pretrained_model_name_or_path,
-                subfolder="text_encoder_2",
-                torch_dtype=self.text_encoder_2_dtype,
-                **common_kwargs,
-            )
-
-        return {
-            "tokenizer": tokenizer,
-            "tokenizer_2": tokenizer_2,
-            "text_encoder": text_encoder,
-            "text_encoder_2": text_encoder_2,
-        }
-
-    def load_latent_models(self) -> Dict[str, torch.nn.Module]:
-        common_kwargs = {"revision": self.revision, "cache_dir": self.cache_dir}
-
-        if self.vae_id is not None:
-            vae = AutoencoderKLHunyuanVideo.from_pretrained(self.vae_id, torch_dtype=self.vae_dtype, **common_kwargs)
-        else:
-            vae = AutoencoderKLHunyuanVideo.from_pretrained(
-                self.pretrained_model_name_or_path, subfolder="vae", torch_dtype=self.vae_dtype, **common_kwargs
-            )
-
-        return {"vae": vae}
+    load_pipeline = hunyuan_common.load_pipeline
 
     def load_diffusion_models(self) -> Dict[str, torch.nn.Module]:
         common_kwargs = {"revision": self.revision, "cache_dir": self.cache_dir}
@@ -203,46 +157,6 @@ def load_diffusion_models(self) -> Dict[str, torch.nn.Module]:
 
         return {"transformer": transformer, "scheduler": scheduler}
 
-    def load_pipeline(
-        self,
-        tokenizer: Optional[AutoTokenizer] = None,
-        tokenizer_2: Optional[CLIPTokenizer] = None,
-        text_encoder: Optional[LlamaModel] = None,
-        text_encoder_2: Optional[CLIPTextModel] = None,
-        transformer: Optional[HunyuanVideoTransformer3DModel] = None,
-        vae: Optional[AutoencoderKLHunyuanVideo] = None,
-        scheduler: Optional[FlowMatchEulerDiscreteScheduler] = None,
-        enable_slicing: bool = False,
-        enable_tiling: bool = False,
-        enable_model_cpu_offload: bool = False,
-        training: bool = False,
-        **kwargs,
-    ) -> HunyuanVideoPipeline:
-        components = {
-            "tokenizer": tokenizer,
-            "tokenizer_2": tokenizer_2,
-            "text_encoder": text_encoder,
-            "text_encoder_2": text_encoder_2,
-            "transformer": transformer,
-            "vae": vae,
-            "scheduler": scheduler,
-        }
-        components = get_non_null_items(components)
-
-        pipe = HunyuanVideoPipeline.from_pretrained(
-            self.pretrained_model_name_or_path, **components, revision=self.revision, cache_dir=self.cache_dir
-        )
-        pipe.text_encoder.to(self.text_encoder_dtype)
-        pipe.text_encoder_2.to(self.text_encoder_2_dtype)
-        pipe.vae.to(self.vae_dtype)
-
-        _enable_vae_memory_optimizations(pipe.vae, enable_slicing, enable_tiling)
-        if not training:
-            pipe.transformer.to(self.transformer_dtype)
-        if enable_model_cpu_offload:
-            pipe.enable_model_cpu_offload()
-        return pipe
-
     @torch.no_grad()
     def prepare_conditions(
         self,
@@ -305,14 +219,21 @@ def forward(
         if compute_posterior:
             latents = latent_model_conditions.pop("latents")
         else:
-            posterior = DiagonalGaussianDistribution(latent_model_conditions.pop("latents"))
+            latents = latent_model_conditions.pop("latents")
+            latents_mean = latent_model_conditions.pop("latents_mean")
+            latents_std = latent_model_conditions.pop("latents_std")
+
+            mu, logvar = torch.chunk(latents, 2, dim=1)
+            mu = self._normalize_latents(mu, latents_mean, latents_std)
+            logvar = self._normalize_latents(logvar, latents_mean, latents_std)
+            latents = torch.cat([mu, logvar], dim=1)
+
+            posterior = DiagonalGaussianDistribution(latents)
             latents = posterior.sample(generator=generator)
             del posterior
 
-        latents = latents * self.vae_config.scaling_factor
         noise = torch.zeros_like(latents).normal_(generator=generator)
         noisy_latents = FF.flow_match_xt(latents, noise, sigmas)
-
         timesteps = (sigmas.flatten() * 1000.0).long()
         guidance = latents.new_full((latents.size(0),), fill_value=guidance) * 1000.0
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from .base_specification import HunyuanVideoModelSpecification
		from .control_specification import HunyuanVideoControlModelSpecification