huggingface · neph1 · Mar 9, 2025 · Mar 9, 2025 · Mar 10, 2025 · Mar 10, 2025
diff --git a/examples/formats/hunyuan_video/convert_to_original_format.py b/examples/formats/hunyuan_video/convert_to_original_format.py
@@ -108,6 +108,17 @@ def convert_lora_sd(diffusers_lora_sd):
             elif "proj_out" in key:
                 new_key = key.replace("proj_out", "linear2").replace(single_block_pattern, prefix + "single_blocks")
                 converted_lora_sd[new_key] = diffusers_lora_sd[key]
+        elif "x_embedder" in key:
+            new_key = key.replace("x_embedder", "img_in").replace(double_block_pattern, prefix + "")
+            if "lora_A" in key:
+                embed = diffusers_lora_sd[key]
+                sizes = embed.size()
+                x_reshaped = embed.view(sizes[0], 16, sizes[2], sizes[3], sizes[4], 2)
+                x_meaned = x_reshaped.mean(dim=2)
+                converted_lora_sd[new_key] = x_meaned
+            else:
+                converted_lora_sd[new_key] = diffusers_lora_sd[key]
+            print(new_key, diffusers_lora_sd[key].size())
 
         else:
             print(f"unknown or not implemented: {key}")

diff --git a/examples/training/control/hunyuan_video/image_condition/train.sh b/examples/training/control/hunyuan_video/image_condition/train.sh
@@ -0,0 +1,175 @@
+#!/bin/bash
+
+set -e -x
+
+# export TORCH_LOGS="+dynamo,recompiles,graph_breaks"
+# export TORCHDYNAMO_VERBOSE=1
+export WANDB_MODE="offline"
+export NCCL_P2P_DISABLE=1
+export NCCL_IB_DISABLE=1
+export TORCH_NCCL_ENABLE_MONITORING=0
+export FINETRAINERS_LOG_LEVEL="INFO"
+
+# Download the validation dataset
+if [ ! -d "examples/training/control/wan/image_condition/validation_dataset" ]; then
+  echo "Downloading validation dataset..."
+  huggingface-cli download --repo-type dataset finetrainers/OpenVid-1k-split-validation --local-dir examples/training/control/wan/image_condition/validation_dataset
+else
+  echo "Validation dataset already exists. Skipping download."
+fi
+
+# Finetrainers supports multiple backends for distributed training. Select your favourite and benchmark the differences!
+# BACKEND="accelerate"
+BACKEND="ptd"
+
+# In this setting, I'm using 1 GPU on 4-GPU node for training
+NUM_GPUS=1
+CUDA_VISIBLE_DEVICES="3"
+
+# Check the JSON files for the expected JSON format
+TRAINING_DATASET_CONFIG="examples/training/control/hunyuan_video/image_condition/training.json"
+VALIDATION_DATASET_FILE="examples/training/control/hunyuan_video/image_condition/validation.json"
+
+# Depending on how many GPUs you have available, choose your degree of parallelism and technique!
+DDP_1="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 1 --cp_degree 1 --tp_degree 1"
+DDP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 2 --dp_shards 1 --cp_degree 1 --tp_degree 1"
+DDP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 4 --dp_shards 1 --cp_degree 1 --tp_degree 1"
+DDP_8="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 8 --dp_shards 1 --cp_degree 1 --tp_degree 1"
+FSDP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 2 --cp_degree 1 --tp_degree 1"
+FSDP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 4 --cp_degree 1 --tp_degree 1"
+HSDP_2_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 2 --dp_shards 2 --cp_degree 1 --tp_degree 1"
+
+# Parallel arguments
+parallel_cmd=(
+  $DDP_1
+)
+
+# Model arguments
+model_cmd=(
+  --model_name "hunyuan_video"
+  --pretrained_model_name_or_path "hunyuanvideo-community/HunyuanVideo"
+  --compile_modules transformer
+)
+
+# Control arguments
+control_cmd=(
+  --control_type none
+  --rank 128
+  --lora_alpha 128
+  --target_modules "blocks.*(to_q|to_k|to_v|to_out.0|ff.net.0.proj|ff.net.2)"
+  --frame_conditioning_type index
+  --frame_conditioning_index 0
+)
+
+# Dataset arguments
+dataset_cmd=(
+  --dataset_config $TRAINING_DATASET_CONFIG
+  --dataset_shuffle_buffer_size 32
+)
+
+# Dataloader arguments
+dataloader_cmd=(
+  --dataloader_num_workers 0
+)
+
+# Diffusion arguments
+diffusion_cmd=(
+  --flow_weighting_scheme "logit_normal"
+)
+
+# Training arguments
+# We target just the attention projections layers for LoRA training here.
+# You can modify as you please and target any layer (regex is supported)
+training_cmd=(
+  --training_type control-lora
+  --seed 42
+  --batch_size 1
+  --train_steps 10000
+  --gradient_accumulation_steps 1
+  --gradient_checkpointing
+  --checkpointing_steps 1000
+  --checkpointing_limit 2
+  # --resume_from_checkpoint 3000
+  --enable_slicing
+  --enable_tiling
+)
+
+# Optimizer arguments
+optimizer_cmd=(
+  --optimizer "adamw"
+  --lr 2e-5
+  --lr_scheduler "constant_with_warmup"
+  --lr_warmup_steps 1000
+  --lr_num_cycles 1
+  --beta1 0.9
+  --beta2 0.99
+  --weight_decay 1e-4
+  --epsilon 1e-8
+  --max_grad_norm 1.0
+)
+
+# Validation arguments
+validation_cmd=(
+  --validation_dataset_file "$VALIDATION_DATASET_FILE"
+  --validation_steps 501
+)
+
+# Miscellaneous arguments
+miscellaneous_cmd=(
+  --tracker_name "finetrainers-hunyuan_video-control"
+  --output_dir "/raid/aryan/hunyuan_video-control-image-condition"
+  --init_timeout 600
+  --nccl_timeout 600
+  --report_to "wandb"
+)
+
+# Execute the training script
+if [ "$BACKEND" == "accelerate" ]; then
+
+  ACCELERATE_CONFIG_FILE=""
+  if [ "$NUM_GPUS" == 1 ]; then
+    ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_1.yaml"
+  elif [ "$NUM_GPUS" == 2 ]; then
+    ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_2.yaml"
+  elif [ "$NUM_GPUS" == 4 ]; then
+    ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_4.yaml"
+  elif [ "$NUM_GPUS" == 8 ]; then
+    ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_8.yaml"
+  fi
+
+  accelerate launch --config_file "$ACCELERATE_CONFIG_FILE" --gpu_ids $CUDA_VISIBLE_DEVICES train.py \
+    "${parallel_cmd[@]}" \
+    "${model_cmd[@]}" \
+    "${control_cmd[@]}" \
+    "${dataset_cmd[@]}" \
+    "${dataloader_cmd[@]}" \
+    "${diffusion_cmd[@]}" \
+    "${training_cmd[@]}" \
+    "${optimizer_cmd[@]}" \
+    "${validation_cmd[@]}" \
+    "${miscellaneous_cmd[@]}"
+
+elif [ "$BACKEND" == "ptd" ]; then
+
+  export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
+
+  torchrun \
+    --standalone \
+    --nnodes=1 \
+    --nproc_per_node=$NUM_GPUS \
+    --rdzv_backend c10d \
+    --rdzv_endpoint="localhost:19242" \
+    train.py \
+      "${parallel_cmd[@]}" \
+      "${model_cmd[@]}" \
+      "${control_cmd[@]}" \
+      "${dataset_cmd[@]}" \
+      "${dataloader_cmd[@]}" \
+      "${diffusion_cmd[@]}" \
+      "${training_cmd[@]}" \
+      "${optimizer_cmd[@]}" \
+      "${validation_cmd[@]}" \
+      "${miscellaneous_cmd[@]}"
+fi
+
+echo -ne "-------------------- Finished executing script --------------------\n\n"
diff --git a/finetrainers/config.py b/finetrainers/config.py
@@ -5,7 +5,7 @@
 from .models.cogvideox import CogVideoXModelSpecification
 from .models.cogview4 import CogView4ControlModelSpecification, CogView4ModelSpecification
 from .models.flux import FluxModelSpecification
-from .models.hunyuan_video import HunyuanVideoModelSpecification
+from .models.hunyuan_video import HunyuanVideoControlModelSpecification, HunyuanVideoModelSpecification
 from .models.ltx_video import LTXVideoModelSpecification
 from .models.wan import WanControlModelSpecification, WanModelSpecification
 
@@ -49,6 +49,7 @@ class TrainingType(str, Enum):
     ModelType.HUNYUAN_VIDEO: {
         TrainingType.LORA: HunyuanVideoModelSpecification,
         TrainingType.FULL_FINETUNE: HunyuanVideoModelSpecification,
+        TrainingType.CONTROL_LORA: HunyuanVideoControlModelSpecification,
     },
     ModelType.LTX_VIDEO: {
         TrainingType.LORA: LTXVideoModelSpecification,

diff --git a/finetrainers/models/hunyuan_video/__init__.py b/finetrainers/models/hunyuan_video/__init__.py
@@ -1 +1,2 @@
 from .base_specification import HunyuanVideoModelSpecification
+from .control_specification import HunyuanVideoControlModelSpecification
diff --git a/finetrainers/models/hunyuan_video/base_specification.py b/finetrainers/models/hunyuan_video/base_specification.py
@@ -38,7 +38,7 @@ class HunyuanLatentEncodeProcessor(ProcessorMixin):
     def __init__(self, output_names: List[str]):
         super().__init__()
         self.output_names = output_names
-        assert len(self.output_names) == 1
+        assert len(self.output_names) == 3
 
     def forward(
         self,
@@ -58,18 +58,24 @@ def forward(
         video = video.to(device=device, dtype=vae.dtype)
         video = video.permute(0, 2, 1, 3, 4).contiguous()  # [B, F, C, H, W] -> [B, C, F, H, W]
 
+        compute_posterior = False
         if compute_posterior:
             latents = vae.encode(video).latent_dist.sample(generator=generator)
             latents = latents.to(dtype=dtype)
         else:
-            if vae.use_slicing and video.shape[0] > 1:
-                encoded_slices = [vae._encode(x_slice) for x_slice in video.split(1)]
-                moments = torch.cat(encoded_slices)
-            else:
-                moments = vae._encode(video)
+            # TODO(aryan): refactor in diffusers to have use_slicing attribute
+            # if vae.use_slicing and video.shape[0] > 1:
+            #     encoded_slices = [vae._encode(x_slice) for x_slice in video.split(1)]
+            #     moments = torch.cat(encoded_slices)
+            # else:
+            #     moments = vae._encode(video)
+            moments = vae._encode(video)
             latents = moments.to(dtype=dtype)
 
-        return {self.output_names[0]: latents}
+        latents_mean = torch.tensor(vae.latent_channels)
+        latents_std = 1.0 / torch.tensor(vae.latent_channels)
+
+        return {self.output_names[0]: latents, self.output_names[1]: latents_mean, self.output_names[2]: latents_std}
 
 
 class HunyuanVideoModelSpecification(ModelSpecification):
@@ -115,7 +121,7 @@ def __init__(
                 ),
             ]
         if latent_model_processors is None:
-            latent_model_processors = [HunyuanLatentEncodeProcessor(["latents"])]
+            latent_model_processors = [HunyuanLatentEncodeProcessor(["latents", "latents_mean", "latents_std"])]
 
         self.condition_model_processors = condition_model_processors
         self.latent_model_processors = latent_model_processors
@@ -305,7 +311,16 @@ def forward(
         if compute_posterior:
             latents = latent_model_conditions.pop("latents")
         else:
-            posterior = DiagonalGaussianDistribution(latent_model_conditions.pop("latents"))
+            latents = latent_model_conditions.pop("latents")
+            latents_mean = latent_model_conditions.pop("latents_mean")
+            latents_std = latent_model_conditions.pop("latents_std")
+
+            mu, logvar = torch.chunk(latents, 2, dim=1)
+            mu = self._normalize_latents(mu, latents_mean, latents_std)
+            logvar = self._normalize_latents(logvar, latents_mean, latents_std)
+            latents = torch.cat([mu, logvar], dim=1)
+
+            posterior = DiagonalGaussianDistribution(latents)
             latents = posterior.sample(generator=generator)
             del posterior
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from .base_specification import HunyuanVideoModelSpecification
		from .control_specification import HunyuanVideoControlModelSpecification