pre-commit

Eigensystem · Eigensystem · commit 1d2ecbd4b68c · 2025-06-10T19:41:52.000+08:00
diff --git a/fastvideo/v1/pipelines/composed_pipeline_base.py b/fastvideo/v1/pipelines/composed_pipeline_base.py
@@ -9,6 +9,7 @@
 import os
 from abc import ABC, abstractmethod
 from copy import deepcopy
+from enum import Enum
 from typing import Any, Dict, List, Optional, Union, cast
 
 import torch
@@ -102,7 +103,7 @@ def __init__(self,
 
         self.initialize_pipeline(fastvideo_args)
 
-        if not fastvideo_args.training_mode:
+        if fastvideo_args.inference_mode:
             logger.info("Creating pipeline stages...")
             self.create_pipeline_stages(fastvideo_args)
 
@@ -115,7 +116,7 @@ def initialize_validation_pipeline(self, training_args: TrainingArgs):
             "if log_validation is True, the pipeline must implement this method"
         )
 
-    def initialize_distillation_pipeline(self, fastvideo_args: FastVideoArgs):
+    def initialize_distillation_pipeline(self, training_args: TrainingArgs):
         raise NotImplementedError(
             "if distill_mode is True, the pipeline must implement this method")
 
@@ -158,30 +159,37 @@ def from_pretrained(cls,
             config_args = shallow_asdict(config)
             config_args.update(kwargs)
 
-        args.model_path = model_path
         # Handle both string mode and Mode enum values
-        mode_str = args.mode if isinstance(args.mode, str) else args.mode.value
+        mode_str: str | Enum = getattr(
+            args, 'mode', "inference") if args is not None else "inference"
+        if hasattr(mode_str, 'value'):
+            mode_str = mode_str.value
+        mode_str = str(mode_str)
 
         if mode_str == "inference":
-            fastvideo_args = FastVideoArgs.from_cli_args(args)
+            fastvideo_args = FastVideoArgs(model_path=model_path, **config_args)
+
+            fastvideo_args.model_path = model_path
             for key, value in config_args.items():
                 setattr(fastvideo_args, key, value)
-
         elif mode_str == "training" or mode_str == "distill":
             assert args is not None, "args must be provided for training mode"
             fastvideo_args = TrainingArgs.from_cli_args(args)
+            # TODO(will): fix this so that its not so ugly
+            fastvideo_args.model_path = model_path
             for key, value in config_args.items():
                 setattr(fastvideo_args, key, value)
 
-            fastvideo_args.num_gpus = int(os.environ.get("WORLD_SIZE", 1))
             fastvideo_args.use_cpu_offload = False
-            # make sure we are in training mode
+            # make sure we are in training mode - note: inference_mode is read-only,
+            # so we don't set it directly here as it's determined by the mode
             # we hijack the precision to be the master weight type so that the
             # model is loaded with the correct precision. Subsequently we will
             # use FSDP2's MixedPrecisionPolicy to set the precision for the
             # fwd, bwd, and other operations' precision.
             # fastvideo_args.precision = fastvideo_args.master_weight_type
             assert fastvideo_args.master_weight_type == 'fp32', 'only fp32 is supported for training'
+            # assert fastvideo_args.precision == 'fp32', 'only fp32 is supported for training'
         else:
             raise ValueError(f"Invalid mode: {mode_str}")
 
diff --git a/fastvideo/v1/training/distillation_pipeline.py b/fastvideo/v1/training/distillation_pipeline.py
@@ -1,6 +1,7 @@
 import gc
 import os
 from abc import ABC, abstractmethod
+from typing import List, Optional
 
 import imageio
 import numpy as np
@@ -26,7 +27,10 @@
 
 
 # from: https://github.com/genmoai/models/blob/075b6e36db58f1242921deff83a1066887b9c9e1/src/mochi_preview/infer.py#L77
-def linear_quadratic_schedule(num_steps, threshold_noise, linear_steps=None):
+def linear_quadratic_schedule(
+        num_steps: int,
+        threshold_noise: float,
+        linear_steps: Optional[int] = None) -> List[float]:
     if linear_steps is None:
         linear_steps = num_steps // 2
     linear_sigma_schedule = [
@@ -48,7 +52,7 @@ def linear_quadratic_schedule(num_steps, threshold_noise, linear_steps=None):
     return sigma_schedule
 
 
-def reshard_fsdp(model):
+def reshard_fsdp(model: torch.nn.Module) -> None:
     """Reshard FSDP model for EMA updates."""
     for m in FSDP.fsdp_modules(model):
         if m._has_params and m.sharding_strategy is not ShardingStrategy.NO_SHARD:
@@ -60,6 +64,7 @@ class DistillationPipeline(ComposedPipelineBase, ABC):
     A pipeline for distillation training. All distillation pipelines should inherit from this class.
     """
     _required_config_modules = ["scheduler", "transformer"]
+    validation_pipeline: ComposedPipelineBase
 
     def initialize_distillation_pipeline(self, fastvideo_args: TrainingArgs):
         logger.info("Initializing distillation pipeline...")
@@ -104,6 +109,7 @@ def initialize_distillation_pipeline(self, fastvideo_args: TrainingArgs):
         assert noise_scheduler is not None
 
         # Initialize solver for distillation
+        sigmas: torch.Tensor | List[float] = []
         if fastvideo_args.scheduler_type == "pcm_linear_quadratic":
             linear_steps = int(noise_scheduler.config.num_train_timesteps *
                                fastvideo_args.linear_range)
@@ -112,10 +118,12 @@ def initialize_distillation_pipeline(self, fastvideo_args: TrainingArgs):
                 fastvideo_args.linear_quadratic_threshold,
                 linear_steps,
             )
-            sigmas = torch.tensor(sigmas).to(dtype=torch.float32)
         else:
             sigmas = noise_scheduler.sigmas
 
+        if isinstance(sigmas, list):
+            sigmas = torch.tensor(sigmas).to(dtype=torch.float32)
+
         self.solver = EulerSolver(
             sigmas.numpy(),
             noise_scheduler.config.num_train_timesteps,
@@ -203,7 +211,8 @@ def distill_one_step(self, transformer, model_type, teacher_transformer,
         raise NotImplementedError(
             "Distillation pipeline must implement this method")
 
-    def log_validation(self, transformer, fastvideo_args, global_step):
+    @torch.no_grad()
+    def _log_validation(self, transformer, fastvideo_args, global_step):
         """Log validation results during training."""
         fastvideo_args.mode = Mode.INFERENCE
         fastvideo_args.use_cpu_offload = False
@@ -220,8 +229,9 @@ def log_validation(self, transformer, fastvideo_args, global_step):
         validation_dataset = ParquetVideoTextDataset(
             fastvideo_args.validation_prompt_dir,
             batch_size=1,
-            cfg_rate=0,
-            num_latent_t=fastvideo_args.num_latent_t)
+            cfg_rate=fastvideo_args.cfg,
+            num_latent_t=fastvideo_args.num_latent_t,
+            validation=True)
 
         validation_dataloader = StatefulDataLoader(validation_dataset,
                                                    batch_size=1,
@@ -231,21 +241,13 @@ def log_validation(self, transformer, fastvideo_args, global_step):
                                                    pin_memory=True,
                                                    drop_last=False)
 
-        transformer.requires_grad_(False)
-        for p in transformer.parameters():
-            p.requires_grad = False
         transformer.eval()
 
-        # Add the transformer to the validation pipeline
-        self.validation_pipeline.add_module("transformer", transformer)
-        self.validation_pipeline.latent_preparation_stage.transformer = transformer
-        self.validation_pipeline.denoising_stage.transformer = transformer
-
         # Process validation prompts
         videos = []
         captions = []
         for _, embeddings, masks, infos in validation_dataloader:
-            logger.info(f"infos: {infos}")
+            logger.info("infos: %s", infos)
             caption = infos['caption']
             captions.append(caption)
             prompt_embeds = embeddings.to(fastvideo_args.device)
diff --git a/fastvideo/v1/training/training_pipeline.py b/fastvideo/v1/training/training_pipeline.py
@@ -17,7 +17,7 @@
 from fastvideo.v1.configs.sample import SamplingParam
 from fastvideo.v1.dataset.parquet_datasets import ParquetVideoTextDataset
 from fastvideo.v1.distributed import get_sp_group, get_world_group
-from fastvideo.v1.fastvideo_args import FastVideoArgs, TrainingArgs
+from fastvideo.v1.fastvideo_args import FastVideoArgs, Mode, TrainingArgs
 from fastvideo.v1.forward_context import set_forward_context
 from fastvideo.v1.logger import init_logger
 from fastvideo.v1.pipelines import ComposedPipelineBase
@@ -148,7 +148,7 @@ def train_one_step(self, transformer, model_type, optimizer, lr_scheduler,
     @torch.no_grad()
     def _log_validation(self, transformer, training_args, global_step) -> None:
         assert training_args is not None
-        training_args.inference_mode = True
+        training_args.mode = Mode.INFERENCE
         training_args.use_cpu_offload = False
         if not training_args.log_validation:
             return
diff --git a/fastvideo/v1/training/wan_distillation_pipeline.py b/fastvideo/v1/training/wan_distillation_pipeline.py
@@ -2,6 +2,7 @@
 import time
 from collections import deque
 from copy import deepcopy
+from typing import Dict
 
 import torch
 from tqdm.auto import tqdm
@@ -23,7 +24,8 @@
 logger = init_logger(__name__)
 
 
-def get_norm(model_pred, norms, gradient_accumulation_steps):
+def get_norm(model_pred: torch.Tensor, norms: Dict[str, float],
+             gradient_accumulation_steps: int) -> None:
     """Calculate and aggregate model prediction norms."""
     fro_norm = (
         torch.linalg.matrix_norm(model_pred, ord="fro") /  # codespell:ignore
@@ -66,7 +68,10 @@ def initialize_validation_pipeline(self, fastvideo_args: FastVideoArgs):
         args_copy.mode = Mode.INFERENCE
         args_copy.vae_config.load_encoder = False
         validation_pipeline = WanValidationPipeline.from_pretrained(
-            fastvideo_args.model_path, args=args_copy)
+            fastvideo_args.model_path,
+            args=None,
+            mode=Mode.INFERENCE,
+            loaded_modules={"transformer": self.get_module("transformer")})
 
         self.validation_pipeline = validation_pipeline
 
@@ -95,11 +100,7 @@ def distill_one_step(
         pred_decay_weight,
         pred_decay_type,
         hunyuan_teacher_disable_cfg,
-        weighting_scheme,
-        logit_mean,
-        logit_std,
-        mode_scale,
-    ):
+    ) -> tuple[float, float, Dict[str, float]]:
         """Perform one step of distillation training."""
         total_loss = 0.0
         optimizer.zero_grad()
@@ -170,17 +171,16 @@ def distill_one_step(
                 noisy_model_input, model_pred, indices, multiphase)
 
             # Get teacher model prediction
-            with torch.no_grad():
-                with torch.autocast("cuda", dtype=torch.bfloat16):
-                    with set_forward_context(current_timestep=timesteps,
-                                             attn_metadata=None):
-                        cond_teacher_output = teacher_transformer(
-                            noisy_model_input,
-                            encoder_hidden_states,
-                            timesteps,
-                            encoder_attention_mask,
-                            return_dict=False,
-                        )[0].float()
+            with torch.no_grad(), torch.autocast(
+                    "cuda", dtype=torch.bfloat16), set_forward_context(
+                        current_timestep=timesteps, attn_metadata=None):
+                cond_teacher_output = teacher_transformer(
+                    noisy_model_input,
+                    encoder_hidden_states,
+                    timesteps,
+                    encoder_attention_mask,
+                    return_dict=False,
+                )[0].float()
 
                 if not_apply_cfg_solver:
                     uncond_teacher_output = cond_teacher_output
@@ -313,31 +313,30 @@ def forward(
         uncond_prompt_embed = self.uncond_prompt_embed
         uncond_prompt_mask = self.uncond_prompt_mask
 
-        # Train!
+        assert self.training_args.sp_size is not None
+        assert self.training_args.gradient_accumulation_steps is not None
         total_batch_size = (self.world_size *
                             self.training_args.gradient_accumulation_steps /
                             self.training_args.sp_size *
                             self.training_args.train_sp_batch_size)
         logger.info("***** Running distillation training *****")
-        logger.info(f"  Resume training from step {init_steps}")
-        logger.info(
-            f"  Instantaneous batch size per device = {self.training_args.train_batch_size}"
-        )
+        logger.info("  Resume training from step %s", init_steps)
+        logger.info("  Instantaneous batch size per device = %s",
+                    self.training_args.train_batch_size)
         logger.info(
-            f"  Total train batch size (w. data & sequence parallel, accumulation) = {total_batch_size}"
-        )
+            "  Total train batch size (w. data & sequence parallel, accumulation) = %s",
+            total_batch_size)
+        logger.info("  Gradient Accumulation steps = %s",
+                    self.training_args.gradient_accumulation_steps)
+        logger.info("  Total optimization steps = %s",
+                    self.training_args.max_train_steps)
         logger.info(
-            f"  Gradient Accumulation steps = {self.training_args.gradient_accumulation_steps}"
-        )
-        logger.info(
-            f"  Total optimization steps = {self.training_args.max_train_steps}"
-        )
-        logger.info(
-            f"  Total training parameters per FSDP shard = {sum(p.numel() for p in self.transformer.parameters() if p.requires_grad) / 1e9} B"
-        )
-        logger.info(
-            f"  Master weight dtype: {self.transformer.parameters().__next__().dtype}"
-        )
+            "  Total training parameters per FSDP shard = %s B",
+            sum(p.numel()
+                for p in self.transformer.parameters() if p.requires_grad) /
+            1e9)
+        logger.info("  Master weight dtype: %s",
+                    self.transformer.parameters().__next__().dtype)
 
         # Potentially load in the weights and states from a previous save
         if self.training_args.resume_from_checkpoint:
@@ -352,13 +351,14 @@ def forward(
         )
 
         loader_iter = iter(train_dataloader)
-        step_times = deque(maxlen=100)
+        step_times: deque[float] = deque(maxlen=100)
 
         # Skip steps if resuming
         for i in range(init_steps):
             next(loader_iter)
 
-        def get_num_phases(multi_phased_distill_schedule, step):
+        def get_num_phases(multi_phased_distill_schedule: str,
+                           step: int) -> int:
             # step-phase,step-phase
             multi_phases = multi_phased_distill_schedule.split(",")
             phase = multi_phases[-1].split("-")[-1]
@@ -400,10 +400,6 @@ def get_num_phases(multi_phased_distill_schedule, step):
                     self.training_args.pred_decay_weight,
                     self.training_args.pred_decay_type,
                     self.training_args.hunyuan_teacher_disable_cfg,
-                    self.training_args.weighting_scheme,
-                    self.training_args.logit_mean,
-                    self.training_args.logit_std,
-                    self.training_args.mode_scale,
                 )
 
                 step_time = time.perf_counter() - start_time
@@ -462,7 +458,7 @@ def get_num_phases(multi_phased_distill_schedule, step):
                 self.sp_group.barrier()
 
             if self.training_args.log_validation and step % self.training_args.validation_steps == 0:
-                self.log_validation(self.transformer, self.training_args, step)
+                self._log_validation(self.transformer, self.training_args, step)
 
         # Final checkpoint
         if self.training_args.use_lora:
@@ -476,7 +472,7 @@ def get_num_phases(multi_phased_distill_schedule, step):
             cleanup_dist_env_and_memory()
 
 
-def main(args):
+def main(args) -> None:
     logger.info("Starting distillation pipeline...")
 
     pipeline = WanDistillationPipeline.from_pretrained(
diff --git a/fastvideo/v1/training/wan_training_pipeline.py b/fastvideo/v1/training/wan_training_pipeline.py
@@ -11,7 +11,7 @@
 
 from fastvideo.v1.distributed import (cleanup_dist_env_and_memory, get_sp_group,
                                       get_world_group)
-from fastvideo.v1.fastvideo_args import FastVideoArgs, TrainingArgs
+from fastvideo.v1.fastvideo_args import FastVideoArgs, Mode, TrainingArgs
 from fastvideo.v1.forward_context import set_forward_context
 from fastvideo.v1.logger import init_logger
 from fastvideo.v1.models.schedulers.scheduling_flow_unipc_multistep import (
@@ -52,12 +52,12 @@ def initialize_validation_pipeline(self, training_args: TrainingArgs):
         logger.info("Initializing validation pipeline...")
         args_copy = deepcopy(training_args)
 
-        args_copy.inference_mode = True
+        args_copy.mode = Mode.INFERENCE
         args_copy.vae_config.load_encoder = False
         validation_pipeline = WanValidationPipeline.from_pretrained(
             training_args.model_path,
             args=None,
-            inference_mode=True,
+            mode=Mode.INFERENCE,
             loaded_modules={"transformer": self.get_module("transformer")})
 
         self.validation_pipeline = validation_pipeline