Optionally use unmerged weights for inference (#745)

Edenzzzz · web-flow · commit 989a03532cf1 · 2025-08-22T15:20:31.000-07:00
diff --git a/examples/inference/lora/wan_lora_inference_from_ckpt.py b/examples/inference/lora/wan_lora_inference_from_ckpt.py
@@ -14,9 +14,10 @@ def main():
         vae_cpu_offload=True,
         text_encoder_cpu_offload=True,
         pin_cpu_memory=True, # set to false if low CPU RAM or hit obscure "CUDA error: Invalid argument" 
-        lora_path="checkpoints/wan_t2v_finetune_lora/checkpoint-1000/transformer",
+        lora_path="checkpoints/wan_t2v_finetune_lora/checkpoint-160/transformer",
         lora_nickname="crush_smol"
     )
+    generator.unmerge_lora_weights()
     kwargs = {
         "height": 480,
         "width": 832,
diff --git a/examples/training/finetune/wan_t2v_1.3B/crush_smol/finetune_t2v_lora.sh b/examples/training/finetune/wan_t2v_1.3B/crush_smol/finetune_t2v_lora.sh
@@ -7,7 +7,7 @@ export WANDB_MODE=online
 MODEL_PATH="Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
 DATA_DIR="data/crush-smol_processed_t2v/combined_parquet_dataset/"
 VALIDATION_DATASET_FILE="$(dirname "$0")/validation.json"
-NUM_GPUS=2
+NUM_GPUS=1
 # export CUDA_VISIBLE_DEVICES=4,5
 
 
@@ -76,6 +76,7 @@ miscellaneous_args=(
   --dit_precision "fp32"
   --num_euler_timesteps 50
   --ema_start_step 0
+  --resume_from_checkpoint "checkpoints/wan_t2v_finetune_lora/checkpoint-160"
 )
 
 torchrun \
diff --git a/fastvideo/entrypoints/video_generator.py b/fastvideo/entrypoints/video_generator.py
@@ -351,6 +351,16 @@ def set_lora_adapter(self,
                          lora_path: str | None = None) -> None:
         self.executor.set_lora_adapter(lora_nickname, lora_path)
 
+    def unmerge_lora_weights(self) -> None:
+        """
+        Use unmerged weights for inference to produce videos that align with 
+        validation videos generated during training.
+        """
+        self.executor.unmerge_lora_weights()
+
+    def merge_lora_weights(self) -> None:
+        self.executor.merge_lora_weights()
+
     def shutdown(self):
         """
         Shutdown the video generator.
diff --git a/fastvideo/layers/lora/linear.py b/fastvideo/layers/lora/linear.py
@@ -76,7 +76,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             lora_B = self.lora_B.to_local()
             lora_A = self.lora_A.to_local()
 
-        if (self.training_mode or not self.merged) and not self.disable_lora:
+        if not self.merged and not self.disable_lora:
             delta = x @ (
                 self.slice_lora_b_weights(lora_B.to(x, non_blocking=True))
                 @ self.slice_lora_a_weights(lora_A.to(x, non_blocking=True)))
diff --git a/fastvideo/pipelines/lora_pipeline.py b/fastvideo/pipelines/lora_pipeline.py
@@ -217,3 +217,11 @@ def set_lora_adapter(self,
                 layer.disable_lora = True
         logger.info("Rank %d: LoRA adapter %s applied to %d layers", rank,
                     lora_path, adapted_count)
+
+    def merge_lora_weights(self) -> None:
+        for name, layer in self.lora_layers.items():
+            layer.merge_lora_weights()
+
+    def unmerge_lora_weights(self) -> None:
+        for name, layer in self.lora_layers.items():
+            layer.unmerge_lora_weights()
diff --git a/fastvideo/worker/executor.py b/fastvideo/worker/executor.py
@@ -57,6 +57,20 @@ def set_lora_adapter(self,
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def unmerge_lora_weights(self) -> None:
+        """
+        Unmerge the LoRA weights for the workers.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def merge_lora_weights(self) -> None:
+        """
+        Merge the LoRA weights for the workers.
+        """
+        raise NotImplementedError
+
     @abstractmethod
     def collective_rpc(self,
                        method: str | Callable[..., _R],
diff --git a/fastvideo/worker/gpu_worker.py b/fastvideo/worker/gpu_worker.py
@@ -18,7 +18,7 @@
 from fastvideo.distributed.parallel_state import get_local_torch_device
 from fastvideo.fastvideo_args import FastVideoArgs
 from fastvideo.logger import init_logger
-from fastvideo.pipelines import ForwardBatch, build_pipeline
+from fastvideo.pipelines import ForwardBatch, LoRAPipeline, build_pipeline
 from fastvideo.platforms import current_platform
 from fastvideo.utils import (get_exception_traceback,
                              kill_itself_when_parent_died)
@@ -117,6 +117,14 @@ def shutdown(self) -> dict[str, Any]:
                     local_main_process_only=False)
         return {"status": "shutdown_complete"}
 
+    def unmerge_lora_weights(self) -> None:
+        if isinstance(self.pipeline, LoRAPipeline):
+            self.pipeline.unmerge_lora_weights()
+
+    def merge_lora_weights(self) -> None:
+        if isinstance(self.pipeline, LoRAPipeline):
+            self.pipeline.merge_lora_weights()
+
     def event_loop(self) -> None:
         """Event loop for the worker."""
         logger.info("Worker %d starting event loop...",
@@ -154,6 +162,14 @@ def event_loop(self) -> None:
                     logger.info("Worker %d set LoRA adapter %s with path %s",
                                 self.rank, lora_nickname, lora_path)
                     self.pipe.send({"status": "lora_adapter_set"})
+                elif method_name == 'unmerge_lora_weights':
+                    self.unmerge_lora_weights()
+                    logger.info("Worker %d unmerged LoRA weights", self.rank)
+                    self.pipe.send({"status": "lora_adapter_unmerged"})
+                elif method_name == 'merge_lora_weights':
+                    self.merge_lora_weights()
+                    logger.info("Worker %d merged LoRA weights", self.rank)
+                    self.pipe.send({"status": "lora_adapter_merged"})
                 else:
                     # Handle other methods dynamically if needed
                     args = recv_rpc.get('args', ())
diff --git a/fastvideo/worker/multiproc_executor.py b/fastvideo/worker/multiproc_executor.py
@@ -109,6 +109,18 @@ def set_lora_adapter(self,
                 raise RuntimeError(
                     f"Worker {i} failed to set LoRA adapter to {lora_path}")
 
+    def unmerge_lora_weights(self) -> None:
+        responses = self.collective_rpc("unmerge_lora_weights", kwargs={})
+        for i, response in enumerate(responses):
+            if response["status"] != "lora_adapter_unmerged":
+                raise RuntimeError(f"Worker {i} failed to unmerge LoRA weights")
+
+    def merge_lora_weights(self) -> None:
+        responses = self.collective_rpc("merge_lora_weights", kwargs={})
+        for i, response in enumerate(responses):
+            if response["status"] != "lora_adapter_merged":
+                raise RuntimeError(f"Worker {i} failed to merge LoRA weights")
+
     def collective_rpc(self,
                        method: str | Callable,
                        timeout: float | None = None,