add some script of lora test (#66)

zRzRzRzRzRzRzR · a-r-r-o-w · sayakpaul · web-flow · commit cbbac061125f · 2024-10-30T16:25:14.000+05:30
* multi resolutions support

* full chinese readme

* Update README.md

* Update README.md

Co-authored-by: Sayak Paul &lt;spsayakpaul@gmail.com&gt;

* reformat from pycharm

* dataset.md

* Update README.md

* for merge

* mergeing

* torch update for use

* add test lora script

* Update test_lora_inference.py

* Update requirements.txt

* Update requirements.txt

---------

Co-authored-by: Aryan &lt;aryan@huggingface.co&gt;
Co-authored-by: Sayak Paul &lt;spsayakpaul@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -53,6 +53,8 @@ video = pipe("<my-awesome-prompt>").frames[0]
 export_to_video(video, "output.mp4", fps=8)
 ```
 
+You can also check if your LoRA is correctly mounted [here](tests/test_lora_inference.py).
+
 **Note:** For Image-to-Video finetuning, you must install diffusers from [this](https://github.com/huggingface/diffusers/pull/9482) branch (which adds lora loading support in CogVideoX image-to-video) until it is merged.
 
 Below we provide additional sections detailing on more options explored in this repository. They all attempt to make fine-tuning for video models as accessible as possible by reducing memory requirements as much as possible.
diff --git a/README_zh.md b/README_zh.md
@@ -51,6 +51,8 @@ video = pipe("<my-awesome-prompt>").frames[0]
 export_to_video(video, "output.mp4", fps=8)
 ```
 
+你也可以在[这里](tests/test_lora_inference.py)来检查你的Lora是否正常挂载。
+
 **注意：** 对于图像到视频的微调，您必须从 [这个分支](https://github.com/huggingface/diffusers/pull/9482) 安装
 diffusers（该分支为 CogVideoX 的图像到视频添加了 LoRA 加载支持）直到它被合并。
 
diff --git a/requirements.txt b/requirements.txt
@@ -8,8 +8,8 @@ peft>=0.12.0
 decord>=0.6.0
 wandb
 pandas
-torch>=2.4.0
-torchvision>=0.19.0
+torch<2.5.0
+torchvision<0.20.0
 torchao>=0.5.0
 sentencepiece>=0.2.0
 imageio-ffmpeg>=0.5.1
diff --git a/tests/test_lora_inference.py b/tests/test_lora_inference.py
@@ -0,0 +1,43 @@
+"""
+Run this test in Lora adpater checking:
+
+```shell
+python3 test_lora_inference.py --prompt "A girl is ridding a bike." --model_path "THUDM/CogVideoX-5B" --lora_path "path/to/lora" --lora_name "lora_adapter" --output_file "output.mp4" --fps 8
+```
+
+"""
+
+import argparse
+import torch
+from diffusers import CogVideoXPipeline
+from diffusers.utils import export_to_video
+
+
+def generate_video(model_path, prompt, lora_path, lora_name, output_file, fps):
+    pipe = CogVideoXPipeline.from_pretrained(model_path, torch_dtype=torch.bfloat16).to("cuda")
+    pipe.load_lora_weights(lora_path, weight_name="pytorch_lora_weights.safetensors", adapter_name=lora_name)
+    pipe.set_adapters([lora_name], [1.0])
+    pipe.enable_model_cpu_offload()
+    pipe.vae.enable_slicing()
+    pipe.vae.enable_tiling()
+
+    video = pipe(prompt=prompt).frames[0]
+    export_to_video(video, output_file, fps=fps)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate video using CogVideoX and LoRA weights")
+    parser.add_argument("--prompt", type=str, required=True, help="Text prompt for the video generation")
+    parser.add_argument("--model_path", type=str, default="THUDM/CogVideoX-5B", help="Base Model path or HF ID")
+    parser.add_argument("--lora_path", type=str, required=True, help="Path to the LoRA weights")
+    parser.add_argument("--lora_name", type=str, default="lora_adapter", help="Name of the LoRA adapter")
+    parser.add_argument("--output_file", type=str, default="output.mp4", help="Output video file name")
+    parser.add_argument("--fps", type=int, default=8, help="Frames per second for the output video")
+
+    args = parser.parse_args()
+
+    generate_video(args.prompt, args.lora_path, args.lora_name, args.output_file, args.fps)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/train_text_to_video_lora.sh b/train_text_to_video_lora.sh
@@ -20,18 +20,22 @@ ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_1.yaml"
 # This example assumes you downloaded an already prepared dataset from HF CLI as follows:
 #   huggingface-cli download --repo-type dataset Wild-Heart/Disney-VideoGeneration-Dataset --local-dir /path/to/my/datasets/disney-dataset
 DATA_ROOT="/path/to/my/datasets/disney-dataset"
+
 CAPTION_COLUMN="prompt.txt"
 VIDEO_COLUMN="videos.txt"
+MODEL_PATH="THUDM/CogVideoX-5b"
 
+# Set ` --load_tensors ` to load tensors from disk instead of recomputing the encoder process.
 # Launch experiments with different hyperparameters
+
 for learning_rate in "${LEARNING_RATES[@]}"; do
   for lr_schedule in "${LR_SCHEDULES[@]}"; do
     for optimizer in "${OPTIMIZERS[@]}"; do
       for steps in "${MAX_TRAIN_STEPS[@]}"; do
-        output_dir="/path/to/my/models/cogvideox-lora__optimizer_${optimizer}__steps_${steps}__lr-schedule_${lr_schedule}__learning-rate_${learning_rate}/"
+        output_dir="./cogvideox-lora__optimizer_${optimizer}__steps_${steps}__lr-schedule_${lr_schedule}__learning-rate_${learning_rate}/"
 
         cmd="accelerate launch --config_file $ACCELERATE_CONFIG_FILE --gpu_ids $GPU_IDS training/cogvideox_text_to_video_lora.py \
-          --pretrained_model_name_or_path THUDM/CogVideoX-5b \
+          --pretrained_model_name_or_path $MODEL_PATH \
           --data_root $DATA_ROOT \
           --caption_column $CAPTION_COLUMN \
           --video_column $VIDEO_COLUMN \
@@ -62,6 +66,8 @@ for learning_rate in "${LEARNING_RATES[@]}"; do
           --lr_num_cycles 1 \
           --enable_slicing \
           --enable_tiling \
+          --enable_model_cpu_offload \
+          --load_tensors \
           --optimizer $optimizer \
           --beta1 0.9 \
           --beta2 0.95 \