VideoVerses
diff --git a/‎.dockerignore‎
Lines changed: 1 addition & 1 deletion b/‎.dockerignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 47 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 50 additions & 30 deletions b/‎README.md‎
Lines changed: 50 additions & 30 deletions
diff --git a/‎configs/000_videocrafter/vc1_i2v_512.yaml‎
Lines changed: 2 additions & 2 deletions b/‎configs/000_videocrafter/vc1_i2v_512.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎configs/000_videocrafter/vc1_t2v_1024.yaml‎
Lines changed: 2 additions & 2 deletions b/‎configs/000_videocrafter/vc1_t2v_1024.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎configs/001_videocrafter2/vc2_t2v_320x512.yaml‎
Lines changed: 1 addition & 2 deletions b/‎configs/001_videocrafter2/vc2_t2v_320x512.yaml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎configs/001_videocrafter2/vc2_t2v_lora.yaml‎
Lines changed: 2 additions & 3 deletions b/‎configs/001_videocrafter2/vc2_t2v_lora.yaml‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎configs/002_dynamicrafter/dc_i2v_1024.yaml‎
Lines changed: 2 additions & 3 deletions b/‎configs/002_dynamicrafter/dc_i2v_1024.yaml‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎configs/003_opensora/opensorav10_256x256.yaml‎
Lines changed: 5 additions & 5 deletions b/‎configs/003_opensora/opensorav10_256x256.yaml‎
Lines changed: 5 additions & 5 deletions
@@ -7,4 +7,4 @@ hooks
 junit.xml
 coverage.xml
 docker
-docs
+docs
@@ -0,0 +1,47 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+fail_fast: false
+repos:
+  - repo: local
+    hooks:
+      - id: format-checking
+        name: format checking
+        entry: poetry run format-check
+        pass_filenames: false
+        language: system
+        stages: [pre-commit]
+#      - id: linting
+#        name: linting
+#        entry: poetry run lint
+#        pass_filenames: false
+#        language: system
+#        stages: [commit]
+#      - id: type-checking
+#        name: type checking
+#        entry: poetry run type-check
+#        pass_filenames: false
+#        language: system
+#        stages: [commit]
+#      - id: unit-tests
+#        name: unit tests
+#        entry: poetry run test
+#        pass_filenames: false
+#        language: system
+#        stages: [commit]
+  - repo: https://github.com/commitizen-tools/commitizen
+    rev: v2.28.0
+    hooks:
+      - id: commitizen
+        stages:
+          - commit-msg
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: check-merge-conflict
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-added-large-files
+      - id: detect-private-key
+      - id: check-case-conflict
+      - id: mixed-line-ending
+      - id: detect-private-key
@@ -7,21 +7,21 @@
 ![Version](https://img.shields.io/badge/version-0.1.0-blue) ![visitors](https://visitor-badge.laobi.icu/badge?page_id=VideoVerses.VideoTuna&left_color=green&right_color=red)  [![](https://dcbadge.limes.pink/api/server/AammaaR2?style=flat)](https://discord.gg/AammaaR2) <a href='https://github.com/user-attachments/assets/a48d57a3-4d89-482c-8181-e0bce4f750fd'><img src='https://badges.aleen42.com/src/wechat.svg'></a> [![Homepage](https://img.shields.io/badge/Homepage-VideoTuna-orange)](https://videoverses.github.io/videotuna/) [![GitHub](https://img.shields.io/github/stars/VideoVerses/VideoTuna?style=social)](https://github.com/VideoVerses/VideoTuna)
 
 
-🤗🤗🤗 Videotuna is a useful codebase for text-to-video applications.   
-🌟 VideoTuna is the first repo that integrates multiple AI video generation models including `text-to-video (T2V)`, `image-to-video (I2V)`, `text-to-image (T2I)`, and `video-to-video (V2V)` generation for model inference and finetuning (to the best of our knowledge).   
-🌟 VideoTuna is the first repo that provides comprehensive pipelines in video generation, from fine-tuning to pre-training, continuous training, and post-training (alignment) (to the best of our knowledge).   
-🌟 An Emotion Control I2V model will be released soon.  
+🤗🤗🤗 Videotuna is a useful codebase for text-to-video applications.
+🌟 VideoTuna is the first repo that integrates multiple AI video generation models including `text-to-video (T2V)`, `image-to-video (I2V)`, `text-to-image (T2I)`, and `video-to-video (V2V)` generation for model inference and finetuning (to the best of our knowledge).
+🌟 VideoTuna is the first repo that provides comprehensive pipelines in video generation, from fine-tuning to pre-training, continuous training, and post-training (alignment) (to the best of our knowledge).
+🌟 An Emotion Control I2V model will be released soon.
 
 
 ## Features
-🌟 **All-in-one framework:** Inference and fine-tune up-to-date video generation models.  
-🌟 **Pre-training:** Build your own foundational text-to-video model.  
-🌟 **Continuous training:** Keep improving your model with new data.  
-🌟 **Domain-specific fine-tuning:** Adapt models to your specific scenario.  
-🌟 **Concept-specific fine-tuning:** Teach your models with unique concepts.  
-🌟 **Enhanced language understanding:** Improve model comprehension through continuous training.  
-🌟 **Post-processing:** Enhance the videos with video-to-video enhancement model.  
-🌟 **Post-training/Human preference alignment:** Post-training with RLHF for more attractive results.  
+🌟 **All-in-one framework:** Inference and fine-tune up-to-date video generation models.
+🌟 **Pre-training:** Build your own foundational text-to-video model.
+🌟 **Continuous training:** Keep improving your model with new data.
+🌟 **Domain-specific fine-tuning:** Adapt models to your specific scenario.
+🌟 **Concept-specific fine-tuning:** Teach your models with unique concepts.
+🌟 **Enhanced language understanding:** Improve model comprehension through continuous training.
+🌟 **Post-processing:** Enhance the videos with video-to-video enhancement model.
+🌟 **Post-training/Human preference alignment:** Post-training with RLHF for more attractive results.
 
 
 ## 🔆 Updates
@@ -54,16 +54,16 @@
 Video VAE+ can accurately compress and reconstruct the input videos with fine details.
 
 <table class="center">
-  
+
   <tr>
     <td style="text-align:center;" width="320">Ground Truth</td>
     <td style="text-align:center;" width="320">Reconstruction</td>
   </tr>
   <tr>
     <td><a href="https://github.com/user-attachments/assets/0efcbf80-0074-4421-810f-79a1f1733ed3"><img src="https://github.com/user-attachments/assets/0efcbf80-0074-4421-810f-79a1f1733ed3" width="320"></a></td>
     <td><a href="https://github.com/user-attachments/assets/4adf29f2-d413-49b1-bccc-48adfd64a4da"><img src="https://github.com/user-attachments/assets/4adf29f2-d413-49b1-bccc-48adfd64a4da" width="320"></a></td>
-  </tr>  
-  
+  </tr>
+
 </table>
 
 ### Emotion Control I2V
@@ -210,7 +210,7 @@ VideoTuna/
     ├── data         # data processing scripts and dataset files
     ├── docs         # documentations
     ├── eval         # evaluation scripts
-    ├── inputs       # input examples for testing 
+    ├── inputs       # input examples for testing
     ├── scripts      # train and inference python scripts
     ├── shsripts     # train and inference shell scripts
     ├── src          # model-related source code
@@ -283,7 +283,7 @@ poetry run pip install "modelscope[cv]" -f https://modelscope.oss-cn-beijing.ali
 
 Hunyuan model uses it to reduce memory usage and speed up inference. If it is not installed, the model will run in normal mode. Install the `flash-attn` via:
 ``` shell
-poetry run install-flash-attn 
+poetry run install-flash-attn
 ```
 
 #### (3) If you use MacOS
@@ -339,7 +339,7 @@ docker compose run -it --remove-orphans videotuna bash
 
 ### 2.Prepare checkpoints
 
-Please follow [docs/CHECKPOINTS.md](https://github.com/VideoVerses/VideoTuna/blob/main/docs/CHECKPOINTS.md) to download model checkpoints.  
+Please follow [docs/CHECKPOINTS.md](https://github.com/VideoVerses/VideoTuna/blob/main/docs/CHECKPOINTS.md) to download model checkpoints.
 After downloading, the model checkpoints should be placed as [Checkpoint Structure](https://github.com/VideoVerses/VideoTuna/blob/main/docs/CHECKPOINTS.md#checkpoint-orgnization-structure).
 
 ### 3.Inference state-of-the-art T2V/I2V/T2I models
@@ -395,22 +395,22 @@ Before started, we assume you have finished the following two preliminary steps:
   ll checkpoints/stablediffusion/v2-1_512-ema/model.ckpt
 ```
 
-First, run this command to convert the VC2 checkpoint as we make minor modifications on the keys of the state dict of the checkpoint. The converted checkpoint will be automatically save at `checkpoints/videocrafter/t2v_v2_512/model_converted.ckpt`.    
+First, run this command to convert the VC2 checkpoint as we make minor modifications on the keys of the state dict of the checkpoint. The converted checkpoint will be automatically save at `checkpoints/videocrafter/t2v_v2_512/model_converted.ckpt`.
 ```
 python tools/convert_checkpoint.py --input_path checkpoints/videocrafter/t2v_v2_512/model.ckpt
 ```
 
-Second, run this command to start training on the single GPU. The training results will be automatically saved at `results/train/${CURRENT_TIME}_${EXPNAME}`    
+Second, run this command to start training on the single GPU. The training results will be automatically saved at `results/train/${CURRENT_TIME}_${EXPNAME}`
 ```
 poetry run train-videocrafter-v2
 ```
 
 #### 2. VideoCrafter2 Lora Fine-tuning
 
-We support lora finetuning to make the model to learn new concepts/characters/styles.   
-- Example config file: `configs/001_videocrafter2/vc2_t2v_lora.yaml`  
-- Training lora based on VideoCrafter2: `bash shscripts/train_videocrafter_lora.sh`  
-- Inference the trained models: `bash shscripts/inference_vc2_t2v_320x512_lora.sh`   
+We support lora finetuning to make the model to learn new concepts/characters/styles.
+- Example config file: `configs/001_videocrafter2/vc2_t2v_lora.yaml`
+- Training lora based on VideoCrafter2: `bash shscripts/train_videocrafter_lora.sh`
+- Inference the trained models: `bash shscripts/inference_vc2_t2v_320x512_lora.sh`
 
 #### 3. Open-Sora Fine-tuning
 We support open-sora finetuning, you can simply run the following commands:
@@ -432,22 +432,22 @@ If you want to build your own dataset, please organize your data as `inputs/t2i/
 ```
 owndata/
     ├── img1.jpg
-    ├── img2.jpg  
-    ├── img3.jpg           
+    ├── img2.jpg
+    ├── img3.jpg
     ├── ...
     ├── prompt1.txt      # prompt of img1.jpg
     ├── prompt2.txt      # prompt of img2.jpg
     ├── prompt3.txt      # prompt of img3.jpg
     ├── ...
-``` 
+```
 
 <!-- Please check [configs/train/003_vc2_lora_ft/README.md](configs/train/003_vc2_lora_ft/README.md) for details.    -->
-<!-- 
+<!--
 
 (1) Prepare data
 
 
-(2) Finetune  
+(2) Finetune
 ```
 bash configs/train/000_videocrafter2ft/run.sh
 ``` -->
@@ -456,12 +456,32 @@ bash configs/train/000_videocrafter2ft/run.sh
 
 
 ### 5. Evaluation
-We support VBench evaluation to evaluate the T2V generation performance. 
+We support VBench evaluation to evaluate the T2V generation performance.
 Please check [eval/README.md](docs/evaluation.md) for details.
 
 <!-- ### 6. Alignment
 We support video alignment post-training to align human perference for video diffusion models. Please check [configs/train/004_rlhf_vc2/README.md](configs/train/004_rlhf_vc2/README.md) for details. -->
 
+# Contribute
+
+## Git hooks
+
+Git hooks are handled with [pre-commit](https://pre-commit.com) library.
+
+### Hooks installation
+
+Run the following command to install hooks on `commit`. They will check formatting, linting and types.
+
+```shell
+poetry run pre-commit install
+poetry run pre-commit install --hook-type commit-msg
+```
+
+### Running the hooks without commiting
+
+```shell
+poetry run pre-commit run --all-files
+```
 
 ## Acknowledgement
 We thank the following repos for sharing their awesome models and codes!
 
@@ -22,7 +22,7 @@ model:
 
     diffusion_scheduler_config:
       target: videotuna.base.diffusion_schedulers.LDMScheduler
-      params: 
+      params:
         timesteps: 1000
         linear_start: 0.00085
         linear_end: 0.012
@@ -87,4 +87,4 @@ model:
     img_cond_stage_config:
       target: videotuna.lvdm.modules.encoders.condition.FrozenOpenCLIPImageEmbedderV2
       params:
-        freeze: true
+        freeze: true
@@ -21,11 +21,11 @@ model:
 
     diffusion_scheduler_config:
       target: videotuna.base.diffusion_schedulers.LDMScheduler
-      params: 
+      params:
         timesteps: 1000
         linear_start: 0.00085
         linear_end: 0.012
-        
+
     unet_config:
       target: videotuna.lvdm.modules.networks.openaimodel3d.UNetModel
       params:
 
@@ -24,7 +24,7 @@ model:
 
     diffusion_scheduler_config:
       target: videotuna.base.diffusion_schedulers.LDMScheduler
-      params: 
+      params:
         timesteps: 1000
         linear_start: 0.00085
         linear_end: 0.012
@@ -137,4 +137,3 @@ lightning:
         save_weights_only: False
         every_n_epochs: 300
         every_n_train_steps: null
-    
@@ -5,7 +5,7 @@ model:
   target: videotuna.base.ddpm3d.LVDMFlow
   params:
     lora_args:
-      # lora_ckpt: "/path/to/lora.ckpt" # no need for the first-time training, only used for resume training. 
+      # lora_ckpt: "/path/to/lora.ckpt" # no need for the first-time training, only used for resume training.
       target_modules:  ["to_q", "to_k", "to_v"]
       lora_rank: 4
       lora_alpha: 1
@@ -30,7 +30,7 @@ model:
 
     diffusion_scheduler_config:
       target: videotuna.base.diffusion_schedulers.LDMScheduler
-      params: 
+      params:
         timesteps: 1000
         linear_start: 0.00085
         linear_end: 0.012
@@ -145,4 +145,3 @@ lightning:
         save_weights_only: False
         # every_n_epochs: 300
         every_n_train_steps: 10
-    
@@ -24,7 +24,7 @@ model:
 
     diffusion_scheduler_config:
       target: videotuna.base.diffusion_schedulers.LDMScheduler
-      params: 
+      params:
         timesteps: 1000
         linear_start: 0.00085
         linear_end: 0.012
@@ -96,7 +96,7 @@ model:
       target: videotuna.lvdm.modules.encoders.condition.FrozenOpenCLIPImageEmbedderV2
       params:
         freeze: true
-    
+
     image_proj_stage_config:
       target: videotuna.lvdm.modules.encoders.ip_resampler.Resampler
       params:
@@ -169,4 +169,3 @@ lightning:
         filename: "{epoch:06}-{step:09}"
         save_weights_only: True
         every_n_train_steps: 10000
-    
@@ -11,7 +11,7 @@ model:
     # cond_stage_trainable: false
     cond_stage_trainable: true
     conditioning_key: crossattn_stdit
-    image_size: # TO CHECK 
+    image_size: # TO CHECK
     - 32
     - 32
     channels: 4
@@ -26,7 +26,7 @@ model:
 
     diffusion_scheduler_config:
       target: videotuna.base.iddpm3d.OpenSoraScheduler
-      params: 
+      params:
         timesteps: 1000
         linear_start: 0.00085
         linear_end: 0.012
@@ -42,7 +42,7 @@ model:
         input_size:
         - 16
         - 32
-        - 32 
+        - 32
     first_stage_config:
       target: videotuna.lvdm.opensoravae.VideoAutoencoderKL
       params:
@@ -53,7 +53,7 @@ model:
       params:
         from_pretrained: "DeepFloyd/t5-v1_1-xxl"
         model_max_length: 120
-        shardformer: False # TODO 
+        shardformer: False # TODO
 
 data:
   target: videotuna.data.lightning_data.DataModuleFromConfig
@@ -107,4 +107,4 @@ lightning:
   #   target: pytorch_lightning.callbacks.ModelCheckpoint
   #   params:
   #     every_n_epochs: 1
-  #     filename: "{epoch:04}-{step:06}"
+  #     filename: "{epoch:04}-{step:06}"