Add qwen-2.5-vl as default captioner.

Naomi-Ken-Korem · matanby · commit 366f59dd70ec · 2025-05-20T17:17:18.000+01:00
diff --git a/README.md b/README.md
@@ -104,12 +104,28 @@ Use the directory containing your video clips (either from step 1, or your own c
 
 ```bash
 # Generate captions for all videos in the scenes directory
+python scripts/caption_videos.py scenes_output_dir/ \
+    --output scenes_output_dir/captions.json
+```
+
+By default, the script uses the Qwen2.5-VL model for captioning. If you're running into VRAM issues:
+
+1. Try enabling 8-bit quantization to reduce memory usage:
+```bash
+python scripts/caption_videos.py scenes_output_dir/ \
+    --output scenes_output_dir/captions.json \
+    --use-8bit
+```
+
+2. If still encountering memory issues, switch to the LLaVA-NeXT model which has lower VRAM requirements:
+```bash
 python scripts/caption_videos.py scenes_output_dir/ \
     --output scenes_output_dir/captions.json \
-    --captioner-type llava_next_7b
+    --captioner-type llava_next_7b \
+    --use-8bit
 ```
 
-This will create a captions.json file which contains video paths and their captions
+This will create a captions.json file which contains video paths and their captions.
 This JSON file will be used as input for the data preprocessing step.
 
 #### 3. Dataset Preprocessing (`preprocess_dataset.py`)
diff --git a/scripts/caption_videos.py b/scripts/caption_videos.py
@@ -388,10 +388,11 @@ def main(  # noqa: PLR0913
         help="Path to output file for captions. Format determined by file extension.",
     ),
     captioner_type: CaptionerType = typer.Option(  # noqa: B008
-        CaptionerType.LLAVA_NEXT_7B,
+        CaptionerType.QWEN_25_VL,
         "--captioner-type",
         "-c",
-        help="Type of captioner to use",
+        help="Type of captioner to use. Valid values: 'llava_next_7b', 'qwen_25_vl'",
+        case_sensitive=False,
     ),
     device: str | None = typer.Option(
         None,
@@ -441,8 +442,23 @@ def main(  # noqa: PLR0913
 ) -> None:
     """Auto-caption videos and images using vision-language models.
 
-    This tool can process individual video/image files or directories of media files and generate
-    captions using a vision-language model. The captions can be saved in various formats.
+    This script supports both LLaVA-NeXT and Qwen2.5-VL models for generating captions.
+    The paths in the output file will be relative to the output file's directory.
+
+    Examples:
+        # Caption using LLaVA-NeXT (default)
+        caption_videos.py video.mp4 -o captions.txt
+
+        # Caption using Qwen2.5-VL
+        caption_videos.py video.mp4 -o captions.txt -c qwen_25_vl
+
+        # Caption with custom instruction (especially useful for Qwen)
+        caption_videos.py video.mp4 -o captions.txt -c qwen_25_vl -i "Describe this video in detail"
+
+    Valid captioner types:
+        qwen_25_vl: Qwen2.5-VL-7B model (default)
+        llava_next_7b: LLaVA-NeXT-7B model (default)
+
     """
 
     # Determine device
diff --git a/src/ltxv_trainer/captioning.py b/src/ltxv_trainer/captioning.py
@@ -6,7 +6,12 @@
 
 import torch
 from diffusers import BitsAndBytesConfig
-from transformers import AutoModel, AutoProcessor, LlavaNextVideoForConditionalGeneration
+from transformers import (
+    AutoModel,
+    AutoProcessor,
+    LlavaNextVideoForConditionalGeneration,
+    Qwen2_5_VLForConditionalGeneration,
+)
 import numpy as np
 
 # Should be imported after `torch` to avoid compatibility issues.import decord
@@ -22,6 +27,7 @@ class CaptionerType(str, Enum):
     """Enum for different types of video captioners."""
 
     LLAVA_NEXT_7B = "llava_next_7b"
+    QWEN_25_VL = "qwen_25_vl"
 
 
 def create_captioner(captioner_type: CaptionerType, **kwargs) -> "MediaCaptioningModel":
@@ -36,6 +42,8 @@ def create_captioner(captioner_type: CaptionerType, **kwargs) -> "MediaCaptionin
     """
     if captioner_type == CaptionerType.LLAVA_NEXT_7B:
         return TransformersVlmCaptioner(model_id="llava-hf/LLaVA-NeXT-Video-7B-hf", **kwargs)
+    elif captioner_type == CaptionerType.QWEN_25_VL:
+        return TransformersVlmCaptioner(model_id="Qwen/Qwen2.5-VL-7B-Instruct", **kwargs)
     else:
         raise ValueError(f"Unsupported captioner type: {captioner_type}")
 
@@ -103,6 +111,8 @@ def __init__(
         Args:
             model_id: HuggingFace model ID for LLaVA-NeXT-Video
             device: torch.device to use for the model
+            use_8bit: Whether to use 8-bit quantization
+            vlm_instruction: Instruction prompt for the model
         """
         self.device = torch.device(device or "cuda" if torch.cuda.is_available() else "cpu")
         self.vlm_instruction = vlm_instruction
@@ -151,17 +161,40 @@ def caption(
         ).to(self.device)
 
         # Generate caption
-        output_tokens = self.model.generate(**inputs, max_new_tokens=200, do_sample=False)
-        output = self.processor.decode(output_tokens[0], skip_special_tokens=True)
-        caption_raw = output.split("ASSISTANT: ")[1]
+        output_tokens = self.model.generate(
+            **inputs,
+            max_new_tokens=200,
+            do_sample=False,
+            temperature=None,
+        )
+
+        # Trim the generated tokens to exclude the input tokens
+        output_tokens_trimmed = [
+            out_ids[len(in_ids) :]
+            for in_ids, out_ids in zip(
+                inputs.input_ids,
+                output_tokens,
+                strict=False,
+            )
+        ]
+
+        # Decode the generated tokens to text
+        caption_raw = self.processor.batch_decode(
+            output_tokens_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
 
         # Clean up caption
         caption = self._clean_raw_caption(caption_raw) if clean_caption else caption_raw
+
         return caption
 
     def _load_model(self, model_id: str, use_8bit: bool) -> None:
         if model_id == "llava-hf/LLaVA-NeXT-Video-7B-hf":
             model_cls = LlavaNextVideoForConditionalGeneration
+        elif model_id == "Qwen/Qwen2.5-VL-7B-Instruct":
+            model_cls = Qwen2_5_VLForConditionalGeneration
         else:
             model_cls = AutoModel
 
@@ -174,7 +207,7 @@ def _load_model(self, model_id: str, use_8bit: bool) -> None:
             device_map=self.device.type,
         )
 
-        self.processor = AutoProcessor.from_pretrained(model_id)
+        self.processor = AutoProcessor.from_pretrained(model_id, use_fast=True)
 
 
 def example() -> None:
@@ -184,9 +217,12 @@ def example() -> None:
         print(f"Usage: python {sys.argv[0]} <video_path>")  # noqa: T201
         sys.exit(1)
 
-    model = TransformersVlmCaptioner()
-    caption = model.caption(sys.argv[1])
-    print(caption)  # noqa: T201
+    # Example using both captioner types
+    for captioner_type in [CaptionerType.LLAVA_NEXT_7B, CaptionerType.QWEN_25_VL]:
+        print(f"\nUsing {captioner_type} captioner:")  # noqa: T201
+        model = create_captioner(captioner_type)
+        caption = model.caption(sys.argv[1])
+        print(f"CAPTION: {caption}")  # noqa: T201
 
 
 if __name__ == "__main__":