add gradio demo

starsummer · starsummer · commit ecddd5cc5c1b · 2025-05-07T09:50:12.000Z
diff --git a/skycaptioner_v1/README.md b/skycaptioner_v1/README.md
@@ -1,15 +1,15 @@
 # SkyCaptioner-V1: A Structural Video Captioning Model
 
 <p align="center">
-📑 <a href="https://arxiv.org/pdf/2504.13074">Technical Report</a> · 👋 <a href="https://www.skyreels.ai/home?utm_campaign=github_SkyReels_V2" target="_blank">Playground</a> · 💬 <a href="https://discord.gg/PwM6NYtccQ" target="_blank">Discord</a> · 🤗 <a href="https://huggingface.co/Skywork/SkyCaptioner-V1" target="_blank">Hugging Face</a> · 🤖 <a href="https://modelscope.cn/collections/SkyReels-V2-f665650130b144">ModelScope</a></a>
+📑 <a href="https://arxiv.org/pdf/2504.13074">Technical Report</a> · 👋 <a href="https://www.skyreels.ai/home?utm_campaign=github_SkyReels_V2" target="_blank">Playground</a> · 💬 <a href="https://discord.gg/PwM6NYtccQ" target="_blank">Discord</a> · 🤗 <a href="https://huggingface.co/Skywork/SkyCaptioner-V1" target="_blank">Hugging Face</a> · 🤖 <a href="https://modelscope.cn/collections/SkyReels-V2-f665650130b144">ModelScope</a> · 🚀 <a href="https://huggingface.co/spaces/Skywork/SkyCaptioner-V1">Demo</a> 
 </p>
 
 ---
 
 Welcome to the SkyCaptioner-V1 repository! Here, you'll find the structural video captioning model weights and inference code for our video captioner that labels the video data efficiently and comprehensively.
 
 ## 🔥🔥🔥 News!!
-
+* May 07, 2025: 🚀 Added a web demo implementation based on Gradio and the [online demo](https://huggingface.co/spaces/Skywork/SkyCaptioner-V1) is now available! 
 * Apr 21, 2025: 👋 We release the [vllm](https://github.com/vllm-project/vllm) batch inference code for SkyCaptioner-V1 Model and caption fusion inference code.
 * Apr 21, 2025: 👋 We release the first shot-aware video captioning model [SkyCaptioner-V1  Model](https://huggingface.co/Skywork/SkyCaptioner-V1). For more details, please check our [paper](https://arxiv.org/pdf/2504.13074).
 
@@ -20,7 +20,7 @@ Welcome to the SkyCaptioner-V1 repository! Here, you'll find the structural vide
   - [x] Checkpoints
   - [x] Batch Inference Code
   - [x] Caption Fusion Method
-  - [ ] Web Demo (Gradio)
+  - [x] Web Demo (Gradio)
 
 ## 🌟 Overview
 
@@ -241,6 +241,22 @@ python scripts/vllm_fusion_caption.py \
 > **Note**: 
 > - If you want to get i2v caption, just change the `--task t2v` to `--task i2v` in your Command.
 
+#### Gradio Web Demo
+Launch the Gradio web demo for SkyCaptioner-V1:
+```shell
+export SkyCaptioner_V1_Model_PATH="/path/to/your_local_model_path"
+python scripts/gradio_struct_caption.py \
+    --skycaptioner_model_path ${SkyCaptioner_V1_Model_PATH}
+```
+
+Launch the Gradio web demo for Caption Fusion:
+```shell
+export LLM_MODEL_PATH="/path/to/your_local_model_path2"
+python scripts/gradio_fusion_caption.py \
+    --fusioncaptioner_model_path ${LLM_MODEL_PATH} \
+```
+
+
 ## Acknowledgements
 
 We would like to thank the contributors of <a href="https://github.com/QwenLM/Qwen2.5-VL">Qwen2.5-VL</a>, <a href="https://github.com/bytedance/tarsier">tarsier2</a> and <a href="https://github.com/vllm-project/vllm">vllm</a> repositories, for their open research and contributions.
diff --git a/skycaptioner_v1/scripts/gradio_fusion_caption.py b/skycaptioner_v1/scripts/gradio_fusion_caption.py
@@ -0,0 +1,146 @@
+import json
+import argparse
+import pandas as pd
+import gradio as gr
+
+from vllm import LLM, SamplingParams
+
+from vllm_fusion_caption import StructuralCaptionDataset
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--fusioncaptioner_model_path", default=None, type=str)
+parser.add_argument("--tensor_parallel_size", type=int, default=2)
+args = parser.parse_args()
+
+example_input = """
+{
+    "subjects": [
+        {
+            "TYPES": {
+                "type": "Human",
+                "sub_type": "Woman"
+            },
+            "appearance": "Long, straight black hair with bangs, wearing a sparkling choker necklace and a dark-colored top or dress with a visible strap over her shoulder.",
+            "action": "A woman wearing a sparkling choker necklace and earrings is sitting in a car, looking to her left and speaking. A man, dressed in a suit, is sitting next to her, attentively watching her.",
+            "expression": "The individual in the video exhibits a neutral facial expression, characterized by slightly open lips and a gentle, soft-focus gaze. There are no noticeable signs of sadness or distress evident in their demeanor.",
+            "position": "Seated in the foreground of the car, facing slightly to the right.",
+            "is_main_subject": true
+        },
+        {
+            "TYPES": {
+                "type": "Human",
+                "sub_type": "Man"
+            },
+            "appearance": "Short hair, wearing a dark-colored suit with a white shirt.",
+            "action": "",
+            "expression": "",
+            "position": "Seated in the background of the car, facing the woman.",
+            "is_main_subject": false
+        }
+    ],
+    "shot_type": "close_up",
+    "shot_angle": "eye_level",
+    "shot_position": "side_view",
+    "camera_motion": "",
+    "environment": "Interior of a car with a dark color scheme.",
+    "lighting": "Soft and natural lighting, suggesting daytime."
+}
+"""
+
+class FusionCaptioner:
+    def __init__(self, model_path, tensor_parallel_size):
+        self.model = LLM(model=model_path,
+            gpu_memory_utilization=0.9, 
+            max_model_len=4096,
+            tensor_parallel_size=tensor_parallel_size)    
+        self.sampling_params = SamplingParams(
+            temperature=0.1,
+            max_tokens=512,
+            stop=['\n\n']
+        )    
+        self.model_path = model_path
+
+    def __call__(self, structural_caption, task='t2v'):
+        if isinstance(structural_caption, dict):
+            structural_caption = json.dumps(structural_caption, ensure_ascii=False)
+        else:
+            structural_caption = json.dumps(json.loads(structural_caption), ensure_ascii=False)
+        meta = pd.DataFrame([structural_caption], columns=['structural_caption'])
+        print(f'structural_caption: {structural_caption}')
+        print(f'task: {task}')
+        dataset = StructuralCaptionDataset(meta, self.model_path, task)
+        _, fusion_by_llm, text, original_text, camera_movement = dataset[0]
+        llm_original_texts = []     
+        if not fusion_by_llm:     
+            caption = original_text + " " + camera_movement
+            return caption
+        try:
+            outputs = self.model.generate([text], self.sampling_params, use_tqdm=False)
+            result = outputs[0].outputs[0].text
+        except Exception as e:
+            result = llm_original_texts
+        
+        llm_caption = result + " " + camera_movement
+        return llm_caption
+
+def main():
+    fusion_captioner = FusionCaptioner(args.fusioncaptioner_model_path, args.tensor_parallel_size)
+
+    def fusion_caption(structural_caption, task):
+        caption = fusion_captioner(structural_caption, task)
+        return caption
+        
+    with gr.Blocks() as demo:
+        gr.Markdown(
+            """
+            <h1 style="text-align: center; font-size: 2em;">SkyCaptioner</h1>
+            """,
+            elem_id="header"
+        )
+        
+        with gr.Row():
+            with gr.Column(visible=True):
+                with gr.Row():
+                    json_input = gr.Code(
+                        label="Structural Caption",
+                        language="json",
+                        lines=25,
+                        interactive=True
+                    )
+                with gr.Row():
+                    task_input = gr.Radio(
+                        label="Task",
+                        choices=["t2v", "i2v"],
+                        value="t2v",
+                        interactive=True
+                    )                                  
+
+            with gr.Column(visible=True):
+                text_output = gr.Textbox(
+                    label="Fusion Caption",
+                    lines=25,
+                    interactive=False,
+                    autoscroll=True
+                )
+
+        gr.Button("Generate").click(
+            fn=fusion_caption,
+            inputs=[json_input, task_input],
+            outputs=text_output
+        )
+        with gr.Row():
+            gr.Examples(
+                examples=[
+                    [example_input, "t2v"],
+                ],
+                inputs=[json_input, task_input],
+                label="Example Input"
+            )
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=7863,
+            share=False
+        )    
+
+if __name__ == '__main__': 
+    main()
diff --git a/skycaptioner_v1/scripts/gradio_struct_caption.py b/skycaptioner_v1/scripts/gradio_struct_caption.py
@@ -0,0 +1,92 @@
+import json
+import argparse
+import pandas as pd
+import gradio as gr
+from vllm import LLM, SamplingParams
+from vllm_struct_caption import VideoTextDataset
+
+
+class StructCaptioner:
+    def __init__(self, model_path, tensor_parallel_size):
+        self.model = LLM(model=model_path,
+            gpu_memory_utilization=0.6, 
+            max_model_len=31920,
+            tensor_parallel_size=tensor_parallel_size)    
+
+        self.model_path = model_path
+        self.sampling_params = SamplingParams(temperature=0.05, max_tokens=2048)
+
+    def __call__(self, video_path):
+        meta = pd.DataFrame([video_path], columns=['path'])
+        dataset = VideoTextDataset(meta, self.model_path)
+        item = dataset[0]['input']
+        batch_user_inputs = [{
+            'prompt': item['prompt'],
+            'multi_modal_data':{'video': item['multi_modal_data']['video'][0]},
+        }]
+        outputs = self.model.generate(batch_user_inputs, self.sampling_params, use_tqdm=False)
+        caption = outputs[0].outputs[0].text
+        caption = json.loads(caption)
+        caption = json.dumps(caption, indent=4, ensure_ascii=False)
+        return caption
+
+    
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--skycaptioner_model_path", required=True, type=str)
+    parser.add_argument("--tensor_parallel_size", type=int, default=2)
+    args = parser.parse_args()
+
+    struct_captioner = StructCaptioner(args.skycaptioner_model_path, args.tensor_parallel_size)
+    def generate_caption(video_path):
+        caption = struct_captioner(video_path)
+        return caption
+    
+    with gr.Blocks() as demo:
+        gr.Markdown(
+            """
+            <h1 style="text-align: center; font-size: 2em;">SkyCaptioner</h1>
+            """,
+            elem_id="header"
+        )
+        
+        with gr.Row():
+            with gr.Column(visible=True, scale=0.5):
+                with gr.Row():
+                    video_input = gr.Video(
+                        label="Upload Video",
+                        interactive=True,
+                        format="mp4", 
+                    )                                
+
+            with gr.Column(visible=True):
+                json_output = gr.Code(
+                    label="Caption",
+                    language="json",
+                    lines=25,
+                    interactive=False
+                )
+
+        gr.Button("Generate").click(
+            fn=generate_caption,
+            inputs=video_input,
+            outputs=json_output
+        )   
+
+        gr.Examples(
+            examples=[
+                ["./examples/data/1.mp4"],
+                ["./examples/data/2.mp4"],
+            ],
+            inputs=video_input,
+            label="Example Videos"
+        )    
+
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=7862,
+            share=False
+        )
+
+if __name__ == '__main__': 
+    main()
diff --git a/skycaptioner_v1/scripts/vllm_fusion_caption.py b/skycaptioner_v1/scripts/vllm_fusion_caption.py
@@ -64,9 +64,15 @@
 
 
 class StructuralCaptionDataset(torch.utils.data.Dataset):
-    def __init__(self, input_csv, model_path):
-        self.meta = pd.read_csv(input_csv)
-        self.task = args.task
+    def __init__(self, input_csv, model_path, task=None):
+        if isinstance(input_csv, pd.DataFrame):
+            self.meta = input_csv
+        else:
+            self.meta = pd.read_csv(input_csv)
+        if task is None:
+            self.task = args.task
+        else:
+            self.task = task
         self.system_prompt = SYSTEM_PROMPT_T2V if self.task == 't2v' else SYSTEM_PROMPT_I2V
         self.tokenizer = AutoTokenizer.from_pretrained(model_path)
         
@@ -146,8 +152,8 @@ def clean_struct_caption(self, struct_caption, task):
 
 
         shot_type = struct_caption.get('shot_type', '').replace('_', ' ')
-        if shot_type not in SHOT_TYPE_LIST:
-            struct_caption['shot_type'] = ''
+        # if shot_type not in SHOT_TYPE_LIST:
+        #     struct_caption['shot_type'] = ''
         
         new_struct_caption = {
             'num_subjects': len(subjects),
diff --git a/skycaptioner_v1/scripts/vllm_struct_caption.py b/skycaptioner_v1/scripts/vllm_struct_caption.py
@@ -17,7 +17,10 @@
 
 class VideoTextDataset(torch.utils.data.Dataset):
     def __init__(self, csv_path, model_path):
-        self.meta = pd.read_csv(csv_path)
+        if isinstance(csv_path, pd.DataFrame):
+            self.meta = csv_path
+        else:
+            self.meta = pd.read_csv(csv_path)
         self._path = 'path'
         self.tokenizer = AutoTokenizer.from_pretrained(model_path)
         self.processor = AutoProcessor.from_pretrained(model_path)