feat: add gemini mllm (#250)

timerring · web-flow · commit c3aeece347ba · 2025-04-02T15:20:44.000+08:00
* feat: add gemini mllm
* docs: update docs
diff --git a/README.md b/README.md
@@ -35,7 +35,7 @@
 - **硬件要求极低**：无需GPU，只需最基础的单核CPU搭配最低的运存即可完成录制，弹幕渲染，上传等等全部过程，无最低配置要求，10年前的电脑或服务器依然可以使用！
 - **( :tada: NEW)自动渲染字幕**(如需使用本功能，则需保证有 Nvidia 显卡)：采用 OpenAI 的开源模型 [`whisper`](https://github.com/openai/whisper)，自动识别视频内语音并转换为字幕渲染至视频中。
 - **( :tada: NEW)自动切片上传**：根据弹幕密度计算寻找高能片段并切片，该自动切片工具库已开源 [auto-slice-video](https://github.com/timerring/auto-slice-video)
- ，结合多模态视频理解大模型 [`GLM-4V-PLUS`](https://bigmodel.cn/dev/api/normal-model/glm-4) 自动生成有意思的切片标题及内容，并且自动上传。
+ ，结合多模态视频理解大模型 [`GLM-4V-PLUS`](https://bigmodel.cn/dev/api/normal-model/glm-4) 或者 [`Gemini-2.0-flash`](https://deepmind.google/technologies/gemini/flash/) 自动生成有意思的切片标题及内容，并且自动上传。
 - **( :tada: NEW)持久化登录/下载/上传视频(支持多p投稿)**：[bilitool](https://github.com/timerring/bilitool)已经开源，实现持久化登录，下载视频及弹幕(含多p)/上传视频(可分p投稿)，查询投稿状态，查询详细信息等功能，一键pip安装，可以使用命令行 cli 操作，也可以作为api调用。
 - **( :tada: NEW)自动多平台循环直播推流**：该工具已经开源 [looplive](https://github.com/timerring/looplive) 是一个 7 x 24 小时全自动**循环多平台同时推流**直播工具。
 
@@ -137,21 +137,36 @@ pip install -r requirements.txt
 ./setPath.sh && source ~/.bashrc
 ```
 
-#### 3. 配置 whisper 模型及 GLM-4V-PLUS 模型
+#### 3. 配置 whisper 模型及 MLLM 模型
 
-##### 3.1 whisper 模型
+##### 3.1 whisper 模型(字幕识别)
 项目默认采用 [`small`](https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt) 模型，请点击下载所需文件，并放置在 `src/subtitle/models` 文件夹中。
 
 > [!TIP]
 > 使用该参数模型至少需要保证有显存大于 2.7GB 的 GPU，否则请使用其他参数量的模型。
 > + 更多模型请参考 [whisper 参数模型](https://timerring.github.io/bilive/models.html) 部分。
 > + 更换模型方法请参考 [更换模型方法](https://timerring.github.io/bilive/models.html#更换模型方法) 部分。
 
-##### 3.2 GLM-4V-PLUS 模型
+##### 3.2 MLLM 模型
 
-> 此功能默认关闭，如果需要打开请将 `src/config.py` 文件中的 `AUTO_SLICE` 参数设置为 `True`
+MLLM 模型主要用于自动切片后的切片标题生成，此功能默认关闭，如果需要打开请将 `src/config.py` 文件中的 `AUTO_SLICE` 参数设置为 `True`。其他配置分别有：
+- `SLICE_DURATION` 以秒为单位设置切片时长（不建议超过 60 秒）。
+- `SLICE_NUM` 设置切片数量。
+- `SLICE_OVERLAP` 设置切片重叠时长。切片采用滑动窗口法处理，细节内容请见 [auto-slice-video](https://github.com/timerring/auto-slice-video)
+- `SLICE_STEP` 设置切片步长。
+- `MIN_VIDEO_SIZE` 设置切片最小视频大小。防止对一些连线或者网络波动原因造成的短片段再切片。
 
-在配置文件 `src/config.py` 中，`SLICE_DURATION` 以秒为单位设置切片时长（不建议超过 1 分钟），在项目的自动切片功能需要使用到智谱的 [`GLM-4V-PLUS`](https://bigmodel.cn/dev/api/normal-model/glm-4) 模型，请自行[注册账号](https://www.bigmodel.cn/invite?icode=shBtZUfNE6FfdMH1R6NybGczbXFgPRGIalpycrEwJ28%3D)并申请 API Key，填写到 `src/config.py` 文件中对应的 `Your_API_KEY` 中。
+##### 3.2.1 GLM-4V-PLUS 模型
+
+> 如需使用 GLM-4V-PLUS 模型，请将 `src/config.py` 文件中的 `MLLM_MODEL` 参数设置为 `zhipu`
+
+在项目的自动切片功能需要使用到智谱的 [`GLM-4V-PLUS`](https://bigmodel.cn/dev/api/normal-model/glm-4) 模型，请自行[注册账号](https://www.bigmodel.cn/invite?icode=shBtZUfNE6FfdMH1R6NybGczbXFgPRGIalpycrEwJ28%3D)并申请 API Key，填写到 `src/config.py` 文件中对应的 `ZHIPU_API_KEY` 中。
+
+##### 3.2.2 Gemini 模型
+
+> 如需使用 Gemini-2.0-flash 模型，请将 `src/config.py` 文件中的 `MLLM_MODEL` 参数设置为 `gemini`
+
+在项目的自动切片功能需要使用到 Gemini-2.0-flash 模型，请自行[注册账号](https://aistudio.google.com/app/apikey)并申请 API Key，填写到 `src/config.py` 文件中对应的 `GEMINI_API_KEY` 中。
 
 #### 4. bilitool 登录
 
diff --git a/src/autoslice/mllm_sdk/gemini_sdk.py b/src/autoslice/mllm_sdk/gemini_sdk.py
@@ -0,0 +1,27 @@
+from google import genai
+from google.genai import types 
+from src.log.logger import scan_log
+from src.config import GEMINI_API_KEY
+
+def gemini_generate_title(video_path, artist):
+
+    client = genai.Client(api_key=GEMINI_API_KEY)
+
+    # Only for videos of size <20Mb
+    video_bytes = open(video_path, 'rb').read()
+
+    response = client.models.generate_content(
+        model='models/gemini-2.0-flash',
+        contents=types.Content(
+            parts=[
+                types.Part(text=f'视频是{artist}的直播的切片，请根据该视频中的内容及弹幕信息，为这段视频起一个调皮并且吸引眼球的标题，只需要返回一个标题即可，无需返回其他内容'),
+                types.Part(
+                    inline_data=types.Blob(data=video_bytes, mime_type='video/mp4')
+                )
+            ]
+        )
+    )
+    scan_log.info("使用 Gemini-2.0-flash 生成切片标题")
+    scan_log.info(f"Prompt: 视频是{artist}的直播的切片，请根据该视频中的内容及弹幕信息，为这段视频起一个调皮并且吸引眼球的标题，只需要返回一个标题即可，无需返回其他内容")
+    scan_log.info(f"生成的切片标题为: {response.text}")
+    return response.text
diff --git a/src/autoslice/mllm_sdk/zhipu_sdk.py b/src/autoslice/mllm_sdk/zhipu_sdk.py
@@ -1,15 +1,15 @@
 # Copyright (c) 2024 bilive.
 
 import base64
-from src.config import Your_API_KEY
+from src.config import ZHIPU_API_KEY
 from zhipuai import ZhipuAI
 from src.log.logger import scan_log
 
 def zhipu_glm_4v_plus_generate_title(video_path, artist):
     with open(video_path, 'rb') as video_file:
         video_base = base64.b64encode(video_file.read()).decode('utf-8')
 
-    client = ZhipuAI(api_key=Your_API_KEY)
+    client = ZhipuAI(api_key=ZHIPU_API_KEY)
     response = client.chat.completions.create(
         model="glm-4v-plus-0111",
         messages=[
@@ -30,6 +30,7 @@ def zhipu_glm_4v_plus_generate_title(video_path, artist):
         }
         ]
     )
+    scan_log.info("使用 Zhipu-glm-4v-plus 生成切片标题")
     scan_log.info(f"Prompt: 视频是{artist}的直播的切片，请根据该视频中的内容及弹幕信息，为这段视频起一个调皮并且吸引眼球的标题，注意标题中如果有“主播”请替换成{artist}")
     scan_log.info(f"生成的切片标题为: {response.choices[0].message.content}")
     return response.choices[0].message.content.replace("《", "").replace("》", "")
diff --git a/src/autoslice/title_generator.py b/src/autoslice/title_generator.py
@@ -0,0 +1,35 @@
+from functools import wraps
+from src.log.logger import scan_log
+from src.config import MLLM_MODEL
+
+def title_generator(model_type):
+    """Decorator to select title generation function based on model type
+    Args:
+        model_type: str, type of model to use
+    Returns:
+        function: wrapped title generation function
+    """
+    def decorator(func):
+        def wrapper(video_path, artist):
+            if model_type == "zhipu":
+                from .mllm_sdk.zhipu_sdk import zhipu_glm_4v_plus_generate_title
+                return zhipu_glm_4v_plus_generate_title(video_path, artist)
+            elif model_type == "gemini":
+                from .mllm_sdk.gemini_sdk import gemini_generate_title
+                return gemini_generate_title(video_path, artist)
+            else:
+                scan_log.error(f"Unsupported model type: {model_type}")
+                return None
+        return wrapper
+    return decorator
+
+@title_generator(MLLM_MODEL)
+def generate_title(video_path, artist):
+    """Generate title for video
+    Args:
+        video_path: str, path to the video file
+        artist: str, artist name
+    Returns:
+        str: generated title
+    """
+    pass  # The actual implementation is handled by the decorator
diff --git a/src/burn/render_video.py b/src/burn/render_video.py
@@ -9,7 +9,7 @@
 from src.burn.render_command import render_command
 from autoslice import slice_video_by_danmaku
 from src.autoslice.inject_metadata import inject_metadata
-from src.autoslice.zhipu_sdk import zhipu_glm_4v_plus_generate_title
+from src.autoslice.title_generator import generate_title
 from src.upload.extract_video_info import get_video_info
 from src.log.logger import scan_log
 from db.conn import insert_upload_queue
@@ -66,9 +66,9 @@ def render_video(video_path):
             slices_path = slice_video_by_danmaku(ass_path, format_video_path, SLICE_DURATION, SLICE_NUM, SLICE_OVERLAP, SLICE_STEP)
             for slice_path in slices_path:
                 try:
-                    glm_title = zhipu_glm_4v_plus_generate_title(slice_path, artist)
+                    slice_title = generate_title(slice_path, artist)
                     slice_video_flv_path = slice_path[:-4] + '.flv'
-                    inject_metadata(slice_path, glm_title, slice_video_flv_path)
+                    inject_metadata(slice_path, slice_title, slice_video_flv_path)
                     os.remove(slice_path)
                     if not insert_upload_queue(slice_video_flv_path):
                         scan_log.error('Cannot insert the video to the upload queue')
diff --git a/src/config.py b/src/config.py
@@ -27,8 +27,12 @@
 SLICE_STEP = 1
 # The minimum video size to be sliced (MB)
 MIN_VIDEO_SIZE = 200
+# the multi-model LLMs, can be "gemini" or "zhipu"
+MLLM_MODEL = "gemini" # Please make sure you have the right API key for the LLM you choose
 # Apply for your own GLM-4v-Plus API key at https://www.bigmodel.cn/invite?icode=shBtZUfNE6FfdMH1R6NybGczbXFgPRGIalpycrEwJ28%3D
-Your_API_KEY = ""
+ZHIPU_API_KEY = ""
+# Apply for your own Gemini API key at https://aistudio.google.com/app/apikey
+GEMINI_API_KEY = ""
 # ============================ Basic configuration ============================
 SRC_DIR = str(Path(os.path.abspath(__file__)).parent)
 BILIVE_DIR = str(Path(SRC_DIR).parent)