feat: add qwen mllm (#251)

timerring · web-flow · commit fe7232c4e36b · 2025-04-02T19:21:11.000+08:00
* feat: add qwen mllm
* dosc: update docs
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@
   <img src="assets/openai.svg" alt="OpenAI whisper" width="60" height="60" />
   <img src="assets/zhipu-color.svg" alt="Zhipu GLM-4V-PLUS" width="60" height="60" />
   <img src="assets/gemini-brand-color.svg" alt="Google Gemini 1.5 Pro" width="60" height="60" />
+  <img src="assets/qwen-color.svg" alt="Qwen-2.5-72B-Instruct" width="60" height="60" />
 
 </div>
 
@@ -34,8 +35,10 @@
 - **自动渲染弹幕**：自动转换xml为ass弹幕文件，该转换工具库已经开源 [DanmakuConvert](https://github.com/timerring/DanmakuConvert) 并且渲染到视频中形成**有弹幕版视频**并自动上传。
 - **硬件要求极低**：无需GPU，只需最基础的单核CPU搭配最低的运存即可完成录制，弹幕渲染，上传等等全部过程，无最低配置要求，10年前的电脑或服务器依然可以使用！
 - **( :tada: NEW)自动渲染字幕**(如需使用本功能，则需保证有 Nvidia 显卡)：采用 OpenAI 的开源模型 [`whisper`](https://github.com/openai/whisper)，自动识别视频内语音并转换为字幕渲染至视频中。
-- **( :tada: NEW)自动切片上传**：根据弹幕密度计算寻找高能片段并切片，该自动切片工具库已开源 [auto-slice-video](https://github.com/timerring/auto-slice-video)
- ，结合多模态视频理解大模型 [`GLM-4V-PLUS`](https://bigmodel.cn/dev/api/normal-model/glm-4) 或者 [`Gemini-2.0-flash`](https://deepmind.google/technologies/gemini/flash/) 自动生成有意思的切片标题及内容，并且自动上传。
+- **( :tada: NEW)自动切片上传**：根据弹幕密度计算寻找高能片段并切片，该自动切片工具库已开源 [auto-slice-video](https://github.com/timerring/auto-slice-video)，结合多模态视频理解大模型自动生成有意思的切片标题及内容，并且自动上传，目前已经支持的模型有：
+  - `GLM-4V-PLUS`
+  - `Gemini-2.0-flash`
+  - `Qwen-2.5-72B-Instruct`
 - **( :tada: NEW)持久化登录/下载/上传视频(支持多p投稿)**：[bilitool](https://github.com/timerring/bilitool)已经开源，实现持久化登录，下载视频及弹幕(含多p)/上传视频(可分p投稿)，查询投稿状态，查询详细信息等功能，一键pip安装，可以使用命令行 cli 操作，也可以作为api调用。
 - **( :tada: NEW)自动多平台循环直播推流**：该工具已经开源 [looplive](https://github.com/timerring/looplive) 是一个 7 x 24 小时全自动**循环多平台同时推流**直播工具。
 
@@ -70,8 +73,6 @@ graph TD
 
 ## 3. 测试硬件
 + OS: Ubuntu 22.04.4 LTS
-
-  >尽量使用 22.04+ 的版本，更早版本的 ubuntu 自带 gcc 版本无法更新至 biliup-rs 所需版本，若使用较早版本，请参考 [version `GLIBC_2.34‘ not found简单有效解决方法](https://blog.csdn.net/huazhang_001/article/details/128828999)。
 + CPU：2核 Intel(R) Xeon(R) Platinum 85
 + GPU：无
 + 内存：2G
@@ -168,6 +169,12 @@ MLLM 模型主要用于自动切片后的切片标题生成，此功能默认关
 
 在项目的自动切片功能需要使用到 Gemini-2.0-flash 模型，请自行[注册账号](https://aistudio.google.com/app/apikey)并申请 API Key，填写到 `src/config.py` 文件中对应的 `GEMINI_API_KEY` 中。
 
+##### 3.2.3 Qwen 模型
+
+> 如需使用 Qwen-2.5-72B-Instruct 模型，请将 `src/config.py` 文件中的 `MLLM_MODEL` 参数设置为 `qwen`
+
+在项目的自动切片功能需要使用到 Qwen-2.5-72B-Instruct 模型，请自行[注册账号](https://bailian.console.aliyun.com/?apiKey=1)并申请 API Key，填写到 `src/config.py` 文件中对应的 `QWEN_API_KEY` 中。
+
 #### 4. bilitool 登录
 
 > 由于一般日志打印不出二维码效果（docker 的日志不确定是否能打印，等发布新image时再修改，docker 版本请先参考文档[bilive](https://bilive.timerring.com)，本 README 只针对源码部署），所以这步需要提前在机器上安装 [bilitool](https://github.com/timerring/bilitool):
diff --git a/assets/qwen-color.svg b/assets/qwen-color.svg
@@ -0,0 +1 @@
+<svg height="1em" style="flex:none;line-height:1" viewBox="0 0 24 24" width="1em" xmlns="http://www.w3.org/2000/svg"><title>Qwen</title><defs><linearGradient id="lobe-icons-qwen-fill" x1="0%" x2="100%" y1="0%" y2="0%"><stop offset="0%" stop-color="#00055F" stop-opacity=".84"></stop><stop offset="100%" stop-color="#6F69F7" stop-opacity=".84"></stop></linearGradient></defs><path d="M12.604 1.34c.393.69.784 1.382 1.174 2.075a.18.18 0 00.157.091h5.552c.174 0 .322.11.446.327l1.454 2.57c.19.337.24.478.024.837-.26.43-.513.864-.76 1.3l-.367.658c-.106.196-.223.28-.04.512l2.652 4.637c.172.301.111.494-.043.77-.437.785-.882 1.564-1.335 2.34-.159.272-.352.375-.68.37-.777-.016-1.552-.01-2.327.016a.099.099 0 00-.081.05 575.097 575.097 0 01-2.705 4.74c-.169.293-.38.363-.725.364-.997.003-2.002.004-3.017.002a.537.537 0 01-.465-.271l-1.335-2.323a.09.09 0 00-.083-.049H4.982c-.285.03-.553-.001-.805-.092l-1.603-2.77a.543.543 0 01-.002-.54l1.207-2.12a.198.198 0 000-.197 550.951 550.951 0 01-1.875-3.272l-.79-1.395c-.16-.31-.173-.496.095-.965.465-.813.927-1.625 1.387-2.436.132-.234.304-.334.584-.335a338.3 338.3 0 012.589-.001.124.124 0 00.107-.063l2.806-4.895a.488.488 0 01.422-.246c.524-.001 1.053 0 1.583-.006L11.704 1c.341-.003.724.032.9.34zm-3.432.403a.06.06 0 00-.052.03L6.254 6.788a.157.157 0 01-.135.078H3.253c-.056 0-.07.025-.041.074l5.81 10.156c.025.042.013.062-.034.063l-2.795.015a.218.218 0 00-.2.116l-1.32 2.31c-.044.078-.021.118.068.118l5.716.008c.046 0 .08.02.104.061l1.403 2.454c.046.081.092.082.139 0l5.006-8.76.783-1.382a.055.055 0 01.096 0l1.424 2.53a.122.122 0 00.107.062l2.763-.02a.04.04 0 00.035-.02.041.041 0 000-.04l-2.9-5.086a.108.108 0 010-.113l.293-.507 1.12-1.977c.024-.041.012-.062-.035-.062H9.2c-.059 0-.073-.026-.043-.077l1.434-2.505a.107.107 0 000-.114L9.225 1.774a.06.06 0 00-.053-.031zm6.29 8.02c.046 0 .058.02.034.06l-.832 1.465-2.613 4.585a.056.056 0 01-.05.029.058.058 0 01-.05-.029L8.498 9.841c-.02-.034-.01-.052.028-.054l.216-.012 6.722-.012z" fill="url(#lobe-icons-qwen-fill)" fill-rule="nonzero"></path></svg>
diff --git a/src/autoslice/mllm_sdk/qwen_sdk.py b/src/autoslice/mllm_sdk/qwen_sdk.py
@@ -0,0 +1,39 @@
+from src.config import QWEN_API_KEY
+from src.log.logger import scan_log
+from openai import OpenAI
+import os
+import base64
+
+def encode_video(video_path):
+    with open(video_path, "rb") as video_file:
+        return base64.b64encode(video_file.read()).decode("utf-8")
+
+def qwen_generate_title(video_path, artist):
+    client = OpenAI(
+        api_key=QWEN_API_KEY,
+        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+    )
+
+    base64_video = encode_video(video_path)
+    completion = client.chat.completions.create(
+        model="qwen2.5-vl-72b-instruct",
+        messages=[
+            {
+                "role": "system",
+                "content": [{"type":"text","text": "你是一个视频切片员"}]},
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video_url",
+                        "video_url": {"url": f"data:video/mp4;base64,{base64_video}"},
+                    },
+                    {"type": "text", "text": f"视频是{artist}的直播切片，请根据该视频中的内容及弹幕信息，为这段视频起一个调皮并且吸引眼球的标题，标题中不要表情符号，可以适当使用网络热词或流行语"},
+                ],
+            }
+        ],
+    )
+    scan_log.info("使用 Qwen-2.5-72B-Instruct 生成切片标题")
+    scan_log.info(f"Prompt: 视频是{artist}的直播切片，请根据该视频中的内容及弹幕信息，为这段视频起一个调皮并且吸引眼球的标题，标题中不要表情符号，可以适当使用网络热词或流行语")
+    scan_log.info(f"生成的切片标题为: {completion.choices[0].message.content.strip('"')}")
+    return completion.choices[0].message.content.strip('"')
diff --git a/src/autoslice/title_generator.py b/src/autoslice/title_generator.py
@@ -17,6 +17,9 @@ def wrapper(video_path, artist):
             elif model_type == "gemini":
                 from .mllm_sdk.gemini_sdk import gemini_generate_title
                 return gemini_generate_title(video_path, artist)
+            elif model_type == "qwen":
+                from .mllm_sdk.qwen_sdk import qwen_generate_title
+                return qwen_generate_title(video_path, artist)
             else:
                 scan_log.error(f"Unsupported model type: {model_type}")
                 return None
diff --git a/src/config.py b/src/config.py
@@ -27,12 +27,14 @@
 SLICE_STEP = 1
 # The minimum video size to be sliced (MB)
 MIN_VIDEO_SIZE = 200
-# the multi-model LLMs, can be "gemini" or "zhipu"
+# the multi-model LLMs, can be "gemini" or "zhipu" or "qwen"
 MLLM_MODEL = "gemini" # Please make sure you have the right API key for the LLM you choose
 # Apply for your own GLM-4v-Plus API key at https://www.bigmodel.cn/invite?icode=shBtZUfNE6FfdMH1R6NybGczbXFgPRGIalpycrEwJ28%3D
 ZHIPU_API_KEY = ""
 # Apply for your own Gemini API key at https://aistudio.google.com/app/apikey
 GEMINI_API_KEY = ""
+# Apply for your own Qwen API key at https://bailian.console.aliyun.com/?apiKey=1
+QWEN_API_KEY = ""
 # ============================ Basic configuration ============================
 SRC_DIR = str(Path(os.path.abspath(__file__)).parent)
 BILIVE_DIR = str(Path(SRC_DIR).parent)