feat: Vllm whisper model

zhanweizhang7 · zhanweizhang7 · commit 5962341637de · 2025-08-20T18:24:21.000+08:00
diff --git a/apps/locales/en_US/LC_MESSAGES/django.po b/apps/locales/en_US/LC_MESSAGES/django.po
@@ -8663,4 +8663,7 @@ msgid "resource authorization"
 msgstr ""
 
 msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition."
+msgstr ""
+
+msgid "If not passed, the default value is 'zh'"
 msgstr ""
diff --git a/apps/locales/zh_CN/LC_MESSAGES/django.po b/apps/locales/zh_CN/LC_MESSAGES/django.po
@@ -8789,4 +8789,7 @@ msgid "resource authorization"
 msgstr "资源授权"
 
 msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition."
-msgstr "基于Qwen-Audio的端到端语音识别大模型，支持3分钟以内的音频识别，目前主要支持中英文识别。"
+msgstr "基于Qwen-Audio的端到端语音识别大模型，支持3分钟以内的音频识别，目前主要支持中英文识别。"
+
+msgid "If not passed, the default value is 'zh'"
+msgstr "如果未传递，则默认值为'zh'"
diff --git a/apps/locales/zh_Hant/LC_MESSAGES/django.po b/apps/locales/zh_Hant/LC_MESSAGES/django.po
@@ -8789,4 +8789,7 @@ msgid "resource authorization"
 msgstr "資源授權"
 
 msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition."
-msgstr "基於Qwen-Audio的端到端語音辨識大模型，支持3分鐘以內的音訊識別，現時主要支持中英文識別。"
+msgstr "基於Qwen-Audio的端到端語音辨識大模型，支持3分鐘以內的音訊識別，現時主要支持中英文識別。"
+
+msgid "If not passed, the default value is 'zh'"
+msgstr "如果未傳遞，則預設值為'zh'"
diff --git a/apps/models_provider/impl/vllm_model_provider/credential/whisper_stt.py b/apps/models_provider/impl/vllm_model_provider/credential/whisper_stt.py
@@ -0,0 +1,62 @@
+# coding=utf-8
+import traceback
+from typing import Dict
+
+from django.utils.translation import gettext_lazy as _, gettext
+from langchain_core.messages import HumanMessage
+
+from common import forms
+from common.exception.app_exception import AppApiException
+from common.forms import BaseForm, TooltipLabel
+from models_provider.base_model_provider import BaseModelCredential, ValidCode
+
+
+class VLLMWhisperModelParams(BaseForm):
+    Language = forms.TextInputField(
+        TooltipLabel(_('Language'),
+                     _("If not passed, the default value is 'zh'")),
+        required=True,
+        default_value='zh',
+    )
+
+
+class VLLMWhisperModelCredential(BaseForm, BaseModelCredential):
+    api_url = forms.TextInputField('API URL', required=True)
+    api_key = forms.PasswordInputField('API Key', required=True)
+
+    def is_valid(self,
+                 model_type: str,
+                 model_name,
+                 model_credential: Dict[str, object],
+                 model_params,
+                 provider,
+                 raise_exception=False):
+
+        model_type_list = provider.get_model_type_list()
+
+        if not any(list(filter(lambda mt: mt.get('value') == model_type, model_type_list))):
+            raise AppApiException(ValidCode.valid_error.value,
+                                  gettext('{model_type} Model type is not supported').format(model_type=model_type))
+        try:
+            model_list = provider.get_base_model_list(model_credential.get('api_url'), model_credential.get('api_key'))
+        except Exception as e:
+            raise AppApiException(ValidCode.valid_error.value, gettext('API domain name is invalid'))
+        exist = provider.get_model_info_by_name(model_list, model_name)
+        if len(exist) == 0:
+            raise AppApiException(ValidCode.valid_error.value,
+                                  gettext('The model does not exist, please download the model first'))
+        model = provider.get_model(model_type, model_name, model_credential, **model_params)
+        return True
+
+    def encryption_dict(self, model_info: Dict[str, object]):
+        return {**model_info, 'api_key': super().encryption(model_info.get('api_key', ''))}
+
+    def build_model(self, model_info: Dict[str, object]):
+        for key in ['api_key', 'model']:
+            if key not in model_info:
+                raise AppApiException(500, gettext('{key}  is required').format(key=key))
+        self.api_key = model_info.get('api_key')
+        return self
+
+    def get_model_params_setting_form(self, model_name):
+        return VLLMWhisperModelParams()
diff --git a/apps/models_provider/impl/vllm_model_provider/model/iat_mp3_16k.mp3 b/apps/models_provider/impl/vllm_model_provider/model/iat_mp3_16k.mp3
diff --git a/apps/models_provider/impl/vllm_model_provider/model/whisper_sst.py b/apps/models_provider/impl/vllm_model_provider/model/whisper_sst.py
@@ -0,0 +1,64 @@
+import base64
+import os
+import traceback
+from typing import Dict
+
+from openai import OpenAI
+
+from common.utils.logger import maxkb_logger
+from models_provider.base_model_provider import MaxKBBaseModel
+from models_provider.impl.base_stt import BaseSpeechToText
+
+
+
+class VllmWhisperSpeechToText(MaxKBBaseModel, BaseSpeechToText):
+    api_key: str
+    api_url: str
+    model: str
+    params: dict
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.api_key = kwargs.get('api_key')
+        self.model = kwargs.get('model')
+        self.params = kwargs.get('params')
+        self.api_url = kwargs.get('api_url')
+
+    @staticmethod
+    def is_cache_model():
+        return False
+
+    @staticmethod
+    def new_instance(model_type, model_name, model_credential: Dict[str, object], **model_kwargs):
+        return VllmWhisperSpeechToText(
+            model=model_name,
+            api_key=model_credential.get('api_key'),
+            api_url=model_credential.get('api_url'),
+            params=model_kwargs,
+            **model_kwargs
+        )
+
+    def check_auth(self):
+        cwd = os.path.dirname(os.path.abspath(__file__))
+        with open(f'{cwd}/iat_mp3_16k.mp3', 'rb') as audio_file:
+            self.speech_to_text(audio_file)
+
+    def speech_to_text(self, audio_file):
+        base_url = f"{self.api_url}/v1"
+        try:
+            client = OpenAI(
+                api_key=self.api_key,
+                base_url=base_url
+            )
+
+            result = client.audio.transcriptions.create(
+                file=audio_file,
+                model=self.model,
+                language=self.params.get('Language'),
+                response_format="json"
+            )
+
+            return result.text
+
+        except Exception as err:
+            maxkb_logger.error(f":Error: {str(err)}: {traceback.format_exc()}")
diff --git a/apps/models_provider/impl/vllm_model_provider/vllm_model_provider.py b/apps/models_provider/impl/vllm_model_provider/vllm_model_provider.py
@@ -10,20 +10,27 @@
 from models_provider.impl.vllm_model_provider.credential.embedding import VllmEmbeddingCredential
 from models_provider.impl.vllm_model_provider.credential.image import VllmImageModelCredential
 from models_provider.impl.vllm_model_provider.credential.llm import VLLMModelCredential
+from models_provider.impl.vllm_model_provider.credential.whisper_stt import VLLMWhisperModelCredential
 from models_provider.impl.vllm_model_provider.model.embedding import VllmEmbeddingModel
 from models_provider.impl.vllm_model_provider.model.image import VllmImage
 from models_provider.impl.vllm_model_provider.model.llm import VllmChatModel
 from maxkb.conf import PROJECT_DIR
 from django.utils.translation import gettext as _
 
+from models_provider.impl.vllm_model_provider.model.whisper_sst import VllmWhisperSpeechToText
+
 v_llm_model_credential = VLLMModelCredential()
 image_model_credential = VllmImageModelCredential()
 embedding_model_credential = VllmEmbeddingCredential()
+whisper_model_credential = VLLMWhisperModelCredential()
 
 model_info_list = [
-    ModelInfo('facebook/opt-125m', _('Facebook’s 125M parameter model'), ModelTypeConst.LLM, v_llm_model_credential, VllmChatModel),
-    ModelInfo('BAAI/Aquila-7B', _('BAAI’s 7B parameter model'), ModelTypeConst.LLM, v_llm_model_credential, VllmChatModel),
-    ModelInfo('BAAI/AquilaChat-7B', _('BAAI’s 13B parameter mode'), ModelTypeConst.LLM, v_llm_model_credential, VllmChatModel),
+    ModelInfo('facebook/opt-125m', _('Facebook’s 125M parameter model'), ModelTypeConst.LLM, v_llm_model_credential,
+              VllmChatModel),
+    ModelInfo('BAAI/Aquila-7B', _('BAAI’s 7B parameter model'), ModelTypeConst.LLM, v_llm_model_credential,
+              VllmChatModel),
+    ModelInfo('BAAI/AquilaChat-7B', _('BAAI’s 13B parameter mode'), ModelTypeConst.LLM, v_llm_model_credential,
+              VllmChatModel),
 
 ]
 
@@ -32,7 +39,15 @@
 ]
 
 embedding_model_info_list = [
-    ModelInfo('HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5', '', ModelTypeConst.EMBEDDING, embedding_model_credential, VllmEmbeddingModel),
+    ModelInfo('HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5', '', ModelTypeConst.EMBEDDING,
+              embedding_model_credential, VllmEmbeddingModel),
+]
+
+whisper_model_info_list = [
+    ModelInfo('whisper-tiny', '', ModelTypeConst.STT, whisper_model_credential, VllmWhisperSpeechToText),
+    ModelInfo('whisper-large-v3-turbo', '', ModelTypeConst.STT, whisper_model_credential, VllmWhisperSpeechToText),
+    ModelInfo('whisper-small', '', ModelTypeConst.STT, whisper_model_credential, VllmWhisperSpeechToText),
+    ModelInfo('whisper-large-v3', '', ModelTypeConst.STT, whisper_model_credential, VllmWhisperSpeechToText),
 ]
 
 model_info_manage = (
@@ -45,6 +60,8 @@
     .append_default_model_info(image_model_info_list[0])
     .append_model_info_list(embedding_model_info_list)
     .append_default_model_info(embedding_model_info_list[0])
+    .append_model_info_list(whisper_model_info_list)
+    .append_default_model_info(whisper_model_info_list[0])
     .build()
 )