Skip to content

Commit b32b063

Browse files
committed
feat: Qwen2.5-omni-7b full modal speech recognition
1 parent a85c36f commit b32b063

File tree

4 files changed

+15
-11
lines changed

4 files changed

+15
-11
lines changed

apps/models_provider/impl/aliyun_bai_lian_model_provider/aliyun_bai_lian_model_provider.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
AliyunBaiLianEmbeddingCredential
1616
from models_provider.impl.aliyun_bai_lian_model_provider.credential.image import QwenVLModelCredential
1717
from models_provider.impl.aliyun_bai_lian_model_provider.credential.llm import BaiLianLLMModelCredential
18-
from models_provider.impl.aliyun_bai_lian_model_provider.credential.omi_stt import AliyunBaiLianOmiSTTModelCredential
18+
from models_provider.impl.aliyun_bai_lian_model_provider.credential.omni_stt import AliyunBaiLianOmiSTTModelCredential
1919
from models_provider.impl.aliyun_bai_lian_model_provider.credential.reranker import \
2020
AliyunBaiLianRerankerCredential
2121
from models_provider.impl.aliyun_bai_lian_model_provider.credential.stt import AliyunBaiLianSTTModelCredential
@@ -24,7 +24,7 @@
2424
from models_provider.impl.aliyun_bai_lian_model_provider.model.embedding import AliyunBaiLianEmbedding
2525
from models_provider.impl.aliyun_bai_lian_model_provider.model.image import QwenVLChatModel
2626
from models_provider.impl.aliyun_bai_lian_model_provider.model.llm import BaiLianChatModel
27-
from models_provider.impl.aliyun_bai_lian_model_provider.model.omi_stt import AliyunBaiLianOmiSpeechToText
27+
from models_provider.impl.aliyun_bai_lian_model_provider.model.omni_stt import AliyunBaiLianOmiSpeechToText
2828
from models_provider.impl.aliyun_bai_lian_model_provider.model.reranker import AliyunBaiLianReranker
2929
from models_provider.impl.aliyun_bai_lian_model_provider.model.stt import AliyunBaiLianSpeechToText
3030
from models_provider.impl.aliyun_bai_lian_model_provider.model.tti import QwenTextToImageModel
@@ -80,6 +80,9 @@
8080
ModelInfo('qwen-omni-turbo',
8181
_('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'),
8282
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText),
83+
ModelInfo('qwen2.5-omni-7b',
84+
_('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'),
85+
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText),
8386
]
8487

8588
module_info_vl_list = [

apps/models_provider/impl/aliyun_bai_lian_model_provider/credential/omi_stt.py renamed to apps/models_provider/impl/aliyun_bai_lian_model_provider/credential/omni_stt.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ class AliyunBaiLianOmiSTTModelParams(BaseForm):
1717

1818

1919
class AliyunBaiLianOmiSTTModelCredential(BaseForm, BaseModelCredential):
20-
api_key = PasswordInputField("API key", required=True)
20+
api_url = forms.TextInputField(_('API URL'), required=True)
21+
api_key = forms.PasswordInputField(_('API Key'), required=True)
2122

2223
def is_valid(self,
2324
model_type: str,

apps/models_provider/impl/aliyun_bai_lian_model_provider/model/omi_stt.py renamed to apps/models_provider/impl/aliyun_bai_lian_model_provider/model/omni_stt.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText):
1414
api_key: str
15+
api_url: str
1516
model: str
1617
params: dict
1718

@@ -20,6 +21,7 @@ def __init__(self, **kwargs):
2021
self.api_key = kwargs.get('api_key')
2122
self.model = kwargs.get('model')
2223
self.params = kwargs.get('params')
24+
self.api_url = kwargs.get('api_url')
2325

2426
@staticmethod
2527
def is_cache_model():
@@ -30,6 +32,7 @@ def new_instance(model_type, model_name, model_credential: Dict[str, object], **
3032
return AliyunBaiLianOmiSpeechToText(
3133
model=model_name,
3234
api_key=model_credential.get('api_key'),
35+
api_url=model_credential.get('api_url') ,
3336
params= model_kwargs,
3437
**model_kwargs
3538
)
@@ -47,13 +50,13 @@ def speech_to_text(self, audio_file):
4750
client = OpenAI(
4851
# 若没有配置环境变量,请用阿里云百炼API Key将下行替换为:api_key="sk-xxx",
4952
api_key=self.api_key,
50-
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
53+
base_url=self.api_url,
5154
)
5255

5356
base64_audio = base64.b64encode(audio_file.read()).decode("utf-8")
5457

5558
completion = client.chat.completions.create(
56-
model="qwen-omni-turbo-0119",
59+
model=self.model,
5760
messages=[
5861
{
5962
"role": "user",
@@ -71,16 +74,15 @@ def speech_to_text(self, audio_file):
7174
],
7275
# 设置输出数据的模态,当前支持两种:["text","audio"]、["text"]
7376
modalities=["text"],
74-
audio={"voice": "Cherry", "format": "mp3"},
7577
# stream 必须设置为 True,否则会报错
7678
stream=True,
7779
stream_options={"include_usage": True},
7880
)
7981
result = []
8082
for chunk in completion:
81-
if chunk.choices and hasattr(chunk.choices[0].delta, 'audio'):
82-
transcript = chunk.choices[0].delta.audio.get('transcript')
83-
result.append(transcript)
83+
if chunk.choices and hasattr(chunk.choices[0].delta, 'content'):
84+
content = chunk.choices[0].delta.content
85+
result.append(content)
8486
return "".join(result)
8587

8688
except Exception as err:

apps/models_provider/impl/aliyun_bai_lian_model_provider/model/stt.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,6 @@ def new_instance(model_type, model_name, model_credential: Dict[str, object], **
3030
optional_params['max_tokens'] = model_kwargs['max_tokens']
3131
if 'temperature' in model_kwargs and model_kwargs['temperature'] is not None:
3232
optional_params['temperature'] = model_kwargs['temperature']
33-
if model_name == 'qwen-omni-turbo':
34-
optional_params['streaming'] = True
3533
return AliyunBaiLianSpeechToText(
3634
model=model_name,
3735
api_key=model_credential.get('api_key'),

0 commit comments

Comments
 (0)