-
Notifications
You must be signed in to change notification settings - Fork 2.6k
feat: Interface with Qwen Omni speech to text model #3865
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,72 @@ | ||
| # coding=utf-8 | ||
| import traceback | ||
| from typing import Dict, Any | ||
|
|
||
| from common import forms | ||
| from common.exception.app_exception import AppApiException | ||
| from common.forms import BaseForm, PasswordInputField, TooltipLabel | ||
| from models_provider.base_model_provider import BaseModelCredential, ValidCode | ||
| from django.utils.translation import gettext as _ | ||
|
|
||
| class AliyunBaiLianOmiSTTModelParams(BaseForm): | ||
| CueWord = forms.TextInputField( | ||
| TooltipLabel(_('CueWord'), _('If not passed, the default value is What is this audio saying? Only answer the audio content')), | ||
| required=True, | ||
| default_value='这段音频在说什么,只回答音频的内容', | ||
| ) | ||
|
|
||
|
|
||
| class AliyunBaiLianOmiSTTModelCredential(BaseForm, BaseModelCredential): | ||
| api_key = PasswordInputField("API key", required=True) | ||
|
|
||
| def is_valid(self, | ||
| model_type: str, | ||
| model_name: str, | ||
| model_credential: Dict[str, Any], | ||
| model_params: Dict[str, Any], | ||
| provider, | ||
| raise_exception: bool = False | ||
| ) -> bool: | ||
|
|
||
| model_type_list = provider.get_model_type_list() | ||
| if not any(mt.get('value') == model_type for mt in model_type_list): | ||
| raise AppApiException( | ||
| ValidCode.valid_error.value, | ||
| _('{model_type} Model type is not supported').format(model_type=model_type) | ||
| ) | ||
|
|
||
| required_keys = ['api_key'] | ||
| for key in required_keys: | ||
| if key not in model_credential: | ||
| if raise_exception: | ||
| raise AppApiException( | ||
| ValidCode.valid_error.value, | ||
| _('{key} is required').format(key=key) | ||
| ) | ||
| return False | ||
|
|
||
| try: | ||
| model = provider.get_model(model_type, model_name, model_credential) | ||
| except Exception as e: | ||
| traceback.print_exc() | ||
| if isinstance(e, AppApiException): | ||
| raise e | ||
| if raise_exception: | ||
| raise AppApiException( | ||
| ValidCode.valid_error.value, | ||
| _('Verification failed, please check whether the parameters are correct: {error}').format(error=str(e)) | ||
| ) | ||
| return False | ||
| return True | ||
|
|
||
| def encryption_dict(self, model: Dict[str, object]) -> Dict[str, object]: | ||
|
|
||
| return { | ||
| **model, | ||
| 'api_key': super().encryption(model.get('api_key', '')) | ||
| } | ||
|
|
||
|
|
||
| def get_model_params_setting_form(self, model_name): | ||
|
|
||
| return AliyunBaiLianOmiSTTModelParams() | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The provided code has several issues that need to be addressed before it can be used:
Here's a revised version of the code with these improvements: # coding=utf-8
import traceback
from typing import Dict, Any
from common.forms import BaseForm, PasswordInputField, TooltipLabel
from models_provider.base_model_provider import BaseModelCredential, ValidCode
from django.utils.translation import gettext_lazy as _
class AliyunBaiLianOmiSTTModelParams(BaseForm):
cue_word = forms.TextInputField(
tooltip_label=_("CueWord"),
help_text="If not passed, the default value is " \
"'What is this audio saying?' only reply the audio content.",
required=True,
default_value=_('这段音频在说什么,只回答音频的内容'),
)
class AliyunBaiLianOmiSTTModelCredential(BaseForm, BaseModelCredential):
api_key = PasswordInputField(label='API key', required=True)
def is_valid(
self,
model_type: str,
model_name: str,
model_credential: Dict[str, Any],
model_params: Dict[str, Any],
provider,
raise_exception: bool = False
) -> bool:
model_type_list = provider.get_model_type_list()
if not any(mt['value'] == model_type for mt in model_type_list):
raise AppApiException(
ValidCode.valid_error.value,
_(f'{model_type} Model type is not supported')
)
required_keys = ['api_key']
missing_keys = [key for key in required_keys if key not in model_credential]
if missing_keys:
if raise_exception:
raise AppApiException(
ValidCode.valid_error.value,
_(f'missing keys: {missing_keys}').format(keys=', '.join(missing_keys))
)
return False
try:
model = provider.get_model(model_type, model_name, model_credential)
except Exception as e:
traceback.print_exc()
if isinstance(e, AppApiException):
raise e
else:
error_message = _('Verification failed.') if raise_exception else (
f'Verification failed, please check whether the '
f'parameters are correct: {str(e)}'
)
raise AppApiException(ValidCode.valid_error.value, error_message)
return True
def encrypt_dict(self, model: Dict[str, Any]) -> Dict[str, Any]:
encrypted_api_key = super().encrypt_data(model.get('api_key', '').strip())
return {
**model,
'api_key': encrypted_api_key
}
def get_model_params_setting_form(self, model_name) -> BaseForm:
return AliyunBaiLianOmiSTTModelParams()Key Changes:
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,87 @@ | ||
| import base64 | ||
| import os | ||
| import traceback | ||
| from typing import Dict | ||
|
|
||
| from openai import OpenAI | ||
|
|
||
| from common.utils.logger import maxkb_logger | ||
| from models_provider.base_model_provider import MaxKBBaseModel | ||
| from models_provider.impl.base_stt import BaseSpeechToText | ||
|
|
||
|
|
||
| class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText): | ||
| api_key: str | ||
| model: str | ||
| params: dict | ||
|
|
||
| def __init__(self, **kwargs): | ||
| super().__init__(**kwargs) | ||
| self.api_key = kwargs.get('api_key') | ||
| self.model = kwargs.get('model') | ||
| self.params = kwargs.get('params') | ||
|
|
||
| @staticmethod | ||
| def is_cache_model(): | ||
| return False | ||
|
|
||
| @staticmethod | ||
| def new_instance(model_type, model_name, model_credential: Dict[str, object], **model_kwargs): | ||
| return AliyunBaiLianOmiSpeechToText( | ||
| model=model_name, | ||
| api_key=model_credential.get('api_key'), | ||
| params= model_kwargs, | ||
| **model_kwargs | ||
| ) | ||
|
|
||
|
|
||
| def check_auth(self): | ||
| cwd = os.path.dirname(os.path.abspath(__file__)) | ||
| with open(f'{cwd}/iat_mp3_16k.mp3', 'rb') as audio_file: | ||
| self.speech_to_text(audio_file) | ||
|
|
||
|
|
||
|
|
||
| def speech_to_text(self, audio_file): | ||
| try: | ||
| client = OpenAI( | ||
| # 若没有配置环境变量,请用阿里云百炼API Key将下行替换为:api_key="sk-xxx", | ||
| api_key=self.api_key, | ||
| base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", | ||
| ) | ||
|
|
||
| base64_audio = base64.b64encode(audio_file.read()).decode("utf-8") | ||
|
|
||
| completion = client.chat.completions.create( | ||
| model="qwen-omni-turbo-0119", | ||
| messages=[ | ||
| { | ||
| "role": "user", | ||
| "content": [ | ||
| { | ||
| "type": "input_audio", | ||
| "input_audio": { | ||
| "data": f"data:;base64,{base64_audio}", | ||
| "format": "mp3", | ||
| }, | ||
| }, | ||
| {"type": "text", "text": self.params.get('CueWord')}, | ||
| ], | ||
| }, | ||
| ], | ||
| # 设置输出数据的模态,当前支持两种:["text","audio"]、["text"] | ||
| modalities=["text"], | ||
| audio={"voice": "Cherry", "format": "mp3"}, | ||
| # stream 必须设置为 True,否则会报错 | ||
| stream=True, | ||
| stream_options={"include_usage": True}, | ||
| ) | ||
| result = [] | ||
| for chunk in completion: | ||
| if chunk.choices and hasattr(chunk.choices[0].delta, 'audio'): | ||
| transcript = chunk.choices[0].delta.audio.get('transcript') | ||
| result.append(transcript) | ||
| return "".join(result) | ||
|
|
||
| except Exception as err: | ||
| maxkb_logger.error(f":Error: {str(err)}: {traceback.format_exc()}") | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code provided appears to be an implementation of a speech-to-text (STT) service using the Qwen Omni Turbo model from Alibaba Cloud DashScope. However, there are several areas that need improvement:
Here's a revised version of your code with some minor adjustments: import base64
import os
from typing import Dict
from openai import OpenAI
from common.utils.logger import maxkb_logger
def load_audio(file_path):
"""Load and encode an audio file to base64."""
with open(file_path, 'rb') as audio_file:
return base64.b64encode(audio_file.read()).decode("utf-8")
class AliyunBaiLianOmiSpeechToText(OpenAI): # Removed unnecessary inheritance
api_key: str
model: str
params: dict
def __init__(self, **kwargs):
super().__init__()
self.api_key = kwargs.get('api_key')
self.model = kwargs.get('model')
self.params = kwargs.get('params')
@staticmethod
def new_instance(model_type, model_name, model_credential: Dict[str, object], **model_kwargs):
return AliyunBaiLianOmiSpeechToText(
model=model_name,
api_key=model_credential.get('api_key'),
params= model_kwargs,
**model_kwargs
)
def check_auth(self):
try:
base64_audio = load_audio(f'iat_mp3_16k.mp3')
response = self.completion.create(
engine="qwen-omni-turbo",
messages=[
{
"role": "user",
"content": [
{
"type": "input_audio",
"input_audio": {
"data": f"data:;base64,{base64_audio}",
"format": "mp3",
}
},
{"type": "text", "text": self.params.get('CueWord')}
]
}
],
modalities=["text"],
audio={"voice": "Cherry", "format": "mp3"},
stream=True,
stream_options={"include_usage": True}
)
result = []
for choice in response.choices:
if hasattr(choice.delta, 'audio'):
transcript = choice.delta.audio.get('transcript')
result.append(transcript)
return "".join(result)
except Exception as err:
maxkb_logger.info(f"{err}")
# Example usage
if __name__ == "__main__":
api_key = os.getenv('OPENAI_API_KEY')
credential = {'api_key': api_key}
stt_service = AliyunBaiLianOmiSpeechToText.new_instance(
"aliyun-bai-lian-omi-speech-to-text",
"<your-model-name>",
credential
)
result = stt_service.check_auth()
print(result)Key changes include:
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are no significant irregularities or potential issues with the provided code snippet. The changes seem to add new credentials and models from the
Aliyun BAI Lianplatform, which is appropriate given that the original list includes related services like TTI (Text To Image) and STT (Speech To Text). Here are some general suggestions for optimization:Consistency: Ensure that all added classes follow the same pattern to maintain consistency within the project.
Error Handling: Consider adding error handling logic around the instantiation of credentials and models to manage exceptions gracefully.
Documentation: Although not shown here, it would be beneficial to document each class thoroughly, explaining its purpose and usage.
Performance Optimization: Depending on the application's requirements, consider optimizing memory usage or processing speed if necessary.
Overall, the additions look well-integrated into the existing structure, enhancing functionality without introducing major bugs or performance bottlenecks.