Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions apps/locales/en_US/LC_MESSAGES/django.po
Original file line number Diff line number Diff line change
Expand Up @@ -8648,4 +8648,13 @@ msgid "Multiple dialects, supporting 23 dialects"
msgstr ""

msgid "This interface is used to recognize short audio files within 60 seconds. Supports Mandarin Chinese, English, Cantonese, Japanese, Vietnamese, Malay, Indonesian, Filipino, Thai, Portuguese, Turkish, Arabic, Hindi, French, German, and 23 Chinese dialects."
msgstr ""

msgid "CueWord"
msgstr ""

msgid "If not passed, the default value is What is this audio saying? Only answer the audio content"
msgstr ""

msgid "The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text."
msgstr ""
11 changes: 10 additions & 1 deletion apps/locales/zh_CN/LC_MESSAGES/django.po
Original file line number Diff line number Diff line change
Expand Up @@ -8774,4 +8774,13 @@ msgid "Multiple dialects, supporting 23 dialects"
msgstr "多种方言,支持 23 种方言"

msgid "This interface is used to recognize short audio files within 60 seconds. Supports Mandarin Chinese, English, Cantonese, Japanese, Vietnamese, Malay, Indonesian, Filipino, Thai, Portuguese, Turkish, Arabic, Hindi, French, German, and 23 Chinese dialects."
msgstr "本接口用于识别 60 秒之内的短音频文件。支持中文普通话、英语、粤语、日语、越南语、马来语、印度尼西亚语、菲律宾语、泰语、葡萄牙语、土耳其语、阿拉伯语、印地语、法语、德语及 23 种汉语方言。"
msgstr "本接口用于识别 60 秒之内的短音频文件。支持中文普通话、英语、粤语、日语、越南语、马来语、印度尼西亚语、菲律宾语、泰语、葡萄牙语、土耳其语、阿拉伯语、印地语、法语、德语及 23 种汉语方言。"

msgid "CueWord"
msgstr "提示词"

msgid "If not passed, the default value is What is this audio saying? Only answer the audio content"
msgstr "如果未传递,默认值为 这段音频在说什么,只回答音频的内容"

msgid "The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text."
msgstr "Qwen-Omni 系列模型支持输入多种模态的数据,包括视频、音频、图片、文本,并输出音频与文本"
11 changes: 10 additions & 1 deletion apps/locales/zh_Hant/LC_MESSAGES/django.po
Original file line number Diff line number Diff line change
Expand Up @@ -8774,4 +8774,13 @@ msgid "Multiple dialects, supporting 23 dialects"
msgstr "多種方言,支持 23 種方言"

msgid "This interface is used to recognize short audio files within 60 seconds. Supports Mandarin Chinese, English, Cantonese, Japanese, Vietnamese, Malay, Indonesian, Filipino, Thai, Portuguese, Turkish, Arabic, Hindi, French, German, and 23 Chinese dialects."
msgstr "本介面用於識別 60 秒之內的短音頻文件。支援中文普通話、英語、粵語、日語、越南語、馬來語、印度尼西亞語、菲律賓語、泰語、葡萄牙語、土耳其語、阿拉伯語、印地語、法語、德語及 23 種漢語方言。"
msgstr "本介面用於識別 60 秒之內的短音頻文件。支援中文普通話、英語、粵語、日語、越南語、馬來語、印度尼西亞語、菲律賓語、泰語、葡萄牙語、土耳其語、阿拉伯語、印地語、法語、德語及 23 種漢語方言。"

msgid "CueWord"
msgstr "提示詞"

msgid "If not passed, the default value is What is this audio saying? Only answer the audio content"
msgstr "如果未傳遞,預設值為這段音訊在說什麼,只回答音訊的內容"

msgid "The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text."
msgstr "Qwen-Omni系列模型支持輸入多種模態的數據,包括視頻、音訊、圖片、文字,並輸出音訊與文字"
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
AliyunBaiLianEmbeddingCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.image import QwenVLModelCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.llm import BaiLianLLMModelCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.omi_stt import AliyunBaiLianOmiSTTModelCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.reranker import \
AliyunBaiLianRerankerCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.stt import AliyunBaiLianSTTModelCredential
Expand All @@ -23,6 +24,7 @@
from models_provider.impl.aliyun_bai_lian_model_provider.model.embedding import AliyunBaiLianEmbedding
from models_provider.impl.aliyun_bai_lian_model_provider.model.image import QwenVLChatModel
from models_provider.impl.aliyun_bai_lian_model_provider.model.llm import BaiLianChatModel
from models_provider.impl.aliyun_bai_lian_model_provider.model.omi_stt import AliyunBaiLianOmiSpeechToText
from models_provider.impl.aliyun_bai_lian_model_provider.model.reranker import AliyunBaiLianReranker
from models_provider.impl.aliyun_bai_lian_model_provider.model.stt import AliyunBaiLianSpeechToText
from models_provider.impl.aliyun_bai_lian_model_provider.model.tti import QwenTextToImageModel
Expand All @@ -33,6 +35,7 @@
aliyun_bai_lian_model_credential = AliyunBaiLianRerankerCredential()
aliyun_bai_lian_tts_model_credential = AliyunBaiLianTTSModelCredential()
aliyun_bai_lian_stt_model_credential = AliyunBaiLianSTTModelCredential()
aliyun_bai_lian_omi_stt_model_credential = AliyunBaiLianOmiSTTModelCredential()
aliyun_bai_lian_embedding_model_credential = AliyunBaiLianEmbeddingCredential()
aliyun_bai_lian_llm_model_credential = BaiLianLLMModelCredential()
qwenvl_model_credential = QwenVLModelCredential()
Expand Down Expand Up @@ -73,7 +76,10 @@
ModelInfo('qwen-plus', '', ModelTypeConst.LLM, aliyun_bai_lian_llm_model_credential,
BaiLianChatModel),
ModelInfo('qwen-max', '', ModelTypeConst.LLM, aliyun_bai_lian_llm_model_credential,
BaiLianChatModel)
BaiLianChatModel),
ModelInfo('qwen-omni-turbo',
_('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'),
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText),
]

module_info_vl_list = [
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are no significant irregularities or potential issues with the provided code snippet. The changes seem to add new credentials and models from the Aliyun BAI Lian platform, which is appropriate given that the original list includes related services like TTI (Text To Image) and STT (Speech To Text). Here are some general suggestions for optimization:

  1. Consistency: Ensure that all added classes follow the same pattern to maintain consistency within the project.

  2. Error Handling: Consider adding error handling logic around the instantiation of credentials and models to manage exceptions gracefully.

  3. Documentation: Although not shown here, it would be beneficial to document each class thoroughly, explaining its purpose and usage.

  4. Performance Optimization: Depending on the application's requirements, consider optimizing memory usage or processing speed if necessary.

Overall, the additions look well-integrated into the existing structure, enhancing functionality without introducing major bugs or performance bottlenecks.

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# coding=utf-8
import traceback
from typing import Dict, Any

from common import forms
from common.exception.app_exception import AppApiException
from common.forms import BaseForm, PasswordInputField, TooltipLabel
from models_provider.base_model_provider import BaseModelCredential, ValidCode
from django.utils.translation import gettext as _

class AliyunBaiLianOmiSTTModelParams(BaseForm):
CueWord = forms.TextInputField(
TooltipLabel(_('CueWord'), _('If not passed, the default value is What is this audio saying? Only answer the audio content')),
required=True,
default_value='这段音频在说什么,只回答音频的内容',
)


class AliyunBaiLianOmiSTTModelCredential(BaseForm, BaseModelCredential):
api_key = PasswordInputField("API key", required=True)

def is_valid(self,
model_type: str,
model_name: str,
model_credential: Dict[str, Any],
model_params: Dict[str, Any],
provider,
raise_exception: bool = False
) -> bool:

model_type_list = provider.get_model_type_list()
if not any(mt.get('value') == model_type for mt in model_type_list):
raise AppApiException(
ValidCode.valid_error.value,
_('{model_type} Model type is not supported').format(model_type=model_type)
)

required_keys = ['api_key']
for key in required_keys:
if key not in model_credential:
if raise_exception:
raise AppApiException(
ValidCode.valid_error.value,
_('{key} is required').format(key=key)
)
return False

try:
model = provider.get_model(model_type, model_name, model_credential)
except Exception as e:
traceback.print_exc()
if isinstance(e, AppApiException):
raise e
if raise_exception:
raise AppApiException(
ValidCode.valid_error.value,
_('Verification failed, please check whether the parameters are correct: {error}').format(error=str(e))
)
return False
return True

def encryption_dict(self, model: Dict[str, object]) -> Dict[str, object]:

return {
**model,
'api_key': super().encryption(model.get('api_key', ''))
}


def get_model_params_setting_form(self, model_name):

return AliyunBaiLianOmiSTTModelParams()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The provided code has several issues that need to be addressed before it can be used:

  1. Inconsistent Indentation: The code uses inconsistent indentation, which is not Pythonic and likely leads to syntax errors.

  2. Syntax Errors: There are several instances of unmatched parentheses and commas, which will cause syntax errors when executed.

  3. Variable Naming Consistency: The variable names app_api_exception should consistently use underscores (e.g., AppApiException).

  4. Redundant Imports: Some imports are redundant or unnecessary.

  5. Function Parameters: The function is_valid requires too many parameters, making it difficult to maintain and understand.

  6. Error Handling: Error handling could be more robust and clear. For example, instead of using a generic exception catch (except Exception as e), you should specifically catch expected exceptions like AppApiException.

  7. Translation Strings: Ensure that all translation strings are correctly formatted and consistent across the codebase.

Here's a revised version of the code with these improvements:

# coding=utf-8
import traceback
from typing import Dict, Any

from common.forms import BaseForm, PasswordInputField, TooltipLabel
from models_provider.base_model_provider import BaseModelCredential, ValidCode
from django.utils.translation import gettext_lazy as _

class AliyunBaiLianOmiSTTModelParams(BaseForm):
    cue_word = forms.TextInputField(
        tooltip_label=_("CueWord"),
        help_text="If not passed, the default value is " \
                  "'What is this audio saying?' only reply the audio content.",
        required=True,
        default_value=_('这段音频在说什么,只回答音频的内容'),
    )

class AliyunBaiLianOmiSTTModelCredential(BaseForm, BaseModelCredential):
    api_key = PasswordInputField(label='API key', required=True)

    def is_valid(
        self,
        model_type: str,
        model_name: str,
        model_credential: Dict[str, Any],
        model_params: Dict[str, Any],
        provider,
        raise_exception: bool = False
    ) -> bool:
        model_type_list = provider.get_model_type_list()
        if not any(mt['value'] == model_type for mt in model_type_list):
            raise AppApiException(
                ValidCode.valid_error.value,
                _(f'{model_type} Model type is not supported')
            )

        required_keys = ['api_key']
        missing_keys = [key for key in required_keys if key not in model_credential]
        if missing_keys:
            if raise_exception:
                raise AppApiException(
                    ValidCode.valid_error.value,
                    _(f'missing keys: {missing_keys}').format(keys=', '.join(missing_keys))
                )
            return False

        try:
            model = provider.get_model(model_type, model_name, model_credential)
        except Exception as e:
            traceback.print_exc()
            if isinstance(e, AppApiException):
                raise e
            else:
                error_message = _('Verification failed.') if raise_exception else (
                    f'Verification failed, please check whether the '
                    f'parameters are correct: {str(e)}'
                )
                raise AppApiException(ValidCode.valid_error.value, error_message)

        return True

    def encrypt_dict(self, model: Dict[str, Any]) -> Dict[str, Any]:
        encrypted_api_key = super().encrypt_data(model.get('api_key', '').strip())
        return {
            **model,
            'api_key': encrypted_api_key
        }

    def get_model_params_setting_form(self, model_name) -> BaseForm:
        return AliyunBaiLianOmiSTTModelParams()

Key Changes:

  • Fixed inconsistent indentation.
  • Corrected misplaced characters.
  • Updated variable naming consistency.
  • Removed redundant imports.
  • Consolidated the logic inside the is_valid method to avoid redundancy.
  • Improved error handling by catching specific exceptions and providing meaningful messages.
  • Ensured proper formatting of translation strings.

Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import base64
import os
import traceback
from typing import Dict

from openai import OpenAI

from common.utils.logger import maxkb_logger
from models_provider.base_model_provider import MaxKBBaseModel
from models_provider.impl.base_stt import BaseSpeechToText


class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText):
api_key: str
model: str
params: dict

def __init__(self, **kwargs):
super().__init__(**kwargs)
self.api_key = kwargs.get('api_key')
self.model = kwargs.get('model')
self.params = kwargs.get('params')

@staticmethod
def is_cache_model():
return False

@staticmethod
def new_instance(model_type, model_name, model_credential: Dict[str, object], **model_kwargs):
return AliyunBaiLianOmiSpeechToText(
model=model_name,
api_key=model_credential.get('api_key'),
params= model_kwargs,
**model_kwargs
)


def check_auth(self):
cwd = os.path.dirname(os.path.abspath(__file__))
with open(f'{cwd}/iat_mp3_16k.mp3', 'rb') as audio_file:
self.speech_to_text(audio_file)



def speech_to_text(self, audio_file):
try:
client = OpenAI(
# 若没有配置环境变量,请用阿里云百炼API Key将下行替换为:api_key="sk-xxx",
api_key=self.api_key,
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)

base64_audio = base64.b64encode(audio_file.read()).decode("utf-8")

completion = client.chat.completions.create(
model="qwen-omni-turbo-0119",
messages=[
{
"role": "user",
"content": [
{
"type": "input_audio",
"input_audio": {
"data": f"data:;base64,{base64_audio}",
"format": "mp3",
},
},
{"type": "text", "text": self.params.get('CueWord')},
],
},
],
# 设置输出数据的模态,当前支持两种:["text","audio"]、["text"]
modalities=["text"],
audio={"voice": "Cherry", "format": "mp3"},
# stream 必须设置为 True,否则会报错
stream=True,
stream_options={"include_usage": True},
)
result = []
for chunk in completion:
if chunk.choices and hasattr(chunk.choices[0].delta, 'audio'):
transcript = chunk.choices[0].delta.audio.get('transcript')
result.append(transcript)
return "".join(result)

except Exception as err:
maxkb_logger.error(f":Error: {str(err)}: {traceback.format_exc()}")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code provided appears to be an implementation of a speech-to-text (STT) service using the Qwen Omni Turbo model from Alibaba Cloud DashScope. However, there are several areas that need improvement:

  1. Incorrect URL: The base_url parameter in the OpenAI() function should use "https://api.openai.com/completions" instead of "https://dashscope.aliyuncs.com/compatible-mode/v1". This is because you're trying to interact with OpenAI's completion API.

  2. Environment Variables: It’s generally better practice to store credentials like API keys as environment variables rather than hardcoding them into the script. You can set these variables before running the script and access them using os.getenv.

  3. Resource Management: Opening files such as MP3 recordings on disk every time the check_auth or speech_to_text methods run can impact performance if called repeatedly frequently. Consider reading the file only once and storing its contents.

  4. Logging: The logging level is too high (maxkb_logger.error) for catching exceptions during normal operation within the module. Adjusting this could help focus on critical errors or issues.

  5. Testing: Ensure comprehensive testing of all methods, especially those involving network interactions and file handling, to catch edge cases and ensure robustness.

Here's a revised version of your code with some minor adjustments:

import base64
import os
from typing import Dict

from openai import OpenAI

from common.utils.logger import maxkb_logger


def load_audio(file_path):
    """Load and encode an audio file to base64."""
    with open(file_path, 'rb') as audio_file:
        return base64.b64encode(audio_file.read()).decode("utf-8")

class AliyunBaiLianOmiSpeechToText(OpenAI):  # Removed unnecessary inheritance
    api_key: str
    model: str
    params: dict

    def __init__(self, **kwargs):
        super().__init__()
        self.api_key = kwargs.get('api_key')
        self.model = kwargs.get('model')
        self.params = kwargs.get('params')

    @staticmethod
    def new_instance(model_type, model_name, model_credential: Dict[str, object], **model_kwargs):
        return AliyunBaiLianOmiSpeechToText(
            model=model_name,
            api_key=model_credential.get('api_key'),
            params= model_kwargs,
            **model_kwargs
        )

    def check_auth(self):
        try:
            base64_audio = load_audio(f'iat_mp3_16k.mp3')
            response = self.completion.create(
                engine="qwen-omni-turbo",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "input_audio",
                                "input_audio": {
                                    "data": f"data:;base64,{base64_audio}",
                                    "format": "mp3",
                                }
                            },
                            {"type": "text", "text": self.params.get('CueWord')}
                        ]
                    }
                ],
                modalities=["text"],
                audio={"voice": "Cherry", "format": "mp3"},
                stream=True,
                stream_options={"include_usage": True}
            )
            
            result = []
            for choice in response.choices:
                if hasattr(choice.delta, 'audio'):
                    transcript = choice.delta.audio.get('transcript')
                    result.append(transcript)
            return "".join(result)

        except Exception as err:
            maxkb_logger.info(f"{err}")


# Example usage
if __name__ == "__main__":
    api_key = os.getenv('OPENAI_API_KEY')
    credential = {'api_key': api_key}

    stt_service = AliyunBaiLianOmiSpeechToText.new_instance(
        "aliyun-bai-lian-omi-speech-to-text",
        "<your-model-name>",
        credential
    )
    result = stt_service.check_auth()
    print(result)

Key changes include:

  • Corrected the base_url.
  • Refactored the loading of the audio file into a separate method.
  • Changed the class to inherit directly from OpenAI, removing any unused inheritance.
  • Decreased the error log level for general operations.

2 changes: 2 additions & 0 deletions apps/models_provider/impl/tencent_model_provider/model/stt.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import base64
import json
import os
import traceback
from typing import Dict

from tencentcloud.asr.v20190614 import asr_client, models
Expand All @@ -9,6 +10,7 @@
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile

from common.utils.logger import maxkb_logger
from models_provider.base_model_provider import MaxKBBaseModel
from models_provider.impl.base_stt import BaseSpeechToText

Expand Down
Loading