Skip to content

Commit aa135a3

Browse files
authored
Add TTS to OpenAI_API_Compatible (langgenius#11071)
1 parent 044e7b6 commit aa135a3

File tree

6 files changed

+169
-7
lines changed

6 files changed

+169
-7
lines changed

api/core/model_runtime/model_providers/azure_openai/tts/tts.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
1616
"""
17-
Model class for OpenAI Speech to text model.
17+
Model class for OpenAI text2speech model.
1818
"""
1919

2020
def _invoke(

api/core/model_runtime/model_providers/gitee_ai/tts/tts.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
class GiteeAIText2SpeechModel(_CommonGiteeAI, TTSModel):
1212
"""
13-
Model class for OpenAI Speech to text model.
13+
Model class for OpenAI text2speech model.
1414
"""
1515

1616
def _invoke(

api/core/model_runtime/model_providers/openai/tts/tts.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
1313
"""
14-
Model class for OpenAI Speech to text model.
14+
Model class for OpenAI text2speech model.
1515
"""
1616

1717
def _invoke(

api/core/model_runtime/model_providers/openai_api_compatible/openai_api_compatible.yaml

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ supported_model_types:
99
- text-embedding
1010
- speech2text
1111
- rerank
12+
- tts
1213
configurate_methods:
1314
- customizable-model
1415
model_credential_schema:
@@ -67,7 +68,7 @@ model_credential_schema:
6768
- variable: __model_type
6869
value: llm
6970
type: text-input
70-
default: '4096'
71+
default: "4096"
7172
placeholder:
7273
zh_Hans: 在此输入您的模型上下文长度
7374
en_US: Enter your Model context size
@@ -80,7 +81,7 @@ model_credential_schema:
8081
- variable: __model_type
8182
value: text-embedding
8283
type: text-input
83-
default: '4096'
84+
default: "4096"
8485
placeholder:
8586
zh_Hans: 在此输入您的模型上下文长度
8687
en_US: Enter your Model context size
@@ -93,7 +94,7 @@ model_credential_schema:
9394
- variable: __model_type
9495
value: rerank
9596
type: text-input
96-
default: '4096'
97+
default: "4096"
9798
placeholder:
9899
zh_Hans: 在此输入您的模型上下文长度
99100
en_US: Enter your Model context size
@@ -104,7 +105,7 @@ model_credential_schema:
104105
show_on:
105106
- variable: __model_type
106107
value: llm
107-
default: '4096'
108+
default: "4096"
108109
type: text-input
109110
- variable: function_calling_type
110111
show_on:
@@ -174,3 +175,19 @@ model_credential_schema:
174175
value: llm
175176
default: '\n\n'
176177
type: text-input
178+
- variable: voices
179+
show_on:
180+
- variable: __model_type
181+
value: tts
182+
label:
183+
en_US: Available Voices (comma-separated)
184+
zh_Hans: 可用声音(用英文逗号分隔)
185+
type: text-input
186+
required: false
187+
default: "alloy"
188+
placeholder:
189+
en_US: "alloy,echo,fable,onyx,nova,shimmer"
190+
zh_Hans: "alloy,echo,fable,onyx,nova,shimmer"
191+
help:
192+
en_US: "List voice names separated by commas. First voice will be used as default."
193+
zh_Hans: "用英文逗号分隔的声音列表。第一个声音将作为默认值。"

api/core/model_runtime/model_providers/openai_api_compatible/tts/__init__.py

Whitespace-only changes.
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
from collections.abc import Iterable
2+
from typing import Optional
3+
from urllib.parse import urljoin
4+
5+
import requests
6+
7+
from core.model_runtime.entities.common_entities import I18nObject
8+
from core.model_runtime.entities.model_entities import AIModelEntity, FetchFrom, ModelPropertyKey, ModelType
9+
from core.model_runtime.errors.invoke import InvokeBadRequestError
10+
from core.model_runtime.errors.validate import CredentialsValidateFailedError
11+
from core.model_runtime.model_providers.__base.tts_model import TTSModel
12+
from core.model_runtime.model_providers.openai_api_compatible._common import _CommonOaiApiCompat
13+
14+
15+
class OAICompatText2SpeechModel(_CommonOaiApiCompat, TTSModel):
16+
"""
17+
Model class for OpenAI-compatible text2speech model.
18+
"""
19+
20+
def _invoke(
21+
self,
22+
model: str,
23+
tenant_id: str,
24+
credentials: dict,
25+
content_text: str,
26+
voice: str,
27+
user: Optional[str] = None,
28+
) -> Iterable[bytes]:
29+
"""
30+
Invoke TTS model
31+
32+
:param model: model name
33+
:param tenant_id: user tenant id
34+
:param credentials: model credentials
35+
:param content_text: text content to be translated
36+
:param voice: model voice/speaker
37+
:param user: unique user id
38+
:return: audio data as bytes iterator
39+
"""
40+
# Set up headers with authentication if provided
41+
headers = {}
42+
if api_key := credentials.get("api_key"):
43+
headers["Authorization"] = f"Bearer {api_key}"
44+
45+
# Construct endpoint URL
46+
endpoint_url = credentials.get("endpoint_url")
47+
if not endpoint_url.endswith("/"):
48+
endpoint_url += "/"
49+
endpoint_url = urljoin(endpoint_url, "audio/speech")
50+
51+
# Get audio format from model properties
52+
audio_format = self._get_model_audio_type(model, credentials)
53+
54+
# Split text into chunks if needed based on word limit
55+
word_limit = self._get_model_word_limit(model, credentials)
56+
sentences = self._split_text_into_sentences(content_text, word_limit)
57+
58+
for sentence in sentences:
59+
# Prepare request payload
60+
payload = {"model": model, "input": sentence, "voice": voice, "response_format": audio_format}
61+
62+
# Make POST request
63+
response = requests.post(endpoint_url, headers=headers, json=payload, stream=True)
64+
65+
if response.status_code != 200:
66+
raise InvokeBadRequestError(response.text)
67+
68+
# Stream the audio data
69+
for chunk in response.iter_content(chunk_size=4096):
70+
if chunk:
71+
yield chunk
72+
73+
def validate_credentials(self, model: str, credentials: dict) -> None:
74+
"""
75+
Validate model credentials
76+
77+
:param model: model name
78+
:param credentials: model credentials
79+
:return:
80+
"""
81+
try:
82+
# Get default voice for validation
83+
voice = self._get_model_default_voice(model, credentials)
84+
85+
# Test with a simple text
86+
next(
87+
self._invoke(
88+
model=model, tenant_id="validate", credentials=credentials, content_text="Test.", voice=voice
89+
)
90+
)
91+
except Exception as ex:
92+
raise CredentialsValidateFailedError(str(ex))
93+
94+
def get_customizable_model_schema(self, model: str, credentials: dict) -> Optional[AIModelEntity]:
95+
"""
96+
Get customizable model schema
97+
"""
98+
# Parse voices from comma-separated string
99+
voice_names = credentials.get("voices", "alloy").strip().split(",")
100+
voices = []
101+
102+
for voice in voice_names:
103+
voice = voice.strip()
104+
if not voice:
105+
continue
106+
107+
# Use en-US for all voices
108+
voices.append(
109+
{
110+
"name": voice,
111+
"mode": voice,
112+
"language": "en-US",
113+
}
114+
)
115+
116+
# If no voices provided or all voices were empty strings, use 'alloy' as default
117+
if not voices:
118+
voices = [{"name": "Alloy", "mode": "alloy", "language": "en-US"}]
119+
120+
return AIModelEntity(
121+
model=model,
122+
label=I18nObject(en_US=model),
123+
fetch_from=FetchFrom.CUSTOMIZABLE_MODEL,
124+
model_type=ModelType.TTS,
125+
model_properties={
126+
ModelPropertyKey.AUDIO_TYPE: credentials.get("audio_type", "mp3"),
127+
ModelPropertyKey.WORD_LIMIT: int(credentials.get("word_limit", 4096)),
128+
ModelPropertyKey.DEFAULT_VOICE: voices[0]["mode"],
129+
ModelPropertyKey.VOICES: voices,
130+
},
131+
)
132+
133+
def get_tts_model_voices(self, model: str, credentials: dict, language: Optional[str] = None) -> list:
134+
"""
135+
Override base get_tts_model_voices to handle customizable voices
136+
"""
137+
model_schema = self.get_customizable_model_schema(model, credentials)
138+
139+
if not model_schema or ModelPropertyKey.VOICES not in model_schema.model_properties:
140+
raise ValueError("this model does not support voice")
141+
142+
voices = model_schema.model_properties[ModelPropertyKey.VOICES]
143+
144+
# Always return all voices regardless of language
145+
return [{"name": d["name"], "value": d["mode"]} for d in voices]

0 commit comments

Comments
 (0)