Skip to content

Commit 2329cb1

Browse files
authored
feat(model/qwen-tts): add param language_type (#59)
1 parent b365901 commit 2329cb1

File tree

3 files changed

+37
-11
lines changed

3 files changed

+37
-11
lines changed

dashscope/aigc/multimodal_conversation.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ def call(
2828
api_key: str = None,
2929
workspace: str = None,
3030
text: str = None,
31+
voice: str = None,
32+
language_type: str = None,
3133
**kwargs
3234
) -> Union[MultiModalConversationResponse, Generator[
3335
MultiModalConversationResponse, None, None]]:
@@ -57,6 +59,9 @@ def call(
5759
[1]: https://help.aliyun.com/zh/dashscope/developer-reference/api-key-settings. # noqa E501
5860
workspace (str): The dashscope workspace id.
5961
text (str): The text to generate.
62+
voice (str): The voice name of qwen tts, include 'Cherry'/'Ethan'/'Sunny'/'Dylan' and so on,
63+
you can get the total voice list : https://help.aliyun.com/zh/model-studio/qwen-tts.
64+
language_type (str): The synthesized language type, default is 'auto', useful for [qwen3-tts].
6065
**kwargs:
6166
stream(bool, `optional`): Enable server-sent events
6267
(ref: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events) # noqa E501
@@ -70,8 +75,6 @@ def call(
7075
tokens with top_p probability mass. So 0.1 means only
7176
the tokens comprising the top 10% probability mass are
7277
considered[qwen-turbo,bailian-v1].
73-
voice(string, `optional`): The voice name of qwen tts, include 'Cherry'/'Ethan'/'Sunny'/'Dylan' and so on,
74-
you can get the total voice list : https://help.aliyun.com/zh/model-studio/qwen-tts.
7578
top_k(float, `optional`):
7679
7780
@@ -99,6 +102,10 @@ def call(
99102

100103
if text is not None and text:
101104
input.update({'text': text})
105+
if voice is not None and voice:
106+
input.update({'voice': voice})
107+
if language_type is not None and language_type:
108+
input.update({'language_type': language_type})
102109
if msg_copy is not None:
103110
input.update({'messages': msg_copy})
104111
response = super().call(model=model,
@@ -160,6 +167,8 @@ async def call(
160167
api_key: str = None,
161168
workspace: str = None,
162169
text: str = None,
170+
voice: str = None,
171+
language_type: str = None,
163172
**kwargs
164173
) -> Union[MultiModalConversationResponse, Generator[
165174
MultiModalConversationResponse, None, None]]:
@@ -189,6 +198,9 @@ async def call(
189198
[1]: https://help.aliyun.com/zh/dashscope/developer-reference/api-key-settings. # noqa E501
190199
workspace (str): The dashscope workspace id.
191200
text (str): The text to generate.
201+
voice (str): The voice name of qwen tts, include 'Cherry'/'Ethan'/'Sunny'/'Dylan' and so on,
202+
you can get the total voice list : https://help.aliyun.com/zh/model-studio/qwen-tts.
203+
language_type (str): The synthesized language type, default is 'auto', useful for [qwen3-tts].
192204
**kwargs:
193205
stream(bool, `optional`): Enable server-sent events
194206
(ref: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events) # noqa E501
@@ -202,8 +214,6 @@ async def call(
202214
tokens with top_p probability mass. So 0.1 means only
203215
the tokens comprising the top 10% probability mass are
204216
considered[qwen-turbo,bailian-v1].
205-
voice(string, `optional`): The voice name of qwen tts, include 'Cherry'/'Ethan'/'Sunny'/'Dylan' and so on,
206-
you can get the total voice list : https://help.aliyun.com/zh/model-studio/qwen-tts.
207217
top_k(float, `optional`):
208218
209219
Raises:
@@ -230,6 +240,10 @@ async def call(
230240

231241
if text is not None and text:
232242
input.update({'text': text})
243+
if voice is not None and voice:
244+
input.update({'voice': voice})
245+
if language_type is not None and language_type:
246+
input.update({'language_type': language_type})
233247
if msg_copy is not None:
234248
input.update({'messages': msg_copy})
235249
response = await super().call(model=model,

dashscope/audio/qwen_tts_realtime/qwen_tts_realtime.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ def update_session(self,
158158
response_format: AudioFormat = AudioFormat.
159159
PCM_24000HZ_MONO_16BIT,
160160
mode: str = 'server_commit',
161+
language_type: str = None,
161162
**kwargs) -> None:
162163
'''
163164
update session configuration, should be used before create response
@@ -170,13 +171,17 @@ def update_session(self,
170171
output audio format
171172
mode: str
172173
response mode, server_commit or commit
174+
language_type: str
175+
language type for synthesized audio, default is 'auto'
173176
'''
174177
self.config = {
175178
'voice': voice,
176179
'mode': mode,
177180
'response_format': response_format.format,
178181
'sample_rate': response_format.sample_rate,
179182
}
183+
if language_type is not None:
184+
self.config['language_type'] = language_type
180185
self.config.update(kwargs)
181186
self.__send_str(
182187
json.dumps({

samples/test_qwen_tts.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,26 @@
2020

2121
response = dashscope.MultiModalConversation.call(
2222
api_key=os.getenv('DASHSCOPE_API_KEY'),
23-
model="qwen-tts",
23+
model="qwen3-tts-flash",
2424
text="Today is a wonderful day to build something people love!",
2525
voice="Cherry",
26-
stream=use_stream
26+
stream=use_stream,
27+
language_type="zh"
2728
)
2829
if use_stream:
2930
# print the audio data in stream mode
3031
for chunk in response:
32+
if chunk.output is None:
33+
print(f"error: {chunk}")
34+
break
3135
audio = chunk.output.audio
32-
print("base64 audio data is: {}", chunk.output.audio.data)
36+
print(f"base64 audio data is: {chunk.output.audio.data}")
3337
if chunk.output.finish_reason == "stop":
34-
print("finish at: {} ", chunk.output.audio.expires_at)
38+
print(f"finish at: {chunk.output.audio.expires_at}")
3539
else:
36-
# print the audio url in non-stream mode
37-
print("synthesized audio url is: {}", response.output.audio.url)
38-
print("finish at: {} ", response.output.audio.expires_at)
40+
if response.output is None:
41+
print(f"error: {response}")
42+
else:
43+
# print the audio url in non-stream mode
44+
print(f"synthesized audio url is: {response.output.audio.url}")
45+
print(f"finish at: {response.output.audio.expires_at}")

0 commit comments

Comments
 (0)