feat(model/qwen-tts): add param language_type (#59)

songguocola · web-flow · commit 2329cb1e2ecf · 2025-09-17T21:50:12.000+08:00
diff --git a/dashscope/aigc/multimodal_conversation.py b/dashscope/aigc/multimodal_conversation.py
@@ -28,6 +28,8 @@ def call(
         api_key: str = None,
         workspace: str = None,
         text: str = None,
+        voice: str = None,
+        language_type: str = None,
         **kwargs
     ) -> Union[MultiModalConversationResponse, Generator[
             MultiModalConversationResponse, None, None]]:
@@ -57,6 +59,9 @@ def call(
                 [1]: https://help.aliyun.com/zh/dashscope/developer-reference/api-key-settings. # noqa E501
             workspace (str): The dashscope workspace id.
             text (str): The text to generate.
+            voice (str): The voice name of qwen tts, include 'Cherry'/'Ethan'/'Sunny'/'Dylan' and so on,
+                    you can get the total voice list : https://help.aliyun.com/zh/model-studio/qwen-tts.
+            language_type (str): The synthesized language type, default is 'auto', useful for [qwen3-tts].
             **kwargs:
                 stream(bool, `optional`): Enable server-sent events
                     (ref: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events)  # noqa E501
@@ -70,8 +75,6 @@ def call(
                     tokens with top_p probability mass. So 0.1 means only
                     the tokens comprising the top 10% probability mass are
                     considered[qwen-turbo,bailian-v1].
-                voice(string, `optional`): The voice name of qwen tts, include 'Cherry'/'Ethan'/'Sunny'/'Dylan' and so on,
-                    you can get the total voice list : https://help.aliyun.com/zh/model-studio/qwen-tts.
                 top_k(float, `optional`):
 
 
@@ -99,6 +102,10 @@ def call(
 
         if text is not None and text:
             input.update({'text': text})
+        if voice is not None and voice:
+            input.update({'voice': voice})
+        if language_type is not None and language_type:
+            input.update({'language_type': language_type})
         if msg_copy is not None:
             input.update({'messages': msg_copy})
         response = super().call(model=model,
@@ -160,6 +167,8 @@ async def call(
         api_key: str = None,
         workspace: str = None,
         text: str = None,
+        voice: str = None,
+        language_type: str = None,
         **kwargs
     ) -> Union[MultiModalConversationResponse, Generator[
             MultiModalConversationResponse, None, None]]:
@@ -189,6 +198,9 @@ async def call(
                 [1]: https://help.aliyun.com/zh/dashscope/developer-reference/api-key-settings. # noqa E501
             workspace (str): The dashscope workspace id.
             text (str): The text to generate.
+            voice (str): The voice name of qwen tts, include 'Cherry'/'Ethan'/'Sunny'/'Dylan' and so on,
+                    you can get the total voice list : https://help.aliyun.com/zh/model-studio/qwen-tts.
+            language_type (str): The synthesized language type, default is 'auto', useful for [qwen3-tts].
             **kwargs:
                 stream(bool, `optional`): Enable server-sent events
                     (ref: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events)  # noqa E501
@@ -202,8 +214,6 @@ async def call(
                     tokens with top_p probability mass. So 0.1 means only
                     the tokens comprising the top 10% probability mass are
                     considered[qwen-turbo,bailian-v1].
-                voice(string, `optional`): The voice name of qwen tts, include 'Cherry'/'Ethan'/'Sunny'/'Dylan' and so on,
-                    you can get the total voice list : https://help.aliyun.com/zh/model-studio/qwen-tts.
                 top_k(float, `optional`):
 
         Raises:
@@ -230,6 +240,10 @@ async def call(
 
         if text is not None and text:
             input.update({'text': text})
+        if voice is not None and voice:
+            input.update({'voice': voice})
+        if language_type is not None and language_type:
+            input.update({'language_type': language_type})
         if msg_copy is not None:
             input.update({'messages': msg_copy})
         response = await super().call(model=model,
diff --git a/dashscope/audio/qwen_tts_realtime/qwen_tts_realtime.py b/dashscope/audio/qwen_tts_realtime/qwen_tts_realtime.py
@@ -158,6 +158,7 @@ def update_session(self,
                        response_format: AudioFormat = AudioFormat.
                        PCM_24000HZ_MONO_16BIT,
                        mode: str = 'server_commit',
+                       language_type: str = None,
                        **kwargs) -> None:
         '''
         update session configuration, should be used before create response
@@ -170,13 +171,17 @@ def update_session(self,
             output audio format
         mode: str
             response mode, server_commit or commit
+        language_type: str
+            language type for synthesized audio, default is 'auto'
         '''
         self.config = {
             'voice': voice,
             'mode': mode,
             'response_format': response_format.format,
             'sample_rate': response_format.sample_rate,
         }
+        if language_type is not None:
+            self.config['language_type'] = language_type
         self.config.update(kwargs)
         self.__send_str(
             json.dumps({
diff --git a/samples/test_qwen_tts.py b/samples/test_qwen_tts.py
@@ -20,19 +20,26 @@
 
 response = dashscope.MultiModalConversation.call(
     api_key=os.getenv('DASHSCOPE_API_KEY'),
-    model="qwen-tts",
+    model="qwen3-tts-flash",
     text="Today is a wonderful day to build something people love!",
     voice="Cherry",
-    stream=use_stream
+    stream=use_stream,
+    language_type="zh"
 )
 if use_stream:
     # print the audio data in stream mode
     for chunk in response:
+        if chunk.output is None:
+            print(f"error: {chunk}")
+            break
         audio = chunk.output.audio
-        print("base64 audio data is: {}", chunk.output.audio.data)
+        print(f"base64 audio data is: {chunk.output.audio.data}")
         if chunk.output.finish_reason == "stop":
-            print("finish at: {} ", chunk.output.audio.expires_at)
+            print(f"finish at: {chunk.output.audio.expires_at}")
 else:
-    # print the audio url in non-stream mode
-    print("synthesized audio url is: {}", response.output.audio.url)
-    print("finish at: {} ", response.output.audio.expires_at)
+    if response.output is None:
+        print(f"error: {response}")
+    else:
+        # print the audio url in non-stream mode
+        print(f"synthesized audio url is: {response.output.audio.url}")
+        print(f"finish at: {response.output.audio.expires_at}")