Skip to content

Commit 6cf0d98

Browse files
DashScope: added support for qwen3-tts (#608)
* feat: added support for qwen3-tts * fix: issues in qwen3.5 tool call tests The dashscope gateway was updated yesterday, implementing stronger validation of input parameters for tool calls, which caused existing test cases in qwen3.5 to fail. The cause was not identified initially. To avoid impacting qwen3-tts testing, qwen3.5 tool call tests were temporarily suspended. A fix has now been made, restoring tool call tests for qwen3.5.
1 parent d76e384 commit 6cf0d98

File tree

14 files changed

+439
-20
lines changed

14 files changed

+439
-20
lines changed

models/langchain4j-community-dashscope/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
</licenses>
2222

2323
<properties>
24-
<dashscope.version>2.22.12</dashscope.version>
24+
<dashscope.version>2.22.13</dashscope.version>
2525
</properties>
2626

2727
<dependencies>

models/langchain4j-community-dashscope/src/main/java/dev/langchain4j/community/model/dashscope/QwenChatModel.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ protected QwenChatModel(
108108
.enableSearch(getOrDefault(enableSearch, qwenParameters.enableSearch()))
109109
.searchOptions(qwenParameters.searchOptions())
110110
.asrOptions(qwenParameters.asrOptions())
111+
.ttsOptions(qwenParameters.ttsOptions())
111112
.translationOptions(qwenParameters.translationOptions())
112113
.vlHighResolutionImages(qwenParameters.vlHighResolutionImages())
113114
.isMultimodalModel(getOrDefault(isMultimodalModel, qwenParameters.isMultimodalModel()))

models/langchain4j-community-dashscope/src/main/java/dev/langchain4j/community/model/dashscope/QwenChatRequestParameters.java

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@ public class QwenChatRequestParameters extends DefaultChatRequestParameters {
3333
* See <a href="https://modelstudio.console.alibabacloud.com/ap-southeast-1/?tab=api#/api/?type=model&url=2986952">Qwen-ASR API reference</a> for more details.
3434
*/
3535
private final AsrOptions asrOptions;
36+
/**
37+
* Parameters for text-to-speech (TTS).
38+
* See <a href="https://www.alibabacloud.com/help/en/model-studio/qwen-tts">Speech synthesis - Qwen</a> for more details.
39+
*/
40+
private final TtsOptions ttsOptions;
3641
/**
3742
* The translation parameters you need to configure when you use the translation
3843
* models.
@@ -122,6 +127,7 @@ protected QwenChatRequestParameters(Builder builder) {
122127
this.enableSearch = builder.enableSearch;
123128
this.searchOptions = builder.searchOptions;
124129
this.asrOptions = builder.asrOptions;
130+
this.ttsOptions = builder.ttsOptions;
125131
this.translationOptions = builder.translationOptions;
126132
this.vlHighResolutionImages = builder.vlHighResolutionImages;
127133
this.isMultimodalModel = builder.isMultimodalModel;
@@ -155,6 +161,10 @@ public AsrOptions asrOptions() {
155161
return asrOptions;
156162
}
157163

164+
public TtsOptions ttsOptions() {
165+
return ttsOptions;
166+
}
167+
158168
public TranslationOptions translationOptions() {
159169
return translationOptions;
160170
}
@@ -236,6 +246,7 @@ public boolean equals(Object o) {
236246
&& Objects.equals(enableSearch, that.enableSearch)
237247
&& Objects.equals(searchOptions, that.searchOptions)
238248
&& Objects.equals(asrOptions, that.asrOptions)
249+
&& Objects.equals(ttsOptions, that.ttsOptions)
239250
&& Objects.equals(translationOptions, that.translationOptions)
240251
&& Objects.equals(vlHighResolutionImages, that.vlHighResolutionImages)
241252
&& Objects.equals(isMultimodalModel, that.isMultimodalModel)
@@ -261,6 +272,7 @@ public int hashCode() {
261272
enableSearch,
262273
searchOptions,
263274
asrOptions,
275+
ttsOptions,
264276
translationOptions,
265277
vlHighResolutionImages,
266278
isMultimodalModel,
@@ -295,7 +307,8 @@ public String toString() {
295307
+ seed + ", enableSearch="
296308
+ enableSearch + ", searchOptions="
297309
+ searchOptions + ", asrOptions="
298-
+ asrOptions + ", translationOptions="
310+
+ asrOptions + ", ttsOptions="
311+
+ ttsOptions + ", translationOptions="
299312
+ translationOptions + ", vlHighResolutionImages="
300313
+ vlHighResolutionImages + ", isMultimodalModel="
301314
+ isMultimodalModel + ", supportIncrementalOutput="
@@ -318,6 +331,7 @@ public static class Builder extends DefaultChatRequestParameters.Builder<Builder
318331
private Boolean enableSearch;
319332
private SearchOptions searchOptions;
320333
private AsrOptions asrOptions;
334+
private TtsOptions ttsOptions;
321335
private TranslationOptions translationOptions;
322336
private Boolean vlHighResolutionImages;
323337
private Boolean isMultimodalModel;
@@ -342,6 +356,7 @@ public Builder overrideWith(ChatRequestParameters parameters) {
342356
enableSearch(getOrDefault(qwenParameters.enableSearch(), enableSearch));
343357
searchOptions(getOrDefault(qwenParameters.searchOptions(), searchOptions));
344358
asrOptions(getOrDefault(qwenParameters.asrOptions(), asrOptions));
359+
ttsOptions(getOrDefault(qwenParameters.ttsOptions(), ttsOptions));
345360
translationOptions(getOrDefault(qwenParameters.translationOptions(), translationOptions));
346361
vlHighResolutionImages(getOrDefault(qwenParameters.vlHighResolutionImages(), vlHighResolutionImages));
347362
enableThinking(getOrDefault(qwenParameters.enableThinking(), enableThinking));
@@ -382,6 +397,11 @@ public Builder asrOptions(AsrOptions asrOptions) {
382397
return this;
383398
}
384399

400+
public Builder ttsOptions(TtsOptions ttsOptions) {
401+
this.ttsOptions = ttsOptions;
402+
return this;
403+
}
404+
385405
public Builder translationOptions(TranslationOptions translationOptions) {
386406
this.translationOptions = translationOptions;
387407
return this;
@@ -656,4 +676,55 @@ public AsrOptions build() {
656676
}
657677
}
658678
}
679+
680+
/**
681+
* Text-to-speech (TTS) parameters.
682+
*
683+
* @param voice (Required) The voice to use.
684+
* See <a href="https://www.alibabacloud.com/help/en/model-studio/qwen-tts#bac280ddf5a1u">Supported system voices</a> for details.
685+
* @param languageType Specify the language of the synthesized audio. The default value is Auto.
686+
* Auto: Use when text language is uncertain or contains multiple languages. The model automatically matches pronunciation for different language segments, but accuracy is not guaranteed.
687+
* Specify language: Use when the text is in a single language. Specifying the exact language significantly improves synthesis quality and usually outperforms Auto. Supported values include the following (for now): Chinese, English, German, Italian, Portuguese, Spanish, Japanese, Korean, French, Russian
688+
* @param instructions Provide instructions to guide speech synthesis. Only supported by instruction models.
689+
* @param optimizeInstructions Optimize instructions to improve speech naturalness and expressiveness. Defaults to false.
690+
* Behavior: When true, the system semantically enhances and rewrites instructions to generate internal instructions better suited for speech synthesis.
691+
* Scenarios: Enable for high-quality, fine-grained speech expression.
692+
* Dependency: Requires instructions parameter. Has no effect if the instructions parameter is empty.
693+
*/
694+
public record TtsOptions(String voice, String languageType, String instructions, Boolean optimizeInstructions) {
695+
public static Builder builder() {
696+
return new Builder();
697+
}
698+
699+
public static class Builder {
700+
private String voice;
701+
private String languageType;
702+
private String instructions;
703+
private Boolean optimizeInstructions;
704+
705+
public Builder voice(String voice) {
706+
this.voice = voice;
707+
return this;
708+
}
709+
710+
public Builder languageType(String languageType) {
711+
this.languageType = languageType;
712+
return this;
713+
}
714+
715+
public Builder instructions(String instructions) {
716+
this.instructions = instructions;
717+
return this;
718+
}
719+
720+
public Builder optimizeInstructions(Boolean optimizeInstructions) {
721+
this.optimizeInstructions = optimizeInstructions;
722+
return this;
723+
}
724+
725+
public TtsOptions build() {
726+
return new TtsOptions(voice, languageType, instructions, optimizeInstructions);
727+
}
728+
}
729+
}
659730
}

models/langchain4j-community-dashscope/src/main/java/dev/langchain4j/community/model/dashscope/QwenHelper.java

Lines changed: 86 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import static dev.langchain4j.data.message.ChatMessageType.USER;
99
import static dev.langchain4j.internal.JsonSchemaElementUtils.toMap;
1010
import static dev.langchain4j.internal.Utils.getOrDefault;
11+
import static dev.langchain4j.internal.Utils.isNotNullOrEmpty;
1112
import static dev.langchain4j.internal.Utils.isNullOrBlank;
1213
import static dev.langchain4j.internal.Utils.isNullOrEmpty;
1314
import static dev.langchain4j.model.chat.request.ToolChoice.REQUIRED;
@@ -24,6 +25,7 @@
2425
import com.alibaba.dashscope.aigc.generation.GenerationResult;
2526
import com.alibaba.dashscope.aigc.generation.SearchInfo;
2627
import com.alibaba.dashscope.aigc.generation.TranslationOptions;
28+
import com.alibaba.dashscope.aigc.multimodalconversation.AudioParameters;
2729
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationOutput;
2830
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
2931
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
@@ -85,6 +87,8 @@
8587
class QwenHelper {
8688

8789
private static final Logger log = LoggerFactory.getLogger(QwenHelper.class);
90+
public static final String GENERATED_AUDIOS_KEY =
91+
"generated_audios"; // key for storing generated audios in AiMessage attributes
8892

8993
static List<Message> toQwenMessages(List<ChatMessage> messages, Boolean enableSanitizeMessages) {
9094
List<ChatMessage> inputMessages =
@@ -168,7 +172,10 @@ static List<Map<String, Object>> toMultiModalContents(ChatMessage message) {
168172
.contents().stream()
169173
.map(QwenHelper::toMultiModalContent)
170174
.collect(toList());
171-
case AI -> Collections.singletonList(Collections.singletonMap("text", ((AiMessage) message).text()));
175+
case AI ->
176+
isNullOrBlank(((AiMessage) message).text())
177+
? Collections.emptyList()
178+
: Collections.singletonList(Collections.singletonMap("text", ((AiMessage) message).text()));
172179
case SYSTEM ->
173180
Collections.singletonList(Collections.singletonMap("text", ((SystemMessage) message).text()));
174181
case TOOL_EXECUTION_RESULT ->
@@ -335,6 +342,24 @@ static List<Image> imagesFrom(MultiModalConversationResult result) {
335342
.collect(toList());
336343
}
337344

345+
static List<Audio> audiosFrom(MultiModalConversationResult result) {
346+
if (result.getOutput().getAudio() != null) {
347+
if (result.getOutput().getAudio().getUrl() != null) {
348+
return Collections.singletonList(Audio.builder()
349+
.url(result.getOutput().getAudio().getUrl())
350+
.mimeType("audio/wav")
351+
.build());
352+
} else if (result.getOutput().getAudio().getData() != null) {
353+
// The base64-encoded audio would be returned in the streaming mode.
354+
return Collections.singletonList(Audio.builder()
355+
.base64Data(result.getOutput().getAudio().getData())
356+
.mimeType("audio/pcm")
357+
.build());
358+
}
359+
}
360+
return Collections.emptyList();
361+
}
362+
338363
static TokenUsage tokenUsageFrom(GenerationResult result) {
339364
return Optional.of(result)
340365
.map(GenerationResult::getUsage)
@@ -366,11 +391,16 @@ static FinishReason finishReasonFrom(GenerationResult result) {
366391
}
367392

368393
static FinishReason finishReasonFrom(MultiModalConversationResult result) {
369-
MultiModalConversationOutput.Choice choice =
370-
result.getOutput().getChoices().get(0);
371-
// Upon observation, when tool_calls occur, the returned finish_reason may be null or "stop", not "tool_calls".
372-
String finishReason =
373-
isNullOrEmpty(choice.getMessage().getToolCalls()) ? choice.getFinishReason() : "tool_calls";
394+
String finishReason;
395+
if (isNullOrEmpty(result.getOutput().getChoices())) {
396+
finishReason = result.getOutput().getFinishReason();
397+
} else {
398+
MultiModalConversationOutput.Choice choice =
399+
result.getOutput().getChoices().get(0);
400+
// Upon observation, when tool_calls occur, the returned finish_reason may be null or "stop", not
401+
// "tool_calls".
402+
finishReason = isNullOrEmpty(choice.getMessage().getToolCalls()) ? choice.getFinishReason() : "tool_calls";
403+
}
374404

375405
return finishReason == null
376406
? null
@@ -411,7 +441,8 @@ static boolean isMultimodalModelName(String modelName) {
411441
|| modelName.contains("-omni-")
412442
|| modelName.contains("-image-")
413443
|| modelName.startsWith("qwen3.5-")
414-
|| modelName.contains("-asr-");
444+
|| modelName.contains("-asr-")
445+
|| modelName.contains("-tts-");
415446
}
416447

417448
static boolean isSupportingIncrementalOutputModelName(String modelName) {
@@ -582,10 +613,18 @@ static AiMessage aiMessageFrom(MultiModalConversationResult result) {
582613
String text = answerFrom(result);
583614
String reasoningContentFrom = reasoningContentFrom(result);
584615
List<Image> images = imagesFrom(result);
616+
List<Audio> audios = audiosFrom(result);
617+
Map<String, Object> attributes = new HashMap<>(2);
618+
if (isNotNullOrEmpty(images)) {
619+
attributes.put(GENERATED_IMAGES_KEY, images);
620+
}
621+
if (isNotNullOrEmpty(audios)) {
622+
attributes.put(GENERATED_AUDIOS_KEY, audios);
623+
}
585624
AiMessage.Builder aiMessageBuilder = AiMessage.builder()
586625
.text(text)
587626
.thinking(isNullOrBlank(reasoningContentFrom) ? null : reasoningContentFrom)
588-
.attributes(isNullOrEmpty(images) ? Map.of() : Map.of(GENERATED_IMAGES_KEY, images));
627+
.attributes(attributes);
589628
if (isFunctionToolCalls(result)) {
590629
aiMessageBuilder = aiMessageBuilder.toolExecutionRequests(toolExecutionRequestsFrom(result));
591630
if (text.isBlank()) {
@@ -845,6 +884,10 @@ static void validateGenerationParameters(QwenChatRequestParameters parameters) {
845884
if (parameters.asrOptions() != null) {
846885
throw new UnsupportedFeatureException("asrOptions is not supported by " + parameters.modelName());
847886
}
887+
888+
if (parameters.ttsOptions() != null) {
889+
throw new UnsupportedFeatureException("ttsOptions is not supported by " + parameters.modelName());
890+
}
848891
}
849892

850893
static void validateMultimodalConversationParameters(QwenChatRequestParameters parameters) {
@@ -977,6 +1020,24 @@ static MultiModalConversationParam toMultiModalConversationParam(
9771020
builder.parameter("asr_options", asrOptions);
9781021
}
9791022

1023+
if (parameters.ttsOptions() != null) {
1024+
builder.text(toQwenTtsText(chatRequest.messages()));
1025+
builder.voice(toQwenTtsVoice(parameters.ttsOptions().voice()));
1026+
if (parameters.ttsOptions().languageType() != null) {
1027+
builder.languageType(parameters.ttsOptions().languageType());
1028+
}
1029+
if (parameters.ttsOptions().instructions() != null) {
1030+
// no java field is provided yet
1031+
builder.parameter("instructions", parameters.ttsOptions().instructions());
1032+
}
1033+
if (parameters.ttsOptions().optimizeInstructions() != null) {
1034+
// no java field is provided yet
1035+
builder.parameter(
1036+
"optimize_instructions", parameters.ttsOptions().optimizeInstructions());
1037+
}
1038+
builder.parameter("enable_omni_output_audio_url", true);
1039+
}
1040+
9801041
if (parameters.custom() != null) {
9811042
// no java field is provided yet
9821043
builder.parameter("custom", parameters.custom());
@@ -989,6 +1050,23 @@ static MultiModalConversationParam toMultiModalConversationParam(
9891050
return builder.build();
9901051
}
9911052

1053+
static String toQwenTtsText(List<ChatMessage> messages) {
1054+
try {
1055+
return ((UserMessage) messages.get(messages.size() - 1)).singleText();
1056+
} catch (Exception e) {
1057+
throw new IllegalArgumentException("No valid text found", e);
1058+
}
1059+
}
1060+
1061+
static AudioParameters.Voice toQwenTtsVoice(String voice) {
1062+
for (AudioParameters.Voice qwenVoice : AudioParameters.Voice.values()) {
1063+
if (qwenVoice.getValue().equalsIgnoreCase(voice)) {
1064+
return qwenVoice;
1065+
}
1066+
}
1067+
throw new IllegalArgumentException("Invalid voice: " + voice);
1068+
}
1069+
9921070
static com.alibaba.dashscope.common.ResponseFormat toQwenResponseFormat(
9931071
ResponseFormat responseFormat, Boolean jsonSchemaStrict) {
9941072
if (responseFormat == null) {

models/langchain4j-community-dashscope/src/main/java/dev/langchain4j/community/model/dashscope/QwenModelName.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ public class QwenModelName {
7979
"qwen-audio-turbo-latest"; // Qwen audio understanding model, latest version
8080

8181
public static final String QWEN3_ASR_FLASH = "qwen3-asr-flash"; // Qwen3 ASR model (flash)
82+
public static final String QWEN3_TTS_FLASH = "qwen3-tts-flash"; // Qwen3 TTS model (flash)
83+
public static final String QWEN3_TTS_INSTRUCT_FLASH =
84+
"qwen3-tts-instruct-flash"; // Qwen3 TTS instruction model (flash)
8285
public static final String QWEN_MT_TURBO = "qwen-mt-turbo"; // Qwen turbo model for translation (deprecated)
8386
public static final String QWEN_MT_PLUS = "qwen-mt-plus"; // Qwen plus model for translation
8487
public static final String QWEN_MT_FLASH = "qwen-mt-flash"; // Qwen flash model (2b) for translation

models/langchain4j-community-dashscope/src/main/java/dev/langchain4j/community/model/dashscope/QwenStreamingChatModel.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ public QwenStreamingChatModel(
123123
.enableSearch(getOrDefault(enableSearch, qwenParameters.enableSearch()))
124124
.searchOptions(qwenParameters.searchOptions())
125125
.asrOptions(qwenParameters.asrOptions())
126+
.ttsOptions(qwenParameters.ttsOptions())
126127
.translationOptions(qwenParameters.translationOptions())
127128
.vlHighResolutionImages(qwenParameters.vlHighResolutionImages())
128129
.isMultimodalModel(getOrDefault(isMultimodalModel, qwenParameters.isMultimodalModel()))

models/langchain4j-community-dashscope/src/main/java/dev/langchain4j/community/model/dashscope/QwenStreamingResponseBuilder.java

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,10 @@ public QwenPartialResponse append(MultiModalConversationResult partialResponse)
138138
String generatedContent = answerFrom(accumulatedMultiModalConversationResult);
139139
partialContent = partialContent.substring(generatedContent.length());
140140
}
141+
} else if (partialResponse.getOutput().getAudio() != null
142+
&& partialResponse.getOutput().getAudio().getData() != null) {
143+
// The tts models will incrementally return base64-encoded PCM data.
144+
partialContent = partialResponse.getOutput().getAudio().getData();
141145
}
142146
if (hasReasoningContent(partialResponse)) {
143147
partialReasoningContent = reasoningContentFrom(partialResponse);
@@ -657,13 +661,6 @@ private static String merge(String previous, String current) {
657661
}
658662

659663
private static Long merge(Long previous, Long current) {
660-
if (previous == null) {
661-
return current;
662-
}
663-
if (current == null) {
664-
return previous;
665-
}
666-
String resultStr = merge(String.valueOf(previous), String.valueOf(current));
667-
return Long.parseLong(resultStr);
664+
return current == null ? previous : current;
668665
}
669666
}

0 commit comments

Comments
 (0)