Skip to content

Commit 5fbb9f3

Browse files
feat: added support for qwen3-asr to replace the deprecated qwen-audio (#603)
1 parent 7820a17 commit 5fbb9f3

File tree

13 files changed

+300
-20
lines changed

13 files changed

+300
-20
lines changed

models/langchain4j-community-dashscope/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
</licenses>
2222

2323
<properties>
24-
<dashscope.version>2.22.10</dashscope.version>
24+
<dashscope.version>2.22.12</dashscope.version>
2525
</properties>
2626

2727
<dependencies>

models/langchain4j-community-dashscope/src/main/java/dev/langchain4j/community/model/dashscope/QwenChatModel.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ protected QwenChatModel(
107107
.seed(getOrDefault(seed, qwenParameters.seed()))
108108
.enableSearch(getOrDefault(enableSearch, qwenParameters.enableSearch()))
109109
.searchOptions(qwenParameters.searchOptions())
110+
.asrOptions(qwenParameters.asrOptions())
110111
.translationOptions(qwenParameters.translationOptions())
111112
.vlHighResolutionImages(qwenParameters.vlHighResolutionImages())
112113
.isMultimodalModel(getOrDefault(isMultimodalModel, qwenParameters.isMultimodalModel()))

models/langchain4j-community-dashscope/src/main/java/dev/langchain4j/community/model/dashscope/QwenChatRequestParameters.java

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@ public class QwenChatRequestParameters extends DefaultChatRequestParameters {
2828
* The strategy for network search. Only takes effect when enableSearch is true.
2929
*/
3030
private final SearchOptions searchOptions;
31+
/**
32+
* Parameters for automatic speech recognition (ASR).
33+
* See <a href="https://modelstudio.console.alibabacloud.com/ap-southeast-1/?tab=api#/api/?type=model&url=2986952">Qwen-ASR API reference</a> for more details.
34+
*/
35+
private final AsrOptions asrOptions;
3136
/**
3237
* The translation parameters you need to configure when you use the translation
3338
* models.
@@ -116,6 +121,7 @@ protected QwenChatRequestParameters(Builder builder) {
116121
this.seed = builder.seed;
117122
this.enableSearch = builder.enableSearch;
118123
this.searchOptions = builder.searchOptions;
124+
this.asrOptions = builder.asrOptions;
119125
this.translationOptions = builder.translationOptions;
120126
this.vlHighResolutionImages = builder.vlHighResolutionImages;
121127
this.isMultimodalModel = builder.isMultimodalModel;
@@ -145,6 +151,10 @@ public SearchOptions searchOptions() {
145151
return searchOptions;
146152
}
147153

154+
public AsrOptions asrOptions() {
155+
return asrOptions;
156+
}
157+
148158
public TranslationOptions translationOptions() {
149159
return translationOptions;
150160
}
@@ -225,6 +235,7 @@ public boolean equals(Object o) {
225235
return Objects.equals(seed, that.seed)
226236
&& Objects.equals(enableSearch, that.enableSearch)
227237
&& Objects.equals(searchOptions, that.searchOptions)
238+
&& Objects.equals(asrOptions, that.asrOptions)
228239
&& Objects.equals(translationOptions, that.translationOptions)
229240
&& Objects.equals(vlHighResolutionImages, that.vlHighResolutionImages)
230241
&& Objects.equals(isMultimodalModel, that.isMultimodalModel)
@@ -249,6 +260,7 @@ public int hashCode() {
249260
seed,
250261
enableSearch,
251262
searchOptions,
263+
asrOptions,
252264
translationOptions,
253265
vlHighResolutionImages,
254266
isMultimodalModel,
@@ -282,7 +294,8 @@ public String toString() {
282294
+ responseFormat() + ", seed="
283295
+ seed + ", enableSearch="
284296
+ enableSearch + ", searchOptions="
285-
+ searchOptions + ", translationOptions="
297+
+ searchOptions + ", asrOptions="
298+
+ asrOptions + ", translationOptions="
286299
+ translationOptions + ", vlHighResolutionImages="
287300
+ vlHighResolutionImages + ", isMultimodalModel="
288301
+ isMultimodalModel + ", supportIncrementalOutput="
@@ -304,6 +317,7 @@ public static class Builder extends DefaultChatRequestParameters.Builder<Builder
304317
private Integer seed;
305318
private Boolean enableSearch;
306319
private SearchOptions searchOptions;
320+
private AsrOptions asrOptions;
307321
private TranslationOptions translationOptions;
308322
private Boolean vlHighResolutionImages;
309323
private Boolean isMultimodalModel;
@@ -327,6 +341,7 @@ public Builder overrideWith(ChatRequestParameters parameters) {
327341
seed(getOrDefault(qwenParameters.seed(), seed));
328342
enableSearch(getOrDefault(qwenParameters.enableSearch(), enableSearch));
329343
searchOptions(getOrDefault(qwenParameters.searchOptions(), searchOptions));
344+
asrOptions(getOrDefault(qwenParameters.asrOptions(), asrOptions));
330345
translationOptions(getOrDefault(qwenParameters.translationOptions(), translationOptions));
331346
vlHighResolutionImages(getOrDefault(qwenParameters.vlHighResolutionImages(), vlHighResolutionImages));
332347
enableThinking(getOrDefault(qwenParameters.enableThinking(), enableThinking));
@@ -362,6 +377,11 @@ public Builder searchOptions(SearchOptions searchOptions) {
362377
return this;
363378
}
364379

380+
public Builder asrOptions(AsrOptions asrOptions) {
381+
this.asrOptions = asrOptions;
382+
return this;
383+
}
384+
365385
public Builder translationOptions(TranslationOptions translationOptions) {
366386
this.translationOptions = translationOptions;
367387
return this;
@@ -603,4 +623,37 @@ public TranslationOptionTerm build() {
603623
}
604624
}
605625
}
626+
627+
/**
628+
* Automatic speech recognition (ASR) parameters.
629+
*
630+
* @param language Audio language hint.
631+
* See <a href="https://www.alibabacloud.com/help/en/model-studio/qwen-asr-api-reference?h2-5234e940#h2-5234e940">Supported languages</a> for more details.
632+
* @param enableItn Enable Inverse Text Normalization (ITN). Chinese and English only.
633+
* Defaults to false.
634+
*/
635+
public record AsrOptions(String language, Boolean enableItn) {
636+
public static Builder builder() {
637+
return new Builder();
638+
}
639+
640+
public static class Builder {
641+
private String language;
642+
private Boolean enableItn;
643+
644+
public Builder language(String language) {
645+
this.language = language;
646+
return this;
647+
}
648+
649+
public Builder enableItn(Boolean enableItn) {
650+
this.enableItn = enableItn;
651+
return this;
652+
}
653+
654+
public AsrOptions build() {
655+
return new AsrOptions(language, enableItn);
656+
}
657+
}
658+
}
606659
}

models/langchain4j-community-dashscope/src/main/java/dev/langchain4j/community/model/dashscope/QwenHelper.java

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
import java.util.ArrayList;
7272
import java.util.Collection;
7373
import java.util.Collections;
74+
import java.util.HashMap;
7475
import java.util.LinkedList;
7576
import java.util.List;
7677
import java.util.Map;
@@ -409,7 +410,8 @@ static boolean isMultimodalModelName(String modelName) {
409410
|| modelName.contains("-audio-")
410411
|| modelName.contains("-omni-")
411412
|| modelName.contains("-image-")
412-
|| modelName.startsWith("qwen3.5-");
413+
|| modelName.startsWith("qwen3.5-")
414+
|| modelName.contains("-asr-");
413415
}
414416

415417
static boolean isSupportingIncrementalOutputModelName(String modelName) {
@@ -839,6 +841,10 @@ static void validateGenerationParameters(QwenChatRequestParameters parameters) {
839841
if (parameters.negativePrompt() != null) {
840842
throw new UnsupportedFeatureException("negativePrompt is not supported by " + parameters.modelName());
841843
}
844+
845+
if (parameters.asrOptions() != null) {
846+
throw new UnsupportedFeatureException("asrOptions is not supported by " + parameters.modelName());
847+
}
842848
}
843849

844850
static void validateMultimodalConversationParameters(QwenChatRequestParameters parameters) {
@@ -959,6 +965,18 @@ static MultiModalConversationParam toMultiModalConversationParam(
959965
builder.parameter("enable_code_interpreter", parameters.enableCodeInterpreter());
960966
}
961967

968+
if (parameters.asrOptions() != null) {
969+
// no java field is provided yet
970+
Map<String, Object> asrOptions = new HashMap<>(2);
971+
if (parameters.asrOptions().language() != null) {
972+
asrOptions.put("language", parameters.asrOptions().language());
973+
}
974+
if (parameters.asrOptions().enableItn() != null) {
975+
asrOptions.put("enable_itn", parameters.asrOptions().enableItn());
976+
}
977+
builder.parameter("asr_options", asrOptions);
978+
}
979+
962980
if (parameters.custom() != null) {
963981
// no java field is provided yet
964982
builder.parameter("custom", parameters.custom());

models/langchain4j-community-dashscope/src/main/java/dev/langchain4j/community/model/dashscope/QwenModelName.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,15 @@ public class QwenModelName {
7070
"qwen-vl-max-latest"; // Qwen multi-modal model, offers optimal performance, stable version
7171
public static final String QWEN3_VL_PLUS = "qwen3-vl-plus"; // Qwen3 multi-modal model v3 (plus)
7272
public static final String QWEN3_VL_FLASH = "qwen3-vl-flash"; // Qwen3 multi-modal model v3 (flash)
73+
74+
@Deprecated
7375
public static final String QWEN_AUDIO_TURBO = "qwen-audio-turbo"; // Qwen audio understanding model, stable version
76+
77+
@Deprecated
7478
public static final String QWEN_AUDIO_TURBO_LATEST =
7579
"qwen-audio-turbo-latest"; // Qwen audio understanding model, latest version
80+
81+
public static final String QWEN3_ASR_FLASH = "qwen3-asr-flash"; // Qwen3 ASR model (flash)
7682
public static final String QWEN_MT_TURBO = "qwen-mt-turbo"; // Qwen turbo model for translation (deprecated)
7783
public static final String QWEN_MT_PLUS = "qwen-mt-plus"; // Qwen plus model for translation
7884
public static final String QWEN_MT_FLASH = "qwen-mt-flash"; // Qwen flash model (2b) for translation

models/langchain4j-community-dashscope/src/main/java/dev/langchain4j/community/model/dashscope/QwenStreamingChatModel.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ public QwenStreamingChatModel(
122122
.seed(getOrDefault(seed, qwenParameters.seed()))
123123
.enableSearch(getOrDefault(enableSearch, qwenParameters.enableSearch()))
124124
.searchOptions(qwenParameters.searchOptions())
125+
.asrOptions(qwenParameters.asrOptions())
125126
.translationOptions(qwenParameters.translationOptions())
126127
.vlHighResolutionImages(qwenParameters.vlHighResolutionImages())
127128
.isMultimodalModel(getOrDefault(isMultimodalModel, qwenParameters.isMultimodalModel()))

models/langchain4j-community-dashscope/src/test/java/dev/langchain4j/community/model/dashscope/QwenChatModelIT.java

Lines changed: 71 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import static dev.langchain4j.community.model.dashscope.QwenModelName.QWEN_MAX;
44
import static dev.langchain4j.community.model.dashscope.QwenTestHelper.apiKey;
55
import static dev.langchain4j.community.model.dashscope.QwenTestHelper.functionCallChatModelNameProvider;
6+
import static dev.langchain4j.community.model.dashscope.QwenTestHelper.getBase64DataFromResource;
7+
import static dev.langchain4j.community.model.dashscope.QwenTestHelper.multimodalAudioData;
68
import static dev.langchain4j.community.model.dashscope.QwenTestHelper.multimodalChatMessagesWithAudioData;
79
import static dev.langchain4j.community.model.dashscope.QwenTestHelper.multimodalChatMessagesWithAudioUrl;
810
import static dev.langchain4j.community.model.dashscope.QwenTestHelper.multimodalChatMessagesWithImageData;
@@ -26,7 +28,9 @@
2628

2729
import dev.langchain4j.agent.tool.ToolExecutionRequest;
2830
import dev.langchain4j.agent.tool.ToolSpecification;
31+
import dev.langchain4j.data.audio.Audio;
2932
import dev.langchain4j.data.message.AiMessage;
33+
import dev.langchain4j.data.message.AudioContent;
3034
import dev.langchain4j.data.message.ChatMessage;
3135
import dev.langchain4j.data.message.SystemMessage;
3236
import dev.langchain4j.data.message.TextContent;
@@ -296,9 +300,8 @@ void should_send_multimodal_image_data_and_receive_response(String modelName) {
296300
assertThat(response.aiMessage().text()).containsIgnoringCase("parrot");
297301
}
298302

299-
@Disabled("only served in China")
300303
@ParameterizedTest
301-
@MethodSource("dev.langchain4j.community.model.dashscope.QwenTestHelper#audioChatModelNameProvider")
304+
@MethodSource("dev.langchain4j.community.model.dashscope.QwenTestHelper#asrChatModelNameProvider")
302305
void should_send_multimodal_audio_url_and_receive_response(String modelName) {
303306
ChatModel model =
304307
QwenChatModel.builder().apiKey(apiKey()).modelName(modelName).build();
@@ -308,9 +311,8 @@ void should_send_multimodal_audio_url_and_receive_response(String modelName) {
308311
assertThat(response.aiMessage().text()).containsIgnoringCase("阿里云");
309312
}
310313

311-
@Disabled("only served in China")
312314
@ParameterizedTest
313-
@MethodSource("dev.langchain4j.community.model.dashscope.QwenTestHelper#audioChatModelNameProvider")
315+
@MethodSource("dev.langchain4j.community.model.dashscope.QwenTestHelper#asrChatModelNameProvider")
314316
void should_send_multimodal_audio_data_and_receive_response(String modelName) {
315317
ChatModel model =
316318
QwenChatModel.builder().apiKey(apiKey()).modelName(modelName).build();
@@ -320,6 +322,71 @@ void should_send_multimodal_audio_data_and_receive_response(String modelName) {
320322
assertThat(response.aiMessage().text()).containsIgnoringCase("阿里云");
321323
}
322324

325+
@ParameterizedTest
326+
@MethodSource("dev.langchain4j.community.model.dashscope.QwenTestHelper#asrChatModelNameProvider")
327+
void should_respect_language_parameter_and_receive_response(String modelName) {
328+
ChatModel model =
329+
QwenChatModel.builder().apiKey(apiKey()).modelName(modelName).build();
330+
331+
// Use only the language hint; omit the system prompt.
332+
Audio audio = Audio.builder()
333+
.base64Data(multimodalAudioData())
334+
.mimeType("audio/mp3")
335+
.build();
336+
AudioContent audioContent = AudioContent.from(audio);
337+
List<ChatMessage> messages = Collections.singletonList(UserMessage.from(audioContent));
338+
QwenChatRequestParameters parameters = QwenChatRequestParameters.builder()
339+
.asrOptions(QwenChatRequestParameters.AsrOptions.builder()
340+
.language("zh")
341+
.build())
342+
.build();
343+
ChatRequest chatRequest =
344+
ChatRequest.builder().messages(messages).parameters(parameters).build();
345+
ChatResponse response = model.chat(chatRequest);
346+
347+
assertThat(response.aiMessage().text()).containsIgnoringCase("阿里云");
348+
}
349+
350+
@ParameterizedTest
351+
@MethodSource("dev.langchain4j.community.model.dashscope.QwenTestHelper#asrChatModelNameProvider")
352+
void should_respect_enable_itn_parameter_and_receive_response(String modelName) {
353+
ChatModel model =
354+
QwenChatModel.builder().apiKey(apiKey()).modelName(modelName).build();
355+
356+
Audio audio = Audio.builder()
357+
.base64Data(getBase64DataFromResource("/5dollars.mp3"))
358+
.mimeType("audio/mp3")
359+
.build();
360+
AudioContent audioContent = AudioContent.from(audio);
361+
List<ChatMessage> messages = Collections.singletonList(UserMessage.from(audioContent));
362+
363+
// ITN is not enabled
364+
QwenChatRequestParameters parameters = QwenChatRequestParameters.builder()
365+
.asrOptions(QwenChatRequestParameters.AsrOptions.builder()
366+
.language("en")
367+
.enableItn(false)
368+
.build())
369+
.build();
370+
ChatRequest chatRequest =
371+
ChatRequest.builder().messages(messages).parameters(parameters).build();
372+
ChatResponse response = model.chat(chatRequest);
373+
374+
assertThat(response.aiMessage().text()).containsAnyOf("5 dollars", "five dollars");
375+
376+
// ITN is enabled
377+
parameters = QwenChatRequestParameters.builder()
378+
.asrOptions(QwenChatRequestParameters.AsrOptions.builder()
379+
.language("en")
380+
.enableItn(true)
381+
.build())
382+
.build();
383+
chatRequest =
384+
ChatRequest.builder().messages(messages).parameters(parameters).build();
385+
response = model.chat(chatRequest);
386+
387+
assertThat(response.aiMessage().text()).contains("$5");
388+
}
389+
323390
@ParameterizedTest
324391
@MethodSource("dev.langchain4j.community.model.dashscope.QwenTestHelper#multimodalChatModelNameProvider")
325392
void should_send_multimodal_video_url_and_receive_response(String modelName) {

0 commit comments

Comments
 (0)