Skip to content

Commit 90e2f45

Browse files
songguocolakevinlin09
authored andcommitted
feat(model/qwen3-livetranslate&asr-realtime):add input params
1 parent 0c4c10e commit 90e2f45

File tree

7 files changed

+353
-5
lines changed

7 files changed

+353
-5
lines changed

samples/Qwen3AsrRealtimeUsage.java

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
import com.alibaba.dashscope.audio.omni.*;
2+
import com.alibaba.dashscope.exception.NoApiKeyException;
3+
import com.google.gson.JsonObject;
4+
import org.slf4j.Logger;
5+
import org.slf4j.LoggerFactory;
6+
7+
import javax.sound.sampled.LineUnavailableException;
8+
import java.io.File;
9+
import java.io.FileInputStream;
10+
import java.util.Base64;
11+
import java.util.Collections;
12+
import java.util.concurrent.atomic.AtomicReference;
13+
14+
15+
public class Qwen3AsrRealtimeUsage {
16+
private static final Logger log = LoggerFactory.getLogger(Qwen3AsrRealtimeUsage.class);
17+
private static final int AUDIO_CHUNK_SIZE = 1024; // Audio chunk size in bytes
18+
private static final int SLEEP_INTERVAL_MS = 30; // Sleep interval in milliseconds
19+
20+
public static void main(String[] args) throws InterruptedException, LineUnavailableException {
21+
22+
OmniRealtimeParam param = OmniRealtimeParam.builder()
23+
.model("qwen3-asr-flash-realtime")
24+
.apikey(System.getenv("DASHSCOPE_API_KEY"))
25+
.build();
26+
27+
OmniRealtimeConversation conversation = null;
28+
final AtomicReference<OmniRealtimeConversation> conversationRef = new AtomicReference<>(null);
29+
conversation = new OmniRealtimeConversation(param, new OmniRealtimeCallback() {
30+
@Override
31+
public void onOpen() {
32+
System.out.println("connection opened");
33+
}
34+
@Override
35+
public void onEvent(JsonObject message) {
36+
String type = message.get("type").getAsString();
37+
switch(type) {
38+
case "session.created":
39+
System.out.println("start session: " + message.get("session").getAsJsonObject().get("id").getAsString());
40+
break;
41+
case "conversation.item.input_audio_transcription.completed":
42+
System.out.println("question: " + message.get("transcript").getAsString());
43+
break;
44+
case "response.audio_transcript.delta":
45+
System.out.println("got llm response delta: " + message.get("delta").getAsString());
46+
break;
47+
case "input_audio_buffer.speech_started":
48+
System.out.println("======VAD Speech Start======");
49+
break;
50+
case "input_audio_buffer.speech_stopped":
51+
System.out.println("======VAD Speech Stop======");
52+
break;
53+
case "response.done":
54+
System.out.println("======RESPONSE DONE======");
55+
if (conversationRef.get() != null) {
56+
System.out.println("[Metric] response: " + conversationRef.get().getResponseId() +
57+
", first text delay: " + conversationRef.get().getFirstTextDelay() +
58+
" ms, first audio delay: " + conversationRef.get().getFirstAudioDelay() + " ms");
59+
}
60+
break;
61+
default:
62+
break;
63+
}
64+
}
65+
@Override
66+
public void onClose(int code, String reason) {
67+
System.out.println("connection closed code: " + code + ", reason: " + reason);
68+
}
69+
});
70+
conversationRef.set(conversation);
71+
try {
72+
conversation.connect();
73+
} catch (NoApiKeyException e) {
74+
throw new RuntimeException(e);
75+
}
76+
77+
78+
OmniRealtimeTranscriptionParam transcriptionParam = new OmniRealtimeTranscriptionParam();
79+
transcriptionParam.setLanguage("zh");
80+
transcriptionParam.setInputAudioFormat("pcm");
81+
transcriptionParam.setInputSampleRate(16000);
82+
transcriptionParam.setCorpusText("这是一段脱口秀表演");
83+
84+
OmniRealtimeConfig config = OmniRealtimeConfig.builder()
85+
.modalities(Collections.singletonList(OmniRealtimeModality.TEXT))
86+
.transcriptionConfig(transcriptionParam)
87+
.build();
88+
conversation.updateSession(config);
89+
90+
91+
String filePath = "./path/to/your/audio/16k-16bit-mono-file.pcm";
92+
File audioFile = new File(filePath);
93+
94+
if (!audioFile.exists()) {
95+
log.error("Audio file not found: {}", filePath);
96+
return;
97+
}
98+
99+
try (FileInputStream audioInputStream = new FileInputStream(audioFile)) {
100+
byte[] audioBuffer = new byte[AUDIO_CHUNK_SIZE];
101+
int bytesRead;
102+
int totalBytesRead = 0;
103+
104+
log.info("Starting to send audio data from: {}", filePath);
105+
106+
// Read and send audio data in chunks
107+
while ((bytesRead = audioInputStream.read(audioBuffer)) != -1) {
108+
totalBytesRead += bytesRead;
109+
String audioB64 = Base64.getEncoder().encodeToString(audioBuffer);
110+
// Send audio chunk to conversation
111+
conversation.appendAudio(audioB64);
112+
113+
// Add small delay to simulate real-time audio streaming
114+
Thread.sleep(SLEEP_INTERVAL_MS);
115+
}
116+
117+
log.info("Finished sending audio data. Total bytes sent: {}", totalBytesRead);
118+
119+
} catch (Exception e) {
120+
log.error("Error sending audio from file: {}", filePath, e);
121+
}
122+
123+
conversation.commit();
124+
conversation.createResponse(null, null);
125+
conversation.close(1000, "bye");
126+
System.exit(0);
127+
}
128+
}
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import com.alibaba.dashscope.audio.omni.*;
2+
import com.alibaba.dashscope.exception.NoApiKeyException;
3+
import com.google.gson.JsonObject;
4+
import org.slf4j.Logger;
5+
import org.slf4j.LoggerFactory;
6+
7+
import javax.sound.sampled.LineUnavailableException;
8+
import java.io.File;
9+
import java.io.FileInputStream;
10+
import java.util.Arrays;
11+
import java.util.Base64;
12+
import java.util.concurrent.atomic.AtomicReference;
13+
14+
15+
public class Qwen3LiveTranslateUsage {
16+
private static final Logger log = LoggerFactory.getLogger(Qwen3LiveTranslateUsage.class);
17+
private static final int AUDIO_CHUNK_SIZE = 1024; // Audio chunk size in bytes
18+
private static final int SLEEP_INTERVAL_MS = 30; // Sleep interval in milliseconds
19+
20+
public static void main(String[] args) throws InterruptedException, LineUnavailableException {
21+
22+
OmniRealtimeParam param = OmniRealtimeParam.builder()
23+
.model("qwen3-livetranslate-flash-realtime")
24+
.apikey(System.getenv("DASHSCOPE_API_KEY"))
25+
.build();
26+
27+
OmniRealtimeConversation conversation = null;
28+
final AtomicReference<OmniRealtimeConversation> conversationRef = new AtomicReference<>(null);
29+
conversation = new OmniRealtimeConversation(param, new OmniRealtimeCallback() {
30+
@Override
31+
public void onOpen() {
32+
System.out.println("connection opened");
33+
}
34+
@Override
35+
public void onEvent(JsonObject message) {
36+
String type = message.get("type").getAsString();
37+
switch(type) {
38+
case "session.created":
39+
System.out.println("start session: " + message.get("session").getAsJsonObject().get("id").getAsString());
40+
break;
41+
case "conversation.item.input_audio_transcription.completed":
42+
System.out.println("question: " + message.get("transcript").getAsString());
43+
break;
44+
case "response.audio_transcript.delta":
45+
System.out.println("got llm response delta: " + message.get("delta").getAsString());
46+
break;
47+
case "response.audio.delta":
48+
String recvAudioB64 = message.get("delta").getAsString();
49+
// audioPlayer.write(recvAudioB64); // 音频播放,可以自行实现
50+
break;
51+
case "input_audio_buffer.speech_started":
52+
System.out.println("======VAD Speech Start======");
53+
break;
54+
case "response.done":
55+
System.out.println("======RESPONSE DONE======");
56+
if (conversationRef.get() != null) {
57+
System.out.println("[Metric] response: " + conversationRef.get().getResponseId() +
58+
", first text delay: " + conversationRef.get().getFirstTextDelay() +
59+
" ms, first audio delay: " + conversationRef.get().getFirstAudioDelay() + " ms");
60+
}
61+
break;
62+
default:
63+
break;
64+
}
65+
}
66+
@Override
67+
public void onClose(int code, String reason) {
68+
System.out.println("connection closed code: " + code + ", reason: " + reason);
69+
}
70+
});
71+
conversationRef.set(conversation);
72+
try {
73+
conversation.connect();
74+
} catch (NoApiKeyException e) {
75+
throw new RuntimeException(e);
76+
}
77+
78+
OmniRealtimeConfig config = OmniRealtimeConfig.builder()
79+
.modalities(Arrays.asList(OmniRealtimeModality.AUDIO, OmniRealtimeModality.TEXT))
80+
.voice("Cherry")
81+
.outputAudioFormat(OmniRealtimeAudioFormat.PCM_24000HZ_MONO_16BIT)
82+
.inputAudioFormat(OmniRealtimeAudioFormat.PCM_16000HZ_MONO_16BIT)
83+
.translationConfig(OmniRealtimeTranslationParam.builder().language("en").build())
84+
.build();
85+
conversation.updateSession(config);
86+
87+
88+
String filePath = "./path/to/your/audio/16k-16bit-mono-file.pcm";
89+
File audioFile = new File(filePath);
90+
91+
if (!audioFile.exists()) {
92+
log.error("Audio file not found: {}", filePath);
93+
return;
94+
}
95+
96+
try (FileInputStream audioInputStream = new FileInputStream(audioFile)) {
97+
byte[] audioBuffer = new byte[AUDIO_CHUNK_SIZE];
98+
int bytesRead;
99+
int totalBytesRead = 0;
100+
101+
log.info("Starting to send audio data from: {}", filePath);
102+
103+
// Read and send audio data in chunks
104+
while ((bytesRead = audioInputStream.read(audioBuffer)) != -1) {
105+
totalBytesRead += bytesRead;
106+
String audioB64 = Base64.getEncoder().encodeToString(audioBuffer);
107+
// Send audio chunk to conversation
108+
conversation.appendAudio(audioB64);
109+
110+
// Add small delay to simulate real-time audio streaming
111+
Thread.sleep(SLEEP_INTERVAL_MS);
112+
}
113+
114+
log.info("Finished sending audio data. Total bytes sent: {}", totalBytesRead);
115+
116+
} catch (Exception e) {
117+
log.error("Error sending audio from file: {}", filePath, e);
118+
}
119+
120+
conversation.commit();
121+
conversation.createResponse(null, null);
122+
conversation.close(1000, "bye");
123+
System.exit(0);
124+
}
125+
}

src/main/java/com/alibaba/dashscope/audio/omni/OmniRealtimeConfig.java

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ public class OmniRealtimeConfig {
1919
/** omni output modalities to be used in session */
2020
@NonNull List<OmniRealtimeModality> modalities;
2121

22-
/** voice to be used in session */
23-
@NonNull String voice;
22+
/** voice to be used in session ,not need in qwen-asr-realtime*/
23+
@Builder.Default String voice = null;
2424

2525
/** input audio format */
2626
@Builder.Default
@@ -50,6 +50,12 @@ public class OmniRealtimeConfig {
5050
@Builder.Default Map<String, Object> turnDetectionParam = null;
5151
/** The extra parameters. */
5252
@Builder.Default Map<String, Object> parameters = null;
53+
/** translation configuration */
54+
@Builder.Default
55+
OmniRealtimeTranslationParam translationConfig = null;
56+
/** transcription configuration */
57+
@Builder.Default
58+
OmniRealtimeTranscriptionParam transcriptionConfig = null;
5359

5460
public JsonObject getConfig() {
5561
Map<String, Object> config = new HashMap<>();
@@ -82,6 +88,31 @@ public JsonObject getConfig() {
8288
} else {
8389
config.put(OmniRealtimeConstants.TURN_DETECTION, null);
8490
}
91+
// Add translation configuration to the config
92+
if (translationConfig != null) {
93+
Map<String, Object> translationConfig = new HashMap<>();
94+
translationConfig.put(OmniRealtimeConstants.LANGUAGE, this.translationConfig.getLanguage());
95+
config.put(OmniRealtimeConstants.TRANSLATION, translationConfig);
96+
} else {
97+
config.put(OmniRealtimeConstants.TRANSLATION, null);
98+
}
99+
// Add transcription configuration for qwen-asr-realtime
100+
if (transcriptionConfig != null) {
101+
Map<String, Object> transcriptionConfig = new HashMap<>();
102+
if (this.transcriptionConfig.getInputSampleRate() != null) {
103+
config.put(OmniRealtimeConstants.SAMPLE_RATE, this.transcriptionConfig.getInputSampleRate());
104+
}
105+
if (this.transcriptionConfig.getInputAudioFormat() != null) {
106+
config.put(OmniRealtimeConstants.INPUT_AUDIO_FORMAT, this.transcriptionConfig.getInputAudioFormat());
107+
}
108+
if (this.transcriptionConfig.getLanguage() != null) {
109+
transcriptionConfig.put(OmniRealtimeConstants.LANGUAGE, this.transcriptionConfig.getLanguage());
110+
}
111+
if (this.transcriptionConfig.getCorpus() != null) {
112+
transcriptionConfig.put(OmniRealtimeConstants.INPUT_AUDIO_TRANSCRIPTION_CORPUS, this.transcriptionConfig.getCorpus());
113+
}
114+
config.put(OmniRealtimeConstants.INPUT_AUDIO_TRANSCRIPTION, transcriptionConfig);
115+
}
85116
if (parameters != null) {
86117
for (Map.Entry<String, Object> entry : parameters.entrySet()) {
87118
config.put(entry.getKey(), entry.getValue());
@@ -93,4 +124,4 @@ public JsonObject getConfig() {
93124
JsonObject jsonObject = gson.toJsonTree(config).getAsJsonObject();
94125
return jsonObject;
95126
}
96-
}
127+
}

src/main/java/com/alibaba/dashscope/audio/omni/OmniRealtimeConstants.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,18 @@ public class OmniRealtimeConstants {
1010
public static final String OUTPUT_AUDIO_FORMAT = "output_audio_format";
1111
public static final String INPUT_AUDIO_TRANSCRIPTION = "input_audio_transcription";
1212
public static final String INPUT_AUDIO_TRANSCRIPTION_MODEL = "model";
13+
public static final String INPUT_AUDIO_TRANSCRIPTION_CORPUS = "corpus";
1314
public static final String TURN_DETECTION = "turn_detection";
1415
public static final String TURN_DETECTION_TYPE = "type";
1516
public static final String TURN_DETECTION_THRESHOLD = "threshold";
1617
public static final String PREFIX_PADDING_MS = "prefix_padding_ms";
1718
public static final String SILENCE_DURATION_MS = "silence_duration_ms";
1819

20+
// Translation constants
21+
public static final String TRANSLATION = "translation";
22+
public static final String LANGUAGE = "language";
23+
public static final String SAMPLE_RATE = "sample_rate";
24+
1925
public static final String PROTOCOL_EVENT_ID = "event_id";
2026
public static final String PROTOCOL_TYPE = "type";
2127
public static final String PROTOCOL_SESSION = "session";
@@ -34,4 +40,4 @@ public class OmniRealtimeConstants {
3440
"response.audio_transcript.delta";
3541
public static final String PROTOCOL_RESPONSE_TYPE_AUDIO_DELTA = "response.audio.delta";
3642
public static final String PROTOCOL_RESPONSE_TYPE_RESPONSE_DONE = "response.done";
37-
}
43+
}

src/main/java/com/alibaba/dashscope/audio/omni/OmniRealtimeConversation.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ public void appendAudio(String audioBase64) {
107107
OmniRealtimeConstants.PROTOCOL_TYPE,
108108
OmniRealtimeConstants.PROTOCOL_EVENT_TYPE_APPEND_AUDIO);
109109
append_request.put(OmniRealtimeConstants.PROTOCOL_AUDIO, audioBase64);
110-
log.debug("append audio with eid: " + event_id + ", length: " + audioBase64.length());
110+
log.info("append audio with eid: {}, length: {}", event_id, audioBase64.length());
111111
GsonBuilder builder = new GsonBuilder();
112112
builder.serializeNulls();
113113
Gson gson = builder.create();
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
package com.alibaba.dashscope.audio.omni;
2+
3+
import lombok.Data;
4+
5+
import java.util.HashMap;
6+
import java.util.Map;
7+
8+
/** @author songsong.shao */
9+
@Data
10+
public class OmniRealtimeTranscriptionParam {
11+
/** input audio sample rate*/
12+
private Integer inputSampleRate = null;
13+
/** input audio format */
14+
private String inputAudioFormat = null;
15+
/** input audio language */
16+
private String language = null;
17+
18+
/** corpus for qwen-asr-realtime */
19+
private Map<String, Object> corpus = null;
20+
21+
/** text content for corpus */
22+
private String corpusText;
23+
24+
/**
25+
* Set text in corpus to improve model recognition accuracy.
26+
*/
27+
public void setCorpusText(String text) {
28+
if (corpus == null) {
29+
corpus = new HashMap<>();
30+
}
31+
this.corpusText = text;
32+
corpus.put("text", text);
33+
}
34+
35+
/**
36+
* Default constructor
37+
*/
38+
public OmniRealtimeTranscriptionParam() {
39+
}
40+
41+
public OmniRealtimeTranscriptionParam(String audioFormat, int sampleRate) {
42+
this.inputAudioFormat = audioFormat;
43+
this.inputSampleRate = sampleRate;
44+
}
45+
}

0 commit comments

Comments
 (0)