Skip to content

Commit 4c29d4b

Browse files
songguocolakevinlin09
authored andcommitted
[Add] Multimodal Dialog API
1 parent 696688d commit 4c29d4b

File tree

9 files changed

+1441
-1
lines changed

9 files changed

+1441
-1
lines changed

samples/MultiModalDialogUsage.java

Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
import com.alibaba.dashscope.multimodal.MultiModalDialog;
2+
import com.alibaba.dashscope.multimodal.State;
3+
import com.alibaba.dashscope.multimodal.MultiModalDialogCallback;
4+
import com.alibaba.dashscope.multimodal.MultiModalRequestParam;
5+
import com.alibaba.dashscope.utils.JsonUtils;
6+
import com.google.gson.Gson;
7+
import com.google.gson.JsonArray;
8+
import com.google.gson.JsonElement;
9+
import com.google.gson.JsonObject;
10+
import lombok.extern.slf4j.Slf4j;
11+
12+
import java.io.File;
13+
import java.io.FileInputStream;
14+
import java.io.IOException;
15+
import java.nio.ByteBuffer;
16+
import java.util.ArrayList;
17+
import java.util.Base64;
18+
import java.util.List;
19+
import static java.lang.Thread.sleep;
20+
/**
21+
* @author songsong.shao
22+
* @date 2025/4/28
23+
*/
24+
@Slf4j
25+
class MultiModalDialogUsage {
26+
static State.DialogState currentState;
27+
static MultiModalDialog conversation;
28+
static int enterListeningTimes = 0;
29+
static boolean vqaUseUrl = true;
30+
private final String workSpaceId = "";
31+
private final String appId = "";
32+
private final String modelName = "multimodal-dialog";
33+
34+
void testMultimodalVQA() {
35+
/*
36+
step1. 发送”看看前面有什么东西“,onRespondingContent 返回visual_qa 指令
37+
step2. 发送图片列表
38+
step3. 返回图片的对话结果
39+
*/
40+
System.out.println("############ Start Test VQA ############");
41+
vqaUseUrl = true;
42+
MultiModalRequestParam params =
43+
MultiModalRequestParam.builder()
44+
.customInput(
45+
MultiModalRequestParam.CustomInput.builder()
46+
.workspaceId(workSpaceId)
47+
.appId(appId)
48+
.build())
49+
.upStream(
50+
MultiModalRequestParam.UpStream.builder()
51+
.mode("push2talk")
52+
.audioFormat("pcm")
53+
.build())
54+
.downStream(
55+
MultiModalRequestParam.DownStream.builder()
56+
.voice("longxiaochun_v2")
57+
.sampleRate(48000)
58+
.build())
59+
.clientInfo(
60+
MultiModalRequestParam.ClientInfo.builder()
61+
.userId("1234")
62+
.device(MultiModalRequestParam.ClientInfo.Device.builder().uuid("device_1234").build())
63+
.build())
64+
.model(modelName)
65+
.apiKey("your-api-key")
66+
.build();
67+
log.debug("params: {}", JsonUtils.toJson(params));
68+
conversation = new MultiModalDialog(params, getCallback());
69+
conversation.start();
70+
while (currentState != State.DialogState.LISTENING) {
71+
try {
72+
sleep(100);
73+
} catch (InterruptedException e) {
74+
throw new RuntimeException(e);
75+
}
76+
}
77+
// 模拟语音请求
78+
conversation.requestToRespond("prompt","拍照看看前面有什么东西",null);
79+
80+
// 增加交互流程等待
81+
while (enterListeningTimes < 3) {
82+
try {
83+
sleep(2000);
84+
} catch (InterruptedException e) {
85+
throw new RuntimeException(e);
86+
}
87+
}
88+
conversation.stop();
89+
try {
90+
sleep(1000);
91+
} catch (InterruptedException e) {
92+
throw new RuntimeException(e);
93+
}
94+
System.out.println("############ End Test VQA ############");
95+
}
96+
97+
98+
99+
public static void main(String[] args) {
100+
MultiModalDialogUsage multiModalDialogUsage = new MultiModalDialogUsage();
101+
multiModalDialogUsage.testMultimodalVQA();
102+
}
103+
104+
public static MultiModalDialogCallback getCallback() {
105+
return new MultiModalDialogCallbackImpl();
106+
}
107+
public static class MultiModalDialogCallbackImpl extends MultiModalDialogCallback {
108+
@Override
109+
public void onConnected() {}
110+
@Override
111+
public void onStarted(String dialogId) {
112+
log.info("onStarted: {}", dialogId);
113+
}
114+
@Override
115+
public void onStopped(String dialogId) {
116+
log.info("onStopped: {}", dialogId);
117+
}
118+
@Override
119+
public void onSpeechStarted(String dialogId) {
120+
log.info("onSpeechStarted: {}", dialogId);
121+
}
122+
@Override
123+
public void onSpeechEnded(String dialogId) {
124+
log.info("onSpeechEnded: {}", dialogId);
125+
}
126+
@Override
127+
public void onError(String dialogId, String errorCode, String errorMsg) {
128+
log.error("onError: {}, {}, {}", dialogId, errorCode, errorMsg);
129+
enterListeningTimes++ ; //force quit dialog test
130+
}
131+
@Override
132+
public void onStateChanged(State.DialogState state) {
133+
log.info("onStateChanged: {}", state);
134+
currentState = state;
135+
if (currentState == State.DialogState.LISTENING) {
136+
enterListeningTimes++;
137+
log.info("enterListeningTimes: {}", enterListeningTimes);
138+
}
139+
}
140+
@Override
141+
public void onSpeechAudioData(ByteBuffer audioData) {
142+
//write audio data to file
143+
//or redirect to audio player
144+
}
145+
@Override
146+
public void onRespondingStarted(String dialogId) {
147+
log.info("onRespondingStarted: {}", dialogId);
148+
conversation.localRespondingStarted();
149+
}
150+
151+
@Override
152+
public void onRespondingEnded(String dialogId, JsonObject content) {
153+
log.info("onRespondingEnded: {}", dialogId);
154+
conversation.localRespondingEnded();
155+
}
156+
157+
158+
@Override
159+
public void onRespondingContent(String dialogId, JsonObject content) {
160+
log.info("onRespondingContent: {}, {}", dialogId, content);
161+
if (content.has("extra_info")) {
162+
JsonObject extraInfo = content.getAsJsonObject("extra_info");
163+
if (extraInfo.has("commands")) {
164+
String commandsStr = extraInfo.get("commands").getAsString();
165+
log.info("commandsStr: {}", commandsStr);
166+
//"[{\"name\":\"visual_qa\",\"params\":[{\"name\":\"shot\",\"value\":\"拍照看看\",\"normValue\":\"True\"}]}]"
167+
JsonArray commands = new Gson().fromJson(commandsStr, JsonArray.class);
168+
for (JsonElement command : commands) {
169+
JsonObject commandObj = command.getAsJsonObject();
170+
if (commandObj.has("name")) {
171+
String commandStr = commandObj.get("name").getAsString();
172+
if (commandStr.equals("visual_qa")) {
173+
log.info("拍照了!!!!");
174+
MultiModalRequestParam.UpdateParams updateParams = MultiModalRequestParam.UpdateParams.builder()
175+
.images(getMockOSSImage())
176+
.build();
177+
conversation.requestToRespond("prompt","",updateParams);
178+
}
179+
}
180+
}
181+
}
182+
}
183+
}
184+
@Override
185+
public void onSpeechContent(String dialogId, JsonObject content) {
186+
log.info("onSpeechContent: {}, {}", dialogId, content);
187+
}
188+
@Override
189+
public void onRequestAccepted(String dialogId) {
190+
log.info("onRequestAccepted: {}", dialogId);
191+
}
192+
@Override
193+
public void onClosed() {
194+
log.info("onClosed");
195+
enterListeningTimes++ ;
196+
}
197+
}
198+
public static List<Object> getMockOSSImage() {
199+
JsonObject imageObject = new JsonObject();
200+
JsonObject extraObject = new JsonObject();
201+
List<Object> images = new ArrayList<>();
202+
try{
203+
if (vqaUseUrl){
204+
imageObject.addProperty("type", "url");
205+
imageObject.addProperty("value", "https://help-static-aliyun-doc.aliyuncs.com/assets/img/zh-CN/7043267371/p909896.png");
206+
imageObject.add("extra", extraObject);
207+
}else {
208+
imageObject.addProperty("type", "base64");
209+
imageObject.addProperty("value", getLocalImageBase64());
210+
}
211+
images.add(imageObject);
212+
}catch (Exception e){
213+
e.printStackTrace();
214+
}
215+
return images;
216+
}
217+
public static String getLocalImageBase64() {
218+
// 图片文件路径
219+
String imagePath = "./**/your-demo.jpg";
220+
try {
221+
FileInputStream fileInputStream = new FileInputStream(new File(imagePath));
222+
byte[] bytes = new byte[fileInputStream.available()];
223+
fileInputStream.read(bytes);
224+
fileInputStream.close();
225+
return Base64.getEncoder().encodeToString(bytes);
226+
} catch (IOException e) {
227+
e.printStackTrace();
228+
}
229+
return null;
230+
}
231+
}

src/main/java/com/alibaba/dashscope/base/FullDuplexParamBase.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,15 @@ public String getModel() {
6969
*/
7070
public abstract Flowable<Object> getStreamingData();
7171

72+
/**
73+
* Custom payload.inputs
74+
*
75+
* @return The key/value parameters
76+
*/
77+
public Map<String, Object> getInputs() {
78+
return null;
79+
}
80+
7281
public void setModel(String model) {
7382
this.model = model;
7483
parameters.put("model", model);

0 commit comments

Comments
 (0)