1+ import com .alibaba .dashscope .multimodal .MultiModalDialog ;
2+ import com .alibaba .dashscope .multimodal .State ;
3+ import com .alibaba .dashscope .multimodal .MultiModalDialogCallback ;
4+ import com .alibaba .dashscope .multimodal .MultiModalRequestParam ;
5+ import com .alibaba .dashscope .utils .JsonUtils ;
6+ import com .google .gson .Gson ;
7+ import com .google .gson .JsonArray ;
8+ import com .google .gson .JsonElement ;
9+ import com .google .gson .JsonObject ;
10+ import lombok .extern .slf4j .Slf4j ;
11+
12+ import java .io .File ;
13+ import java .io .FileInputStream ;
14+ import java .io .IOException ;
15+ import java .nio .ByteBuffer ;
16+ import java .util .ArrayList ;
17+ import java .util .Base64 ;
18+ import java .util .List ;
19+ import static java .lang .Thread .sleep ;
20+ /**
21+ * @author songsong.shao
22+ * @date 2025/4/28
23+ */
24+ @ Slf4j
25+ class MultiModalDialogUsage {
26+ static State .DialogState currentState ;
27+ static MultiModalDialog conversation ;
28+ static int enterListeningTimes = 0 ;
29+ static boolean vqaUseUrl = true ;
30+ private final String workSpaceId = "" ;
31+ private final String appId = "" ;
32+ private final String modelName = "multimodal-dialog" ;
33+
34+ void testMultimodalVQA () {
35+ /*
36+ step1. 发送”看看前面有什么东西“,onRespondingContent 返回visual_qa 指令
37+ step2. 发送图片列表
38+ step3. 返回图片的对话结果
39+ */
40+ System .out .println ("############ Start Test VQA ############" );
41+ vqaUseUrl = true ;
42+ MultiModalRequestParam params =
43+ MultiModalRequestParam .builder ()
44+ .customInput (
45+ MultiModalRequestParam .CustomInput .builder ()
46+ .workspaceId (workSpaceId )
47+ .appId (appId )
48+ .build ())
49+ .upStream (
50+ MultiModalRequestParam .UpStream .builder ()
51+ .mode ("push2talk" )
52+ .audioFormat ("pcm" )
53+ .build ())
54+ .downStream (
55+ MultiModalRequestParam .DownStream .builder ()
56+ .voice ("longxiaochun_v2" )
57+ .sampleRate (48000 )
58+ .build ())
59+ .clientInfo (
60+ MultiModalRequestParam .ClientInfo .builder ()
61+ .userId ("1234" )
62+ .device (MultiModalRequestParam .ClientInfo .Device .builder ().uuid ("device_1234" ).build ())
63+ .build ())
64+ .model (modelName )
65+ .apiKey ("your-api-key" )
66+ .build ();
67+ log .debug ("params: {}" , JsonUtils .toJson (params ));
68+ conversation = new MultiModalDialog (params , getCallback ());
69+ conversation .start ();
70+ while (currentState != State .DialogState .LISTENING ) {
71+ try {
72+ sleep (100 );
73+ } catch (InterruptedException e ) {
74+ throw new RuntimeException (e );
75+ }
76+ }
77+ // 模拟语音请求
78+ conversation .requestToRespond ("prompt" ,"拍照看看前面有什么东西" ,null );
79+
80+ // 增加交互流程等待
81+ while (enterListeningTimes < 3 ) {
82+ try {
83+ sleep (2000 );
84+ } catch (InterruptedException e ) {
85+ throw new RuntimeException (e );
86+ }
87+ }
88+ conversation .stop ();
89+ try {
90+ sleep (1000 );
91+ } catch (InterruptedException e ) {
92+ throw new RuntimeException (e );
93+ }
94+ System .out .println ("############ End Test VQA ############" );
95+ }
96+
97+
98+
99+ public static void main (String [] args ) {
100+ MultiModalDialogUsage multiModalDialogUsage = new MultiModalDialogUsage ();
101+ multiModalDialogUsage .testMultimodalVQA ();
102+ }
103+
104+ public static MultiModalDialogCallback getCallback () {
105+ return new MultiModalDialogCallbackImpl ();
106+ }
107+ public static class MultiModalDialogCallbackImpl extends MultiModalDialogCallback {
108+ @ Override
109+ public void onConnected () {}
110+ @ Override
111+ public void onStarted (String dialogId ) {
112+ log .info ("onStarted: {}" , dialogId );
113+ }
114+ @ Override
115+ public void onStopped (String dialogId ) {
116+ log .info ("onStopped: {}" , dialogId );
117+ }
118+ @ Override
119+ public void onSpeechStarted (String dialogId ) {
120+ log .info ("onSpeechStarted: {}" , dialogId );
121+ }
122+ @ Override
123+ public void onSpeechEnded (String dialogId ) {
124+ log .info ("onSpeechEnded: {}" , dialogId );
125+ }
126+ @ Override
127+ public void onError (String dialogId , String errorCode , String errorMsg ) {
128+ log .error ("onError: {}, {}, {}" , dialogId , errorCode , errorMsg );
129+ enterListeningTimes ++ ; //force quit dialog test
130+ }
131+ @ Override
132+ public void onStateChanged (State .DialogState state ) {
133+ log .info ("onStateChanged: {}" , state );
134+ currentState = state ;
135+ if (currentState == State .DialogState .LISTENING ) {
136+ enterListeningTimes ++;
137+ log .info ("enterListeningTimes: {}" , enterListeningTimes );
138+ }
139+ }
140+ @ Override
141+ public void onSpeechAudioData (ByteBuffer audioData ) {
142+ //write audio data to file
143+ //or redirect to audio player
144+ }
145+ @ Override
146+ public void onRespondingStarted (String dialogId ) {
147+ log .info ("onRespondingStarted: {}" , dialogId );
148+ conversation .localRespondingStarted ();
149+ }
150+
151+ @ Override
152+ public void onRespondingEnded (String dialogId , JsonObject content ) {
153+ log .info ("onRespondingEnded: {}" , dialogId );
154+ conversation .localRespondingEnded ();
155+ }
156+
157+
158+ @ Override
159+ public void onRespondingContent (String dialogId , JsonObject content ) {
160+ log .info ("onRespondingContent: {}, {}" , dialogId , content );
161+ if (content .has ("extra_info" )) {
162+ JsonObject extraInfo = content .getAsJsonObject ("extra_info" );
163+ if (extraInfo .has ("commands" )) {
164+ String commandsStr = extraInfo .get ("commands" ).getAsString ();
165+ log .info ("commandsStr: {}" , commandsStr );
166+ //"[{\"name\":\"visual_qa\",\"params\":[{\"name\":\"shot\",\"value\":\"拍照看看\",\"normValue\":\"True\"}]}]"
167+ JsonArray commands = new Gson ().fromJson (commandsStr , JsonArray .class );
168+ for (JsonElement command : commands ) {
169+ JsonObject commandObj = command .getAsJsonObject ();
170+ if (commandObj .has ("name" )) {
171+ String commandStr = commandObj .get ("name" ).getAsString ();
172+ if (commandStr .equals ("visual_qa" )) {
173+ log .info ("拍照了!!!!" );
174+ MultiModalRequestParam .UpdateParams updateParams = MultiModalRequestParam .UpdateParams .builder ()
175+ .images (getMockOSSImage ())
176+ .build ();
177+ conversation .requestToRespond ("prompt" ,"" ,updateParams );
178+ }
179+ }
180+ }
181+ }
182+ }
183+ }
184+ @ Override
185+ public void onSpeechContent (String dialogId , JsonObject content ) {
186+ log .info ("onSpeechContent: {}, {}" , dialogId , content );
187+ }
188+ @ Override
189+ public void onRequestAccepted (String dialogId ) {
190+ log .info ("onRequestAccepted: {}" , dialogId );
191+ }
192+ @ Override
193+ public void onClosed () {
194+ log .info ("onClosed" );
195+ enterListeningTimes ++ ;
196+ }
197+ }
198+ public static List <Object > getMockOSSImage () {
199+ JsonObject imageObject = new JsonObject ();
200+ JsonObject extraObject = new JsonObject ();
201+ List <Object > images = new ArrayList <>();
202+ try {
203+ if (vqaUseUrl ){
204+ imageObject .addProperty ("type" , "url" );
205+ imageObject .addProperty ("value" , "https://help-static-aliyun-doc.aliyuncs.com/assets/img/zh-CN/7043267371/p909896.png" );
206+ imageObject .add ("extra" , extraObject );
207+ }else {
208+ imageObject .addProperty ("type" , "base64" );
209+ imageObject .addProperty ("value" , getLocalImageBase64 ());
210+ }
211+ images .add (imageObject );
212+ }catch (Exception e ){
213+ e .printStackTrace ();
214+ }
215+ return images ;
216+ }
217+ public static String getLocalImageBase64 () {
218+ // 图片文件路径
219+ String imagePath = "./**/your-demo.jpg" ;
220+ try {
221+ FileInputStream fileInputStream = new FileInputStream (new File (imagePath ));
222+ byte [] bytes = new byte [fileInputStream .available ()];
223+ fileInputStream .read (bytes );
224+ fileInputStream .close ();
225+ return Base64 .getEncoder ().encodeToString (bytes );
226+ } catch (IOException e ) {
227+ e .printStackTrace ();
228+ }
229+ return null ;
230+ }
231+ }
0 commit comments