@@ -31,6 +31,7 @@ import com.google.ai.edge.gallery.data.MAX_IMAGE_COUNT
3131import com.google.ai.edge.gallery.data.Model
3232import com.google.ai.edge.gallery.data.Task
3333import com.google.mediapipe.framework.image.BitmapImageBuilder
34+ import com.google.mediapipe.tasks.genai.llminference.AudioModelOptions
3435import com.google.mediapipe.tasks.genai.llminference.GraphOptions
3536import com.google.mediapipe.tasks.genai.llminference.LlmInference
3637import com.google.mediapipe.tasks.genai.llminference.LlmInferenceSession
@@ -73,6 +74,9 @@ object LlmChatModelHelper {
7374 .setMaxTokens(maxTokens)
7475 .setPreferredBackend(preferredBackend)
7576 .setMaxNumImages(if (shouldEnableImage) MAX_IMAGE_COUNT else 0 )
77+ if (shouldEnableAudio) {
78+ optionsBuilder.setAudioModelOptions(AudioModelOptions .builder().build())
79+ }
7680 val options = optionsBuilder.build()
7781
7882 // Create an instance of the LLM Inference task and session.
@@ -89,6 +93,7 @@ object LlmChatModelHelper {
8993 .setGraphOptions(
9094 GraphOptions .builder()
9195 .setEnableVisionModality(shouldEnableImage)
96+ .setEnableAudioModality(shouldEnableAudio)
9297 .build()
9398 )
9499 .build(),
@@ -127,6 +132,7 @@ object LlmChatModelHelper {
127132 .setGraphOptions(
128133 GraphOptions .builder()
129134 .setEnableVisionModality(shouldEnableImage)
135+ .setEnableAudioModality(shouldEnableAudio)
130136 .build()
131137 )
132138 .build(),
@@ -194,8 +200,7 @@ object LlmChatModelHelper {
194200 session.addImage(BitmapImageBuilder (image).build())
195201 }
196202 for (audioClip in audioClips) {
197- // Uncomment when audio is supported.
198- // session.addAudio(audioClip)
203+ session.addAudio(audioClip)
199204 }
200205 val unused = session.generateResponseAsync(resultListener)
201206 }
0 commit comments