Skip to content

Commit b67eba7

Browse files
authored
VertexAI: add audio input feature (#1653)
1 parent f202e72 commit b67eba7

File tree

11 files changed

+459
-1
lines changed

11 files changed

+459
-1
lines changed

vertexai/app/src/main/AndroidManifest.xml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
1717
xmlns:tools="http://schemas.android.com/tools">
1818

19+
<uses-permission android:name="android.permission.RECORD_AUDIO"/>
20+
1921
<application
2022
android:allowBackup="true"
2123
android:dataExtractionRules="@xml/data_extraction_rules"

vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/GenerativeAiViewModelFactory.kt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import androidx.lifecycle.ViewModel
2020
import androidx.lifecycle.ViewModelProvider
2121
import androidx.lifecycle.viewmodel.CreationExtras
2222
import com.google.firebase.Firebase
23+
import com.google.firebase.quickstart.vertexai.feature.audio.AudioViewModel
2324
import com.google.firebase.quickstart.vertexai.feature.chat.ChatViewModel
2425
import com.google.firebase.quickstart.vertexai.feature.functioncalling.FunctionsChatViewModel
2526
import com.google.firebase.quickstart.vertexai.feature.multimodal.PhotoReasoningViewModel
@@ -96,6 +97,15 @@ val GenerativeViewModelFactory = object : ViewModelProvider.Factory {
9697
FunctionsChatViewModel(generativeModel)
9798
}
9899

100+
isAssignableFrom(AudioViewModel::class.java) -> {
101+
// Initialize a GenerativeModel with the `gemini-pro` AI model for audio generation
102+
val generativeModel = Firebase.vertexAI.generativeModel(
103+
modelName = "gemini-1.5-pro-001",
104+
generationConfig = config
105+
)
106+
AudioViewModel(generativeModel)
107+
}
108+
99109
else ->
100110
throw IllegalArgumentException("Unknown ViewModel class: ${viewModelClass.name}")
101111
}

vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/MainActivity.kt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import androidx.compose.ui.Modifier
2626
import androidx.navigation.compose.NavHost
2727
import androidx.navigation.compose.composable
2828
import androidx.navigation.compose.rememberNavController
29+
import com.google.firebase.quickstart.vertexai.feature.audio.AudioRoute
2930
import com.google.firebase.quickstart.vertexai.feature.chat.ChatRoute
3031
import com.google.firebase.quickstart.vertexai.feature.functioncalling.FunctionsChatRoute
3132
import com.google.firebase.quickstart.vertexai.feature.multimodal.PhotoReasoningRoute
@@ -64,6 +65,9 @@ class MainActivity : ComponentActivity() {
6465
composable("functions_chat") {
6566
FunctionsChatRoute()
6667
}
68+
composable("audio") {
69+
AudioRoute()
70+
}
6771
}
6872
}
6973
}

vertexai/app/src/main/kotlin/com/google/firebase/quickstart/vertexai/MenuScreen.kt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ fun MenuScreen(
4646
MenuItem("summarize", R.string.menu_summarize_title, R.string.menu_summarize_description),
4747
MenuItem("photo_reasoning", R.string.menu_reason_title, R.string.menu_reason_description),
4848
MenuItem("chat", R.string.menu_chat_title, R.string.menu_chat_description),
49-
MenuItem("functions_chat", R.string.menu_functions_title, R.string.menu_functions_description)
49+
MenuItem("functions_chat", R.string.menu_functions_title, R.string.menu_functions_description),
50+
MenuItem("audio", R.string.menu_audio_title, R.string.menu_audio_description)
5051
)
5152

5253
LazyColumn(
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
/*
2+
* Copyright 2024 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.google.firebase.quickstart.vertexai.feature.audio
18+
19+
import android.content.Context
20+
import android.media.MediaRecorder
21+
import android.os.Build
22+
import java.io.File
23+
24+
class AudioRecorder {
25+
private var recorder: MediaRecorder? = null
26+
private var outputFilePath: String? = null
27+
28+
fun startRecording(context: Context) {
29+
outputFilePath = File.createTempFile(
30+
"recording_${System.currentTimeMillis()}", ".m4a", context.cacheDir
31+
).absolutePath
32+
33+
recorder = if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) {
34+
MediaRecorder(context)
35+
} else {
36+
MediaRecorder()
37+
}.apply {
38+
setAudioSource(MediaRecorder.AudioSource.MIC)
39+
setOutputFormat(MediaRecorder.OutputFormat.MPEG_4)
40+
setAudioEncoder(MediaRecorder.AudioEncoder.AAC)
41+
setOutputFile(outputFilePath)
42+
prepare()
43+
start()
44+
}
45+
}
46+
47+
fun stopRecording(): ByteArray {
48+
recorder?.stop()
49+
recorder?.release()
50+
recorder = null
51+
52+
val audioFile = File(outputFilePath ?: throw IllegalStateException("Output file path not set"))
53+
val audioBytes = audioFile.readBytes()
54+
audioFile.delete()
55+
return audioBytes
56+
}
57+
}
Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
/*
2+
* Copyright 2024 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.google.firebase.quickstart.vertexai.feature.audio
18+
19+
import android.Manifest
20+
import androidx.activity.compose.rememberLauncherForActivityResult
21+
import androidx.activity.result.contract.ActivityResultContracts
22+
import androidx.compose.foundation.layout.Box
23+
import androidx.compose.foundation.layout.Column
24+
import androidx.compose.foundation.layout.Row
25+
import androidx.compose.foundation.layout.fillMaxWidth
26+
import androidx.compose.foundation.layout.padding
27+
import androidx.compose.foundation.layout.requiredSize
28+
import androidx.compose.foundation.rememberScrollState
29+
import androidx.compose.foundation.verticalScroll
30+
import androidx.compose.material.icons.Icons
31+
import androidx.compose.material.icons.filled.Delete
32+
import androidx.compose.material.icons.outlined.Person
33+
import androidx.compose.material3.Card
34+
import androidx.compose.material3.CardDefaults
35+
import androidx.compose.material3.CircularProgressIndicator
36+
import androidx.compose.material3.Icon
37+
import androidx.compose.material3.IconButton
38+
import androidx.compose.material3.MaterialTheme
39+
import androidx.compose.material3.OutlinedTextField
40+
import androidx.compose.material3.Text
41+
import androidx.compose.material3.TextButton
42+
import androidx.compose.runtime.Composable
43+
import androidx.compose.runtime.collectAsState
44+
import androidx.compose.runtime.getValue
45+
import androidx.compose.runtime.mutableStateOf
46+
import androidx.compose.runtime.rememberCoroutineScope
47+
import androidx.compose.runtime.saveable.rememberSaveable
48+
import androidx.compose.runtime.setValue
49+
import androidx.compose.ui.Alignment
50+
import androidx.compose.ui.Modifier
51+
import androidx.compose.ui.draw.drawBehind
52+
import androidx.compose.ui.graphics.Color
53+
import androidx.compose.ui.graphics.vector.ImageVector
54+
import androidx.compose.ui.platform.LocalContext
55+
import androidx.compose.ui.res.stringResource
56+
import androidx.compose.ui.res.vectorResource
57+
import androidx.compose.ui.tooling.preview.Preview
58+
import androidx.compose.ui.unit.dp
59+
import androidx.core.content.ContextCompat
60+
import androidx.core.content.PermissionChecker.PERMISSION_GRANTED
61+
import androidx.lifecycle.viewmodel.compose.viewModel
62+
import com.google.firebase.quickstart.vertexai.GenerativeViewModelFactory
63+
import com.google.firebase.quickstart.vertexai.R
64+
import kotlinx.coroutines.launch
65+
66+
@Composable
67+
internal fun AudioRoute(
68+
viewModel: AudioViewModel = viewModel(factory = GenerativeViewModelFactory)
69+
) {
70+
val audioUiState by viewModel.uiState.collectAsState()
71+
val coroutineScope = rememberCoroutineScope()
72+
73+
AudioScreen(
74+
viewModel.audioRecorder,
75+
uiState = audioUiState,
76+
onReasonClicked = { inputText, audioData ->
77+
coroutineScope.launch { viewModel.reason(inputText, audioData) }
78+
},
79+
)
80+
}
81+
82+
@Composable
83+
fun AudioScreen(
84+
audioRecorder: AudioRecorder = AudioRecorder(),
85+
uiState: AudioUiState = AudioUiState.Loading,
86+
onReasonClicked: (String, ByteArray) -> Unit = { _, _ -> },
87+
) {
88+
val context = LocalContext.current
89+
90+
var userQuestion by rememberSaveable { mutableStateOf("") }
91+
var recordGranted by rememberSaveable {
92+
mutableStateOf(
93+
ContextCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO) == PERMISSION_GRANTED
94+
)
95+
}
96+
var isRecording by rememberSaveable { mutableStateOf(false) }
97+
var audioData by rememberSaveable { mutableStateOf<ByteArray?>(null) }
98+
99+
val launcher = rememberLauncherForActivityResult(ActivityResultContracts.RequestPermission()) { isGranted ->
100+
recordGranted = isGranted
101+
}
102+
103+
Column(
104+
modifier = Modifier
105+
.padding(all = 16.dp)
106+
.verticalScroll(rememberScrollState())
107+
) {
108+
Card(modifier = Modifier.fillMaxWidth()) {
109+
Row(modifier = Modifier.padding(vertical = 16.dp)) {
110+
if (!recordGranted) {
111+
Box(
112+
modifier = Modifier.fillMaxWidth(), contentAlignment = Alignment.Center
113+
) {
114+
TextButton(onClick = { launcher.launch(Manifest.permission.RECORD_AUDIO) }) {
115+
Text(stringResource(R.string.grant_record))
116+
}
117+
}
118+
} else {
119+
IconButton(
120+
onClick = {
121+
if (isRecording) {
122+
audioData = audioRecorder.stopRecording()
123+
isRecording = false
124+
} else if (audioData == null) {
125+
audioRecorder.startRecording(context)
126+
isRecording = true
127+
} else {
128+
audioData = null
129+
}
130+
},
131+
modifier = Modifier
132+
.padding(all = 4.dp)
133+
.align(Alignment.CenterVertically),
134+
) {
135+
Icon(
136+
imageVector = if (isRecording) {
137+
ImageVector.vectorResource(R.drawable.stop)
138+
} else if (audioData == null) {
139+
ImageVector.vectorResource(R.drawable.mic)
140+
} else {
141+
Icons.Filled.Delete
142+
},
143+
contentDescription = stringResource(
144+
if (isRecording) {
145+
R.string.stop_recording
146+
} else if (audioData == null) {
147+
R.string.start_recording
148+
} else {
149+
R.string.delete_clip
150+
}
151+
),
152+
)
153+
}
154+
OutlinedTextField(
155+
value = userQuestion,
156+
label = { Text(stringResource(R.string.audio_label)) },
157+
placeholder = { Text(stringResource(R.string.audio_hint)) },
158+
onValueChange = { userQuestion = it },
159+
modifier = Modifier.fillMaxWidth(0.8f),
160+
)
161+
TextButton(
162+
onClick = {
163+
if (audioData != null) onReasonClicked(userQuestion, audioData!!)
164+
},
165+
modifier = Modifier
166+
.padding(all = 4.dp)
167+
.align(Alignment.CenterVertically),
168+
) {
169+
Text(
170+
stringResource(R.string.action_go),
171+
color = if (audioData == null) {
172+
MaterialTheme.colorScheme.secondary
173+
} else {
174+
MaterialTheme.colorScheme.primary
175+
}
176+
)
177+
}
178+
}
179+
}
180+
}
181+
when (uiState) {
182+
AudioUiState.Initial -> {
183+
// Nothing is shown
184+
}
185+
186+
AudioUiState.Loading -> {
187+
Box(
188+
contentAlignment = Alignment.Center,
189+
modifier = Modifier
190+
.padding(all = 8.dp)
191+
.align(Alignment.CenterHorizontally),
192+
) {
193+
CircularProgressIndicator()
194+
}
195+
}
196+
197+
is AudioUiState.Success -> {
198+
Card(
199+
modifier = Modifier
200+
.padding(vertical = 16.dp)
201+
.fillMaxWidth(),
202+
shape = MaterialTheme.shapes.large,
203+
colors = CardDefaults.cardColors(containerColor = MaterialTheme.colorScheme.onSecondaryContainer),
204+
) {
205+
Row(
206+
modifier = Modifier
207+
.padding(all = 16.dp)
208+
.fillMaxWidth()
209+
) {
210+
Icon(
211+
Icons.Outlined.Person,
212+
contentDescription = "Person Icon",
213+
tint = MaterialTheme.colorScheme.onSecondary,
214+
modifier = Modifier
215+
.requiredSize(36.dp)
216+
.drawBehind { drawCircle(color = Color.White) },
217+
)
218+
Text(
219+
text = uiState.outputText,
220+
color = MaterialTheme.colorScheme.onSecondary,
221+
modifier = Modifier
222+
.padding(start = 16.dp)
223+
.fillMaxWidth(),
224+
)
225+
}
226+
}
227+
}
228+
229+
is AudioUiState.Error -> {
230+
Card(
231+
modifier = Modifier
232+
.padding(vertical = 16.dp)
233+
.fillMaxWidth(),
234+
shape = MaterialTheme.shapes.large,
235+
colors = CardDefaults.cardColors(containerColor = MaterialTheme.colorScheme.errorContainer),
236+
) {
237+
Text(
238+
text = uiState.errorMessage,
239+
color = MaterialTheme.colorScheme.error,
240+
modifier = Modifier.padding(all = 16.dp),
241+
)
242+
}
243+
}
244+
}
245+
}
246+
}
247+
248+
@Composable
249+
@Preview(showSystemUi = true)
250+
fun AudioScreenPreview() {
251+
AudioScreen()
252+
}

0 commit comments

Comments
 (0)