@@ -273,6 +273,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
273273 jint width,
274274 jint height,
275275 jint channels) {
276+ if (image == nullptr ) {
277+ return Error::InvalidArgument;
278+ }
276279 std::vector<llm::Image> images;
277280 auto image_size = image->size ();
278281 if (image_size != 0 ) {
@@ -290,6 +293,29 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
290293 return 0 ;
291294 }
292295
296+ jint append_audio_input (
297+ facebook::jni::alias_ref<jintArray> audio,
298+ jint batch_size,
299+ jint n_channels,
300+ jint n_samples) {
301+ if (audio == nullptr ) {
302+ return Error::InvalidArgument;
303+ }
304+ auto audio_size = audio->size ();
305+ std::vector<uint8_t > audio_data (audio_size);
306+ if (audio_size != 0 ) {
307+ std::vector<jint> audio_data_jint (audio_size);
308+ audio->getRegion (0 , audio_size, audio_data_jint.data ());
309+ for (int i = 0 ; i < audio_size; i++) {
310+ audio_data[i] = audio_data_jint[i];
311+ }
312+ llm::RawAudio audio_input{audio_data, batch_size, n_channels, n_samples};
313+ prefill_inputs_.emplace_back (
314+ llm::MultimodalInput{std::move (audio_input)});
315+ }
316+ return 0 ;
317+ }
318+
293319 void stop () {
294320 if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) {
295321 multi_modal_runner_->stop ();
@@ -321,6 +347,8 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
321347 " appendImagesInput" , ExecuTorchLlmJni::append_images_input),
322348 makeNativeMethod (
323349 " appendTextInput" , ExecuTorchLlmJni::append_text_input),
350+ makeNativeMethod (
351+ " appendAudioInput" , ExecuTorchLlmJni::append_audio_input),
324352 makeNativeMethod (" resetContext" , ExecuTorchLlmJni::reset_context),
325353 });
326354 }
0 commit comments