@@ -273,6 +273,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
273
273
jint width,
274
274
jint height,
275
275
jint channels) {
276
+ if (image == nullptr ) {
277
+ return Error::InvalidArgument;
278
+ }
276
279
std::vector<llm::Image> images;
277
280
auto image_size = image->size ();
278
281
if (image_size != 0 ) {
@@ -290,6 +293,29 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
290
293
return 0 ;
291
294
}
292
295
296
+ jint append_audio_input (
297
+ facebook::jni::alias_ref<jintArray> audio,
298
+ jint batch_size,
299
+ jint n_channels,
300
+ jint n_samples) {
301
+ if (audio == nullptr ) {
302
+ return Error::InvalidArgument;
303
+ }
304
+ auto audio_size = audio->size ();
305
+ std::vector<uint8_t > audio_data (audio_size);
306
+ if (audio_size != 0 ) {
307
+ std::vector<jint> audio_data_jint (audio_size);
308
+ audio->getRegion (0 , audio_size, audio_data_jint.data ());
309
+ for (int i = 0 ; i < audio_size; i++) {
310
+ audio_data[i] = audio_data_jint[i];
311
+ }
312
+ llm::RawAudio audio_input{audio_data, batch_size, n_channels, n_samples};
313
+ prefill_inputs_.emplace_back (
314
+ llm::MultimodalInput{std::move (audio_input)});
315
+ }
316
+ return 0 ;
317
+ }
318
+
293
319
void stop () {
294
320
if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) {
295
321
multi_modal_runner_->stop ();
@@ -321,6 +347,8 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
321
347
" appendImagesInput" , ExecuTorchLlmJni::append_images_input),
322
348
makeNativeMethod (
323
349
" appendTextInput" , ExecuTorchLlmJni::append_text_input),
350
+ makeNativeMethod (
351
+ " appendAudioInput" , ExecuTorchLlmJni::append_audio_input),
324
352
makeNativeMethod (" resetContext" , ExecuTorchLlmJni::reset_context),
325
353
});
326
354
}
0 commit comments