1212#include < string>
1313#include < unordered_map>
1414#include < vector>
15+ #include < fstream>
1516
1617#include < executorch/extension/llm/runner/image.h>
1718#include < executorch/extension/llm/runner/irunner.h>
4142
4243namespace llm = ::executorch::extension::llm;
4344using ::executorch::runtime::Error;
45+ using executorch::extension::Module;
4446
4547namespace {
4648bool utf8_check_validity (const char * str, size_t length) {
@@ -285,6 +287,101 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
285287 return 0 ;
286288 }
287289
290+ llm::MultimodalInput processRawAudioFile (
291+ const std::string& audio_path,
292+ const std::string& processor_path) {
293+ if (processor_path.empty ()) {
294+ ET_LOG (Error, " Processor path is required for raw audio processing" );
295+ throw std::runtime_error (
296+ " Processor path is required for raw audio processing" );
297+ }
298+
299+ // Load the audio processor .pte.
300+ std::unique_ptr<Module> processor_module;
301+ try {
302+ processor_module =
303+ std::make_unique<Module>(processor_path, Module::LoadMode::File);
304+ auto load_error = processor_module->load ();
305+ if (load_error != ::executorch::runtime::Error::Ok) {
306+ ET_LOG (
307+ Error,
308+ " Failed to load processor module from: %s" ,
309+ processor_path.c_str ());
310+ throw std::runtime_error (" Failed to load processor module" );
311+ }
312+ } catch (const std::exception& e) {
313+ ET_LOG (Error, " Exception while loading processor module: %s" , e.what ());
314+ throw std::runtime_error (" Exception while loading processor module" );
315+ }
316+
317+ // Load the audio data from file.
318+ std::ifstream f (audio_path, std::ios::binary | std::ios::ate);
319+ if (!f.is_open ()) {
320+ ET_LOG (Error, " Failed to open audio file: %s" , audio_path.c_str ());
321+ throw std::runtime_error (" Failed to open audio file" );
322+ }
323+
324+ std::size_t n_floats = f.tellg () / sizeof (float );
325+ f.seekg (0 , std::ios::beg);
326+
327+ std::vector<float > audio_data (n_floats);
328+ f.read (
329+ reinterpret_cast <char *>(audio_data.data ()),
330+ audio_data.size () * sizeof (float ));
331+ f.close ();
332+
333+ ET_LOG (
334+ Info, " Loaded .bin file: %s, %zu floats" , audio_path.c_str (), n_floats);
335+
336+ // Execute the processor
337+ std::vector<executorch::aten::SizesType> tensor_shape = {
338+ static_cast <executorch::aten::SizesType>(audio_data.size ())};
339+ auto input_tensor = executorch::extension::from_blob (
340+ audio_data.data (), tensor_shape, ::executorch::aten::ScalarType::Float);
341+
342+ ET_LOG (Info, " Processing audio through processor module..." );
343+ auto result = processor_module->execute (" forward" , input_tensor);
344+ if (!result.ok ()) {
345+ ET_LOG (Error, " Failed to execute processor's forward method" );
346+ throw std::runtime_error (" Failed to execute processor forward method" );
347+ }
348+
349+ auto outputs = result.get ();
350+ if (outputs.empty ()) {
351+ ET_LOG (Error, " Processor returned no outputs" );
352+ throw std::runtime_error (" Processor returned no outputs" );
353+ }
354+
355+ // Extract processed audio features
356+ const auto & processed_tensor = outputs[0 ].toTensor ();
357+ const float * processed_data = processed_tensor.const_data_ptr <float >();
358+ const auto & sizes = processed_tensor.sizes ();
359+
360+ ET_LOG (
361+ Info,
362+ " Processed audio tensor shape: [%d, %d, %d]" ,
363+ static_cast <int >(sizes[0 ]),
364+ static_cast <int >(sizes[1 ]),
365+ static_cast <int >(sizes[2 ]));
366+
367+ // Create Audio multimodal input from processed features
368+ int32_t batch_size = static_cast <int32_t >(sizes[0 ]);
369+ int32_t n_bins = static_cast <int32_t >(sizes[1 ]);
370+ int32_t n_frames = static_cast <int32_t >(sizes[2 ]);
371+ size_t total_elements = batch_size * n_bins * n_frames;
372+ std::vector<float > audio_vec (processed_data, processed_data + total_elements);
373+ auto processed_audio = ::executorch::extension::llm::Audio (
374+ std::move (audio_vec), batch_size, n_bins, n_frames);
375+ ET_LOG (
376+ Info,
377+ " Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d" ,
378+ batch_size,
379+ n_bins,
380+ n_frames);
381+ return ::executorch::extension::llm::make_audio_input (
382+ std::move (processed_audio));
383+ }
384+
288385 jint prefill_audio_input (
289386 facebook::jni::alias_ref<jbyteArray> audio,
290387 jint batch_size,
@@ -306,7 +403,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
306403 }
307404 llm::Audio audio_input{std::move (audio_data), batch_size, n_bins, n_frames};
308405 multi_modal_runner_->prefill (
309- {llm::MultimodalInput{ std::move (audio_input)} });
406+ {processRawAudioFile ( " /data/local/tmp/llama/audio.bin " , " /data/local/tmp/llama/voxtral_preprocessor.pte " ) });
310407 }
311408 return 0 ;
312409 }
0 commit comments