Skip to content

Commit f8b1c47

Browse files
committed
let me have a try
1 parent d020306 commit f8b1c47

File tree

2 files changed

+99
-2
lines changed

2 files changed

+99
-2
lines changed

backends/xnnpack/cmake/Dependencies.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ set(XNNPACK_ENABLE_AVX512VNNIGFNI
4343
CACHE BOOL ""
4444
)
4545
set(XNNPACK_ENABLE_ARM_SME2
46-
ON
46+
OFF
4747
CACHE BOOL ""
4848
)
4949
if(EXECUTORCH_XNNPACK_ENABLE_KLEIDI)

extension/android/jni/jni_layer_llama.cpp

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <string>
1313
#include <unordered_map>
1414
#include <vector>
15+
#include <fstream>
1516

1617
#include <executorch/extension/llm/runner/image.h>
1718
#include <executorch/extension/llm/runner/irunner.h>
@@ -41,6 +42,7 @@
4142

4243
namespace llm = ::executorch::extension::llm;
4344
using ::executorch::runtime::Error;
45+
using executorch::extension::Module;
4446

4547
namespace {
4648
bool utf8_check_validity(const char* str, size_t length) {
@@ -285,6 +287,101 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
285287
return 0;
286288
}
287289

290+
llm::MultimodalInput processRawAudioFile(
291+
const std::string& audio_path,
292+
const std::string& processor_path) {
293+
if (processor_path.empty()) {
294+
ET_LOG(Error, "Processor path is required for raw audio processing");
295+
throw std::runtime_error(
296+
"Processor path is required for raw audio processing");
297+
}
298+
299+
// Load the audio processor .pte.
300+
std::unique_ptr<Module> processor_module;
301+
try {
302+
processor_module =
303+
std::make_unique<Module>(processor_path, Module::LoadMode::File);
304+
auto load_error = processor_module->load();
305+
if (load_error != ::executorch::runtime::Error::Ok) {
306+
ET_LOG(
307+
Error,
308+
"Failed to load processor module from: %s",
309+
processor_path.c_str());
310+
throw std::runtime_error("Failed to load processor module");
311+
}
312+
} catch (const std::exception& e) {
313+
ET_LOG(Error, "Exception while loading processor module: %s", e.what());
314+
throw std::runtime_error("Exception while loading processor module");
315+
}
316+
317+
// Load the audio data from file.
318+
std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
319+
if (!f.is_open()) {
320+
ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
321+
throw std::runtime_error("Failed to open audio file");
322+
}
323+
324+
std::size_t n_floats = f.tellg() / sizeof(float);
325+
f.seekg(0, std::ios::beg);
326+
327+
std::vector<float> audio_data(n_floats);
328+
f.read(
329+
reinterpret_cast<char*>(audio_data.data()),
330+
audio_data.size() * sizeof(float));
331+
f.close();
332+
333+
ET_LOG(
334+
Info, "Loaded .bin file: %s, %zu floats", audio_path.c_str(), n_floats);
335+
336+
// Execute the processor
337+
std::vector<executorch::aten::SizesType> tensor_shape = {
338+
static_cast<executorch::aten::SizesType>(audio_data.size())};
339+
auto input_tensor = executorch::extension::from_blob(
340+
audio_data.data(), tensor_shape, ::executorch::aten::ScalarType::Float);
341+
342+
ET_LOG(Info, "Processing audio through processor module...");
343+
auto result = processor_module->execute("forward", input_tensor);
344+
if (!result.ok()) {
345+
ET_LOG(Error, "Failed to execute processor's forward method");
346+
throw std::runtime_error("Failed to execute processor forward method");
347+
}
348+
349+
auto outputs = result.get();
350+
if (outputs.empty()) {
351+
ET_LOG(Error, "Processor returned no outputs");
352+
throw std::runtime_error("Processor returned no outputs");
353+
}
354+
355+
// Extract processed audio features
356+
const auto& processed_tensor = outputs[0].toTensor();
357+
const float* processed_data = processed_tensor.const_data_ptr<float>();
358+
const auto& sizes = processed_tensor.sizes();
359+
360+
ET_LOG(
361+
Info,
362+
"Processed audio tensor shape: [%d, %d, %d]",
363+
static_cast<int>(sizes[0]),
364+
static_cast<int>(sizes[1]),
365+
static_cast<int>(sizes[2]));
366+
367+
// Create Audio multimodal input from processed features
368+
int32_t batch_size = static_cast<int32_t>(sizes[0]);
369+
int32_t n_bins = static_cast<int32_t>(sizes[1]);
370+
int32_t n_frames = static_cast<int32_t>(sizes[2]);
371+
size_t total_elements = batch_size * n_bins * n_frames;
372+
std::vector<float> audio_vec(processed_data, processed_data + total_elements);
373+
auto processed_audio = ::executorch::extension::llm::Audio(
374+
std::move(audio_vec), batch_size, n_bins, n_frames);
375+
ET_LOG(
376+
Info,
377+
"Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d",
378+
batch_size,
379+
n_bins,
380+
n_frames);
381+
return ::executorch::extension::llm::make_audio_input(
382+
std::move(processed_audio));
383+
}
384+
288385
jint prefill_audio_input(
289386
facebook::jni::alias_ref<jbyteArray> audio,
290387
jint batch_size,
@@ -306,7 +403,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
306403
}
307404
llm::Audio audio_input{std::move(audio_data), batch_size, n_bins, n_frames};
308405
multi_modal_runner_->prefill(
309-
{llm::MultimodalInput{std::move(audio_input)}});
406+
{processRawAudioFile("/data/local/tmp/llama/audio.bin", "/data/local/tmp/llama/voxtral_preprocessor.pte")});
310407
}
311408
return 0;
312409
}

0 commit comments

Comments
 (0)