From 2d7c1163d7aaecb831bb497f2a840e0aac1e5c44 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Thu, 16 Oct 2025 17:55:38 -0700 Subject: [PATCH 1/3] audio float API --- .../executorch/extension/llm/LlmModule.java | 22 ++++++++++++++++ extension/android/jni/jni_layer_llama.cpp | 26 +++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java index 40e38afb8b9..cbd1c474ed3 100644 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java @@ -233,6 +233,28 @@ public long prefillAudio(byte[] audio, int batch_size, int n_bins, int n_frames) private native int appendAudioInput(byte[] audio, int batch_size, int n_bins, int n_frames); + /** + * Prefill a multimodal Module with the given audio input. + * + * @param audio Input preprocessed audio as a float array + * @param batch_size Input batch size + * @param n_bins Input number of bins + * @param n_frames Input number of frames + * @return 0, as the updated starting position in KV cache of the input in the LLM is no longer + * exposed to user. + * @throws RuntimeException if the prefill failed + */ + @Experimental + public long prefillAudio(float[] audio, int batch_size, int n_bins, int n_frames) { + int nativeResult = appendAudioInputFloat(audio, batch_size, n_bins, n_frames); + if (nativeResult != 0) { + throw new RuntimeException("Prefill failed with error code: " + nativeResult); + } + return 0; + } + + private native int appendAudioInputFloat(float[] audio, int batch_size, int n_bins, int n_frames); + /** * Prefill a multimodal Module with the given raw audio input. * diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index ccb0d55dc41..aa9a6ee58c5 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -325,6 +325,29 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { return 0; } + // Returns status_code + jint append_audio_input_float( + facebook::jni::alias_ref data, + jint batch_size, + jint n_bins, + jint n_frames) { + if (data == nullptr) { + return static_cast(Error::EndOfMethod); + } + auto data_size = data->size(); + if (data_size != 0) { + std::vector data_jfloat(data_size); + std::vector data_f(data_size); + data->getRegion(0, data_size, data_jfloat.data()); + for (int i = 0; i < data_size; i++) { + data_f[i] = data_jfloat[i]; + } + llm::Audio audio{std::move(data_f), batch_size, n_bins, n_frames}; + prefill_inputs_.emplace_back(llm::MultimodalInput{std::move(audio)}); + } + return 0; + } + // Returns status_code jint append_raw_audio_input( facebook::jni::alias_ref data, @@ -388,6 +411,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { ExecuTorchLlmJni::append_normalized_images_input), makeNativeMethod( "appendAudioInput", ExecuTorchLlmJni::append_audio_input), + makeNativeMethod( + "appendAudioInputFloat", + ExecuTorchLlmJni::append_audio_input_float), makeNativeMethod( "appendRawAudioInput", ExecuTorchLlmJni::append_raw_audio_input), makeNativeMethod( From e104edf29e24e10f6567beea3a839eb29b830bc7 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Thu, 16 Oct 2025 18:01:10 -0700 Subject: [PATCH 2/3] helper --- examples/models/voxtral/multimodal.cpp | 10 ++++++++++ extension/android/jni/jni_layer_llama.cpp | 14 ++++++++++++-- extension/llm/runner/audio.h | 20 ++++++++++++++++++++ extension/llm/runner/image.h | 15 +++++++++++++++ extension/llm/runner/multimodal_input.h | 14 ++++++++++++++ 5 files changed, 71 insertions(+), 2 deletions(-) diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp index 081df27cd67..c60f0aaddc5 100644 --- a/examples/models/voxtral/multimodal.cpp +++ b/examples/models/voxtral/multimodal.cpp @@ -104,7 +104,11 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) { ET_LOG(Info, "audio_data len = %zu", n_floats); std::vector audio_data(n_floats); + ET_LOG(Info, "audio_data size = %zu", audio_data.size()); f.read(reinterpret_cast(audio_data.data()), n_floats * sizeof(float)); + ET_LOG(Info, "First 5 floats in audio_data is %f, %f, %f, %f, %f", + audio_data[10000], audio_data[10001], audio_data[10002], audio_data[10003], audio_data[10004]); + f.close(); auto audio = ::executorch::extension::llm::Audio( @@ -331,6 +335,12 @@ int32_t main(int32_t argc, char** argv) { // Generate ET_LOG(Info, "Starting generation..."); + for (const auto& input : inputs) { + ET_LOG( + Info, + "Input : %s", + input.to_string().c_str()); + } auto error = runner->generate(inputs, config); if (error != ::executorch::runtime::Error::Ok) { ET_LOG(Error, "Failed to generate with multimodal runner"); diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index aa9a6ee58c5..cd8b8dd01a5 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -222,7 +222,15 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { .echo = static_cast(echo), .seq_len = seq_len, .temperature = temperature_, + .max_new_tokens = 256, }; + for (const auto& input : inputs) { + ET_LOG( + Error, + "Prefill input: %s", + input.to_string().c_str()); + } + multi_modal_runner_->generate( std::move(inputs), config, @@ -342,6 +350,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { for (int i = 0; i < data_size; i++) { data_f[i] = data_jfloat[i]; } + ET_LOG(Error, "First 5 elements of data_f: %f, %f, %f, %f, %f", + data_f[10000], data_f[10001], data_f[10002], data_f[10003], data_f[ + 10004]); llm::Audio audio{std::move(data_f), batch_size, n_bins, n_frames}; prefill_inputs_.emplace_back(llm::MultimodalInput{std::move(audio)}); } @@ -412,8 +423,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { makeNativeMethod( "appendAudioInput", ExecuTorchLlmJni::append_audio_input), makeNativeMethod( - "appendAudioInputFloat", - ExecuTorchLlmJni::append_audio_input_float), + "appendAudioInputFloat", ExecuTorchLlmJni::append_audio_input_float), makeNativeMethod( "appendRawAudioInput", ExecuTorchLlmJni::append_raw_audio_input), makeNativeMethod( diff --git a/extension/llm/runner/audio.h b/extension/llm/runner/audio.h index ce71513ed17..5820c05ba59 100644 --- a/extension/llm/runner/audio.h +++ b/extension/llm/runner/audio.h @@ -29,6 +29,11 @@ struct ET_EXPERIMENTAL RawAudio { int32_t batch_size; int32_t n_channels; // For mono, use n_channels = 1. int32_t n_samples; + + std::string to_string() const { + return "RawAudio: " + std::to_string(batch_size) + "x" + + std::to_string(n_channels) + "x" + std::to_string(n_samples); + } }; /** @@ -146,6 +151,21 @@ class ET_EXPERIMENTAL Audio final { return ::executorch::runtime::Error::NotSupported; } + std::string to_string() const { + std::string result = "Audio: "; + if (is_uint8()) { + result += "uint8_t"; + } else if (is_float()) { + result += "float"; + } else { + result += "unknown"; + } + result += " data, batch_size: " + std::to_string(get_batch_size()) + + ", n_bins: " + std::to_string(get_n_bins()) + + ", n_frames: " + std::to_string(get_n_frames()); + return result; + } + private: // Members std::variant, std::vector> data_; diff --git a/extension/llm/runner/image.h b/extension/llm/runner/image.h index dbdba273536..697e3b03ecf 100644 --- a/extension/llm/runner/image.h +++ b/extension/llm/runner/image.h @@ -110,6 +110,21 @@ class ET_EXPERIMENTAL Image { return ::executorch::runtime::Error::NotSupported; } + std::string to_string() const { + std::string result = "Image: "; + if (is_uint8()) { + result += "uint8_t"; + } else if (is_float()) { + result += "float"; + } else { + result += "unknown"; + } + result += "width: " + std::to_string(width_) + ", "; + result += "height: " + std::to_string(height_) + ", "; + result += "channels: " + std::to_string(channels_); + return result; + } + private: // Assuming NCHW format std::variant, std::vector> data_; diff --git a/extension/llm/runner/multimodal_input.h b/extension/llm/runner/multimodal_input.h index 728d8aef08f..f17d7795554 100644 --- a/extension/llm/runner/multimodal_input.h +++ b/extension/llm/runner/multimodal_input.h @@ -57,6 +57,20 @@ class ET_EXPERIMENTAL MultimodalInput { // Destructor ~MultimodalInput() = default; + std::string to_string() const noexcept { + if (is_text()) { + return "Text: \"" + get_text() + "\""; + } else if (is_image()) { + return get_image().to_string(); + } else if (is_audio()) { + return get_audio().to_string(); + } else if (is_raw_audio()) { + return get_raw_audio().to_string(); + } else { + return "Unsupported input type"; + } + } + /** * Check if this input contains text data. * @return true if this input contains text, false otherwise. From eec63eb7b926928b0b0eaf6771bd3c90aa99f726 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Fri, 17 Oct 2025 11:45:56 -0700 Subject: [PATCH 3/3] Good --- extension/android/jni/jni_layer_llama.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index cd8b8dd01a5..481cb552c0f 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -222,7 +222,6 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { .echo = static_cast(echo), .seq_len = seq_len, .temperature = temperature_, - .max_new_tokens = 256, }; for (const auto& input : inputs) { ET_LOG(