diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp index 081df27cd67..c60f0aaddc5 100644 --- a/examples/models/voxtral/multimodal.cpp +++ b/examples/models/voxtral/multimodal.cpp @@ -104,7 +104,11 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) { ET_LOG(Info, "audio_data len = %zu", n_floats); std::vector audio_data(n_floats); + ET_LOG(Info, "audio_data size = %zu", audio_data.size()); f.read(reinterpret_cast(audio_data.data()), n_floats * sizeof(float)); + ET_LOG(Info, "First 5 floats in audio_data is %f, %f, %f, %f, %f", + audio_data[10000], audio_data[10001], audio_data[10002], audio_data[10003], audio_data[10004]); + f.close(); auto audio = ::executorch::extension::llm::Audio( @@ -331,6 +335,12 @@ int32_t main(int32_t argc, char** argv) { // Generate ET_LOG(Info, "Starting generation..."); + for (const auto& input : inputs) { + ET_LOG( + Info, + "Input : %s", + input.to_string().c_str()); + } auto error = runner->generate(inputs, config); if (error != ::executorch::runtime::Error::Ok) { ET_LOG(Error, "Failed to generate with multimodal runner"); diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java index 40e38afb8b9..cbd1c474ed3 100644 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java @@ -233,6 +233,28 @@ public long prefillAudio(byte[] audio, int batch_size, int n_bins, int n_frames) private native int appendAudioInput(byte[] audio, int batch_size, int n_bins, int n_frames); + /** + * Prefill a multimodal Module with the given audio input. + * + * @param audio Input preprocessed audio as a float array + * @param batch_size Input batch size + * @param n_bins Input number of bins + * @param n_frames Input number of frames + * @return 0, as the updated starting position in KV cache of the input in the LLM is no longer + * exposed to user. + * @throws RuntimeException if the prefill failed + */ + @Experimental + public long prefillAudio(float[] audio, int batch_size, int n_bins, int n_frames) { + int nativeResult = appendAudioInputFloat(audio, batch_size, n_bins, n_frames); + if (nativeResult != 0) { + throw new RuntimeException("Prefill failed with error code: " + nativeResult); + } + return 0; + } + + private native int appendAudioInputFloat(float[] audio, int batch_size, int n_bins, int n_frames); + /** * Prefill a multimodal Module with the given raw audio input. * diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index ccb0d55dc41..481cb552c0f 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -223,6 +223,13 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { .seq_len = seq_len, .temperature = temperature_, }; + for (const auto& input : inputs) { + ET_LOG( + Error, + "Prefill input: %s", + input.to_string().c_str()); + } + multi_modal_runner_->generate( std::move(inputs), config, @@ -325,6 +332,32 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { return 0; } + // Returns status_code + jint append_audio_input_float( + facebook::jni::alias_ref data, + jint batch_size, + jint n_bins, + jint n_frames) { + if (data == nullptr) { + return static_cast(Error::EndOfMethod); + } + auto data_size = data->size(); + if (data_size != 0) { + std::vector data_jfloat(data_size); + std::vector data_f(data_size); + data->getRegion(0, data_size, data_jfloat.data()); + for (int i = 0; i < data_size; i++) { + data_f[i] = data_jfloat[i]; + } + ET_LOG(Error, "First 5 elements of data_f: %f, %f, %f, %f, %f", + data_f[10000], data_f[10001], data_f[10002], data_f[10003], data_f[ + 10004]); + llm::Audio audio{std::move(data_f), batch_size, n_bins, n_frames}; + prefill_inputs_.emplace_back(llm::MultimodalInput{std::move(audio)}); + } + return 0; + } + // Returns status_code jint append_raw_audio_input( facebook::jni::alias_ref data, @@ -388,6 +421,8 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { ExecuTorchLlmJni::append_normalized_images_input), makeNativeMethod( "appendAudioInput", ExecuTorchLlmJni::append_audio_input), + makeNativeMethod( + "appendAudioInputFloat", ExecuTorchLlmJni::append_audio_input_float), makeNativeMethod( "appendRawAudioInput", ExecuTorchLlmJni::append_raw_audio_input), makeNativeMethod( diff --git a/extension/llm/runner/audio.h b/extension/llm/runner/audio.h index ce71513ed17..5820c05ba59 100644 --- a/extension/llm/runner/audio.h +++ b/extension/llm/runner/audio.h @@ -29,6 +29,11 @@ struct ET_EXPERIMENTAL RawAudio { int32_t batch_size; int32_t n_channels; // For mono, use n_channels = 1. int32_t n_samples; + + std::string to_string() const { + return "RawAudio: " + std::to_string(batch_size) + "x" + + std::to_string(n_channels) + "x" + std::to_string(n_samples); + } }; /** @@ -146,6 +151,21 @@ class ET_EXPERIMENTAL Audio final { return ::executorch::runtime::Error::NotSupported; } + std::string to_string() const { + std::string result = "Audio: "; + if (is_uint8()) { + result += "uint8_t"; + } else if (is_float()) { + result += "float"; + } else { + result += "unknown"; + } + result += " data, batch_size: " + std::to_string(get_batch_size()) + + ", n_bins: " + std::to_string(get_n_bins()) + + ", n_frames: " + std::to_string(get_n_frames()); + return result; + } + private: // Members std::variant, std::vector> data_; diff --git a/extension/llm/runner/image.h b/extension/llm/runner/image.h index dbdba273536..697e3b03ecf 100644 --- a/extension/llm/runner/image.h +++ b/extension/llm/runner/image.h @@ -110,6 +110,21 @@ class ET_EXPERIMENTAL Image { return ::executorch::runtime::Error::NotSupported; } + std::string to_string() const { + std::string result = "Image: "; + if (is_uint8()) { + result += "uint8_t"; + } else if (is_float()) { + result += "float"; + } else { + result += "unknown"; + } + result += "width: " + std::to_string(width_) + ", "; + result += "height: " + std::to_string(height_) + ", "; + result += "channels: " + std::to_string(channels_); + return result; + } + private: // Assuming NCHW format std::variant, std::vector> data_; diff --git a/extension/llm/runner/multimodal_input.h b/extension/llm/runner/multimodal_input.h index 728d8aef08f..f17d7795554 100644 --- a/extension/llm/runner/multimodal_input.h +++ b/extension/llm/runner/multimodal_input.h @@ -57,6 +57,20 @@ class ET_EXPERIMENTAL MultimodalInput { // Destructor ~MultimodalInput() = default; + std::string to_string() const noexcept { + if (is_text()) { + return "Text: \"" + get_text() + "\""; + } else if (is_image()) { + return get_image().to_string(); + } else if (is_audio()) { + return get_audio().to_string(); + } else if (is_raw_audio()) { + return get_raw_audio().to_string(); + } else { + return "Unsupported input type"; + } + } + /** * Check if this input contains text data. * @return true if this input contains text, false otherwise.