From 463c4b559b6a320018c94caf178f32f167053cf6 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Thu, 28 Aug 2025 17:03:29 -0700 Subject: [PATCH 1/4] Remove unused line --- extension/android/jni/jni_layer_llama.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index a27b8194530..c64cb516e41 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -15,7 +15,6 @@ #include #include -#include #include #include #include From 63e407ea85b16c14dd932c3432f5f1d144d1ce71 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Thu, 28 Aug 2025 18:05:14 -0700 Subject: [PATCH 2/4] test --- extension/android/jni/jni_layer_llama.cpp | 65 ++++++----------------- 1 file changed, 16 insertions(+), 49 deletions(-) diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index c64cb516e41..1af5f56215a 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -13,10 +13,12 @@ #include #include -#include -#include #include #include +#include +#include +#include +#include #include #include #include @@ -119,7 +121,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { float temperature_ = 0.0f; int model_type_category_; std::unique_ptr runner_; - std::unique_ptr multi_modal_runner_; + std::unique_ptr multi_modal_runner_; public: constexpr static auto kJavaDescriptor = @@ -165,19 +167,16 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { model_type_category_ = model_type_category; if (model_type_category == MODEL_TYPE_CATEGORY_MULTIMODAL) { - multi_modal_runner_ = std::make_unique( + multi_modal_runner_ = llm::create_multimodal_runner( model_path->toStdString().c_str(), - tokenizer_path->toStdString().c_str(), - temperature); + llm::load_tokenizer(tokenizer_path->toStdString())); } else if (model_type_category == MODEL_TYPE_CATEGORY_LLM) { std::optional data_path_str = data_path ? std::optional{data_path->toStdString()} : std::nullopt; - // TODO(larryliu0820): Use the API in text_llm_runner.h to create the - // runner. - runner_ = example::create_llama_runner( + runner_ = executorch::extension::llm::create_text_llm_runner( model_path->toStdString(), - tokenizer_path->toStdString(), + llm::load_tokenizer(tokenizer_path->toStdString()), data_path_str); #if defined(EXECUTORCH_BUILD_QNN) } else if (model_type_category == MODEL_TYPE_QNN_LLAMA) { @@ -260,17 +259,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { jint eos) { facebook::jni::local_ref tuple_result = facebook::jni::make_long_array(2); - if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) { - tuple_result->pin()[0] = static_cast(Error::NotSupported); - return tuple_result; - } - - auto&& result = multi_modal_runner_->prefill_prompt( - prompt->toStdString(), start_pos, bos, eos); - tuple_result->pin()[0] = static_cast(Error::Ok); - if (result.ok()) { - tuple_result->pin()[1] = static_cast(start_pos); - } + tuple_result->pin()[0] = static_cast(Error::NotSupported); return tuple_result; } @@ -287,28 +276,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { facebook::jni::local_ref tuple_result = facebook::jni::make_long_array(2); - if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) { - tuple_result->pin()[0] = static_cast(Error::NotSupported); - return tuple_result; - } - - auto image_size = image->size(); - std::vector images; - if (image_size != 0) { - std::vector image_data_jint(image_size); - std::vector image_data(image_size); - image->getRegion(0, image_size, image_data_jint.data()); - for (int i = 0; i < image_size; i++) { - image_data[i] = image_data_jint[i]; - } - llm::Image image_runner{image_data, width, height, channels}; - images.push_back(image_runner); - } - // TODO(hsz): make start_pos a reference and update it here - jint result = static_cast( - multi_modal_runner_->prefill_images(images, start_pos)); - tuple_result->pin()[0] = result; - tuple_result->pin()[1] = static_cast(start_pos); + tuple_result->pin()[0] = static_cast(Error::NotSupported); return tuple_result; } @@ -319,13 +287,12 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { facebook::jni::alias_ref callback, jboolean echo) { if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) { - return static_cast(multi_modal_runner_->generate_from_pos( - prompt->toStdString(), - seq_len, - start_pos, + + return static_cast(multi_modal_runner_->generate( + std::vector{llm::MultimodalInput{prompt->toStdString()}}, + llm::GenerationConfig {.echo = static_cast(echo), .seq_len = seq_len}, [callback](const std::string& result) { callback->onResult(result); }, - [callback](const llm::Stats& stats) { callback->onStats(stats); }, - echo)); + [callback](const llm::Stats& stats) { callback->onStats(stats); })); } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) { executorch::extension::llm::GenerationConfig config{ .echo = static_cast(echo), From 43d8e5edc09fa8f1e3b2f5566f5f279a4f194063 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Wed, 3 Sep 2025 16:08:17 -0700 Subject: [PATCH 3/4] Prefill --- extension/android/jni/jni_layer_llama.cpp | 52 ++++++++++++++++------ extension/llm/runner/multimodal_runner.cpp | 4 +- extension/llm/runner/multimodal_runner.h | 4 +- 3 files changed, 43 insertions(+), 17 deletions(-) diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index 1af5f56215a..0c3550f151a 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -121,7 +121,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { float temperature_ = 0.0f; int model_type_category_; std::unique_ptr runner_; - std::unique_ptr multi_modal_runner_; + std::unique_ptr + multi_modal_runner_; + std::vector prefill_inputs_; public: constexpr static auto kJavaDescriptor = @@ -215,6 +217,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { facebook::jni::alias_ref callback, jboolean echo) { if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) { + std::vector inputs = prefill_inputs_; + prefill_inputs_.clear(); + inputs.emplace_back(llm::MultimodalInput{prompt->toStdString()}); auto image_size = image->size(); std::vector images; if (image_size != 0) { @@ -225,15 +230,18 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { image_data[i] = image_data_jint[i]; } llm::Image image_runner{image_data, width, height, channels}; - images.push_back(image_runner); + inputs.emplace_back(llm::MultimodalInput{std::move(image_runner)}); } + executorch::extension::llm::GenerationConfig config{ + .echo = static_cast(echo), + .seq_len = seq_len, + .temperature = temperature_, + }; multi_modal_runner_->generate( - std::move(images), - prompt->toStdString(), - seq_len, - [callback](std::string result) { callback->onResult(result); }, - [callback](const llm::Stats& result) { callback->onStats(result); }, - echo); + std::move(inputs), + config, + [callback](const std::string& result) { callback->onResult(result); }, + [callback](const llm::Stats& result) { callback->onStats(result); }); } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) { executorch::extension::llm::GenerationConfig config{ .echo = static_cast(echo), @@ -257,9 +265,10 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { jlong start_pos, jint bos, jint eos) { + prefill_inputs_.emplace_back(llm::MultimodalInput{prompt->toStdString()}); facebook::jni::local_ref tuple_result = facebook::jni::make_long_array(2); - tuple_result->pin()[0] = static_cast(Error::NotSupported); + tuple_result->pin()[0] = static_cast(Error::Ok); return tuple_result; } @@ -273,10 +282,24 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { jint height, jint channels, jlong start_pos) { + std::vector images; + auto image_size = image->size(); + if (image_size != 0) { + std::vector image_data_jint(image_size); + std::vector image_data(image_size); + image->getRegion(0, image_size, image_data_jint.data()); + for (int i = 0; i < image_size; i++) { + image_data[i] = image_data_jint[i]; + } + llm::Image image_runner{image_data, width, height, channels}; + prefill_inputs_.emplace_back( + llm::MultimodalInput{std::move(image_runner)}); + } + facebook::jni::local_ref tuple_result = facebook::jni::make_long_array(2); - tuple_result->pin()[0] = static_cast(Error::NotSupported); + tuple_result->pin()[0] = static_cast(Error::Ok); return tuple_result; } @@ -287,10 +310,13 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { facebook::jni::alias_ref callback, jboolean echo) { if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) { - + std::vector inputs = prefill_inputs_; + prefill_inputs_.clear(); + inputs.emplace_back(llm::MultimodalInput{prompt->toStdString()}); return static_cast(multi_modal_runner_->generate( - std::vector{llm::MultimodalInput{prompt->toStdString()}}, - llm::GenerationConfig {.echo = static_cast(echo), .seq_len = seq_len}, + inputs, + llm::GenerationConfig{ + .echo = static_cast(echo), .seq_len = seq_len}, [callback](const std::string& result) { callback->onResult(result); }, [callback](const llm::Stats& stats) { callback->onStats(stats); })); } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) { diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp index 2bc658692da..f6b29d42c09 100644 --- a/extension/llm/runner/multimodal_runner.cpp +++ b/extension/llm/runner/multimodal_runner.cpp @@ -65,8 +65,8 @@ Error MultimodalRunner::load() { Error MultimodalRunner::generate( const std::vector& inputs, const GenerationConfig& config, - std::function& token_callback, - std::function& stats_callback) { + std::function token_callback, + std::function stats_callback) { if (inputs.empty()) { ET_LOG(Error, "MultimodalInput vector cannot be empty"); return Error::InvalidArgument; diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h index 186a5bf70e4..fc87a9ab18a 100644 --- a/extension/llm/runner/multimodal_runner.h +++ b/extension/llm/runner/multimodal_runner.h @@ -116,8 +116,8 @@ class ET_EXPERIMENTAL MultimodalRunner { virtual ::executorch::runtime::Error generate( const std::vector& inputs, const GenerationConfig& config, - std::function& token_callback, - std::function& stats_callback); + std::function token_callback, + std::function stats_callback); inline void stop() { text_token_generator_->stop(); From 3a606e217840111b1772ed4af961bf32ccf74789 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Wed, 3 Sep 2025 17:58:31 -0700 Subject: [PATCH 4/4] Java allow adding audio input --- .../executorch/extension/llm/LlmModule.java | 17 +++++++--- extension/android/jni/jni_layer_llama.cpp | 32 ++++++++++++++++--- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java index b014ceb75d8..d8ee0ab7482 100644 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java @@ -177,7 +177,7 @@ public native int generate( * @throws RuntimeException if the prefill failed */ public long prefillImages(int[] image, int width, int height, int channels, long startPos) { - long[] nativeResult = prefillImagesNative(image, width, height, channels, startPos); + long[] nativeResult = addImageInputNative(image, width, height, channels, startPos); if (nativeResult[0] != 0) { throw new RuntimeException("Prefill failed with error code: " + nativeResult[0]); } @@ -185,7 +185,7 @@ public long prefillImages(int[] image, int width, int height, int channels, long } // returns a tuple of (status, updated startPos) - private native long[] prefillImagesNative( + private native long[] addImageInputNative( int[] image, int width, int height, int channels, long startPos); /** @@ -200,7 +200,7 @@ private native long[] prefillImagesNative( * @throws RuntimeException if the prefill failed */ public long prefillPrompt(String prompt, long startPos, int bos, int eos) { - long[] nativeResult = prefillPromptNative(prompt, startPos, bos, eos); + long[] nativeResult = addTextInputNative(prompt, startPos, bos, eos); if (nativeResult[0] != 0) { throw new RuntimeException("Prefill failed with error code: " + nativeResult[0]); } @@ -208,7 +208,10 @@ public long prefillPrompt(String prompt, long startPos, int bos, int eos) { } // returns a tuple of (status, updated startPos) - private native long[] prefillPromptNative(String prompt, long startPos, int bos, int eos); + private native long[] addTextInputNative(String prompt, long startPos, int bos, int eos); + + // returns the status code + private native int addAudioInputNative(int[] audio, int batch_size, int n_bins, int n_frames); /** * Generate tokens from the given prompt, starting from the given position. @@ -217,6 +220,12 @@ public long prefillPrompt(String prompt, long startPos, int bos, int eos) { * @param seqLen The total sequence length, including the prompt tokens and new tokens. * @param startPos The starting position in KV cache of the input in the LLM. * @param callback callback object to receive results. + * @param echo indicate whether to echo the + *

/** Generate tokens from the given prompt, starting from the given position. + * @param prompt The text prompt to LLaVA. + * @param seqLen The total sequence length, including the prompt tokens and new tokens. + * @param startPos The starting position in KV cache of the input in the LLM. + * @param callback callback object to receive results. * @param echo indicate whether to echo the input prompt or not. * @return The error code. */ diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index 0c3550f151a..aa5f6052225 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -260,7 +260,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { // Returns a tuple of (error, start_pos) // Contract is valid within an AAR (JNI + corresponding Java code) // If the first element is not Error::Ok, the other element is undefined. - facebook::jni::local_ref prefill_prompt( + facebook::jni::local_ref add_text_input( facebook::jni::alias_ref prompt, jlong start_pos, jint bos, @@ -276,7 +276,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { // Contract is valid within an AAR (JNI + corresponding Java code) // If the first element is not Error::Ok, the other element is undefined. - facebook::jni::local_ref prefill_images( + facebook::jni::local_ref add_images_input( facebook::jni::alias_ref image, jint width, jint height, @@ -303,6 +303,28 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { return tuple_result; } + // Returns the status code + jint add_audio_input( + facebook::jni::alias_ref audio, + jint batch_size, + jint n_bins, + jint n_frames) { + auto audio_size = audio->size(); + if (audio_size != 0) { + std::vector audio_data_jint(audio_size); + std::vector audio_data(audio_size); + audio->getRegion(0, audio_size, audio_data_jint.data()); + for (int i = 0; i < audio_size; i++) { + audio_data[i] = audio_data_jint[i]; + } + auto&& audio_input = llm::make_audio_input( + llm::Audio{audio_data, batch_size, n_bins, n_frames}); + prefill_inputs_.emplace_back(audio_input); + } + + return 0; + } + jint generate_from_pos( facebook::jni::alias_ref prompt, jint seq_len, @@ -359,9 +381,11 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { makeNativeMethod("stop", ExecuTorchLlmJni::stop), makeNativeMethod("load", ExecuTorchLlmJni::load), makeNativeMethod( - "prefillImagesNative", ExecuTorchLlmJni::prefill_images), + "addImageInputNative", ExecuTorchLlmJni::add_images_input), + makeNativeMethod( + "addTextInputNative", ExecuTorchLlmJni::add_text_input), makeNativeMethod( - "prefillPromptNative", ExecuTorchLlmJni::prefill_prompt), + "addAudioInputNative", ExecuTorchLlmJni::add_audio_input), makeNativeMethod( "generateFromPos", ExecuTorchLlmJni::generate_from_pos), });