From 463c4b559b6a320018c94caf178f32f167053cf6 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@meta.com>
Date: Thu, 28 Aug 2025 17:03:29 -0700
Subject: [PATCH 1/4] Remove unused line

---
 extension/android/jni/jni_layer_llama.cpp | 1 -
 1 file changed, 1 deletion(-)
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index a27b8194530..c64cb516e41 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -15,7 +15,6 @@
 
 #include <executorch/examples/models/llama/runner/runner.h>
 #include <executorch/examples/models/llava/runner/llava_runner.h>
-#include <executorch/examples/qualcomm/oss_scripts/llama/runner/runner.h>
 #include <executorch/extension/llm/runner/image.h>
 #include <executorch/extension/llm/runner/irunner.h>
 #include <executorch/runtime/platform/log.h>

From 63e407ea85b16c14dd932c3432f5f1d144d1ce71 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@meta.com>
Date: Thu, 28 Aug 2025 18:05:14 -0700
Subject: [PATCH 2/4] test

---
 extension/android/jni/jni_layer_llama.cpp | 65 ++++++-----------------
 1 file changed, 16 insertions(+), 49 deletions(-)

diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index c64cb516e41..1af5f56215a 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -13,10 +13,12 @@
 #include <unordered_map>
 #include <vector>
 
-#include <executorch/examples/models/llama/runner/runner.h>
-#include <executorch/examples/models/llava/runner/llava_runner.h>
 #include <executorch/extension/llm/runner/image.h>
 #include <executorch/extension/llm/runner/irunner.h>
+#include <executorch/extension/llm/runner/llm_runner_helper.h>
+#include <executorch/extension/llm/runner/multimodal_input.h>
+#include <executorch/extension/llm/runner/multimodal_runner.h>
+#include <executorch/extension/llm/runner/text_llm_runner.h>
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/platform.h>
 #include <executorch/runtime/platform/runtime.h>
@@ -119,7 +121,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
   float temperature_ = 0.0f;
   int model_type_category_;
   std::unique_ptr<llm::IRunner> runner_;
-  std::unique_ptr<example::LlavaRunner> multi_modal_runner_;
+  std::unique_ptr<executorch::extension::llm::MultimodalRunner> multi_modal_runner_;
 
  public:
   constexpr static auto kJavaDescriptor =
@@ -165,19 +167,16 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
 
     model_type_category_ = model_type_category;
     if (model_type_category == MODEL_TYPE_CATEGORY_MULTIMODAL) {
-      multi_modal_runner_ = std::make_unique<example::LlavaRunner>(
+      multi_modal_runner_ = llm::create_multimodal_runner(
           model_path->toStdString().c_str(),
-          tokenizer_path->toStdString().c_str(),
-          temperature);
+          llm::load_tokenizer(tokenizer_path->toStdString()));
     } else if (model_type_category == MODEL_TYPE_CATEGORY_LLM) {
       std::optional<const std::string> data_path_str = data_path
           ? std::optional<const std::string>{data_path->toStdString()}
           : std::nullopt;
-      // TODO(larryliu0820): Use the API in text_llm_runner.h to create the
-      // runner.
-      runner_ = example::create_llama_runner(
+      runner_ = executorch::extension::llm::create_text_llm_runner(
           model_path->toStdString(),
-          tokenizer_path->toStdString(),
+          llm::load_tokenizer(tokenizer_path->toStdString()),
           data_path_str);
 #if defined(EXECUTORCH_BUILD_QNN)
     } else if (model_type_category == MODEL_TYPE_QNN_LLAMA) {
@@ -260,17 +259,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       jint eos) {
     facebook::jni::local_ref<jlongArray> tuple_result =
         facebook::jni::make_long_array(2);
-    if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) {
-      tuple_result->pin()[0] = static_cast<jint>(Error::NotSupported);
-      return tuple_result;
-    }
-
-    auto&& result = multi_modal_runner_->prefill_prompt(
-        prompt->toStdString(), start_pos, bos, eos);
-    tuple_result->pin()[0] = static_cast<jint>(Error::Ok);
-    if (result.ok()) {
-      tuple_result->pin()[1] = static_cast<jlong>(start_pos);
-    }
+    tuple_result->pin()[0] = static_cast<jint>(Error::NotSupported);
     return tuple_result;
   }
 
@@ -287,28 +276,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
     facebook::jni::local_ref<jlongArray> tuple_result =
         facebook::jni::make_long_array(2);
 
-    if (model_type_category_ != MODEL_TYPE_CATEGORY_MULTIMODAL) {
-      tuple_result->pin()[0] = static_cast<jint>(Error::NotSupported);
-      return tuple_result;
-    }
-
-    auto image_size = image->size();
-    std::vector<llm::Image> images;
-    if (image_size != 0) {
-      std::vector<jint> image_data_jint(image_size);
-      std::vector<uint8_t> image_data(image_size);
-      image->getRegion(0, image_size, image_data_jint.data());
-      for (int i = 0; i < image_size; i++) {
-        image_data[i] = image_data_jint[i];
-      }
-      llm::Image image_runner{image_data, width, height, channels};
-      images.push_back(image_runner);
-    }
-    // TODO(hsz): make  start_pos a reference and update it here
-    jint result = static_cast<jint>(
-        multi_modal_runner_->prefill_images(images, start_pos));
-    tuple_result->pin()[0] = result;
-    tuple_result->pin()[1] = static_cast<jlong>(start_pos);
+    tuple_result->pin()[0] = static_cast<jint>(Error::NotSupported);
     return tuple_result;
   }
 
@@ -319,13 +287,12 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       facebook::jni::alias_ref<ExecuTorchLlmCallbackJni> callback,
       jboolean echo) {
     if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) {
-      return static_cast<jint>(multi_modal_runner_->generate_from_pos(
-          prompt->toStdString(),
-          seq_len,
-          start_pos,
+
+      return static_cast<jint>(multi_modal_runner_->generate(
+          std::vector<llm::MultimodalInput>{llm::MultimodalInput{prompt->toStdString()}},
+          llm::GenerationConfig {.echo = static_cast<bool>(echo), .seq_len = seq_len},
           [callback](const std::string& result) { callback->onResult(result); },
-          [callback](const llm::Stats& stats) { callback->onStats(stats); },
-          echo));
+          [callback](const llm::Stats& stats) { callback->onStats(stats); }));
     } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
       executorch::extension::llm::GenerationConfig config{
           .echo = static_cast<bool>(echo),

From 43d8e5edc09fa8f1e3b2f5566f5f279a4f194063 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@meta.com>
Date: Wed, 3 Sep 2025 16:08:17 -0700
Subject: [PATCH 3/4] Prefill

---
 extension/android/jni/jni_layer_llama.cpp  | 52 ++++++++++++++++------
 extension/llm/runner/multimodal_runner.cpp |  4 +-
 extension/llm/runner/multimodal_runner.h   |  4 +-
 3 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index 1af5f56215a..0c3550f151a 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -121,7 +121,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
   float temperature_ = 0.0f;
   int model_type_category_;
   std::unique_ptr<llm::IRunner> runner_;
-  std::unique_ptr<executorch::extension::llm::MultimodalRunner> multi_modal_runner_;
+  std::unique_ptr<executorch::extension::llm::MultimodalRunner>
+      multi_modal_runner_;
+  std::vector<llm::MultimodalInput> prefill_inputs_;
 
  public:
   constexpr static auto kJavaDescriptor =
@@ -215,6 +217,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       facebook::jni::alias_ref<ExecuTorchLlmCallbackJni> callback,
       jboolean echo) {
     if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) {
+      std::vector<llm::MultimodalInput> inputs = prefill_inputs_;
+      prefill_inputs_.clear();
+      inputs.emplace_back(llm::MultimodalInput{prompt->toStdString()});
       auto image_size = image->size();
       std::vector<llm::Image> images;
       if (image_size != 0) {
@@ -225,15 +230,18 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
           image_data[i] = image_data_jint[i];
         }
         llm::Image image_runner{image_data, width, height, channels};
-        images.push_back(image_runner);
+        inputs.emplace_back(llm::MultimodalInput{std::move(image_runner)});
       }
+      executorch::extension::llm::GenerationConfig config{
+          .echo = static_cast<bool>(echo),
+          .seq_len = seq_len,
+          .temperature = temperature_,
+      };
       multi_modal_runner_->generate(
-          std::move(images),
-          prompt->toStdString(),
-          seq_len,
-          [callback](std::string result) { callback->onResult(result); },
-          [callback](const llm::Stats& result) { callback->onStats(result); },
-          echo);
+          std::move(inputs),
+          config,
+          [callback](const std::string& result) { callback->onResult(result); },
+          [callback](const llm::Stats& result) { callback->onStats(result); });
     } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
       executorch::extension::llm::GenerationConfig config{
           .echo = static_cast<bool>(echo),
@@ -257,9 +265,10 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       jlong start_pos,
       jint bos,
       jint eos) {
+    prefill_inputs_.emplace_back(llm::MultimodalInput{prompt->toStdString()});
     facebook::jni::local_ref<jlongArray> tuple_result =
         facebook::jni::make_long_array(2);
-    tuple_result->pin()[0] = static_cast<jint>(Error::NotSupported);
+    tuple_result->pin()[0] = static_cast<jint>(Error::Ok);
     return tuple_result;
   }
 
@@ -273,10 +282,24 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       jint height,
       jint channels,
       jlong start_pos) {
+    std::vector<llm::Image> images;
+    auto image_size = image->size();
+    if (image_size != 0) {
+      std::vector<jint> image_data_jint(image_size);
+      std::vector<uint8_t> image_data(image_size);
+      image->getRegion(0, image_size, image_data_jint.data());
+      for (int i = 0; i < image_size; i++) {
+        image_data[i] = image_data_jint[i];
+      }
+      llm::Image image_runner{image_data, width, height, channels};
+      prefill_inputs_.emplace_back(
+          llm::MultimodalInput{std::move(image_runner)});
+    }
+
     facebook::jni::local_ref<jlongArray> tuple_result =
         facebook::jni::make_long_array(2);
 
-    tuple_result->pin()[0] = static_cast<jint>(Error::NotSupported);
+    tuple_result->pin()[0] = static_cast<jint>(Error::Ok);
     return tuple_result;
   }
 
@@ -287,10 +310,13 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       facebook::jni::alias_ref<ExecuTorchLlmCallbackJni> callback,
       jboolean echo) {
     if (model_type_category_ == MODEL_TYPE_CATEGORY_MULTIMODAL) {
-
+      std::vector<llm::MultimodalInput> inputs = prefill_inputs_;
+      prefill_inputs_.clear();
+      inputs.emplace_back(llm::MultimodalInput{prompt->toStdString()});
       return static_cast<jint>(multi_modal_runner_->generate(
-          std::vector<llm::MultimodalInput>{llm::MultimodalInput{prompt->toStdString()}},
-          llm::GenerationConfig {.echo = static_cast<bool>(echo), .seq_len = seq_len},
+          inputs,
+          llm::GenerationConfig{
+              .echo = static_cast<bool>(echo), .seq_len = seq_len},
           [callback](const std::string& result) { callback->onResult(result); },
           [callback](const llm::Stats& stats) { callback->onStats(stats); }));
     } else if (model_type_category_ == MODEL_TYPE_CATEGORY_LLM) {
diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp
index 2bc658692da..f6b29d42c09 100644
--- a/extension/llm/runner/multimodal_runner.cpp
+++ b/extension/llm/runner/multimodal_runner.cpp
@@ -65,8 +65,8 @@ Error MultimodalRunner::load() {
 Error MultimodalRunner::generate(
     const std::vector<MultimodalInput>& inputs,
     const GenerationConfig& config,
-    std::function<void(const std::string&)>& token_callback,
-    std::function<void(const Stats&)>& stats_callback) {
+    std::function<void(const std::string&)> token_callback,
+    std::function<void(const Stats&)> stats_callback) {
   if (inputs.empty()) {
     ET_LOG(Error, "MultimodalInput vector cannot be empty");
     return Error::InvalidArgument;
diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h
index 186a5bf70e4..fc87a9ab18a 100644
--- a/extension/llm/runner/multimodal_runner.h
+++ b/extension/llm/runner/multimodal_runner.h
@@ -116,8 +116,8 @@ class ET_EXPERIMENTAL MultimodalRunner {
   virtual ::executorch::runtime::Error generate(
       const std::vector<MultimodalInput>& inputs,
       const GenerationConfig& config,
-      std::function<void(const std::string&)>& token_callback,
-      std::function<void(const Stats&)>& stats_callback);
+      std::function<void(const std::string&)> token_callback,
+      std::function<void(const Stats&)> stats_callback);
 
   inline void stop() {
     text_token_generator_->stop();

From 3a606e217840111b1772ed4af961bf32ccf74789 Mon Sep 17 00:00:00 2001
From: Hansong Zhang <hsz@meta.com>
Date: Wed, 3 Sep 2025 17:58:31 -0700
Subject: [PATCH 4/4] Java allow adding audio input

---
 .../executorch/extension/llm/LlmModule.java   | 17 +++++++---
 extension/android/jni/jni_layer_llama.cpp     | 32 ++++++++++++++++---
 2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
index b014ceb75d8..d8ee0ab7482 100644
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
@@ -177,7 +177,7 @@ public native int generate(
    * @throws RuntimeException if the prefill failed
    */
   public long prefillImages(int[] image, int width, int height, int channels, long startPos) {
-    long[] nativeResult = prefillImagesNative(image, width, height, channels, startPos);
+    long[] nativeResult = addImageInputNative(image, width, height, channels, startPos);
     if (nativeResult[0] != 0) {
       throw new RuntimeException("Prefill failed with error code: " + nativeResult[0]);
     }
@@ -185,7 +185,7 @@ public long prefillImages(int[] image, int width, int height, int channels, long
   }
 
   // returns a tuple of (status, updated startPos)
-  private native long[] prefillImagesNative(
+  private native long[] addImageInputNative(
       int[] image, int width, int height, int channels, long startPos);
 
   /**
@@ -200,7 +200,7 @@ private native long[] prefillImagesNative(
    * @throws RuntimeException if the prefill failed
    */
   public long prefillPrompt(String prompt, long startPos, int bos, int eos) {
-    long[] nativeResult = prefillPromptNative(prompt, startPos, bos, eos);
+    long[] nativeResult = addTextInputNative(prompt, startPos, bos, eos);
     if (nativeResult[0] != 0) {
       throw new RuntimeException("Prefill failed with error code: " + nativeResult[0]);
     }
@@ -208,7 +208,10 @@ public long prefillPrompt(String prompt, long startPos, int bos, int eos) {
   }
 
   // returns a tuple of (status, updated startPos)
-  private native long[] prefillPromptNative(String prompt, long startPos, int bos, int eos);
+  private native long[] addTextInputNative(String prompt, long startPos, int bos, int eos);
+
+  // returns the status code
+  private native int addAudioInputNative(int[] audio, int batch_size, int n_bins, int n_frames);
 
   /**
    * Generate tokens from the given prompt, starting from the given position.
@@ -217,6 +220,12 @@ public long prefillPrompt(String prompt, long startPos, int bos, int eos) {
    * @param seqLen The total sequence length, including the prompt tokens and new tokens.
    * @param startPos The starting position in KV cache of the input in the LLM.
    * @param callback callback object to receive results.
+   * @param echo indicate whether to echo the
+   *     <p>/** Generate tokens from the given prompt, starting from the given position.
+   * @param prompt The text prompt to LLaVA.
+   * @param seqLen The total sequence length, including the prompt tokens and new tokens.
+   * @param startPos The starting position in KV cache of the input in the LLM.
+   * @param callback callback object to receive results.
    * @param echo indicate whether to echo the input prompt or not.
    * @return The error code.
    */
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index 0c3550f151a..aa5f6052225 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -260,7 +260,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
   // Returns a tuple of (error, start_pos)
   // Contract is valid within an AAR (JNI + corresponding Java code)
   // If the first element is not Error::Ok, the other element is undefined.
-  facebook::jni::local_ref<jlongArray> prefill_prompt(
+  facebook::jni::local_ref<jlongArray> add_text_input(
       facebook::jni::alias_ref<jstring> prompt,
       jlong start_pos,
       jint bos,
@@ -276,7 +276,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
   // Contract is valid within an AAR (JNI + corresponding Java code)
   // If the first element is not Error::Ok, the other element is undefined.
 
-  facebook::jni::local_ref<jlongArray> prefill_images(
+  facebook::jni::local_ref<jlongArray> add_images_input(
       facebook::jni::alias_ref<jintArray> image,
       jint width,
       jint height,
@@ -303,6 +303,28 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
     return tuple_result;
   }
 
+  // Returns the status code
+  jint add_audio_input(
+      facebook::jni::alias_ref<jintArray> audio,
+      jint batch_size,
+      jint n_bins,
+      jint n_frames) {
+    auto audio_size = audio->size();
+    if (audio_size != 0) {
+      std::vector<jint> audio_data_jint(audio_size);
+      std::vector<uint8_t> audio_data(audio_size);
+      audio->getRegion(0, audio_size, audio_data_jint.data());
+      for (int i = 0; i < audio_size; i++) {
+        audio_data[i] = audio_data_jint[i];
+      }
+      auto&& audio_input = llm::make_audio_input(
+          llm::Audio{audio_data, batch_size, n_bins, n_frames});
+      prefill_inputs_.emplace_back(audio_input);
+    }
+
+    return 0;
+  }
+
   jint generate_from_pos(
       facebook::jni::alias_ref<jstring> prompt,
       jint seq_len,
@@ -359,9 +381,11 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
         makeNativeMethod("stop", ExecuTorchLlmJni::stop),
         makeNativeMethod("load", ExecuTorchLlmJni::load),
         makeNativeMethod(
-            "prefillImagesNative", ExecuTorchLlmJni::prefill_images),
+            "addImageInputNative", ExecuTorchLlmJni::add_images_input),
+        makeNativeMethod(
+            "addTextInputNative", ExecuTorchLlmJni::add_text_input),
         makeNativeMethod(
-            "prefillPromptNative", ExecuTorchLlmJni::prefill_prompt),
+            "addAudioInputNative", ExecuTorchLlmJni::add_audio_input),
         makeNativeMethod(
             "generateFromPos", ExecuTorchLlmJni::generate_from_pos),
     });