From 154850e3ded826cf7109cbd5ca57a93ecaa7d7e8 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Tue, 16 Sep 2025 16:40:26 -0700
Subject: [PATCH 1/3] [multimodal] Allow float32 image input

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 examples/models/llava/main.cpp              |  16 +-
 extension/llm/runner/image.h                | 103 +++++++-
 extension/llm/runner/multimodal_prefiller.h | 245 ++++++++++++++++----
 3 files changed, 302 insertions(+), 62 deletions(-)
diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp
index 6cb84aa088e..3946a629ade 100644
--- a/examples/models/llava/main.cpp
+++ b/examples/models/llava/main.cpp
@@ -81,24 +81,20 @@ void load_image(const std::string& image_path, Image& image) {
       new_height,
       0,
       channels);
-  // transpose to CHW
-  image.data.resize(channels * new_width * new_height);
+  std::vector<uint8_t> chw_data(channels * new_width * new_height);
   for (int i = 0; i < new_width * new_height; ++i) {
     for (int c = 0; c < channels; ++c) {
-      image.data[c * new_width * new_height + i] =
-          resized_data[i * channels + c];
+      chw_data[c * new_width * new_height + i] = resized_data[i * channels + c];
     }
   }
-  image.width = new_width;
-  image.height = new_height;
-  image.channels = channels;
+  image = Image(std::move(chw_data), new_width, new_height, channels);
   // convert to tensor
   ET_LOG(
       Info,
       "image Channels: %" PRId32 ", Height: %" PRId32 ", Width: %" PRId32,
-      image.channels,
-      image.height,
-      image.width);
+      image.channels(),
+      image.height(),
+      image.width());
   stbi_image_free(data);
 }
 
diff --git a/extension/llm/runner/image.h b/extension/llm/runner/image.h
index 67fb8939518..dbdba273536 100644
--- a/extension/llm/runner/image.h
+++ b/extension/llm/runner/image.h
@@ -10,19 +10,112 @@
 
 #pragma once
 #include <executorch/runtime/platform/compiler.h>
+#include <cstddef>
 #include <cstdint>
+#include <variant>
 #include <vector>
 
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+
 namespace executorch {
 namespace extension {
 namespace llm {
 
-struct ET_EXPERIMENTAL Image {
+class ET_EXPERIMENTAL Image {
+ public:
+  // Default constructor
+  Image() : width_(0), height_(0), channels_(0) {}
+
+  // Constructor for uint8_t data
+  Image(
+      std::vector<uint8_t>&& data,
+      int32_t width,
+      int32_t height,
+      int32_t channels)
+      : data_(std::move(data)),
+        width_(width),
+        height_(height),
+        channels_(channels) {}
+
+  // Constructor for float data
+  Image(
+      std::vector<float>&& data,
+      int32_t width,
+      int32_t height,
+      int32_t channels)
+      : data_(std::move(data)),
+        width_(width),
+        height_(height),
+        channels_(channels) {}
+
+  // Getters
+  int32_t width() const {
+    return width_;
+  }
+  int32_t height() const {
+    return height_;
+  }
+  int32_t channels() const {
+    return channels_;
+  }
+
+  // Data access
+  bool is_uint8() const {
+    return std::holds_alternative<std::vector<uint8_t>>(data_);
+  }
+
+  bool is_float() const {
+    return std::holds_alternative<std::vector<float>>(data_);
+  }
+
+  const std::vector<uint8_t>& get_uint8_data() const& {
+    return std::get<std::vector<uint8_t>>(data_);
+  }
+
+  std::vector<uint8_t>& get_uint8_data() & {
+    return std::get<std::vector<uint8_t>>(data_);
+  }
+
+  const std::vector<float>& get_float_data() const& {
+    return std::get<std::vector<float>>(data_);
+  }
+
+  std::vector<float>& get_float_data() & {
+    return std::get<std::vector<float>>(data_);
+  }
+
+  executorch::runtime::Result<executorch::extension::TensorPtr> toTensor(
+      bool with_batch = false) const {
+    // Note: This creates a 3D tensor (CHW). The model might expect a 4D
+    // tensor (NCHW). The caller should handle reshaping if needed.
+    std::vector<executorch::aten::SizesType> sizes = {
+        channels(), height(), width()};
+    if (with_batch) {
+      sizes.insert(sizes.begin(), 1);
+    }
+    if (is_float()) {
+      return executorch::extension::from_blob(
+          const_cast<float*>(get_float_data().data()),
+          sizes,
+          ::executorch::aten::ScalarType::Float);
+    } else if (is_uint8()) {
+      return executorch::extension::from_blob(
+          const_cast<uint8_t*>(get_uint8_data().data()),
+          sizes,
+          ::executorch::aten::ScalarType::Byte);
+    }
+    ET_LOG(
+        Error, "Image data is not initialized with uint8_t or float vector.");
+    return ::executorch::runtime::Error::NotSupported;
+  }
+
+ private:
   // Assuming NCHW format
-  std::vector<uint8_t> data;
-  int32_t width;
-  int32_t height;
-  int32_t channels;
+  std::variant<std::vector<uint8_t>, std::vector<float>> data_;
+  int32_t width_;
+  int32_t height_;
+  int32_t channels_;
 };
 
 } // namespace llm
diff --git a/extension/llm/runner/multimodal_prefiller.h b/extension/llm/runner/multimodal_prefiller.h
index dbfa2ec7ca3..3f8777d4acf 100644
--- a/extension/llm/runner/multimodal_prefiller.h
+++ b/extension/llm/runner/multimodal_prefiller.h
@@ -6,56 +6,207 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// Generic encoder prefiller that handles multimodal inputs (image and audio)
-// to prefill the KV cache of a multimodal LLM.
+// Generic encoder prefiller that handles multimodal inputs (text, image and
+// audio (to be implemented)) to prefill the KV cache of a multimodal LLM.
+// @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
 
-#pragma once
-
-#include <executorch/extension/llm/runner/multimodal_decoder_runner.h>
-#include <executorch/extension/llm/runner/multimodal_input.h>
-#include <executorch/extension/llm/runner/text_decoder_runner.h>
-#include <executorch/extension/llm/sampler/sampler.h>
-#include <executorch/extension/module/module.h>
-#include <executorch/runtime/platform/compiler.h>
-#include <pytorch/tokenizers/tokenizer.h>
+#include <executorch/extension/llm/runner/constants.h>
+#include <executorch/extension/llm/runner/multimodal_prefiller.h>
+#include <executorch/extension/llm/runner/util.h>
+#include <executorch/extension/tensor/tensor.h>
 
 namespace executorch::extension::llm {
 
-using runtime::Error;
-using runtime::Result;
-using tokenizers::Tokenizer;
-
-// Assuming kv cache and parallel prefill are enabled.
-// This prefiller supports both image and audio inputs
-class ET_EXPERIMENTAL MultimodalPrefiller {
- public:
-  explicit MultimodalPrefiller(
-      Module* module,
-      MultimodalDecoderRunner* decoder_runner,
-      Tokenizer* tokenizer,
-      IOManager* io_manager);
-
-  /**
-   * Prefill an LLM Module with the given multimodal input.
-   * @param input The multimodal input (image or audio) to the multimodal LLM.
-   * @param start_pos The starting position in KV cache of the input in the LLM.
-   * It's passed as reference and will be updated inside this function.
-   * @return The next token of the LLM Module after prefill.
-   */
-  virtual Result<uint64_t> prefill(
-      const MultimodalInput& input,
-      int64_t& start_pos);
-
-  virtual Error load();
-  virtual bool is_method_loaded();
-
-  virtual ~MultimodalPrefiller() = default;
-
- protected:
-  Module* module_;
-  MultimodalDecoderRunner* text_decoder_runner_;
-  Tokenizer* tokenizer_;
-  IOManager* io_manager_;
-};
+MultimodalPrefiller::MultimodalPrefiller(
+    Module* module,
+    MultimodalDecoderRunner* decoder_runner,
+    Tokenizer* tokenizer,
+    IOManager* io_manager)
+    : module_(module),
+      text_decoder_runner_(decoder_runner),
+      tokenizer_(tokenizer),
+      io_manager_(io_manager) {}
+
+/**
+ * Prefill an LLM Module with the given multimodal input.
+ * @param input The multimodal input (text, image or audio) to the multimodal
+ * LLM.
+ * @param start_pos The starting position in KV cache of the input in the LLM
+ * @return logits of the prefill.
+ */
+Result<uint64_t> MultimodalPrefiller::prefill(
+    const MultimodalInput& input,
+    int64_t& start_pos) {
+  // 1. Run encoder model.
+  ::executorch::runtime::EValue encoder_output;
+  if (input.is_image()) {
+    Image image = input.get_image();
+
+    auto method_meta = ET_UNWRAP(
+        module_->method_meta(kImageEncoderMethod),
+        "Failed to get method_meta for %s",
+        kImageEncoderMethod);
+
+    ET_CHECK_MSG(
+        method_meta.num_inputs() > 0,
+        "Image encoder should have at least 1 input");
+    auto input_meta = ET_UNWRAP(
+        method_meta.input_tensor_meta(0),
+        "Cannot get input tensor meta at index 0");
+    auto expected_dtype = input_meta.scalar_type();
+
+    if (expected_dtype == ::executorch::aten::ScalarType::Float) {
+      ET_CHECK_MSG(
+          image.is_float(),
+          "Model expects float image data, but image has uint8_t data.");
+    } else if (expected_dtype == ::executorch::aten::ScalarType::Byte) {
+      ET_CHECK_MSG(
+          image.is_uint8(),
+          "Model expects uint8_t image data, but image has float data.");
+    } else {
+      ET_LOG(
+          Error,
+          "Unsupported image encoder input dtype: %s",
+          ::executorch::runtime::toString(expected_dtype));
+      return ::executorch::runtime::Error::NotSupported;
+    }
+
+    // The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D
+    // tensor (CHW). Add a batch dimension of 1 if needed.
+    auto expected_dims = input_meta.sizes();
+    auto image_tensor = ET_UNWRAP(
+        image.toTensor(/*with_batch*/ expected_dims.size() == 4),
+        "Failed to convert image to tensor");
+
+    // Run image encoder
+    auto image_encoder_outputs =
+        ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor));
+
+    encoder_output = image_encoder_outputs[0];
+  } else if (input.is_audio()) {
+    Audio audio = input.get_audio();
+
+    // Use the original tensor shape as intended
+    auto audio_tensor = executorch::extension::from_blob(
+        audio.data.data(),
+        {audio.batch_size, audio.n_bins, audio.n_frames},
+        ::executorch::aten::ScalarType::Float);
+
+    // Run audio encoder
+    auto audio_encoder_result =
+        module_->execute(kAudioEncoderMethod, audio_tensor);
+    if (audio_encoder_result.error() != ::executorch::runtime::Error::Ok) {
+      return ::executorch::runtime::Error::Internal;
+    }
+    auto audio_encoder_outputs = audio_encoder_result.get();
+
+    encoder_output = audio_encoder_outputs[0];
+  } else if (input.is_text()) {
+    auto& text = input.get_text();
+    std::vector<uint64_t> tokens =
+        ET_UNWRAP_TOKENIZER(tokenizer_->encode(text));
+
+    auto text_tensor = executorch::extension::from_blob(
+        tokens.data(),
+        {1, static_cast<aten::SizesType>(tokens.size())},
+        ::executorch::aten::ScalarType::Long);
+
+    // Run text encoder (token embeddings)
+    auto token_embedding_outputs =
+        ET_UNWRAP(module_->execute(kTokenEmbeddingMethod, text_tensor));
+
+    encoder_output = token_embedding_outputs[0];
+  } else {
+    ET_LOG(Error, "Unsupported input type");
+    // For any other input types, return error
+    return ::executorch::runtime::Error::NotSupported;
+  }
+
+  // 2. Run decoder model for prefill.
+
+  // Get expected shape of cache position tensor, which should be the second
+  // argument
+
+  int64_t seq_len = encoder_output.toTensor().size(1);
+  if (seq_len == 0) {
+    ET_LOG(Error, "The encoder returned an empty output.");
+    return ::executorch::runtime::Error::InvalidState;
+  }
+  std::vector<int64_t> cache_positions;
+
+  auto cache_position_tensor = ET_UNWRAP(populate_start_pos_or_cache_position(
+      module_, start_pos, cache_positions, seq_len, kTextModelMethod));
+
+  auto prefill_result = module_->execute(
+      kTextModelMethod, {encoder_output, cache_position_tensor});
+  if (prefill_result.error() != ::executorch::runtime::Error::Ok) {
+    return prefill_result.error();
+  }
+  // Check if prefill_outputs is empty, if it is return error and log that the
+  // specified encoder returned empty results when used to prefill decoder.
+  auto prefill_outputs = prefill_result.get();
+  if (prefill_outputs.empty()) {
+    ET_LOG(
+        Error, "Encoder returned empty results when used to prefill decoder");
+    return ::executorch::runtime::Error::InvalidState;
+  }
+  auto outputs_res = prefill_outputs[0].toTensor();
+
+  // Update start_pos, tracking the current cache position.
+  start_pos += seq_len;
+
+  return static_cast<uint64_t>(
+      text_decoder_runner_->logits_to_token(outputs_res));
+}
+
+/**
+ * Load the Module for encoder prefill purpose.
+ * @return The error code.
+ */
+::executorch::runtime::Error MultimodalPrefiller::load() {
+  if (is_method_loaded()) {
+    return ::executorch::runtime::Error::Ok;
+  }
+  // token_embeddings and text_model have to show up in method names.
+  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTokenEmbeddingMethod));
+  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod));
+
+  std::unordered_set<std::string> methods =
+      ET_UNWRAP(module_->method_names(), "Failed to get method names");
+
+  // Load image_encoder method if exists.
+  if (methods.find(kImageEncoderMethod) != methods.end()) {
+    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod));
+  }
+
+  if (methods.find(kAudioEncoderMethod) != methods.end()) {
+    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kAudioEncoderMethod));
+  }
+
+  return ::executorch::runtime::Error::Ok;
+}
+
+/**
+ * Check if the required methods in the Module is loaded.
+ * @return True if the Module is loaded, false otherwise.
+ */
+bool MultimodalPrefiller::is_method_loaded() {
+  ::executorch::runtime::Result<std::unordered_set<std::string>> methods_res =
+      module_->method_names();
+  if (!module_->is_method_loaded(kTokenEmbeddingMethod)) {
+    return false;
+  }
+  if (!module_->is_method_loaded(kTextModelMethod)) {
+    return false;
+  }
+  if (methods_res.error() != ::executorch::runtime::Error::Ok) {
+    ET_CHECK_MSG(false, "Failed to get method names");
+  }
+  std::unordered_set<std::string> methods = methods_res.get();
+  if (methods.find(kImageEncoderMethod) != methods.end()) {
+    return module_->is_method_loaded(kImageEncoderMethod);
+  }
+  return true;
+}
 
 } // namespace executorch::extension::llm

From 78fa105a9109c39a997872de93faccc708323c6e Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Tue, 16 Sep 2025 16:44:01 -0700
Subject: [PATCH 2/3] Fix

---
 extension/llm/runner/multimodal_prefiller.cpp |  40 ++-
 extension/llm/runner/multimodal_prefiller.h   | 245 ++++--------------
 2 files changed, 83 insertions(+), 202 deletions(-)

diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
index 2705a9eadff..3f8777d4acf 100644
--- a/extension/llm/runner/multimodal_prefiller.cpp
+++ b/extension/llm/runner/multimodal_prefiller.cpp
@@ -41,10 +41,42 @@ Result<uint64_t> MultimodalPrefiller::prefill(
   ::executorch::runtime::EValue encoder_output;
   if (input.is_image()) {
     Image image = input.get_image();
-    auto image_tensor = executorch::extension::from_blob(
-        image.data.data(),
-        {3, image.height, image.width},
-        ::executorch::aten::ScalarType::Byte);
+
+    auto method_meta = ET_UNWRAP(
+        module_->method_meta(kImageEncoderMethod),
+        "Failed to get method_meta for %s",
+        kImageEncoderMethod);
+
+    ET_CHECK_MSG(
+        method_meta.num_inputs() > 0,
+        "Image encoder should have at least 1 input");
+    auto input_meta = ET_UNWRAP(
+        method_meta.input_tensor_meta(0),
+        "Cannot get input tensor meta at index 0");
+    auto expected_dtype = input_meta.scalar_type();
+
+    if (expected_dtype == ::executorch::aten::ScalarType::Float) {
+      ET_CHECK_MSG(
+          image.is_float(),
+          "Model expects float image data, but image has uint8_t data.");
+    } else if (expected_dtype == ::executorch::aten::ScalarType::Byte) {
+      ET_CHECK_MSG(
+          image.is_uint8(),
+          "Model expects uint8_t image data, but image has float data.");
+    } else {
+      ET_LOG(
+          Error,
+          "Unsupported image encoder input dtype: %s",
+          ::executorch::runtime::toString(expected_dtype));
+      return ::executorch::runtime::Error::NotSupported;
+    }
+
+    // The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D
+    // tensor (CHW). Add a batch dimension of 1 if needed.
+    auto expected_dims = input_meta.sizes();
+    auto image_tensor = ET_UNWRAP(
+        image.toTensor(/*with_batch*/ expected_dims.size() == 4),
+        "Failed to convert image to tensor");
 
     // Run image encoder
     auto image_encoder_outputs =
diff --git a/extension/llm/runner/multimodal_prefiller.h b/extension/llm/runner/multimodal_prefiller.h
index 3f8777d4acf..dbfa2ec7ca3 100644
--- a/extension/llm/runner/multimodal_prefiller.h
+++ b/extension/llm/runner/multimodal_prefiller.h
@@ -6,207 +6,56 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// Generic encoder prefiller that handles multimodal inputs (text, image and
-// audio (to be implemented)) to prefill the KV cache of a multimodal LLM.
-// @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
+// Generic encoder prefiller that handles multimodal inputs (image and audio)
+// to prefill the KV cache of a multimodal LLM.
 
-#include <executorch/extension/llm/runner/constants.h>
-#include <executorch/extension/llm/runner/multimodal_prefiller.h>
-#include <executorch/extension/llm/runner/util.h>
-#include <executorch/extension/tensor/tensor.h>
+#pragma once
 
-namespace executorch::extension::llm {
-
-MultimodalPrefiller::MultimodalPrefiller(
-    Module* module,
-    MultimodalDecoderRunner* decoder_runner,
-    Tokenizer* tokenizer,
-    IOManager* io_manager)
-    : module_(module),
-      text_decoder_runner_(decoder_runner),
-      tokenizer_(tokenizer),
-      io_manager_(io_manager) {}
-
-/**
- * Prefill an LLM Module with the given multimodal input.
- * @param input The multimodal input (text, image or audio) to the multimodal
- * LLM.
- * @param start_pos The starting position in KV cache of the input in the LLM
- * @return logits of the prefill.
- */
-Result<uint64_t> MultimodalPrefiller::prefill(
-    const MultimodalInput& input,
-    int64_t& start_pos) {
-  // 1. Run encoder model.
-  ::executorch::runtime::EValue encoder_output;
-  if (input.is_image()) {
-    Image image = input.get_image();
-
-    auto method_meta = ET_UNWRAP(
-        module_->method_meta(kImageEncoderMethod),
-        "Failed to get method_meta for %s",
-        kImageEncoderMethod);
-
-    ET_CHECK_MSG(
-        method_meta.num_inputs() > 0,
-        "Image encoder should have at least 1 input");
-    auto input_meta = ET_UNWRAP(
-        method_meta.input_tensor_meta(0),
-        "Cannot get input tensor meta at index 0");
-    auto expected_dtype = input_meta.scalar_type();
-
-    if (expected_dtype == ::executorch::aten::ScalarType::Float) {
-      ET_CHECK_MSG(
-          image.is_float(),
-          "Model expects float image data, but image has uint8_t data.");
-    } else if (expected_dtype == ::executorch::aten::ScalarType::Byte) {
-      ET_CHECK_MSG(
-          image.is_uint8(),
-          "Model expects uint8_t image data, but image has float data.");
-    } else {
-      ET_LOG(
-          Error,
-          "Unsupported image encoder input dtype: %s",
-          ::executorch::runtime::toString(expected_dtype));
-      return ::executorch::runtime::Error::NotSupported;
-    }
-
-    // The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D
-    // tensor (CHW). Add a batch dimension of 1 if needed.
-    auto expected_dims = input_meta.sizes();
-    auto image_tensor = ET_UNWRAP(
-        image.toTensor(/*with_batch*/ expected_dims.size() == 4),
-        "Failed to convert image to tensor");
-
-    // Run image encoder
-    auto image_encoder_outputs =
-        ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor));
-
-    encoder_output = image_encoder_outputs[0];
-  } else if (input.is_audio()) {
-    Audio audio = input.get_audio();
-
-    // Use the original tensor shape as intended
-    auto audio_tensor = executorch::extension::from_blob(
-        audio.data.data(),
-        {audio.batch_size, audio.n_bins, audio.n_frames},
-        ::executorch::aten::ScalarType::Float);
-
-    // Run audio encoder
-    auto audio_encoder_result =
-        module_->execute(kAudioEncoderMethod, audio_tensor);
-    if (audio_encoder_result.error() != ::executorch::runtime::Error::Ok) {
-      return ::executorch::runtime::Error::Internal;
-    }
-    auto audio_encoder_outputs = audio_encoder_result.get();
-
-    encoder_output = audio_encoder_outputs[0];
-  } else if (input.is_text()) {
-    auto& text = input.get_text();
-    std::vector<uint64_t> tokens =
-        ET_UNWRAP_TOKENIZER(tokenizer_->encode(text));
-
-    auto text_tensor = executorch::extension::from_blob(
-        tokens.data(),
-        {1, static_cast<aten::SizesType>(tokens.size())},
-        ::executorch::aten::ScalarType::Long);
-
-    // Run text encoder (token embeddings)
-    auto token_embedding_outputs =
-        ET_UNWRAP(module_->execute(kTokenEmbeddingMethod, text_tensor));
+#include <executorch/extension/llm/runner/multimodal_decoder_runner.h>
+#include <executorch/extension/llm/runner/multimodal_input.h>
+#include <executorch/extension/llm/runner/text_decoder_runner.h>
+#include <executorch/extension/llm/sampler/sampler.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/runtime/platform/compiler.h>
+#include <pytorch/tokenizers/tokenizer.h>
 
-    encoder_output = token_embedding_outputs[0];
-  } else {
-    ET_LOG(Error, "Unsupported input type");
-    // For any other input types, return error
-    return ::executorch::runtime::Error::NotSupported;
-  }
-
-  // 2. Run decoder model for prefill.
-
-  // Get expected shape of cache position tensor, which should be the second
-  // argument
-
-  int64_t seq_len = encoder_output.toTensor().size(1);
-  if (seq_len == 0) {
-    ET_LOG(Error, "The encoder returned an empty output.");
-    return ::executorch::runtime::Error::InvalidState;
-  }
-  std::vector<int64_t> cache_positions;
-
-  auto cache_position_tensor = ET_UNWRAP(populate_start_pos_or_cache_position(
-      module_, start_pos, cache_positions, seq_len, kTextModelMethod));
-
-  auto prefill_result = module_->execute(
-      kTextModelMethod, {encoder_output, cache_position_tensor});
-  if (prefill_result.error() != ::executorch::runtime::Error::Ok) {
-    return prefill_result.error();
-  }
-  // Check if prefill_outputs is empty, if it is return error and log that the
-  // specified encoder returned empty results when used to prefill decoder.
-  auto prefill_outputs = prefill_result.get();
-  if (prefill_outputs.empty()) {
-    ET_LOG(
-        Error, "Encoder returned empty results when used to prefill decoder");
-    return ::executorch::runtime::Error::InvalidState;
-  }
-  auto outputs_res = prefill_outputs[0].toTensor();
-
-  // Update start_pos, tracking the current cache position.
-  start_pos += seq_len;
-
-  return static_cast<uint64_t>(
-      text_decoder_runner_->logits_to_token(outputs_res));
-}
-
-/**
- * Load the Module for encoder prefill purpose.
- * @return The error code.
- */
-::executorch::runtime::Error MultimodalPrefiller::load() {
-  if (is_method_loaded()) {
-    return ::executorch::runtime::Error::Ok;
-  }
-  // token_embeddings and text_model have to show up in method names.
-  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTokenEmbeddingMethod));
-  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod));
-
-  std::unordered_set<std::string> methods =
-      ET_UNWRAP(module_->method_names(), "Failed to get method names");
-
-  // Load image_encoder method if exists.
-  if (methods.find(kImageEncoderMethod) != methods.end()) {
-    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod));
-  }
-
-  if (methods.find(kAudioEncoderMethod) != methods.end()) {
-    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kAudioEncoderMethod));
-  }
-
-  return ::executorch::runtime::Error::Ok;
-}
+namespace executorch::extension::llm {
 
-/**
- * Check if the required methods in the Module is loaded.
- * @return True if the Module is loaded, false otherwise.
- */
-bool MultimodalPrefiller::is_method_loaded() {
-  ::executorch::runtime::Result<std::unordered_set<std::string>> methods_res =
-      module_->method_names();
-  if (!module_->is_method_loaded(kTokenEmbeddingMethod)) {
-    return false;
-  }
-  if (!module_->is_method_loaded(kTextModelMethod)) {
-    return false;
-  }
-  if (methods_res.error() != ::executorch::runtime::Error::Ok) {
-    ET_CHECK_MSG(false, "Failed to get method names");
-  }
-  std::unordered_set<std::string> methods = methods_res.get();
-  if (methods.find(kImageEncoderMethod) != methods.end()) {
-    return module_->is_method_loaded(kImageEncoderMethod);
-  }
-  return true;
-}
+using runtime::Error;
+using runtime::Result;
+using tokenizers::Tokenizer;
+
+// Assuming kv cache and parallel prefill are enabled.
+// This prefiller supports both image and audio inputs
+class ET_EXPERIMENTAL MultimodalPrefiller {
+ public:
+  explicit MultimodalPrefiller(
+      Module* module,
+      MultimodalDecoderRunner* decoder_runner,
+      Tokenizer* tokenizer,
+      IOManager* io_manager);
+
+  /**
+   * Prefill an LLM Module with the given multimodal input.
+   * @param input The multimodal input (image or audio) to the multimodal LLM.
+   * @param start_pos The starting position in KV cache of the input in the LLM.
+   * It's passed as reference and will be updated inside this function.
+   * @return The next token of the LLM Module after prefill.
+   */
+  virtual Result<uint64_t> prefill(
+      const MultimodalInput& input,
+      int64_t& start_pos);
+
+  virtual Error load();
+  virtual bool is_method_loaded();
+
+  virtual ~MultimodalPrefiller() = default;
+
+ protected:
+  Module* module_;
+  MultimodalDecoderRunner* text_decoder_runner_;
+  Tokenizer* tokenizer_;
+  IOManager* io_manager_;
+};
 
 } // namespace executorch::extension::llm

From 4d6d3be13bf555684c6dc73672479cabfc875d04 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Tue, 16 Sep 2025 23:50:11 -0700
Subject: [PATCH 3/3] Fix android and ios usages of image

---
 extension/android/jni/jni_layer_llama.cpp     |   2 +-
 .../Exported/ExecuTorchLLMMultimodalRunner.mm |  12 +-
 .../llm/runner/test/test_multimodal_input.cpp | 133 ++++++++----------
 3 files changed, 65 insertions(+), 82 deletions(-)

diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index 23686f01ee7..cabf30c42e4 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -268,7 +268,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       for (int i = 0; i < image_size; i++) {
         image_data[i] = image_data_jint[i];
       }
-      llm::Image image_runner{image_data, width, height, channels};
+      llm::Image image_runner{std::move(image_data), width, height, channels};
       prefill_inputs_.emplace_back(
           llm::MultimodalInput{std::move(image_runner)});
     }
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
index dcc5dc98806..b95e480aded 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
@@ -172,12 +172,12 @@ - (BOOL)generate:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
       case ExecuTorchLLMMultimodalInputTypeImage: {
         ExecuTorchLLMImage *image = input.image;
         std::vector<uint8_t> data((uint8_t *)image.data.bytes, (uint8_t *)image.data.bytes + image.data.length);
-        nativeInputs.emplace_back(llm::MultimodalInput(llm::Image{
-          .data = std::move(data),
-          .width = (int32_t)image.width,
-          .height = (int32_t)image.height,
-          .channels = (int32_t)image.channels
-        }));
+        nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
+          std::move(data),
+          (int32_t)image.width,
+          (int32_t)image.height,
+          (int32_t)image.channels
+        )));
         break;
       }
       default: {
diff --git a/extension/llm/runner/test/test_multimodal_input.cpp b/extension/llm/runner/test/test_multimodal_input.cpp
index 97b9cc1379e..486515175e8 100644
--- a/extension/llm/runner/test/test_multimodal_input.cpp
+++ b/extension/llm/runner/test/test_multimodal_input.cpp
@@ -16,7 +16,6 @@ using executorch::extension::llm::make_image_input;
 using executorch::extension::llm::make_text_input;
 using executorch::extension::llm::MultimodalInput;
 
-namespace {
 class MultimodalInputTest : public Test {
  protected:
   std::string createTestText() {
@@ -28,21 +27,13 @@ class MultimodalInputTest : public Test {
   }
 
   Image createTestImage() {
-    Image img;
-    img.width = 224;
-    img.height = 224;
-    img.channels = 3;
-    img.data = std::vector<uint8_t>(224 * 224 * 3, 128); // Fill with gray
-    return img;
+    std::vector<uint8_t> data(224 * 224 * 3, 128); // Fill with gray
+    return Image(std::move(data), 224, 224, 3);
   }
 
   Image createTestImageSmall() {
-    Image img;
-    img.width = 32;
-    img.height = 32;
-    img.channels = 1;
-    img.data = std::vector<uint8_t>(32 * 32, 255); // Fill with white
-    return img;
+    std::vector<uint8_t> data(32 * 32, 255); // Fill with white
+    return Image(std::move(data), 32, 32, 1);
   }
 };
 
@@ -76,28 +67,28 @@ TEST_F(MultimodalInputTest, ImageConstructorFromImage) {
   EXPECT_FALSE(input.is_text());
   EXPECT_TRUE(input.is_image());
   EXPECT_EQ(input.get_type(), MultimodalInput::Type::IMAGE);
-  EXPECT_EQ(input.get_image().width, 224);
-  EXPECT_EQ(input.get_image().height, 224);
-  EXPECT_EQ(input.get_image().channels, 3);
-  EXPECT_EQ(input.get_image().data.size(), 224 * 224 * 3);
+  EXPECT_EQ(input.get_image().width(), 224);
+  EXPECT_EQ(input.get_image().height(), 224);
+  EXPECT_EQ(input.get_image().channels(), 3);
+  EXPECT_EQ(input.get_image().get_uint8_data().size(), 224 * 224 * 3);
 }
 
 TEST_F(MultimodalInputTest, ImageConstructorFromRvalueImage) {
   Image img = createTestImage();
-  int width = img.width;
-  int height = img.height;
-  int channels = img.channels;
-  size_t data_size = img.data.size();
+  int width = img.width();
+  int height = img.height();
+  int channels = img.channels();
+  size_t data_size = img.get_uint8_data().size();
 
   MultimodalInput input(std::move(img));
 
   EXPECT_FALSE(input.is_text());
   EXPECT_TRUE(input.is_image());
   EXPECT_EQ(input.get_type(), MultimodalInput::Type::IMAGE);
-  EXPECT_EQ(input.get_image().width, width);
-  EXPECT_EQ(input.get_image().height, height);
-  EXPECT_EQ(input.get_image().channels, channels);
-  EXPECT_EQ(input.get_image().data.size(), data_size);
+  EXPECT_EQ(input.get_image().width(), width);
+  EXPECT_EQ(input.get_image().height(), height);
+  EXPECT_EQ(input.get_image().channels(), channels);
+  EXPECT_EQ(input.get_image().get_uint8_data().size(), data_size);
 }
 
 // Test copy constructor and assignment
@@ -129,10 +120,10 @@ TEST_F(MultimodalInputTest, CopyConstructorImage) {
   MultimodalInput copy(original);
 
   EXPECT_TRUE(copy.is_image());
-  EXPECT_EQ(copy.get_image().width, 224);
-  EXPECT_EQ(copy.get_image().height, 224);
-  EXPECT_EQ(copy.get_image().channels, 3);
-  EXPECT_EQ(original.get_image().width, 224); // Original should be unchanged
+  EXPECT_EQ(copy.get_image().width(), 224);
+  EXPECT_EQ(copy.get_image().height(), 224);
+  EXPECT_EQ(copy.get_image().channels(), 3);
+  EXPECT_EQ(original.get_image().width(), 224); // Original should be unchanged
 }
 
 TEST_F(MultimodalInputTest, CopyAssignmentImage) {
@@ -143,10 +134,10 @@ TEST_F(MultimodalInputTest, CopyAssignmentImage) {
   copy = original;
 
   EXPECT_TRUE(copy.is_image());
-  EXPECT_EQ(copy.get_image().width, 224);
-  EXPECT_EQ(copy.get_image().height, 224);
-  EXPECT_EQ(copy.get_image().channels, 3);
-  EXPECT_EQ(original.get_image().width, 224); // Original should be unchanged
+  EXPECT_EQ(copy.get_image().width(), 224);
+  EXPECT_EQ(copy.get_image().height(), 224);
+  EXPECT_EQ(copy.get_image().channels(), 3);
+  EXPECT_EQ(original.get_image().width(), 224); // Original should be unchanged
 }
 
 // Test move constructor and assignment
@@ -174,32 +165,32 @@ TEST_F(MultimodalInputTest, MoveAssignmentText) {
 
 TEST_F(MultimodalInputTest, MoveConstructorImage) {
   Image img = createTestImage();
-  int width = img.width;
-  int height = img.height;
-  int channels = img.channels;
+  int width = img.width();
+  int height = img.height();
+  int channels = img.channels();
   MultimodalInput original(std::move(img));
   MultimodalInput moved(std::move(original));
 
   EXPECT_TRUE(moved.is_image());
-  EXPECT_EQ(moved.get_image().width, width);
-  EXPECT_EQ(moved.get_image().height, height);
-  EXPECT_EQ(moved.get_image().channels, channels);
+  EXPECT_EQ(moved.get_image().width(), width);
+  EXPECT_EQ(moved.get_image().height(), height);
+  EXPECT_EQ(moved.get_image().channels(), channels);
 }
 
 TEST_F(MultimodalInputTest, MoveAssignmentImage) {
   Image img = createTestImage();
-  int width = img.width;
-  int height = img.height;
-  int channels = img.channels;
+  int width = img.width();
+  int height = img.height();
+  int channels = img.channels();
   MultimodalInput original(std::move(img));
   MultimodalInput moved(createTestText()); // Start with different type
 
   moved = std::move(original);
 
   EXPECT_TRUE(moved.is_image());
-  EXPECT_EQ(moved.get_image().width, width);
-  EXPECT_EQ(moved.get_image().height, height);
-  EXPECT_EQ(moved.get_image().channels, channels);
+  EXPECT_EQ(moved.get_image().width(), width);
+  EXPECT_EQ(moved.get_image().height(), height);
+  EXPECT_EQ(moved.get_image().channels(), channels);
 }
 
 // Test getter methods with correct types
@@ -227,16 +218,13 @@ TEST_F(MultimodalInputTest, GetImageWithImageInput) {
 
   // Test const lvalue reference version
   const MultimodalInput& const_input = input;
-  EXPECT_EQ(const_input.get_image().width, 224);
-
-  // Test mutable lvalue reference version
-  Image& mutable_image = input.get_image();
-  mutable_image.width = 448;
-  EXPECT_EQ(input.get_image().width, 448);
+  EXPECT_EQ(const_input.get_image().width(), 224);
+  EXPECT_EQ(const_input.get_image().height(), 224);
+  EXPECT_EQ(const_input.get_image().channels(), 3);
 
   // Test rvalue reference version
   Image moved_image = std::move(input).get_image();
-  EXPECT_EQ(moved_image.width, 448);
+  EXPECT_EQ(moved_image.width(), 224);
 }
 
 // Test getter methods with wrong types (should throw)
@@ -296,18 +284,14 @@ TEST_F(MultimodalInputTest, TryGetImageWithImageInput) {
   const MultimodalInput& const_input = input;
   const Image* image_ptr = const_input.try_get_image();
   ASSERT_NE(image_ptr, nullptr);
-  EXPECT_EQ(image_ptr->width, 224);
-  EXPECT_EQ(image_ptr->height, 224);
-  EXPECT_EQ(image_ptr->channels, 3);
+  EXPECT_EQ(image_ptr->width(), 224);
+  EXPECT_EQ(image_ptr->height(), 224);
+  EXPECT_EQ(image_ptr->channels(), 3);
 
   // Test mutable version
   Image* mutable_image_ptr = input.try_get_image();
   ASSERT_NE(mutable_image_ptr, nullptr);
-  EXPECT_EQ(mutable_image_ptr->width, 224);
-
-  // Modify through pointer
-  mutable_image_ptr->width = 448;
-  EXPECT_EQ(input.get_image().width, 448);
+  EXPECT_EQ(mutable_image_ptr->width(), 224);
 }
 
 TEST_F(MultimodalInputTest, TryGetImageWithTextInput) {
@@ -344,22 +328,22 @@ TEST_F(MultimodalInputTest, MakeImageInputFromImage) {
   MultimodalInput input = make_image_input(img);
 
   EXPECT_TRUE(input.is_image());
-  EXPECT_EQ(input.get_image().width, 224);
-  EXPECT_EQ(input.get_image().height, 224);
-  EXPECT_EQ(input.get_image().channels, 3);
+  EXPECT_EQ(input.get_image().width(), 224);
+  EXPECT_EQ(input.get_image().height(), 224);
+  EXPECT_EQ(input.get_image().channels(), 3);
 }
 
 TEST_F(MultimodalInputTest, MakeImageInputFromRvalueImage) {
   Image img = createTestImage();
-  int width = img.width;
-  int height = img.height;
-  int channels = img.channels;
+  int width = img.width();
+  int height = img.height();
+  int channels = img.channels();
   MultimodalInput input = make_image_input(std::move(img));
 
   EXPECT_TRUE(input.is_image());
-  EXPECT_EQ(input.get_image().width, width);
-  EXPECT_EQ(input.get_image().height, height);
-  EXPECT_EQ(input.get_image().channels, channels);
+  EXPECT_EQ(input.get_image().width(), width);
+  EXPECT_EQ(input.get_image().height(), height);
+  EXPECT_EQ(input.get_image().channels(), channels);
 }
 
 // Test with different image sizes
@@ -368,10 +352,10 @@ TEST_F(MultimodalInputTest, DifferentImageSizes) {
   MultimodalInput input(small_img);
 
   EXPECT_TRUE(input.is_image());
-  EXPECT_EQ(input.get_image().width, 32);
-  EXPECT_EQ(input.get_image().height, 32);
-  EXPECT_EQ(input.get_image().channels, 1);
-  EXPECT_EQ(input.get_image().data.size(), 32 * 32);
+  EXPECT_EQ(input.get_image().width(), 32);
+  EXPECT_EQ(input.get_image().height(), 32);
+  EXPECT_EQ(input.get_image().channels(), 1);
+  EXPECT_EQ(input.get_image().get_uint8_data().size(), 32 * 32);
 }
 
 // Test with empty text
@@ -424,11 +408,10 @@ TEST_F(MultimodalInputTest, AssignmentBetweenTypes) {
   // Assign image to text input
   input = MultimodalInput(img);
   EXPECT_TRUE(input.is_image());
-  EXPECT_EQ(input.get_image().width, 224);
+  EXPECT_EQ(input.get_image().width(), 224);
 
   // Assign text back to image input
   input = MultimodalInput(text);
   EXPECT_TRUE(input.is_text());
   EXPECT_EQ(input.get_text(), text);
 }
-} // namespace