Fix

larryliu0820 · larryliu0820 · commit f385829e0e51 · 2025-09-16T16:44:01.000-07:00
diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
@@ -41,10 +41,42 @@ Result<uint64_t> MultimodalPrefiller::prefill(
   ::executorch::runtime::EValue encoder_output;
   if (input.is_image()) {
     Image image = input.get_image();
-    auto image_tensor = executorch::extension::from_blob(
-        image.data.data(),
-        {3, image.height, image.width},
-        ::executorch::aten::ScalarType::Byte);
+
+    auto method_meta = ET_UNWRAP(
+        module_->method_meta(kImageEncoderMethod),
+        "Failed to get method_meta for %s",
+        kImageEncoderMethod);
+
+    ET_CHECK_MSG(
+        method_meta.num_inputs() > 0,
+        "Image encoder should have at least 1 input");
+    auto input_meta = ET_UNWRAP(
+        method_meta.input_tensor_meta(0),
+        "Cannot get input tensor meta at index 0");
+    auto expected_dtype = input_meta.scalar_type();
+
+    if (expected_dtype == ::executorch::aten::ScalarType::Float) {
+      ET_CHECK_MSG(
+          image.is_float(),
+          "Model expects float image data, but image has uint8_t data.");
+    } else if (expected_dtype == ::executorch::aten::ScalarType::Byte) {
+      ET_CHECK_MSG(
+          image.is_uint8(),
+          "Model expects uint8_t image data, but image has float data.");
+    } else {
+      ET_LOG(
+          Error,
+          "Unsupported image encoder input dtype: %s",
+          ::executorch::runtime::toString(expected_dtype));
+      return ::executorch::runtime::Error::NotSupported;
+    }
+
+    // The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D
+    // tensor (CHW). Add a batch dimension of 1 if needed.
+    auto expected_dims = input_meta.sizes();
+    auto image_tensor = ET_UNWRAP(
+        image.toTensor(/*with_batch*/ expected_dims.size() == 4),
+        "Failed to convert image to tensor");
 
     // Run image encoder
     auto image_encoder_outputs =
diff --git a/extension/llm/runner/multimodal_prefiller.h b/extension/llm/runner/multimodal_prefiller.h
@@ -6,207 +6,56 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// Generic encoder prefiller that handles multimodal inputs (text, image and
-// audio (to be implemented)) to prefill the KV cache of a multimodal LLM.
-// @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
+// Generic encoder prefiller that handles multimodal inputs (image and audio)
+// to prefill the KV cache of a multimodal LLM.
 
-#include <executorch/extension/llm/runner/constants.h>
-#include <executorch/extension/llm/runner/multimodal_prefiller.h>
-#include <executorch/extension/llm/runner/util.h>
-#include <executorch/extension/tensor/tensor.h>
+#pragma once
 
-namespace executorch::extension::llm {
-
-MultimodalPrefiller::MultimodalPrefiller(
-    Module* module,
-    MultimodalDecoderRunner* decoder_runner,
-    Tokenizer* tokenizer,
-    IOManager* io_manager)
-    : module_(module),
-      text_decoder_runner_(decoder_runner),
-      tokenizer_(tokenizer),
-      io_manager_(io_manager) {}
-
-/**
- * Prefill an LLM Module with the given multimodal input.
- * @param input The multimodal input (text, image or audio) to the multimodal
- * LLM.
- * @param start_pos The starting position in KV cache of the input in the LLM
- * @return logits of the prefill.
- */
-Result<uint64_t> MultimodalPrefiller::prefill(
-    const MultimodalInput& input,
-    int64_t& start_pos) {
-  // 1. Run encoder model.
-  ::executorch::runtime::EValue encoder_output;
-  if (input.is_image()) {
-    Image image = input.get_image();
-
-    auto method_meta = ET_UNWRAP(
-        module_->method_meta(kImageEncoderMethod),
-        "Failed to get method_meta for %s",
-        kImageEncoderMethod);
-
-    ET_CHECK_MSG(
-        method_meta.num_inputs() > 0,
-        "Image encoder should have at least 1 input");
-    auto input_meta = ET_UNWRAP(
-        method_meta.input_tensor_meta(0),
-        "Cannot get input tensor meta at index 0");
-    auto expected_dtype = input_meta.scalar_type();
-
-    if (expected_dtype == ::executorch::aten::ScalarType::Float) {
-      ET_CHECK_MSG(
-          image.is_float(),
-          "Model expects float image data, but image has uint8_t data.");
-    } else if (expected_dtype == ::executorch::aten::ScalarType::Byte) {
-      ET_CHECK_MSG(
-          image.is_uint8(),
-          "Model expects uint8_t image data, but image has float data.");
-    } else {
-      ET_LOG(
-          Error,
-          "Unsupported image encoder input dtype: %s",
-          ::executorch::runtime::toString(expected_dtype));
-      return ::executorch::runtime::Error::NotSupported;
-    }
-
-    // The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D
-    // tensor (CHW). Add a batch dimension of 1 if needed.
-    auto expected_dims = input_meta.sizes();
-    auto image_tensor = ET_UNWRAP(
-        image.toTensor(/*with_batch*/ expected_dims.size() == 4),
-        "Failed to convert image to tensor");
-
-    // Run image encoder
-    auto image_encoder_outputs =
-        ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor));
-
-    encoder_output = image_encoder_outputs[0];
-  } else if (input.is_audio()) {
-    Audio audio = input.get_audio();
-
-    // Use the original tensor shape as intended
-    auto audio_tensor = executorch::extension::from_blob(
-        audio.data.data(),
-        {audio.batch_size, audio.n_bins, audio.n_frames},
-        ::executorch::aten::ScalarType::Float);
-
-    // Run audio encoder
-    auto audio_encoder_result =
-        module_->execute(kAudioEncoderMethod, audio_tensor);
-    if (audio_encoder_result.error() != ::executorch::runtime::Error::Ok) {
-      return ::executorch::runtime::Error::Internal;
-    }
-    auto audio_encoder_outputs = audio_encoder_result.get();
-
-    encoder_output = audio_encoder_outputs[0];
-  } else if (input.is_text()) {
-    auto& text = input.get_text();
-    std::vector<uint64_t> tokens =
-        ET_UNWRAP_TOKENIZER(tokenizer_->encode(text));
-
-    auto text_tensor = executorch::extension::from_blob(
-        tokens.data(),
-        {1, static_cast<aten::SizesType>(tokens.size())},
-        ::executorch::aten::ScalarType::Long);
-
-    // Run text encoder (token embeddings)
-    auto token_embedding_outputs =
-        ET_UNWRAP(module_->execute(kTokenEmbeddingMethod, text_tensor));
+#include <executorch/extension/llm/runner/multimodal_decoder_runner.h>
+#include <executorch/extension/llm/runner/multimodal_input.h>
+#include <executorch/extension/llm/runner/text_decoder_runner.h>
+#include <executorch/extension/llm/sampler/sampler.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/runtime/platform/compiler.h>
+#include <pytorch/tokenizers/tokenizer.h>
 
-    encoder_output = token_embedding_outputs[0];
-  } else {
-    ET_LOG(Error, "Unsupported input type");
-    // For any other input types, return error
-    return ::executorch::runtime::Error::NotSupported;
-  }
-
-  // 2. Run decoder model for prefill.
-
-  // Get expected shape of cache position tensor, which should be the second
-  // argument
-
-  int64_t seq_len = encoder_output.toTensor().size(1);
-  if (seq_len == 0) {
-    ET_LOG(Error, "The encoder returned an empty output.");
-    return ::executorch::runtime::Error::InvalidState;
-  }
-  std::vector<int64_t> cache_positions;
-
-  auto cache_position_tensor = ET_UNWRAP(populate_start_pos_or_cache_position(
-      module_, start_pos, cache_positions, seq_len, kTextModelMethod));
-
-  auto prefill_result = module_->execute(
-      kTextModelMethod, {encoder_output, cache_position_tensor});
-  if (prefill_result.error() != ::executorch::runtime::Error::Ok) {
-    return prefill_result.error();
-  }
-  // Check if prefill_outputs is empty, if it is return error and log that the
-  // specified encoder returned empty results when used to prefill decoder.
-  auto prefill_outputs = prefill_result.get();
-  if (prefill_outputs.empty()) {
-    ET_LOG(
-        Error, "Encoder returned empty results when used to prefill decoder");
-    return ::executorch::runtime::Error::InvalidState;
-  }
-  auto outputs_res = prefill_outputs[0].toTensor();
-
-  // Update start_pos, tracking the current cache position.
-  start_pos += seq_len;
-
-  return static_cast<uint64_t>(
-      text_decoder_runner_->logits_to_token(outputs_res));
-}
-
-/**
- * Load the Module for encoder prefill purpose.
- * @return The error code.
- */
-::executorch::runtime::Error MultimodalPrefiller::load() {
-  if (is_method_loaded()) {
-    return ::executorch::runtime::Error::Ok;
-  }
-  // token_embeddings and text_model have to show up in method names.
-  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTokenEmbeddingMethod));
-  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod));
-
-  std::unordered_set<std::string> methods =
-      ET_UNWRAP(module_->method_names(), "Failed to get method names");
-
-  // Load image_encoder method if exists.
-  if (methods.find(kImageEncoderMethod) != methods.end()) {
-    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod));
-  }
-
-  if (methods.find(kAudioEncoderMethod) != methods.end()) {
-    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kAudioEncoderMethod));
-  }
-
-  return ::executorch::runtime::Error::Ok;
-}
+namespace executorch::extension::llm {
 
-/**
- * Check if the required methods in the Module is loaded.
- * @return True if the Module is loaded, false otherwise.
- */
-bool MultimodalPrefiller::is_method_loaded() {
-  ::executorch::runtime::Result<std::unordered_set<std::string>> methods_res =
-      module_->method_names();
-  if (!module_->is_method_loaded(kTokenEmbeddingMethod)) {
-    return false;
-  }
-  if (!module_->is_method_loaded(kTextModelMethod)) {
-    return false;
-  }
-  if (methods_res.error() != ::executorch::runtime::Error::Ok) {
-    ET_CHECK_MSG(false, "Failed to get method names");
-  }
-  std::unordered_set<std::string> methods = methods_res.get();
-  if (methods.find(kImageEncoderMethod) != methods.end()) {
-    return module_->is_method_loaded(kImageEncoderMethod);
-  }
-  return true;
-}
+using runtime::Error;
+using runtime::Result;
+using tokenizers::Tokenizer;
+
+// Assuming kv cache and parallel prefill are enabled.
+// This prefiller supports both image and audio inputs
+class ET_EXPERIMENTAL MultimodalPrefiller {
+ public:
+  explicit MultimodalPrefiller(
+      Module* module,
+      MultimodalDecoderRunner* decoder_runner,
+      Tokenizer* tokenizer,
+      IOManager* io_manager);
+
+  /**
+   * Prefill an LLM Module with the given multimodal input.
+   * @param input The multimodal input (image or audio) to the multimodal LLM.
+   * @param start_pos The starting position in KV cache of the input in the LLM.
+   * It's passed as reference and will be updated inside this function.
+   * @return The next token of the LLM Module after prefill.
+   */
+  virtual Result<uint64_t> prefill(
+      const MultimodalInput& input,
+      int64_t& start_pos);
+
+  virtual Error load();
+  virtual bool is_method_loaded();
+
+  virtual ~MultimodalPrefiller() = default;
+
+ protected:
+  Module* module_;
+  MultimodalDecoderRunner* text_decoder_runner_;
+  Tokenizer* tokenizer_;
+  IOManager* io_manager_;
+};
 
 } // namespace executorch::extension::llm