pytorch
diff --git a/‎extension/llm/runner/image_prefiller.cpp‎
Lines changed: 92 additions & 0 deletions b/‎extension/llm/runner/image_prefiller.cpp‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎extension/llm/runner/image_prefiller.h‎
Lines changed: 43 additions & 6 deletions b/‎extension/llm/runner/image_prefiller.h‎
Lines changed: 43 additions & 6 deletions
diff --git a/‎extension/llm/runner/targets.bzl‎
Lines changed: 3 additions & 0 deletions b/‎extension/llm/runner/targets.bzl‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎extension/llm/runner/test/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎extension/llm/runner/test/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎extension/llm/runner/test/targets.bzl‎
Lines changed: 9 additions & 0 deletions b/‎extension/llm/runner/test/targets.bzl‎
Lines changed: 9 additions & 0 deletions
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Given a image tensor, prefill the KV cache of LLaVA.
+
+#include <executorch/extension/llm/runner/constants.h>
+#include <executorch/extension/llm/runner/image_prefiller.h>
+#include <executorch/extension/tensor/tensor.h>
+
+namespace executorch::extension::llm {
+/**
+ * Prefill an LLM Module with the given image input.
+ * @param image The image input to LLaVa.
+ * @param start_pos The starting position in KV cache of the input in the LLM
+ * @return logits of the image prefill.
+ */
+::executorch::runtime::Result<uint64_t> ImagePrefiller::prefill(
+    ::executorch::extension::llm::Image& image,
+    int64_t& start_pos) {
+  auto image_tensor = executorch::extension::from_blob(
+      image.data.data(),
+      {3, image.height, image.width},
+      ::executorch::aten::ScalarType::Byte);
+  // Run image encoder
+  auto image_encoder_outputs =
+      ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor));
+
+  // inputs:[start_pos, embeds]
+  auto start_pos_tensor = executorch::extension::from_blob(
+      &start_pos, {1}, ::executorch::aten::ScalarType::Long);
+
+  // Run text model
+  auto outputs_res = ET_UNWRAP(module_->execute(
+      kTextModelMethod, {start_pos_tensor, image_encoder_outputs[0]}));
+  ET_CHECK_MSG(
+      outputs_res[0].isTensor(),
+      "Non Tensor Output returned from executing image prefill");
+
+  // Update the start_pos, which is only available inside this function.
+  // outputs_res can have only one logits.
+  start_pos += image_encoder_outputs[0].toTensor().size(1);
+
+  return logits_to_token(outputs_res[0].toTensor());
+}
+
+/**
+ * Load the Module for image prefill purpose.
+ * @return The error code.
+ */
+::executorch::runtime::Error ImagePrefiller::load() {
+  if (is_method_loaded()) {
+    return ::executorch::runtime::Error::Ok;
+  }
+  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod));
+  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod));
+  return ::executorch::runtime::Error::Ok;
+}
+
+/**
+ * Check if the required methods in the Module is loaded.
+ * @return True if the Module is loaded, false otherwise.
+ */
+bool ImagePrefiller::is_method_loaded() {
+  ::executorch::runtime::Result<std::unordered_set<std::string>> methods_res =
+      module_->method_names();
+  if (methods_res.error() != ::executorch::runtime::Error::Ok) {
+    ET_CHECK_MSG(false, "Failed to get method names");
+  }
+  std::unordered_set<std::string> methods = methods_res.get();
+  bool methods_exist = methods.find(kImageEncoderMethod) != methods.end() &&
+      methods.find(kTextModelMethod) != methods.end();
+  if (!methods_exist) {
+    for (const auto& method : methods) {
+      ET_LOG(Error, "Method: %s", method.c_str());
+    }
+    ET_CHECK_MSG(
+        methods_exist,
+        "Missing required methods (%s, %s) in the model",
+        kImageEncoderMethod,
+        kTextModelMethod);
+  }
+  bool methods_loaded = module_->is_method_loaded(kImageEncoderMethod) &&
+      module_->is_method_loaded(kTextModelMethod);
+  return methods_loaded;
+}
+
+} // namespace executorch::extension::llm
@@ -11,6 +11,7 @@
 #pragma once
 
 #include <executorch/extension/llm/runner/image.h>
+#include <executorch/extension/llm/sampler/sampler.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/runtime/platform/compiler.h>
 
@@ -21,7 +22,7 @@ namespace llm {
 // Assuming kv cache and parallel prefill are enabled.
 class ET_EXPERIMENTAL ImagePrefiller {
  public:
-  explicit ImagePrefiller(::executorch::extension::Module* module)
+  explicit ImagePrefiller(::executorch::extension::ET_MODULE_NAMESPACE::IModule* module)
       : module_(module) {}
 
   /**
@@ -31,17 +32,53 @@ class ET_EXPERIMENTAL ImagePrefiller {
    * It's passed as reference and will be updated inside this function.
    * @return The next token of the LLM Module after prefill.
    */
-  virtual ::executorch::runtime::Result<executorch::aten::Tensor> prefill(
+  virtual ::executorch::runtime::Result<uint64_t> prefill(
       Image& image,
-      int64_t& start_pos) = 0;
+      int64_t& start_pos);
 
-  virtual ::executorch::runtime::Error load() = 0;
-  virtual bool is_method_loaded() = 0;
+  virtual ::executorch::runtime::Error load();
+  virtual bool is_method_loaded();
 
   virtual ~ImagePrefiller() = default;
 
  protected:
-  Module* module_;
+  /**
+   * Sample the next token from the logits tensor.
+   * @param logits_tensor The logits tensor.
+   * @param temperature The temperature parameter used to control randomness in
+   * sampling.
+   * @return The next token.
+   */
+  inline uint64_t logits_to_token(
+      const executorch::aten::Tensor& logits_tensor,
+      const float temperature = 0.0f) {
+    uint64_t result = 0;
+    ET_SWITCH_THREE_TYPES(
+        Float,
+        Half,
+        BFloat16,
+        logits_tensor.scalar_type(),
+        unused,
+        "logits_to_token",
+        CTYPE,
+        [&]() {
+          // If the logit_tensor rank is 3, the shape is [batch, seq_length,
+          // vocab_size], get the last logits, sample and return. Else the model
+          // outputs the last logit, directly sample and return.
+          auto* logits = logits_tensor.mutable_data_ptr<CTYPE>();
+          ssize_t vocab_size = logits_tensor.size(logits_tensor.dim() - 1);
+          if (logits_tensor.dim() == 3) {
+            auto num_tokens = logits_tensor.size(1);
+            logits += (num_tokens - 1) * vocab_size;
+          }
+          // @lint-ignore CLANGTIDY facebook-hte-Deprecated
+          Sampler sampler(vocab_size, temperature);
+          result = sampler.sample(logits);
+        });
+    return result;
+  }
+
+  ::executorch::extension::ET_MODULE_NAMESPACE::IModule* module_;
 };
 
 } // namespace llm
 
@@ -84,12 +84,15 @@ def define_common_targets():
         runtime.cxx_library(
             name = "image_prefiller" + aten_suffix,
             exported_headers = ["image_prefiller.h", "image.h"],
+            srcs = ["image_prefiller.cpp"],
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
             exported_deps = [
                 ":constants",
                 "//executorch/extension/module:module" + aten_suffix,
+                "//executorch/extension/tensor:tensor" + aten_suffix,
+                "//executorch/extension/llm/sampler:sampler" + aten_suffix,
             ],
         )
 
 
@@ -19,6 +19,7 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs test_generation_config.cpp test_text_llm_runner.cpp
                test_text_prefiller.cpp test_text_decoder_runner.cpp
+               test_image_prefiller.cpp
 )
 
 et_cxx_test(
 
@@ -36,3 +36,12 @@ def define_common_targets():
             "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
         ],
     )
+
+    runtime.cxx_test(
+        name = "test_image_prefiller",
+        srcs = ["test_image_prefiller.cpp"],
+        deps = [
+            "//executorch/extension/llm/runner:runner_lib",
+            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+        ],
+    )
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)`
`19`	`19`
`20`	`20`	`set(_test_srcs test_generation_config.cpp test_text_llm_runner.cpp`
`21`	`21`	`test_text_prefiller.cpp test_text_decoder_runner.cpp`
	`22`	`+ test_image_prefiller.cpp`
`22`	`23`	`)`
`23`	`24`
`24`	`25`	`et_cxx_test(`