pytorch
diff --git a/‎extension/llm/runner/README.md‎
Lines changed: 541 additions & 0 deletions b/‎extension/llm/runner/README.md‎
Lines changed: 541 additions & 0 deletions
diff --git a/‎extension/llm/runner/image_prefiller.cpp‎
Lines changed: 92 additions & 0 deletions b/‎extension/llm/runner/image_prefiller.cpp‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎extension/llm/runner/image_prefiller.h‎
Lines changed: 41 additions & 4 deletions b/‎extension/llm/runner/image_prefiller.h‎
Lines changed: 41 additions & 4 deletions
diff --git a/‎extension/llm/runner/llm_runner_helper.cpp‎
Lines changed: 65 additions & 0 deletions b/‎extension/llm/runner/llm_runner_helper.cpp‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎extension/llm/runner/llm_runner_helper.h‎
Lines changed: 19 additions & 0 deletions b/‎extension/llm/runner/llm_runner_helper.h‎
Lines changed: 19 additions & 0 deletions
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Given a image tensor, prefill the KV cache of LLaVA.
+
+#include <executorch/extension/llm/runner/constants.h>
+#include <executorch/extension/llm/runner/image_prefiller.h>
+#include <executorch/extension/tensor/tensor.h>
+
+namespace executorch::extension::llm {
+/**
+ * Prefill an LLM Module with the given image input.
+ * @param image The image input to LLaVa.
+ * @param start_pos The starting position in KV cache of the input in the LLM
+ * @return logits of the image prefill.
+ */
+::executorch::runtime::Result<uint64_t> ImagePrefiller::prefill(
+    ::executorch::extension::llm::Image& image,
+    int64_t& start_pos) {
+  auto image_tensor = executorch::extension::from_blob(
+      image.data.data(),
+      {3, image.height, image.width},
+      ::executorch::aten::ScalarType::Byte);
+  // Run image encoder
+  auto image_encoder_outputs =
+      ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor));
+
+  // inputs:[start_pos, embeds]
+  auto start_pos_tensor = executorch::extension::from_blob(
+      &start_pos, {1}, ::executorch::aten::ScalarType::Long);
+
+  // Run text model
+  auto outputs_res = ET_UNWRAP(module_->execute(
+      kTextModelMethod, {start_pos_tensor, image_encoder_outputs[0]}));
+  ET_CHECK_MSG(
+      outputs_res[0].isTensor(),
+      "Non Tensor Output returned from executing image prefill");
+
+  // Update the start_pos, which is only available inside this function.
+  // outputs_res can have only one logits.
+  start_pos += image_encoder_outputs[0].toTensor().size(1);
+
+  return logits_to_token(outputs_res[0].toTensor());
+}
+
+/**
+ * Load the Module for image prefill purpose.
+ * @return The error code.
+ */
+::executorch::runtime::Error ImagePrefiller::load() {
+  if (is_method_loaded()) {
+    return ::executorch::runtime::Error::Ok;
+  }
+  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod));
+  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod));
+  return ::executorch::runtime::Error::Ok;
+}
+
+/**
+ * Check if the required methods in the Module is loaded.
+ * @return True if the Module is loaded, false otherwise.
+ */
+bool ImagePrefiller::is_method_loaded() {
+  ::executorch::runtime::Result<std::unordered_set<std::string>> methods_res =
+      module_->method_names();
+  if (methods_res.error() != ::executorch::runtime::Error::Ok) {
+    ET_CHECK_MSG(false, "Failed to get method names");
+  }
+  std::unordered_set<std::string> methods = methods_res.get();
+  bool methods_exist = methods.find(kImageEncoderMethod) != methods.end() &&
+      methods.find(kTextModelMethod) != methods.end();
+  if (!methods_exist) {
+    for (const auto& method : methods) {
+      ET_LOG(Error, "Method: %s", method.c_str());
+    }
+    ET_CHECK_MSG(
+        methods_exist,
+        "Missing required methods (%s, %s) in the model",
+        kImageEncoderMethod,
+        kTextModelMethod);
+  }
+  bool methods_loaded = module_->is_method_loaded(kImageEncoderMethod) &&
+      module_->is_method_loaded(kTextModelMethod);
+  return methods_loaded;
+}
+
+} // namespace executorch::extension::llm
@@ -11,6 +11,7 @@
 #pragma once
 
 #include <executorch/extension/llm/runner/image.h>
+#include <executorch/extension/llm/sampler/sampler.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/runtime/platform/compiler.h>
 
@@ -31,16 +32,52 @@ class ET_EXPERIMENTAL ImagePrefiller {
    * It's passed as reference and will be updated inside this function.
    * @return The next token of the LLM Module after prefill.
    */
-  virtual ::executorch::runtime::Result<executorch::aten::Tensor> prefill(
+  virtual ::executorch::runtime::Result<uint64_t> prefill(
       Image& image,
-      int64_t& start_pos) = 0;
+      int64_t& start_pos);
 
-  virtual ::executorch::runtime::Error load() = 0;
-  virtual bool is_method_loaded() = 0;
+  virtual ::executorch::runtime::Error load();
+  virtual bool is_method_loaded();
 
   virtual ~ImagePrefiller() = default;
 
  protected:
+  /**
+   * Sample the next token from the logits tensor.
+   * @param logits_tensor The logits tensor.
+   * @param temperature The temperature parameter used to control randomness in
+   * sampling.
+   * @return The next token.
+   */
+  inline uint64_t logits_to_token(
+      const executorch::aten::Tensor& logits_tensor,
+      const float temperature = 0.0f) {
+    uint64_t result = 0;
+    ET_SWITCH_THREE_TYPES(
+        Float,
+        Half,
+        BFloat16,
+        logits_tensor.scalar_type(),
+        unused,
+        "logits_to_token",
+        CTYPE,
+        [&]() {
+          // If the logit_tensor rank is 3, the shape is [batch, seq_length,
+          // vocab_size], get the last logits, sample and return. Else the model
+          // outputs the last logit, directly sample and return.
+          auto* logits = logits_tensor.mutable_data_ptr<CTYPE>();
+          ssize_t vocab_size = logits_tensor.size(logits_tensor.dim() - 1);
+          if (logits_tensor.dim() == 3) {
+            auto num_tokens = logits_tensor.size(1);
+            logits += (num_tokens - 1) * vocab_size;
+          }
+          // @lint-ignore CLANGTIDY facebook-hte-Deprecated
+          Sampler sampler(vocab_size, temperature);
+          result = sampler.sample(logits);
+        });
+    return result;
+  }
+
   Module* module_;
 };
 
 
@@ -8,7 +8,10 @@
 
 // Implementation of helper utilities for creating and configuring LLM runners
 
+#include <executorch/extension/llm/runner/image_prefiller.h>
 #include <executorch/extension/llm/runner/llm_runner_helper.h>
+#include <executorch/extension/llm/runner/multimodal_runner.h>
+#include <executorch/extension/llm/runner/multimodal_text_decoder_runner.h>
 #include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/runner/text_llm_runner.h>
 #include <executorch/extension/llm/runner/text_prefiller.h>
@@ -205,6 +208,68 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
       temperature);
 }
 
+std::unique_ptr<MultimodalRunner> create_multimodal_runner(
+    const std::string& model_path,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    std::optional<const std::string> data_path,
+    float temperature) {
+  // Sanity check tokenizer
+  if (!tokenizer || !tokenizer->is_loaded()) {
+    ET_LOG(Error, "Tokenizer is null or not loaded");
+    return nullptr;
+  }
+
+  // Create the Module
+  std::unique_ptr<Module> module;
+  if (data_path.has_value()) {
+    module = std::make_unique<Module>(
+        model_path, data_path.value(), Module::LoadMode::File);
+  } else {
+    module = std::make_unique<Module>(model_path, Module::LoadMode::File);
+  }
+
+  // Get metadata from Module
+  ET_LOG(Info, "Reading metadata from model");
+  auto metadata = get_llm_metadata(tokenizer.get(), module.get());
+
+  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>(
+      get_eos_ids(tokenizer.get(), module.get()));
+
+  // Create text_decoder_runner
+  auto text_decoder_runner =
+      std::make_unique<MultimodalTextDecoderRunner>(module.get());
+
+  // Create text_prefiller
+  auto text_prefiller = std::make_unique<TextPrefiller>(
+      text_decoder_runner.get(),
+      metadata.at(kUseKVCache),
+      metadata.at(kEnableDynamicShape),
+      metadata.at(kMaxSeqLen));
+
+  // Create image_prefiller
+  auto image_prefiller = std::make_unique<ImagePrefiller>(module.get());
+
+  // Create text_token_generator with stats
+  auto stats = std::make_unique<Stats>();
+  auto text_token_generator = std::make_unique<TextTokenGenerator>(
+      tokenizer.get(),
+      text_decoder_runner.get(),
+      metadata.at(kUseKVCache),
+      std::move(eos_ids),
+      stats.get());
+
+  // Create and return the MultimodalRunner instance
+  return std::make_unique<MultimodalRunner>(
+      std::move(metadata),
+      std::move(tokenizer),
+      std::move(module),
+      std::move(text_decoder_runner),
+      std::move(text_prefiller),
+      std::move(image_prefiller),
+      std::move(text_token_generator),
+      std::move(stats));
+}
+
 } // namespace llm
 } // namespace extension
 } // namespace executorch
@@ -103,6 +103,25 @@ ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
     std::optional<const std::string> data_path = std::nullopt,
     float temperature = -1.0f);
 
+/**
+ * @brief Creates a MultimodalRunner instance with dependency injection
+ *
+ * This factory function creates and initializes a MultimodalRunner with all
+ * necessary components for multimodal text generation.
+ *
+ * @param model_path Path to the model file
+ * @param tokenizer Initialized tokenizer instance
+ * @param data_path Optional path to additional data required by the model
+ * @param temperature Optional temperature parameter for controlling randomness
+ * @return std::unique_ptr<MultimodalRunner> Initialized MultimodalRunner
+ * instance, or nullptr on failure
+ */
+ET_EXPERIMENTAL std::unique_ptr<MultimodalRunner> create_multimodal_runner(
+    const std::string& model_path,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    std::optional<const std::string> data_path = std::nullopt,
+    float temperature = 0.8f);
+
 } // namespace llm
 } // namespace extension
 } // namespace executorch