pytorch
diff --git a/‎extension/llm/runner/README.md‎
Lines changed: 527 additions & 0 deletions b/‎extension/llm/runner/README.md‎
Lines changed: 527 additions & 0 deletions
diff --git a/‎extension/llm/runner/llm_runner_helper.cpp‎
Lines changed: 63 additions & 0 deletions b/‎extension/llm/runner/llm_runner_helper.cpp‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎extension/llm/runner/llm_runner_helper.h‎
Lines changed: 17 additions & 0 deletions b/‎extension/llm/runner/llm_runner_helper.h‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎extension/llm/runner/multimodal_decoder_runner.h‎
Lines changed: 1 addition & 0 deletions b/‎extension/llm/runner/multimodal_decoder_runner.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎extension/llm/runner/multimodal_runner.cpp‎
Lines changed: 187 additions & 0 deletions b/‎extension/llm/runner/multimodal_runner.cpp‎
Lines changed: 187 additions & 0 deletions
@@ -8,7 +8,11 @@
 
 // Implementation of helper utilities for creating and configuring LLM runners
 
+#include <executorch/extension/llm/runner/image_prefiller.h>
 #include <executorch/extension/llm/runner/llm_runner_helper.h>
+#include <executorch/extension/llm/runner/multimodal_decoder_runner.h>
+#include <executorch/extension/llm/runner/multimodal_prefiller.h>
+#include <executorch/extension/llm/runner/multimodal_runner.h>
 #include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/runner/text_llm_runner.h>
 #include <executorch/extension/llm/runner/text_prefiller.h>
@@ -205,6 +209,65 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
       temperature);
 }
 
+std::unique_ptr<MultimodalRunner> create_multimodal_runner(
+    const std::string& model_path,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    std::optional<const std::string> data_path,
+    float temperature) {
+  // Sanity check tokenizer
+  if (!tokenizer || !tokenizer->is_loaded()) {
+    ET_LOG(Error, "Tokenizer is null or not loaded");
+    return nullptr;
+  }
+
+  // Create the Module
+  std::unique_ptr<Module> module;
+  if (data_path.has_value()) {
+    module = std::make_unique<Module>(
+        model_path, data_path.value(), Module::LoadMode::File);
+  } else {
+    module = std::make_unique<Module>(model_path, Module::LoadMode::File);
+  }
+
+  // Get metadata from Module
+  ET_LOG(Info, "Reading metadata from model");
+  auto metadata = get_llm_metadata(tokenizer.get(), module.get());
+
+  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>(
+      get_eos_ids(tokenizer.get(), module.get()));
+
+  // Create IOManager
+  std::unique_ptr<IOManager> io_manager = std::make_unique<IOManager>();
+
+  // Create text_decoder_runner
+  auto text_decoder_runner =
+      std::make_unique<MultimodalDecoderRunner>(module.get(), io_manager.get());
+
+  // Create multimodal_prefiller
+  auto multimodal_prefiller = std::make_unique<MultimodalPrefiller>(
+      module.get(), text_decoder_runner.get(), tokenizer.get(), io_manager.get());
+
+  // Create text_token_generator with stats
+  auto stats = std::make_unique<Stats>();
+  auto text_token_generator = std::make_unique<TextTokenGenerator>(
+      tokenizer.get(),
+      text_decoder_runner.get(),
+      metadata.at(kUseKVCache),
+      std::move(eos_ids),
+      stats.get());
+
+  // Create and return the MultimodalRunner instance
+  return std::make_unique<MultimodalRunner>(
+      std::move(metadata),
+      std::move(tokenizer),
+      std::move(module),
+      std::move(text_decoder_runner),
+      std::move(multimodal_prefiller),
+      std::move(io_manager),
+      std::move(text_token_generator),
+      std::move(stats));
+}
+
 } // namespace llm
 } // namespace extension
 } // namespace executorch
@@ -103,6 +103,23 @@ ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
     std::optional<const std::string> data_path = std::nullopt,
     float temperature = -1.0f);
 
+/**
+ * @brief Creates a MultimodalRunner instance with dependency injection
+ *
+ * This factory function creates and initializes a MultimodalRunner with all
+ * necessary components for multimodal text generation.
+ *
+ * @param model_path Path to the model file
+ * @param tokenizer Initialized tokenizer instance
+ * @param data_path Optional path to additional .ptd required by the model
+ * @return std::unique_ptr<MultimodalRunner> Initialized MultimodalRunner
+ * instance, or nullptr on failure
+ */
+ET_EXPERIMENTAL std::unique_ptr<MultimodalRunner> create_multimodal_runner(
+    const std::string& model_path,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    std::optional<const std::string> data_path = std::nullopt);
+
 } // namespace llm
 } // namespace extension
 } // namespace executorch
@@ -5,6 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#pragma once
 
 #include <executorch/extension/llm/runner/constants.h>
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
 
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Implementation of MultimodalRunner for multimodal input and text output LLMs
+
+#include <executorch/extension/llm/runner/constants.h>
+#include <executorch/extension/llm/runner/multimodal_runner.h>
+#include <executorch/extension/llm/runner/util.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <pytorch/tokenizers/hf_tokenizer.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
+#include <pytorch/tokenizers/sentencepiece.h>
+#include <pytorch/tokenizers/tiktoken.h>
+
+namespace executorch {
+namespace extension {
+namespace llm {
+
+using ::executorch::extension::Module;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+
+namespace {
+// Default preset prompt for multimodal models
+const std::string kDefaultPresetPrompt =
+    "A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions. USER: ";
+} // namespace
+
+MultimodalRunner::MultimodalRunner(
+    std::unordered_map<std::string, int64_t> metadata,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    std::unique_ptr<Module> module,
+    std::unique_ptr<MultimodalDecoderRunner> text_decoder_runner,
+    std::unique_ptr<MultimodalPrefiller> multimodal_prefiller,
+    std::unique_ptr<IOManager> io_manager,
+    std::unique_ptr<TextTokenGenerator> text_token_generator,
+    std::unique_ptr<Stats> stats)
+    : metadata_(std::move(metadata)),
+      tokenizer_(std::move(tokenizer)),
+      module_(std::move(module)),
+      text_decoder_runner_(std::move(text_decoder_runner)),
+      multimodal_prefiller_(std::move(multimodal_prefiller)),
+      io_manager_(std::move(io_manager)),
+      text_token_generator_(std::move(text_token_generator)),
+      stats_(std::move(stats)),
+      pos_(0) {}
+
+bool MultimodalRunner::is_loaded() {
+  return multimodal_prefiller_->is_method_loaded() &&
+      text_token_generator_->is_loaded();
+}
+
+Error MultimodalRunner::load() {
+  if (is_loaded()) {
+    return Error::Ok;
+  }
+  ET_CHECK_OK_OR_RETURN_ERROR(multimodal_prefiller_->load());
+  ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load());
+  return Error::Ok;
+}
+
+// Don't print with the same priority during warmup
+#define RUNNER_ET_LOG(warmup, format, ...) \
+  if (warmup) {                            \
+    ET_LOG(Debug, format, __VA_ARGS__);    \
+  } else {                                 \
+    ET_LOG(Info, format, __VA_ARGS__);     \
+  }
+
+Error MultimodalRunner::generate(
+    const std::vector<MultimodalInput>& inputs,
+    const GenerationConfig& config,
+    std::function<void(const std::string&)>& token_callback,
+    std::function<void(const Stats&)>& stats_callback) {
+  if (inputs.empty()) {
+    ET_LOG(Error, "MultimodalInput vector cannot be empty");
+    return Error::InvalidArgument;
+  }
+
+  if (!is_loaded()) {
+    stats_->model_load_start_ms = time_in_ms();
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+    stats_->model_load_end_ms = time_in_ms();
+  }
+
+  if (config.warming) {
+    ET_LOG(Info, "Doing a warmup run...");
+  }
+
+  RUNNER_ET_LOG(
+      config.warming,
+      "RSS after loading model: %f MiB (0 if unsupported)",
+      get_rss_bytes() / 1024.0 / 1024.0);
+
+  // Wrap the token_callback with print function
+  std::function<void(const std::string&)> wrapped_callback =
+      [token_callback, config](const std::string& piece) {
+        if (!config.warming) {
+          safe_printf(piece.c_str());
+          fflush(stdout);
+        }
+        if (token_callback) {
+          token_callback(piece);
+        }
+      };
+
+  // Reset internal state and start inference
+  stats_->inference_start_ms = time_in_ms();
+
+  uint64_t prefill_next_token = 0;
+  // Process multimodal inputs in order
+  for (const MultimodalInput& input : inputs) {
+    prefill_next_token = ET_UNWRAP(multimodal_prefiller_->prefill(input, pos_));
+  }
+
+  stats_->first_token_ms = time_in_ms();
+  stats_->prompt_eval_end_ms = time_in_ms();
+  stats_->num_prompt_tokens = pos_;
+
+  wrapped_callback(ET_UNWRAP_TOKENIZER(
+      tokenizer_->decode(prefill_next_token, prefill_next_token)));
+
+  RUNNER_ET_LOG(
+      config.warming,
+      "RSS after multimodal input processing: %f MiB (0 if unsupported)",
+      get_rss_bytes() / 1024.0 / 1024.0);
+
+  // Resolve max_new_tokens based on config
+  int64_t max_context_len =
+      metadata_.at(kMaxContextLen) - 0; // No start_pos offset
+  int32_t max_new_tokens = config.resolve_max_new_tokens(max_context_len, pos_);
+
+  ET_LOG(
+      Info,
+      "Max new tokens resolved: %d, pos_ %" PRId64 ", max_context_len %" PRId64,
+      max_new_tokens,
+      pos_,
+      max_context_len);
+
+  ET_CHECK_OR_RETURN_ERROR(
+      max_new_tokens > 0,
+      InvalidArgument,
+      "Max new tokens %d is less than or equal to 0",
+      max_new_tokens);
+
+  // Generate tokens using the text token generator
+  std::vector<uint64_t> prompt_tokens = {prefill_next_token};
+  int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
+      /*tokens=*/prompt_tokens,
+      /*start_pos=*/pos_,
+      /*max_new_tokens=*/max_new_tokens -
+          1, // Subtract 1 because prefill already generated 1 token
+      /*temperature=*/config.temperature,
+      /*token_callback=*/wrapped_callback));
+
+  pos_ += num_generated_tokens;
+  // Update stats
+  stats_->num_generated_tokens = num_generated_tokens;
+  // Finalize stats and call callback
+  stats_->inference_end_ms = time_in_ms();
+  if (!config.warming) {
+    printf("\n");
+  }
+
+  if (config.warming) {
+    ET_LOG(Info, "Warmup run finished!");
+  } else {
+    // Do not print report during warmup
+    print_report(*stats_);
+  }
+
+  if (stats_callback) {
+    stats_callback(*stats_);
+  }
+
+  return Error::Ok;
+}
+
+} // namespace llm
+} // namespace extension
+} // namespace executorch