IOManager Interface (#10418)

JacobSzwejbka · facebook-github-bot · commit bf3ac0b7635c · 2025-04-29T12:40:48.000-07:00
Summary: Pull Request resolved: #10418 Hopefully this is sufficient for the contract. Going to do 2 follow up tests. Add a basic cpu implementation add a static attention implementation. Differential Revision: D73450877
diff --git a/extension/llm/runner/io_manager/TARGETS b/extension/llm/runner/io_manager/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/llm/runner/io_manager/io_manager.h b/extension/llm/runner/io_manager/io_manager.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+ #pragma once
+
+ #include <executorch/extension/tensor/tensor.h>
+ #include <executorch/runtime/executor/method_meta.h>
+ #include <runtime/executor/method.h>
+ 
+ namespace executorch {
+ namespace extension {
+ namespace llm {
+ 
+ /**
+  * @brief Base class for managing input/output operations for LLM inference.
+  *
+  * IOManagerBase provides an interface for handling the input preparation and
+  * output processing for both prefill and decode phases of LLM inference.
+  * Derived classes must implement the virtual methods to provide specific IO
+  * management functionality.
+  */
+ class ET_EXPERIMENTAL IOManagerBase {
+  public:
+   /**
+    * @brief Virtual destructor to allow proper cleanup in derived classes.
+    */
+   ET_EXPERIMENTAL virtual ~IOManagerBase() = default;
+ 
+   /**
+    * @brief Initialize the IO manager with method metadata for prefill and
+    * decode operations.
+    *
+    * @param prefill_method The prefill method to initialize with.
+    * @param decode_method The decode method to initialize with.
+    */
+   ET_EXPERIMENTAL ET_NODISCARD virtual runtime::Error init(
+       executorch::runtime::Method& prefill_method,
+       executorch::runtime::Method& decode_method) = 0;
+ 
+   /**
+    * @brief Reset the IO manager state.
+    *
+    * @param prefill_method The prefill method to reset with.
+    * @param decode_method The decode method to reset with.
+    */
+   ET_EXPERIMENTAL ET_NODISCARD virtual runtime::Error reset(
+       executorch::runtime::Method& prefill_method,
+       executorch::runtime::Method& decode_method) = 0;
+ 
+   /**
+    * @brief Prepare inputs for the prefill phase of LLM inference.
+    *
+    * @param input The input tensor containing token IDs.
+    * @param start_pos The tensor containing the starting position of the current
+    * input within the context.
+    * @param prefill_method The prefill method to prepare inputs for.
+    * @return std::vector<executorch::runtime::EValue> Vector of prepared inputs
+    * for the prefill method.
+    */
+   ET_EXPERIMENTAL virtual runtime::Result<
+       std::vector<executorch::runtime::EValue>>
+   prepare_prefill(
+       const executorch::extension::TensorPtr& input,
+       const executorch::extension::TensorPtr& start_pos,
+       executorch::runtime::Method& prefill_method) = 0;
+ 
+   /**
+    * @brief Prepare inputs for the decode phase of LLM inference.
+    *
+    * @param input The input tensor containing token IDs.
+    * @param start_pos The tensor containing the starting position of the current
+    * input within the context.
+    * @param decode_method The decode method to prepare inputs for.
+    * @return std::vector<executorch::runtime::EValue> Vector of prepared inputs
+    * for the decode method.
+    */
+   ET_EXPERIMENTAL virtual runtime::Result<
+       std::vector<executorch::runtime::EValue>>
+   prepare_decode(
+       const executorch::extension::TensorPtr& input,
+       const executorch::extension::TensorPtr& start_pos,
+       executorch::runtime::Method& decode_method) = 0;
+ 
+   /**
+    * @brief Process and update internal state with outputs from the prefill
+    * phase.
+    *
+    * @param prefill_method The prefill method to update with outputs.
+    * @param model_outputs Vector of outputs from the prefill method execution.
+    */
+   ET_EXPERIMENTAL ET_NODISCARD virtual runtime::Error update_prefill(
+       executorch::runtime::Method& prefill_method,
+       const std::vector<executorch::runtime::EValue>& model_outputs) = 0;
+ 
+   /**
+    * @brief Process and update internal state with outputs from the decode
+    * phase.
+    *
+    * @param decode_method The decode method to update with outputs.
+    * @param model_outputs Vector of outputs from the decode method execution.
+    */
+   ET_EXPERIMENTAL ET_NODISCARD virtual runtime::Error update_decode(
+       const executorch::runtime::Method& decode_method,
+       const std::vector<executorch::runtime::EValue>& model_outputs) = 0;
+ };
+ 
+ } // namespace llm
+ } // namespace extension
+ } // namespace executorch
+ 
diff --git a/extension/llm/runner/io_manager/targets.bzl b/extension/llm/runner/io_manager/targets.bzl
@@ -0,0 +1,12 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    runtime.cxx_library(
+        name = "io_manager",
+        exported_headers = [
+            "io_manager.h",
+        ],
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )