From 9f8dac0649317c11744ada3cb497f098619ad216 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Wed, 16 Jul 2025 16:54:37 -0700
Subject: [PATCH] IOManager Interface (#10418)

Summary:

Hopefully this is sufficient for the contract.

Going to do 2 follow up tests.

Add a basic cpu implementation

add a static attention implementation.

Reviewed By: larryliu0820

Differential Revision: D73450877
---
 extension/llm/runner/io_manager/TARGETS       |   8 +
 extension/llm/runner/io_manager/io_manager.h  | 158 ++++++++++++
 extension/llm/runner/io_manager/targets.bzl   |  22 ++
 extension/llm/runner/io_manager/test/TARGETS  |  25 ++
 .../llm/runner/io_manager/test/targets.bzl    |  10 +
 .../io_manager/test/test_io_manager.cpp       | 230 ++++++++++++++++++
 tools/cmake/cmake_deps.toml                   |   1 +
 7 files changed, 454 insertions(+)
 create mode 100644 extension/llm/runner/io_manager/TARGETS
 create mode 100644 extension/llm/runner/io_manager/io_manager.h
 create mode 100644 extension/llm/runner/io_manager/targets.bzl
 create mode 100644 extension/llm/runner/io_manager/test/TARGETS
 create mode 100644 extension/llm/runner/io_manager/test/targets.bzl
 create mode 100644 extension/llm/runner/io_manager/test/test_io_manager.cpp
diff --git a/extension/llm/runner/io_manager/TARGETS b/extension/llm/runner/io_manager/TARGETS
new file mode 100644
index 00000000000..2341af9282f
--- /dev/null
+++ b/extension/llm/runner/io_manager/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/llm/runner/io_manager/io_manager.h b/extension/llm/runner/io_manager/io_manager.h
new file mode 100644
index 00000000000..ce158c23b6e
--- /dev/null
+++ b/extension/llm/runner/io_manager/io_manager.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/executor/method.h>
+#include <executorch/runtime/executor/method_meta.h>
+
+namespace executorch {
+namespace extension {
+namespace llm {
+
+/**
+ * @brief Base class for managing input/output operations for LLM inference.
+ *
+ * IOManager provides an interface for handling the input preparation and
+ * output processing for both prefill and decode phases of LLM inference.
+ * Derived classes must implement the virtual methods to provide specific IO
+ * management functionality.
+ */
+class ET_EXPERIMENTAL IOManager {
+ public:
+  /**
+   * @brief Virtual destructor to allow proper cleanup in derived classes.
+   */
+  virtual ~IOManager() = default;
+
+  /**
+   * @brief Load the IO manager with method metadata for prefill and
+   * decode operations.
+   *
+   * @param program The program prefill and decode methods are loaded from.
+   * @param prefill_method The prefill method to initialize with.
+   * @param decode_method The decode method to initialize with.
+   */
+  ET_NODISCARD virtual runtime::Error load(
+      const executorch::ET_RUNTIME_NAMESPACE::Program& program,
+      executorch::ET_RUNTIME_NAMESPACE::Method& prefill_method,
+      executorch::ET_RUNTIME_NAMESPACE::Method& decode_method) {
+    (void)program;
+    (void)prefill_method;
+    (void)decode_method;
+    return runtime::Error::Ok;
+  }
+
+  /**
+   * @brief Reset the IO manager state.
+   *
+   * @param prefill_method The prefill method to reset with.
+   * @param decode_method The decode method to reset with.
+   */
+  ET_NODISCARD virtual runtime::Error reset(
+      executorch::ET_RUNTIME_NAMESPACE::Method& prefill_method,
+      executorch::ET_RUNTIME_NAMESPACE::Method& decode_method) {
+    (void)prefill_method;
+    (void)decode_method;
+    return runtime::Error::Ok;
+  }
+
+  /**
+   * @brief Prepare inputs for the prefill phase of LLM inference.
+   *
+   * @param input The input tensor containing token IDs.
+   * @param start_pos The tensor containing the starting position of the current
+   * input within the context.
+   * @param prefill_method The prefill method to prepare inputs for.
+   * @return std::vector<executorch::runtime::EValue> Vector of prepared inputs
+   * for the prefill method.
+   */
+  virtual runtime::Result<std::vector<executorch::runtime::EValue>>
+  prepare_prefill(
+      const executorch::extension::TensorPtr& input,
+      const executorch::extension::TensorPtr& start_pos,
+      executorch::ET_RUNTIME_NAMESPACE::Method& prefill_method) {
+    if (prefill_method.inputs_size() != 2) {
+      ET_LOG(
+          Error,
+          "Expected 2 inputs for prefill method, got %zu. Likely the model takes the caches or mask as an argument which this IOManager does not support.",
+          prefill_method.inputs_size());
+      return runtime::Error::InvalidState;
+    }
+    // Cpu IO Manager supports dynamic shapes for prefill, so no work to be done
+    // here.
+    return std::vector<runtime::EValue>{input, start_pos};
+  }
+
+  /**
+   * @brief Prepare inputs for the decode phase of LLM inference.
+   *
+   * @param input The input tensor containing token IDs.
+   * @param start_pos The tensor containing the starting position of the current
+   * input within the context.
+   * @param decode_method The decode method to prepare inputs for.
+   * @return std::vector<executorch::runtime::EValue> Vector of prepared inputs
+   * for the decode method.
+   */
+  virtual runtime::Result<std::vector<executorch::runtime::EValue>>
+  prepare_decode(
+      const executorch::extension::TensorPtr& input,
+      const executorch::extension::TensorPtr& start_pos,
+      executorch::ET_RUNTIME_NAMESPACE::Method& decode_method) {
+    if (decode_method.inputs_size() != 2) {
+      ET_LOG(
+          Error,
+          "Expected 2 inputs for decode method, got %zu. Likely the model takes the caches or mask as an argument which this IOManager does not support.",
+          decode_method.inputs_size());
+      return runtime::Error::InvalidState;
+    }
+    // Cpu IO Manager supports dynamic shapes for prefill, so no work to be done
+    // here.
+    return std::vector<runtime::EValue>{input, start_pos};
+  }
+
+  /**
+   * @brief Process and update internal state with outputs from the prefill
+   * phase.
+   *
+   * @param prefill_method The prefill method to update with outputs.
+   * @param model_outputs Vector of outputs from the prefill method execution.
+   */
+  ET_NODISCARD virtual runtime::Error update_prefill(
+      executorch::ET_RUNTIME_NAMESPACE::Method& prefill_method,
+      const std::vector<executorch::runtime::EValue>& model_outputs) {
+    (void)prefill_method;
+    (void)model_outputs;
+    // No post inference work to do.
+    return runtime::Error::Ok;
+  }
+
+  /**
+   * @brief Process and update internal state with outputs from the decode
+   * phase.
+   *
+   * @param decode_method The decode method to update with outputs.
+   * @param model_outputs Vector of outputs from the decode method execution.
+   */
+  ET_NODISCARD virtual runtime::Error update_decode(
+      const executorch::ET_RUNTIME_NAMESPACE::Method& decode_method,
+      const std::vector<executorch::runtime::EValue>& model_outputs) {
+    (void)decode_method;
+    (void)model_outputs;
+    // No post inference work to do.
+    return runtime::Error::Ok;
+  }
+};
+
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/extension/llm/runner/io_manager/targets.bzl b/extension/llm/runner/io_manager/targets.bzl
new file mode 100644
index 00000000000..e538572c51b
--- /dev/null
+++ b/extension/llm/runner/io_manager/targets.bzl
@@ -0,0 +1,22 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+
+    for aten in (True, False):
+        aten_suffix = "_aten" if aten else ""
+
+        # Interface for IOManager. No concrete impl from this dep.
+        runtime.cxx_library(
+            name = "io_manager" + aten_suffix,
+            exported_headers = [
+                "io_manager.h",
+            ],
+            deps = [
+                "//executorch/extension/tensor:tensor" + aten_suffix,
+                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
+                "//executorch/runtime/executor:program_no_prim_ops" + aten_suffix,
+            ],
+            visibility = [
+                "@EXECUTORCH_CLIENTS",
+            ],
+        )
diff --git a/extension/llm/runner/io_manager/test/TARGETS b/extension/llm/runner/io_manager/test/TARGETS
new file mode 100644
index 00000000000..6db0a7c590b
--- /dev/null
+++ b/extension/llm/runner/io_manager/test/TARGETS
@@ -0,0 +1,25 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
+
+runtime.cxx_test(
+    name = "test_io_manager",
+    srcs = ["test_io_manager.cpp"],
+    deps = [
+        "//executorch/extension/llm/runner/io_manager:io_manager",
+        "//executorch/extension/llm/runner/io_manager:io_manager",
+        "//executorch/extension/module:module",
+        "//executorch/extension/tensor:tensor",
+        "//executorch/runtime/executor:program",
+         "//executorch/kernels/portable:generated_lib",
+    ],
+    env = {
+        "KVCACHE_CACHE_POS": "$(location fbcode//executorch/test/models:exported_programs[ModuleKVCacheCachePos.pte])",
+    }
+)
diff --git a/extension/llm/runner/io_manager/test/targets.bzl b/extension/llm/runner/io_manager/test/targets.bzl
new file mode 100644
index 00000000000..6e3ae5311b9
--- /dev/null
+++ b/extension/llm/runner/io_manager/test/targets.bzl
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+  pass
diff --git a/extension/llm/runner/io_manager/test/test_io_manager.cpp b/extension/llm/runner/io_manager/test/test_io_manager.cpp
new file mode 100644
index 00000000000..bc265e8d083
--- /dev/null
+++ b/extension/llm/runner/io_manager/test/test_io_manager.cpp
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/llm/runner/io_manager/io_manager.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using executorch::extension::Module;
+using executorch::extension::llm::IOManager;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::Method;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+
+// Test fixture for IOManager tests
+class IOManagerTest : public Test {
+ protected:
+  void SetUp() override {
+    executorch::runtime::runtime_init();
+
+    module_ = std::make_unique<Module>(std::getenv("KVCACHE_CACHE_POS"));
+    io_manager_ = std::make_unique<IOManager>();
+    auto err = module_->load_method("forward");
+    EXPECT_EQ(err, Error::Ok);
+  }
+
+ protected:
+  std::unique_ptr<Module> module_;
+
+  std::unique_ptr<IOManager> io_manager_;
+};
+
+// Test that load() returns Error::Ok (no-op)
+TEST_F(IOManagerTest, LoadReturnsOk) {
+  auto* program = module_->program().get();
+  auto* prefill_method = module_->method("forward").get();
+  auto* decode_method = module_->method("forward").get();
+
+  auto result = io_manager_->load(*program, *prefill_method, *decode_method);
+
+  EXPECT_EQ(result, Error::Ok);
+}
+
+// Test that reset() returns Error::Ok (no-op)
+TEST_F(IOManagerTest, ResetReturnsOk) {
+  auto* prefill_method = module_->method("forward").get();
+  auto* decode_method = module_->method("forward").get();
+
+  auto result = io_manager_->reset(*prefill_method, *decode_method);
+
+  EXPECT_EQ(result, Error::Ok);
+}
+
+// Test that prepare_prefill() returns the input tensors when method has 2
+// inputs
+TEST_F(IOManagerTest, PreparePrefillReturnsInputsWhenValidInputCount) {
+  auto* prefill_method = module_->method("forward").get();
+
+  // Create test tensors
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<int64_t> start_pos_data = {0};
+  auto input_ptr = executorch::extension::make_tensor_ptr({1, 4}, input_data);
+  auto start_pos_ptr =
+      executorch::extension::make_tensor_ptr({1}, start_pos_data);
+
+  auto result =
+      io_manager_->prepare_prefill(input_ptr, start_pos_ptr, *prefill_method);
+
+  EXPECT_EQ(result.error(), Error::Ok);
+  auto outputs = result.get();
+  EXPECT_EQ(outputs.size(), 2);
+
+  // Verify that the returned EValues contain the same tensors we passed in
+  EXPECT_TRUE(outputs[0].isTensor());
+  EXPECT_TRUE(outputs[1].isTensor());
+}
+
+// Test that prepare_decode() returns the input tensors when method has 2 inputs
+TEST_F(IOManagerTest, PrepareDecodeReturnsInputsWhenValidInputCount) {
+  auto* decode_method = module_->method("forward").get();
+
+  // Create test tensors
+  std::vector<float> input_data = {5.0f, 6.0f, 7.0f, 8.0f};
+  std::vector<int64_t> start_pos_data = {10};
+  auto input_ptr = executorch::extension::make_tensor_ptr({1, 4}, input_data);
+  auto start_pos_ptr =
+      executorch::extension::make_tensor_ptr({1}, start_pos_data);
+
+  auto result =
+      io_manager_->prepare_decode(input_ptr, start_pos_ptr, *decode_method);
+
+  EXPECT_EQ(result.error(), Error::Ok);
+  auto outputs = result.get();
+  EXPECT_EQ(outputs.size(), 2);
+
+  // Verify that the returned EValues contain the same tensors we passed in
+  EXPECT_TRUE(outputs[0].isTensor());
+  EXPECT_TRUE(outputs[1].isTensor());
+}
+
+// Test that update_prefill() returns Error::Ok (no-op)
+TEST_F(IOManagerTest, UpdatePrefillReturnsOk) {
+  auto* prefill_method = module_->method("forward").get();
+
+  // Create dummy model outputs
+  std::vector<EValue> model_outputs;
+  std::vector<float> output_data = {0.1f, 0.2f, 0.3f};
+  auto output_tensor =
+      executorch::extension::make_tensor_ptr({1, 3}, output_data);
+  model_outputs.emplace_back(*output_tensor);
+
+  auto result = io_manager_->update_prefill(*prefill_method, model_outputs);
+
+  EXPECT_EQ(result, Error::Ok);
+}
+
+// Test that update_decode() returns Error::Ok (no-op)
+TEST_F(IOManagerTest, UpdateDecodeReturnsOk) {
+  auto* decode_method = module_->method("forward").get();
+
+  // Create dummy model outputs
+  std::vector<EValue> model_outputs;
+  std::vector<float> output_data = {0.4f, 0.5f, 0.6f};
+  auto output_tensor =
+      executorch::extension::make_tensor_ptr({1, 3}, output_data);
+  model_outputs.emplace_back(*output_tensor);
+
+  auto result = io_manager_->update_decode(*decode_method, model_outputs);
+
+  EXPECT_EQ(result, Error::Ok);
+}
+
+// Test that prepare_prefill() correctly passes through different tensor shapes
+TEST_F(IOManagerTest, PreparePrefillPassesThroughDifferentTensorShapes) {
+  auto* prefill_method = module_->method("forward").get();
+
+  // Create test tensors with different shapes
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  std::vector<int64_t> start_pos_data = {5, 10};
+  auto input_ptr = executorch::extension::make_tensor_ptr({2, 3}, input_data);
+  auto start_pos_ptr =
+      executorch::extension::make_tensor_ptr({2}, start_pos_data);
+
+  auto result =
+      io_manager_->prepare_prefill(input_ptr, start_pos_ptr, *prefill_method);
+
+  EXPECT_EQ(result.error(), Error::Ok);
+  auto outputs = result.get();
+  EXPECT_EQ(outputs.size(), 2);
+
+  // Verify that the returned EValues contain tensors
+  EXPECT_TRUE(outputs[0].isTensor());
+  EXPECT_TRUE(outputs[1].isTensor());
+}
+
+// Test that prepare_decode() correctly passes through different tensor shapes
+TEST_F(IOManagerTest, PrepareDecodePassesThroughDifferentTensorShapes) {
+  auto* decode_method = module_->method("forward").get();
+
+  // Create test tensors with different shapes
+  std::vector<float> input_data = {
+      7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f};
+  std::vector<int64_t> start_pos_data = {15, 20, 25};
+  auto input_ptr = executorch::extension::make_tensor_ptr({2, 4}, input_data);
+  auto start_pos_ptr =
+      executorch::extension::make_tensor_ptr({3}, start_pos_data);
+
+  auto result =
+      io_manager_->prepare_decode(input_ptr, start_pos_ptr, *decode_method);
+
+  EXPECT_EQ(result.error(), Error::Ok);
+  auto outputs = result.get();
+  EXPECT_EQ(outputs.size(), 2);
+
+  // Verify that the returned EValues contain tensors
+  EXPECT_TRUE(outputs[0].isTensor());
+  EXPECT_TRUE(outputs[1].isTensor());
+}
+
+// Test that update methods handle empty model outputs
+TEST_F(IOManagerTest, UpdateMethodsHandleEmptyModelOutputs) {
+  auto* prefill_method = module_->method("forward").get();
+  auto* decode_method = module_->method("forward").get();
+
+  // Create empty model outputs
+  std::vector<EValue> empty_outputs;
+
+  auto prefill_result =
+      io_manager_->update_prefill(*prefill_method, empty_outputs);
+  auto decode_result =
+      io_manager_->update_decode(*decode_method, empty_outputs);
+
+  EXPECT_EQ(prefill_result, Error::Ok);
+  EXPECT_EQ(decode_result, Error::Ok);
+}
+
+// Test that update methods handle multiple model outputs
+TEST_F(IOManagerTest, UpdateMethodsHandleMultipleModelOutputs) {
+  auto* prefill_method = module_->method("forward").get();
+  auto* decode_method = module_->method("forward").get();
+
+  // Create multiple model outputs
+  std::vector<EValue> model_outputs;
+  std::vector<float> output1_data = {0.1f, 0.2f};
+  std::vector<float> output2_data = {0.3f, 0.4f, 0.5f};
+  auto output1_tensor =
+      executorch::extension::make_tensor_ptr({1, 2}, output1_data);
+  auto output2_tensor =
+      executorch::extension::make_tensor_ptr({1, 3}, output2_data);
+  model_outputs.emplace_back(*output1_tensor);
+  model_outputs.emplace_back(*output2_tensor);
+
+  auto prefill_result =
+      io_manager_->update_prefill(*prefill_method, model_outputs);
+  auto decode_result =
+      io_manager_->update_decode(*decode_method, model_outputs);
+
+  EXPECT_EQ(prefill_result, Error::Ok);
+  EXPECT_EQ(decode_result, Error::Ok);
+}
diff --git a/tools/cmake/cmake_deps.toml b/tools/cmake/cmake_deps.toml
index a033fba4929..e9f06b7574e 100644
--- a/tools/cmake/cmake_deps.toml
+++ b/tools/cmake/cmake_deps.toml
@@ -255,6 +255,7 @@ filters = [
 [targets.extension_llm_runner]
 buck_targets = [
   "//extension/llm/runner:runner_lib",
+  "//extension/llm/runner/io_manager:io_manager",
 ]
 filters = [
   ".cpp$",