From 9f8dac0649317c11744ada3cb497f098619ad216 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Wed, 16 Jul 2025 16:54:37 -0700 Subject: [PATCH] IOManager Interface (#10418) Summary: Hopefully this is sufficient for the contract. Going to do 2 follow up tests. Add a basic cpu implementation add a static attention implementation. Reviewed By: larryliu0820 Differential Revision: D73450877 --- extension/llm/runner/io_manager/TARGETS | 8 + extension/llm/runner/io_manager/io_manager.h | 158 ++++++++++++ extension/llm/runner/io_manager/targets.bzl | 22 ++ extension/llm/runner/io_manager/test/TARGETS | 25 ++ .../llm/runner/io_manager/test/targets.bzl | 10 + .../io_manager/test/test_io_manager.cpp | 230 ++++++++++++++++++ tools/cmake/cmake_deps.toml | 1 + 7 files changed, 454 insertions(+) create mode 100644 extension/llm/runner/io_manager/TARGETS create mode 100644 extension/llm/runner/io_manager/io_manager.h create mode 100644 extension/llm/runner/io_manager/targets.bzl create mode 100644 extension/llm/runner/io_manager/test/TARGETS create mode 100644 extension/llm/runner/io_manager/test/targets.bzl create mode 100644 extension/llm/runner/io_manager/test/test_io_manager.cpp diff --git a/extension/llm/runner/io_manager/TARGETS b/extension/llm/runner/io_manager/TARGETS new file mode 100644 index 00000000000..2341af9282f --- /dev/null +++ b/extension/llm/runner/io_manager/TARGETS @@ -0,0 +1,8 @@ +# Any targets that should be shared between fbcode and xplat must be defined in +# targets.bzl. This file can contain fbcode-only targets. + +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/extension/llm/runner/io_manager/io_manager.h b/extension/llm/runner/io_manager/io_manager.h new file mode 100644 index 00000000000..ce158c23b6e --- /dev/null +++ b/extension/llm/runner/io_manager/io_manager.h @@ -0,0 +1,158 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include +#include +#include +#include + +namespace executorch { +namespace extension { +namespace llm { + +/** + * @brief Base class for managing input/output operations for LLM inference. + * + * IOManager provides an interface for handling the input preparation and + * output processing for both prefill and decode phases of LLM inference. + * Derived classes must implement the virtual methods to provide specific IO + * management functionality. + */ +class ET_EXPERIMENTAL IOManager { + public: + /** + * @brief Virtual destructor to allow proper cleanup in derived classes. + */ + virtual ~IOManager() = default; + + /** + * @brief Load the IO manager with method metadata for prefill and + * decode operations. + * + * @param program The program prefill and decode methods are loaded from. + * @param prefill_method The prefill method to initialize with. + * @param decode_method The decode method to initialize with. + */ + ET_NODISCARD virtual runtime::Error load( + const executorch::ET_RUNTIME_NAMESPACE::Program& program, + executorch::ET_RUNTIME_NAMESPACE::Method& prefill_method, + executorch::ET_RUNTIME_NAMESPACE::Method& decode_method) { + (void)program; + (void)prefill_method; + (void)decode_method; + return runtime::Error::Ok; + } + + /** + * @brief Reset the IO manager state. + * + * @param prefill_method The prefill method to reset with. + * @param decode_method The decode method to reset with. + */ + ET_NODISCARD virtual runtime::Error reset( + executorch::ET_RUNTIME_NAMESPACE::Method& prefill_method, + executorch::ET_RUNTIME_NAMESPACE::Method& decode_method) { + (void)prefill_method; + (void)decode_method; + return runtime::Error::Ok; + } + + /** + * @brief Prepare inputs for the prefill phase of LLM inference. + * + * @param input The input tensor containing token IDs. + * @param start_pos The tensor containing the starting position of the current + * input within the context. + * @param prefill_method The prefill method to prepare inputs for. + * @return std::vector Vector of prepared inputs + * for the prefill method. + */ + virtual runtime::Result> + prepare_prefill( + const executorch::extension::TensorPtr& input, + const executorch::extension::TensorPtr& start_pos, + executorch::ET_RUNTIME_NAMESPACE::Method& prefill_method) { + if (prefill_method.inputs_size() != 2) { + ET_LOG( + Error, + "Expected 2 inputs for prefill method, got %zu. Likely the model takes the caches or mask as an argument which this IOManager does not support.", + prefill_method.inputs_size()); + return runtime::Error::InvalidState; + } + // Cpu IO Manager supports dynamic shapes for prefill, so no work to be done + // here. + return std::vector{input, start_pos}; + } + + /** + * @brief Prepare inputs for the decode phase of LLM inference. + * + * @param input The input tensor containing token IDs. + * @param start_pos The tensor containing the starting position of the current + * input within the context. + * @param decode_method The decode method to prepare inputs for. + * @return std::vector Vector of prepared inputs + * for the decode method. + */ + virtual runtime::Result> + prepare_decode( + const executorch::extension::TensorPtr& input, + const executorch::extension::TensorPtr& start_pos, + executorch::ET_RUNTIME_NAMESPACE::Method& decode_method) { + if (decode_method.inputs_size() != 2) { + ET_LOG( + Error, + "Expected 2 inputs for decode method, got %zu. Likely the model takes the caches or mask as an argument which this IOManager does not support.", + decode_method.inputs_size()); + return runtime::Error::InvalidState; + } + // Cpu IO Manager supports dynamic shapes for prefill, so no work to be done + // here. + return std::vector{input, start_pos}; + } + + /** + * @brief Process and update internal state with outputs from the prefill + * phase. + * + * @param prefill_method The prefill method to update with outputs. + * @param model_outputs Vector of outputs from the prefill method execution. + */ + ET_NODISCARD virtual runtime::Error update_prefill( + executorch::ET_RUNTIME_NAMESPACE::Method& prefill_method, + const std::vector& model_outputs) { + (void)prefill_method; + (void)model_outputs; + // No post inference work to do. + return runtime::Error::Ok; + } + + /** + * @brief Process and update internal state with outputs from the decode + * phase. + * + * @param decode_method The decode method to update with outputs. + * @param model_outputs Vector of outputs from the decode method execution. + */ + ET_NODISCARD virtual runtime::Error update_decode( + const executorch::ET_RUNTIME_NAMESPACE::Method& decode_method, + const std::vector& model_outputs) { + (void)decode_method; + (void)model_outputs; + // No post inference work to do. + return runtime::Error::Ok; + } +}; + +} // namespace llm +} // namespace extension +} // namespace executorch diff --git a/extension/llm/runner/io_manager/targets.bzl b/extension/llm/runner/io_manager/targets.bzl new file mode 100644 index 00000000000..e538572c51b --- /dev/null +++ b/extension/llm/runner/io_manager/targets.bzl @@ -0,0 +1,22 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + + for aten in (True, False): + aten_suffix = "_aten" if aten else "" + + # Interface for IOManager. No concrete impl from this dep. + runtime.cxx_library( + name = "io_manager" + aten_suffix, + exported_headers = [ + "io_manager.h", + ], + deps = [ + "//executorch/extension/tensor:tensor" + aten_suffix, + "//executorch/runtime/core/exec_aten:lib" + aten_suffix, + "//executorch/runtime/executor:program_no_prim_ops" + aten_suffix, + ], + visibility = [ + "@EXECUTORCH_CLIENTS", + ], + ) diff --git a/extension/llm/runner/io_manager/test/TARGETS b/extension/llm/runner/io_manager/test/TARGETS new file mode 100644 index 00000000000..6db0a7c590b --- /dev/null +++ b/extension/llm/runner/io_manager/test/TARGETS @@ -0,0 +1,25 @@ +# Any targets that should be shared between fbcode and xplat must be defined in +# targets.bzl. This file can contain fbcode-only targets. + +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() + +runtime.cxx_test( + name = "test_io_manager", + srcs = ["test_io_manager.cpp"], + deps = [ + "//executorch/extension/llm/runner/io_manager:io_manager", + "//executorch/extension/llm/runner/io_manager:io_manager", + "//executorch/extension/module:module", + "//executorch/extension/tensor:tensor", + "//executorch/runtime/executor:program", + "//executorch/kernels/portable:generated_lib", + ], + env = { + "KVCACHE_CACHE_POS": "$(location fbcode//executorch/test/models:exported_programs[ModuleKVCacheCachePos.pte])", + } +) diff --git a/extension/llm/runner/io_manager/test/targets.bzl b/extension/llm/runner/io_manager/test/targets.bzl new file mode 100644 index 00000000000..6e3ae5311b9 --- /dev/null +++ b/extension/llm/runner/io_manager/test/targets.bzl @@ -0,0 +1,10 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + pass diff --git a/extension/llm/runner/io_manager/test/test_io_manager.cpp b/extension/llm/runner/io_manager/test/test_io_manager.cpp new file mode 100644 index 00000000000..bc265e8d083 --- /dev/null +++ b/extension/llm/runner/io_manager/test/test_io_manager.cpp @@ -0,0 +1,230 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include + +using namespace ::testing; +using executorch::extension::Module; +using executorch::extension::llm::IOManager; +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::Method; +using executorch::runtime::Program; +using executorch::runtime::Result; + +// Test fixture for IOManager tests +class IOManagerTest : public Test { + protected: + void SetUp() override { + executorch::runtime::runtime_init(); + + module_ = std::make_unique(std::getenv("KVCACHE_CACHE_POS")); + io_manager_ = std::make_unique(); + auto err = module_->load_method("forward"); + EXPECT_EQ(err, Error::Ok); + } + + protected: + std::unique_ptr module_; + + std::unique_ptr io_manager_; +}; + +// Test that load() returns Error::Ok (no-op) +TEST_F(IOManagerTest, LoadReturnsOk) { + auto* program = module_->program().get(); + auto* prefill_method = module_->method("forward").get(); + auto* decode_method = module_->method("forward").get(); + + auto result = io_manager_->load(*program, *prefill_method, *decode_method); + + EXPECT_EQ(result, Error::Ok); +} + +// Test that reset() returns Error::Ok (no-op) +TEST_F(IOManagerTest, ResetReturnsOk) { + auto* prefill_method = module_->method("forward").get(); + auto* decode_method = module_->method("forward").get(); + + auto result = io_manager_->reset(*prefill_method, *decode_method); + + EXPECT_EQ(result, Error::Ok); +} + +// Test that prepare_prefill() returns the input tensors when method has 2 +// inputs +TEST_F(IOManagerTest, PreparePrefillReturnsInputsWhenValidInputCount) { + auto* prefill_method = module_->method("forward").get(); + + // Create test tensors + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f}; + std::vector start_pos_data = {0}; + auto input_ptr = executorch::extension::make_tensor_ptr({1, 4}, input_data); + auto start_pos_ptr = + executorch::extension::make_tensor_ptr({1}, start_pos_data); + + auto result = + io_manager_->prepare_prefill(input_ptr, start_pos_ptr, *prefill_method); + + EXPECT_EQ(result.error(), Error::Ok); + auto outputs = result.get(); + EXPECT_EQ(outputs.size(), 2); + + // Verify that the returned EValues contain the same tensors we passed in + EXPECT_TRUE(outputs[0].isTensor()); + EXPECT_TRUE(outputs[1].isTensor()); +} + +// Test that prepare_decode() returns the input tensors when method has 2 inputs +TEST_F(IOManagerTest, PrepareDecodeReturnsInputsWhenValidInputCount) { + auto* decode_method = module_->method("forward").get(); + + // Create test tensors + std::vector input_data = {5.0f, 6.0f, 7.0f, 8.0f}; + std::vector start_pos_data = {10}; + auto input_ptr = executorch::extension::make_tensor_ptr({1, 4}, input_data); + auto start_pos_ptr = + executorch::extension::make_tensor_ptr({1}, start_pos_data); + + auto result = + io_manager_->prepare_decode(input_ptr, start_pos_ptr, *decode_method); + + EXPECT_EQ(result.error(), Error::Ok); + auto outputs = result.get(); + EXPECT_EQ(outputs.size(), 2); + + // Verify that the returned EValues contain the same tensors we passed in + EXPECT_TRUE(outputs[0].isTensor()); + EXPECT_TRUE(outputs[1].isTensor()); +} + +// Test that update_prefill() returns Error::Ok (no-op) +TEST_F(IOManagerTest, UpdatePrefillReturnsOk) { + auto* prefill_method = module_->method("forward").get(); + + // Create dummy model outputs + std::vector model_outputs; + std::vector output_data = {0.1f, 0.2f, 0.3f}; + auto output_tensor = + executorch::extension::make_tensor_ptr({1, 3}, output_data); + model_outputs.emplace_back(*output_tensor); + + auto result = io_manager_->update_prefill(*prefill_method, model_outputs); + + EXPECT_EQ(result, Error::Ok); +} + +// Test that update_decode() returns Error::Ok (no-op) +TEST_F(IOManagerTest, UpdateDecodeReturnsOk) { + auto* decode_method = module_->method("forward").get(); + + // Create dummy model outputs + std::vector model_outputs; + std::vector output_data = {0.4f, 0.5f, 0.6f}; + auto output_tensor = + executorch::extension::make_tensor_ptr({1, 3}, output_data); + model_outputs.emplace_back(*output_tensor); + + auto result = io_manager_->update_decode(*decode_method, model_outputs); + + EXPECT_EQ(result, Error::Ok); +} + +// Test that prepare_prefill() correctly passes through different tensor shapes +TEST_F(IOManagerTest, PreparePrefillPassesThroughDifferentTensorShapes) { + auto* prefill_method = module_->method("forward").get(); + + // Create test tensors with different shapes + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + std::vector start_pos_data = {5, 10}; + auto input_ptr = executorch::extension::make_tensor_ptr({2, 3}, input_data); + auto start_pos_ptr = + executorch::extension::make_tensor_ptr({2}, start_pos_data); + + auto result = + io_manager_->prepare_prefill(input_ptr, start_pos_ptr, *prefill_method); + + EXPECT_EQ(result.error(), Error::Ok); + auto outputs = result.get(); + EXPECT_EQ(outputs.size(), 2); + + // Verify that the returned EValues contain tensors + EXPECT_TRUE(outputs[0].isTensor()); + EXPECT_TRUE(outputs[1].isTensor()); +} + +// Test that prepare_decode() correctly passes through different tensor shapes +TEST_F(IOManagerTest, PrepareDecodePassesThroughDifferentTensorShapes) { + auto* decode_method = module_->method("forward").get(); + + // Create test tensors with different shapes + std::vector input_data = { + 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f}; + std::vector start_pos_data = {15, 20, 25}; + auto input_ptr = executorch::extension::make_tensor_ptr({2, 4}, input_data); + auto start_pos_ptr = + executorch::extension::make_tensor_ptr({3}, start_pos_data); + + auto result = + io_manager_->prepare_decode(input_ptr, start_pos_ptr, *decode_method); + + EXPECT_EQ(result.error(), Error::Ok); + auto outputs = result.get(); + EXPECT_EQ(outputs.size(), 2); + + // Verify that the returned EValues contain tensors + EXPECT_TRUE(outputs[0].isTensor()); + EXPECT_TRUE(outputs[1].isTensor()); +} + +// Test that update methods handle empty model outputs +TEST_F(IOManagerTest, UpdateMethodsHandleEmptyModelOutputs) { + auto* prefill_method = module_->method("forward").get(); + auto* decode_method = module_->method("forward").get(); + + // Create empty model outputs + std::vector empty_outputs; + + auto prefill_result = + io_manager_->update_prefill(*prefill_method, empty_outputs); + auto decode_result = + io_manager_->update_decode(*decode_method, empty_outputs); + + EXPECT_EQ(prefill_result, Error::Ok); + EXPECT_EQ(decode_result, Error::Ok); +} + +// Test that update methods handle multiple model outputs +TEST_F(IOManagerTest, UpdateMethodsHandleMultipleModelOutputs) { + auto* prefill_method = module_->method("forward").get(); + auto* decode_method = module_->method("forward").get(); + + // Create multiple model outputs + std::vector model_outputs; + std::vector output1_data = {0.1f, 0.2f}; + std::vector output2_data = {0.3f, 0.4f, 0.5f}; + auto output1_tensor = + executorch::extension::make_tensor_ptr({1, 2}, output1_data); + auto output2_tensor = + executorch::extension::make_tensor_ptr({1, 3}, output2_data); + model_outputs.emplace_back(*output1_tensor); + model_outputs.emplace_back(*output2_tensor); + + auto prefill_result = + io_manager_->update_prefill(*prefill_method, model_outputs); + auto decode_result = + io_manager_->update_decode(*decode_method, model_outputs); + + EXPECT_EQ(prefill_result, Error::Ok); + EXPECT_EQ(decode_result, Error::Ok); +} diff --git a/tools/cmake/cmake_deps.toml b/tools/cmake/cmake_deps.toml index a033fba4929..e9f06b7574e 100644 --- a/tools/cmake/cmake_deps.toml +++ b/tools/cmake/cmake_deps.toml @@ -255,6 +255,7 @@ filters = [ [targets.extension_llm_runner] buck_targets = [ "//extension/llm/runner:runner_lib", + "//extension/llm/runner/io_manager:io_manager", ] filters = [ ".cpp$",