Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions examples/models/llava/runner/llava_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@
#include <executorch/examples/models/llava/runner/llava_text_decoder_runner.h>
#include <pytorch/tokenizers/llama2c_tokenizer.h>

#include <ctime>
#include <memory>
#include <sstream>
#include <vector>

namespace llm = ::executorch::extension::llm;
Expand Down Expand Up @@ -49,7 +47,8 @@ Error LlavaRunner::load() {
// Load the text decoder runner
text_decoder_runner_ =
// @lint-ignore CLANGTIDY facebook-hte-Deprecated
std::make_unique<LlavaTextDecoderRunner>(module_.get());
std::make_unique<LlavaTextDecoderRunner>(
module_.get(), io_manager_.get());
// @lint-ignore CLANGTIDY facebook-hte-Deprecated
text_decoder_runner_->load();

Expand Down
6 changes: 4 additions & 2 deletions examples/models/llava/runner/llava_text_decoder_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ namespace example {
class ET_EXPERIMENTAL LlavaTextDecoderRunner
: public executorch::extension::llm::TextDecoderRunner {
public:
explicit LlavaTextDecoderRunner(executorch::extension::Module* module)
: TextDecoderRunner(module) {}
explicit LlavaTextDecoderRunner(
executorch::extension::Module* module,
executorch::extension::llm::IOManager* io_manager)
: TextDecoderRunner(module, io_manager) {}

inline executorch::runtime::Result<executorch::aten::Tensor> step(
executorch::extension::TensorPtr& tokens,
Expand Down
5 changes: 3 additions & 2 deletions extension/llm/runner/multimodal_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,10 @@
#include <functional>
#include <memory>
#include <string>
#include <type_traits>
#include <unordered_map>

#include <executorch/extension/llm/runner/image.h>
#include <executorch/extension/llm/runner/image_prefiller.h>
#include <executorch/extension/llm/runner/io_manager/io_manager.h>
#include <executorch/extension/llm/runner/stats.h>
#include <executorch/extension/llm/runner/text_decoder_runner.h>
#include <executorch/extension/llm/runner/text_prefiller.h>
Expand All @@ -41,6 +40,7 @@ class ET_EXPERIMENTAL MultimodalRunner {
const float temperature = 0.8f)
: temperature_(temperature),
module_(std::make_unique<Module>(model_path, Module::LoadMode::File)),
io_manager_(std::make_unique<IOManager>()),
tokenizer_path_(tokenizer_path) {
ET_LOG(
Info,
Expand Down Expand Up @@ -127,6 +127,7 @@ class ET_EXPERIMENTAL MultimodalRunner {
std::unique_ptr<TextDecoderRunner> text_decoder_runner_;
std::unique_ptr<TextPrefiller> text_prefiller_;
std::unique_ptr<ImagePrefiller> image_prefiller_;
std::unique_ptr<IOManager> io_manager_;
std::unique_ptr<TextTokenGenerator> text_token_generator_;
std::string tokenizer_path_;
std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
Expand Down
2 changes: 2 additions & 0 deletions extension/llm/runner/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def define_common_targets():
":stats",
"//executorch/kernels/portable/cpu/util:arange_util" + aten_suffix,
"//executorch/extension/llm/sampler:sampler" + aten_suffix,
"//executorch/extension/llm/runner/io_manager:io_manager" + aten_suffix,
"//executorch/extension/module:module" + aten_suffix,
"//executorch/extension/tensor:tensor" + aten_suffix,
],
Expand Down Expand Up @@ -102,6 +103,7 @@ def define_common_targets():
":text_decoder_runner" + aten_suffix,
":text_prefiller" + aten_suffix,
":text_token_generator" + aten_suffix,
"//executorch/extension/llm/runner/io_manager:io_manager" + aten_suffix,
"//pytorch/tokenizers:hf_tokenizer",
"//pytorch/tokenizers:llama2c_tokenizer",
"//pytorch/tokenizers:sentencepiece",
Expand Down
1 change: 1 addition & 0 deletions extension/llm/runner/test/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ runtime.cxx_test(
srcs = ["test_text_decoder_runner.cpp"],
deps = [
"//executorch/extension/llm/runner:runner_lib",
"//executorch/extension/llm/runner/io_manager:io_manager",
"//executorch/kernels/portable:generated_lib",
"//executorch/runtime/core/exec_aten/testing_util:tensor_util",
],
Expand Down
13 changes: 10 additions & 3 deletions extension/llm/runner/test/test_text_decoder_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
* @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
*/

#include <executorch/extension/llm/runner/io_manager/io_manager.h>
#include <executorch/extension/llm/runner/text_decoder_runner.h>
#include <executorch/extension/module/module.h>
#include <executorch/extension/tensor/tensor.h>
Expand All @@ -18,6 +19,7 @@
using namespace ::testing;
using executorch::extension::Module;
using executorch::extension::TensorPtr;
using executorch::extension::llm::IOManager;
using executorch::extension::llm::TextDecoderRunner;
using executorch::runtime::Error;
using executorch::runtime::EValue;
Expand All @@ -34,11 +36,14 @@ class TextDecoderRunnerTest : public Test {
protected:
void SetUp() override {
mock_module_ = std::make_unique<MockModule>();
runner_ = std::make_unique<TextDecoderRunner>(mock_module_.get());
io_manager_ = std::make_unique<executorch::extension::llm::IOManager>();
runner_ = std::make_unique<TextDecoderRunner>(
mock_module_.get(), io_manager_.get());
}

std::unique_ptr<MockModule> mock_module_;
std::unique_ptr<TextDecoderRunner> runner_;
std::unique_ptr<IOManager> io_manager_;
};

// Test logits_to_token() method with Float tensor
Expand Down Expand Up @@ -150,15 +155,17 @@ TEST_F(TextDecoderRunnerTest, StepWithAllModels) {

// Load the model
auto module = std::make_unique<Module>(model_path);

auto load_result = module->load();
if (load_result != Error::Ok) {
ADD_FAILURE() << "Failed to load model " << model_name << " from "
<< model_path << " with error: " << (int)load_result;
continue;
}

std::unique_ptr<executorch::extension::llm::IOManager> io_manager =
std::make_unique<executorch::extension::llm::IOManager>();
// Create TextDecoderRunner
TextDecoderRunner runner(module.get());
TextDecoderRunner runner(module.get(), io_manager.get());
auto runner_load_result = runner.load();
ASSERT_EQ(runner_load_result, Error::Ok)
<< "Failed to load runner for " << model_name;
Expand Down
7 changes: 6 additions & 1 deletion extension/llm/runner/test/test_text_llm_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
* @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
*/

#include <executorch/extension/llm/runner/io_manager/io_manager.h>
#include <executorch/extension/llm/runner/irunner.h>
#include <executorch/extension/llm/runner/text_llm_runner.h>
#include <executorch/extension/llm/runner/text_prefiller.h>
Expand Down Expand Up @@ -63,7 +64,7 @@ class MockModule : public ::executorch::extension::Module {

class MockTextDecoderRunner : public TextDecoderRunner {
public:
MockTextDecoderRunner() : TextDecoderRunner(nullptr) {}
MockTextDecoderRunner() : TextDecoderRunner(nullptr, nullptr) {}
MOCK_METHOD(
Result<executorch::aten::Tensor>,
step,
Expand Down Expand Up @@ -219,6 +220,7 @@ TEST_F(RunnerTest, GenerateCallsCallbackExactlyMaxNewTokensTimes) {
std::move(text_decoder_runner),
std::unique_ptr<::executorch::extension::llm::TextPrefiller>(
text_prefiller.release()),
std::make_unique<executorch::extension::llm::IOManager>(),
std::move(text_token_generator),
std::move(stats));

Expand Down Expand Up @@ -278,6 +280,7 @@ TEST_F(RunnerTest, WarmupCallsGenerateWithWarmingFlag) {
std::move(text_decoder_runner),
std::unique_ptr<::executorch::extension::llm::TextPrefiller>(
text_prefiller.release()),
std::make_unique<executorch::extension::llm::IOManager>(),
std::move(text_token_generator),
std::move(stats));

Expand Down Expand Up @@ -312,6 +315,7 @@ TEST_F(RunnerTest, IsLoadedReturnsTrueWhenComponentsInitialized) {
std::move(text_decoder_runner),
std::unique_ptr<::executorch::extension::llm::TextPrefiller>(
text_prefiller.release()),
std::make_unique<executorch::extension::llm::IOManager>(),
std::move(text_token_generator),
std::move(stats));

Expand Down Expand Up @@ -356,6 +360,7 @@ TEST_F(RunnerTest, GenerateFromPosErrorsWithNegativeMaxNewTokens) {
std::move(text_decoder_runner),
std::unique_ptr<::executorch::extension::llm::TextPrefiller>(
text_prefiller.release()),
std::make_unique<executorch::extension::llm::IOManager>(),
std::move(text_token_generator),
std::move(stats));

Expand Down
2 changes: 1 addition & 1 deletion extension/llm/runner/test/test_text_prefiller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ using executorch::runtime::testing::TensorFactory;
// Mock class for TextDecoderRunner
class MockTextDecoderRunner : public TextDecoderRunner {
public:
MockTextDecoderRunner() : TextDecoderRunner(nullptr) {}
MockTextDecoderRunner() : TextDecoderRunner(nullptr, nullptr) {}
MOCK_METHOD(
Result<executorch::aten::Tensor>,
step,
Expand Down
19 changes: 17 additions & 2 deletions extension/llm/runner/text_decoder_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ namespace llm {
// NOTE: we observed ~2x loading performance increase on iPhone 15
// and a ~5% improvement on Galaxy S22 by switching to
// FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
TextDecoderRunner::TextDecoderRunner(Module* module) : module_(module) {}
TextDecoderRunner::TextDecoderRunner(Module* module, IOManager* io_manager)
: module_(module), io_manager_(io_manager) {}

// This function is functional, meaning it shouldn't modify any state of the
// input. It should be safe to call multiple times with the same inputs. The
Expand Down Expand Up @@ -66,8 +67,22 @@ ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
start_pos_tensor = from_blob(
&start_pos, sizes_vec, ::executorch::aten::ScalarType::Long);
}
auto outputs_res = module_->forward({tokens, start_pos_tensor});

std::vector<runtime::EValue> inputs;
auto method_err = module_->method("forward");
ET_CHECK_OK_OR_RETURN_ERROR(method_err.error());
auto& method = *(method_err.get());

auto inputs_res =
io_manager_->prepare_decode(tokens, start_pos_tensor, method);
ET_CHECK_OK_OR_RETURN_ERROR(inputs_res.error());
inputs = inputs_res.get();
auto outputs_res = module_->forward(inputs);
ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());

auto update_err = io_manager_->update_decode(method, outputs_res.get());
ET_CHECK_OK_OR_RETURN_ERROR(update_err);

ET_CHECK_MSG(
outputs_res.get().size() == 1,
"More then one output returned from executing LLM.");
Expand Down
10 changes: 6 additions & 4 deletions extension/llm/runner/text_decoder_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#pragma once

#include <executorch/extension/llm/runner/io_manager/io_manager.h>
#include <executorch/extension/llm/sampler/sampler.h>
#include <executorch/extension/module/module.h>
#include <executorch/extension/tensor/tensor.h>
Expand All @@ -21,7 +22,7 @@ namespace llm {

class ET_EXPERIMENTAL TextDecoderRunner {
public:
explicit TextDecoderRunner(Module* module);
explicit TextDecoderRunner(Module* module, IOManager* io_manager);

virtual ~TextDecoderRunner() = default;

Expand Down Expand Up @@ -94,13 +95,14 @@ class ET_EXPERIMENTAL TextDecoderRunner {

protected:
/**
* Note: TextDecoderRunner does not own the Module instance. It is expected
* that the outer class (likely Runner) manages the lifecycle of the Module.
* This means that the responsibility for creating, maintaining, and
* Note: TextDecoderRunner does not own the Module or IOManager instance. It
* is expected that the outer class (likely Runner) manages the lifecycle of
* them. This means that the responsibility for creating, maintaining, and
* destroying the Module lies outside of TextDecoderRunner. Ensure that the
* Module remains valid for the duration of TextDecoderRunner's usage.
*/
Module* module_;
IOManager* io_manager_;
bool should_stop_{false};
};

Expand Down
18 changes: 17 additions & 1 deletion extension/llm/runner/text_llm_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
// A simple llama2 runner that includes preprocessing and post processing logic.
// The module takes in a string as input and emits a string as output.

#include <executorch/extension/llm/runner/io_manager/io_manager.h>
#include <executorch/extension/llm/runner/text_llm_runner.h>
#include <executorch/extension/llm/runner/util.h>
#include <executorch/runtime/platform/runtime.h>
Expand Down Expand Up @@ -39,6 +40,7 @@ TextLLMRunner::TextLLMRunner(
std::unique_ptr<::executorch::extension::Module> module,
std::unique_ptr<TextDecoderRunner> text_decoder_runner,
std::unique_ptr<TextPrefiller> text_prefiller,
std::unique_ptr<IOManager> io_manager,
std::unique_ptr<TextTokenGenerator> text_token_generator,
std::unique_ptr<Stats> stats,
float temperature)
Expand All @@ -47,6 +49,7 @@ TextLLMRunner::TextLLMRunner(
module_(std::move(module)),
text_decoder_runner_(std::move(text_decoder_runner)),
text_prefiller_(std::move(text_prefiller)),
io_manager_(std::move(io_manager)),
text_token_generator_(std::move(text_token_generator)),
stats_(std::move(stats)),
temperature_(temperature) {
Expand All @@ -63,6 +66,14 @@ Error TextLLMRunner::load() {
return Error::Ok;
}
ET_CHECK_OK_OR_RETURN_ERROR(text_prefiller_->load());
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
auto method_res = module_->method("forward");

Program& program = *module_->program();

ET_CHECK_OK_OR_RETURN_ERROR(method_res.error());
auto& forward = *(method_res.get());
ET_CHECK_OK_OR_RETURN_ERROR(io_manager_->load(program, forward, forward));
ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load());
return Error::Ok;
}
Expand Down Expand Up @@ -393,9 +404,13 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>(
llm::get_eos_ids(tokenizer.get(), module.get()));

// Create IOManager
std::unique_ptr<IOManager> io_manager = std::make_unique<IOManager>();

// Create text_decoder_runner. Use a shared_ptr so that it can be shared with
// TextPrefiller and TextTokenGenerator
auto text_decoder_runner = std::make_unique<TextDecoderRunner>(module.get());
auto text_decoder_runner =
std::make_unique<TextDecoderRunner>(module.get(), io_manager.get());

// Create text_prefiller
auto text_prefiller = std::make_unique<TextPrefiller>(
Expand All @@ -420,6 +435,7 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
std::move(module),
std::move(text_decoder_runner),
std::move(text_prefiller),
std::move(io_manager),
std::move(text_token_generator),
std::move(stats),
temperature);
Expand Down
2 changes: 2 additions & 0 deletions extension/llm/runner/text_llm_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner {
std::unique_ptr<::executorch::extension::Module> module,
std::unique_ptr<TextDecoderRunner> text_decoder_runner,
std::unique_ptr<TextPrefiller> text_prefiller,
std::unique_ptr<IOManager> io_manager,
std::unique_ptr<TextTokenGenerator> text_token_generator,
std::unique_ptr<Stats> stats,
float temperature = -1.0f);
Expand Down Expand Up @@ -155,6 +156,7 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner {
// sure it outlives text_prefiller_ &
// text_token_generator_.
std::unique_ptr<TextPrefiller> text_prefiller_;
std::unique_ptr<IOManager> io_manager_;
std::unique_ptr<TextTokenGenerator> text_token_generator_;

// Stats
Expand Down
1 change: 1 addition & 0 deletions extension/module/module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ runtime::Error Module::load_method(

ET_NODISCARD runtime::Result<Method*> Module::method(
const std::string& method_name) {
ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name));
ET_CHECK_OR_RETURN_ERROR(
methods_.count(method_name) > 0,
InvalidArgument,
Expand Down
Loading