Skip to content
Merged
1 change: 1 addition & 0 deletions extension/llm/runner/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def define_common_targets():
],
exported_deps = [
":stats",
"//executorch/kernels/portable/cpu/util:arange_util",
"//executorch/extension/llm/sampler:sampler" + aten_suffix,
"//executorch/extension/module:module" + aten_suffix,
"//executorch/extension/tensor:tensor" + aten_suffix,
Expand Down
2 changes: 1 addition & 1 deletion extension/llm/runner/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)

set(_test_srcs test_generation_config.cpp test_text_llm_runner.cpp
test_text_prefiller.cpp
test_text_prefiller.cpp test_text_decoder_runner.cpp
)

et_cxx_test(
Expand Down
15 changes: 15 additions & 0 deletions extension/llm/runner/test/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,18 @@ def define_common_targets():
"//executorch/runtime/core/exec_aten/testing_util:tensor_util",
],
)

runtime.cxx_test(
name = "test_text_decoder_runner",
srcs = ["test_text_decoder_runner.cpp"],
deps = [
"//executorch/extension/llm/runner:runner_lib",
"//executorch/kernels/portable:generated_lib",
"//executorch/runtime/core/exec_aten/testing_util:tensor_util",
],
env = {
"KVCACHE_CACHE_POS": "$(location fbcode//executorch/test/models:exported_programs[ModuleKVCacheCachePos.pte])",
"KVCACHE_INPUT_POS": "$(location fbcode//executorch/test/models:exported_programs[ModuleKVCacheInputPos.pte])",
"NO_KVCACHE": "$(location fbcode//executorch/test/models:exported_programs[ModuleNoKVCache.pte])",
}
)
199 changes: 199 additions & 0 deletions extension/llm/runner/test/test_text_decoder_runner.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
* @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
*/

#include <executorch/extension/llm/runner/text_decoder_runner.h>
#include <executorch/extension/module/module.h>
#include <executorch/extension/tensor/tensor.h>
#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include <cstdlib>

using namespace ::testing;
using executorch::extension::Module;
using executorch::extension::TensorPtr;
using executorch::extension::llm::TextDecoderRunner;
using executorch::runtime::Error;
using executorch::runtime::EValue;
using executorch::runtime::Result;
using executorch::runtime::testing::TensorFactory;

// Mock Module class for testing
class MockModule : public Module {
public:
MockModule() : Module("") {}
};

class TextDecoderRunnerTest : public Test {
protected:
void SetUp() override {
mock_module_ = std::make_unique<MockModule>();
runner_ = std::make_unique<TextDecoderRunner>(mock_module_.get());
}

std::unique_ptr<MockModule> mock_module_;
std::unique_ptr<TextDecoderRunner> runner_;
};

// Test logits_to_token() method with Float tensor
TEST_F(TextDecoderRunnerTest, LogitsToTokenFloat) {
TensorFactory<executorch::aten::ScalarType::Float> tf_float;
auto logits = tf_float.make({1, 4}, {0.1f, 0.2f, 0.8f, 0.4f});

// Call logits_to_token with temperature 0 (deterministic)
int32_t token = runner_->logits_to_token(logits, 0.0f);

// With temperature 0, should return the argmax (index 2)
EXPECT_EQ(token, 2);
}

// Test logits_to_token() method with 3D tensor (batch, seq_length, vocab_size)
TEST_F(TextDecoderRunnerTest, LogitsToToken3D) {
TensorFactory<executorch::aten::ScalarType::Float> tf_float;
// Shape: [1, 2, 4] - batch=1, seq_length=2, vocab_size=4
auto logits = tf_float.make(
{1, 2, 4},
{
0.1f,
0.2f,
0.3f,
0.4f, // First sequence position
0.5f,
0.6f,
0.9f,
0.8f // Second sequence position (last)
});

// Call logits_to_token with temperature 0 (deterministic)
int32_t token = runner_->logits_to_token(logits, 0.0f);

// Should use the last sequence position and return argmax (index 2)
EXPECT_EQ(token, 2);
}

// Test logits_to_token() method with Half tensor
TEST_F(TextDecoderRunnerTest, LogitsToTokenHalf) {
TensorFactory<executorch::aten::ScalarType::Half> tf_half;
auto logits = tf_half.make({1, 4}, {0.1f, 0.2f, 0.8f, 0.4f});

// Call logits_to_token with temperature 0 (deterministic)
int32_t token = runner_->logits_to_token(logits, 0.0f);

// With temperature 0, should return the argmax (index 2)
EXPECT_EQ(token, 2);
}

// Test logits_to_token() method with BFloat16 tensor
TEST_F(TextDecoderRunnerTest, LogitsToTokenBFloat16) {
TensorFactory<executorch::aten::ScalarType::BFloat16> tf_bfloat16;
auto logits = tf_bfloat16.make({1, 4}, {0.1f, 0.2f, 0.8f, 0.4f});

// Call logits_to_token with temperature 0 (deterministic)
int32_t token = runner_->logits_to_token(logits, 0.0f);

// With temperature 0, should return the argmax (index 2)
EXPECT_EQ(token, 2);
}

// Test logits_to_token() method with non-zero temperature
TEST_F(TextDecoderRunnerTest, LogitsToTokenWithTemperature) {
TensorFactory<executorch::aten::ScalarType::Float> tf_float;
auto logits = tf_float.make({1, 4}, {0.1f, 0.2f, 0.8f, 0.4f});

// Call logits_to_token with temperature > 0 (stochastic)
int32_t token = runner_->logits_to_token(logits, 1.0f);

// With temperature > 0, result should be within valid range
EXPECT_GE(token, 0);
EXPECT_LT(token, 4);
}

// Test step() method with all available PTE models
TEST_F(TextDecoderRunnerTest, StepWithAllModels) {
// List of all environment variables for PTE models
std::vector<std::pair<std::string, const char*>> env_vars = {
{"KVCACHE_CACHE_POS", "KVCACHE_CACHE_POS"},
{"KVCACHE_INPUT_POS", "KVCACHE_INPUT_POS"},
{"NO_KVCACHE", "NO_KVCACHE"}};

// Check if any environment variables are set up front
bool any_env_set = false;
for (const auto& [model_name, env_var] : env_vars) {
if (std::getenv(env_var)) {
any_env_set = true;
break;
}
}

// Skip test if no environment variables are set
if (!any_env_set) {
GTEST_SKIP() << "No PTE model environment variables were set";
}

bool any_model_tested = false;

// Loop through all available models
for (const auto& [model_name, env_var] : env_vars) {
const char* model_path = std::getenv(env_var);
if (!model_path) {
continue; // Skip if environment variable not set
}

SCOPED_TRACE(
"Testing model: " + model_name + " from " + std::string(model_path));

// Load the model
auto module = std::make_unique<Module>(model_path);
auto load_result = module->load();
if (load_result != Error::Ok) {
ADD_FAILURE() << "Failed to load model " << model_name << " from "
<< model_path << " with error: " << (int)load_result;
continue;
}

// Create TextDecoderRunner
TextDecoderRunner runner(module.get());
auto runner_load_result = runner.load();
ASSERT_EQ(runner_load_result, Error::Ok)
<< "Failed to load runner for " << model_name;

// Verify method is loaded
EXPECT_TRUE(runner.is_method_loaded())
<< "Method not loaded for " << model_name;

// Create input tensor pointer

TensorFactory<executorch::aten::ScalarType::Long> tf_long;
auto input_tokens_ =
tf_long.make({1, 3}, {50, 7, 11}); // Single token input

auto input_ptr = std::make_shared<executorch::aten::Tensor>(input_tokens_);
int64_t start_pos = 0;

// Call step() and verify result is ok
auto result = runner.step(input_ptr, start_pos);
ASSERT_TRUE(result.ok()) << "step() failed for " << model_name
<< " with error: " << (int)result.error();

// Verify output tensor is valid
auto output_tensor = result.get();
EXPECT_GT(output_tensor.numel(), 0)
<< "Output tensor empty for " << model_name;

// Test logits_to_token works
int32_t token = runner.logits_to_token(output_tensor, 0.0f);
EXPECT_GE(token, 0) << "Invalid token for " << model_name;

any_model_tested = true;
}

// This should not happen since we checked environment variables up front
ASSERT_TRUE(any_model_tested)
<< "No models were tested despite environment variables being set";
}
7 changes: 3 additions & 4 deletions extension/llm/runner/test/test_text_llm_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,11 @@ class MockModule : public ::executorch::extension::Module {

class MockTextDecoderRunner : public TextDecoderRunner {
public:
MockTextDecoderRunner() : TextDecoderRunner(nullptr, false) {}
MockTextDecoderRunner() : TextDecoderRunner(nullptr) {}
MOCK_METHOD(
Result<executorch::aten::Tensor>,
step,
(executorch::extension::TensorPtr&, executorch::extension::TensorPtr&),
(executorch::extension::TensorPtr&, int64_t),
());
MOCK_METHOD(bool, is_method_loaded, (), ());
MOCK_METHOD(Result<uint64_t>, prefill, (std::vector<uint64_t>&, int64_t), ());
Expand Down Expand Up @@ -134,8 +134,7 @@ class RunnerTest : public Test {
std::unique_ptr<MockTextDecoderRunner> createMockTextDecoderRunner() {
auto text_decoder_runner = std::make_unique<MockTextDecoderRunner>();
ON_CALL(*text_decoder_runner, step)
.WillByDefault([&](executorch::extension::TensorPtr&,
executorch::extension::TensorPtr&) {
.WillByDefault([&](executorch::extension::TensorPtr&, int64_t) {
return Result<executorch::aten::Tensor>(tensor);
});
ON_CALL(*text_decoder_runner, is_method_loaded())
Expand Down
46 changes: 41 additions & 5 deletions extension/llm/runner/text_decoder_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
// Given inputs, run a text decoder and return logits.

#include <executorch/extension/llm/runner/text_decoder_runner.h>
#include <executorch/kernels/portable/cpu/util/arange_util.h>

#include <ctime>

Expand All @@ -21,18 +22,53 @@ namespace llm {
// NOTE: we observed ~2x loading performance increase on iPhone 15
// and a ~5% improvement on Galaxy S22 by switching to
// FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
TextDecoderRunner::TextDecoderRunner(Module* module, bool use_kv_cache)
: module_(module), use_kv_cache_(use_kv_cache) {}
TextDecoderRunner::TextDecoderRunner(Module* module) : module_(module) {}

// This function is functional, meaning it shouldn't modify any state of the
// input. It should be safe to call multiple times with the same inputs. The
// outer loop (call site) is responsible for managing state.
::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
TensorPtr& tokens,
TensorPtr& start_pos) {
int64_t start_pos) {
// ET_LOG(Info, "Input token %" PRIu64, input_token);
if (use_kv_cache_) {
auto outputs_res = module_->forward({tokens, start_pos});
auto method_meta = ET_UNWRAP(module_->method_meta("forward"));
// If only 1 input, we are not using kv cache
bool use_kv_cache = method_meta.num_inputs() > 1;

if (use_kv_cache) {
// Size of the second argument. This could be either input_pos or
// cache_positions

// Check if we are using cache positions instead of input pos.
auto second_input_info = ET_UNWRAP(method_meta.input_tensor_meta(1));
// For input_pos, numel is 1, for cache_positions, numel is max_seq_len
auto sizes = second_input_info.sizes();
// Assuming 1D tensor
ET_CHECK_OR_RETURN_ERROR(
sizes.size() == 1,
InvalidProgram,
"The second input tensor is not 1D tensor. Got dimension (%zu)",
sizes.size());
auto numel = sizes[0];
std::vector<::executorch::aten::SizesType> sizes_vec = {numel};

// Assuming the last dimension is the one with the variable token length,
// for example [1, S] or [1, 1, S]
sizes_vec[sizes_vec.size() - 1] = numel;
TensorPtr start_pos_tensor;
if (numel > 1) {
// Assuming model is exported with cache_positions, create a tensor with
// the same size as cache_positions
start_pos_tensor = empty(sizes_vec, ::executorch::aten::ScalarType::Long);
torch::executor::native::arange_out_impl(
start_pos, start_pos + numel, 1.0, *start_pos_tensor);
} else {
// Assuming model is exported with input_pos, create a tensor with size 1
start_pos_tensor = from_blob(
&start_pos, sizes_vec, ::executorch::aten::ScalarType::Long);
}
ET_LOG(Info, "Start pos tensor numel: %zu", start_pos_tensor->numel());
auto outputs_res = module_->forward({tokens, start_pos_tensor});
ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
ET_CHECK_MSG(
outputs_res.get().size() == 1,
Expand Down
5 changes: 2 additions & 3 deletions extension/llm/runner/text_decoder_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ namespace llm {

class ET_EXPERIMENTAL TextDecoderRunner {
public:
TextDecoderRunner(Module* module, bool use_kv_cache);
TextDecoderRunner(Module* module);

virtual ~TextDecoderRunner() = default;

Expand All @@ -34,7 +34,7 @@ class ET_EXPERIMENTAL TextDecoderRunner {
*/
virtual ::executorch::runtime::Result<executorch::aten::Tensor> step(
TensorPtr& input,
TensorPtr& start_pos);
int64_t start_pos);

/**
* Load the Module for text decode purpose.
Expand Down Expand Up @@ -101,7 +101,6 @@ class ET_EXPERIMENTAL TextDecoderRunner {
* Module remains valid for the duration of TextDecoderRunner's usage.
*/
Module* module_;
bool use_kv_cache_;
bool should_stop_{false};
};

Expand Down
3 changes: 1 addition & 2 deletions extension/llm/runner/text_llm_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -393,8 +393,7 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(

// Create text_decoder_runner. Use a shared_ptr so that it can be shared with
// TextPrefiller and TextTokenGenerator
auto text_decoder_runner = std::make_unique<TextDecoderRunner>(
module.get(), metadata.at(kUseKVCache));
auto text_decoder_runner = std::make_unique<TextDecoderRunner>(module.get());

// Create text_prefiller
auto text_prefiller = std::make_unique<TextPrefiller>(
Expand Down
13 changes: 3 additions & 10 deletions extension/llm/runner/text_prefiller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,7 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(
{1, num_prompt_tokens},
executorch::aten::ScalarType::Long);

auto start_pos_tensor =
from_blob(&start_pos, {1}, executorch::aten::ScalarType::Long);

auto outputs_res = text_decoder_runner_->step(tokens, start_pos_tensor);
auto outputs_res = text_decoder_runner_->step(tokens, start_pos);

ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
ET_LOG(
Expand All @@ -106,13 +103,10 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(
auto tokens =
from_blob(&cur_token, {1, 1}, executorch::aten::ScalarType::Long);

auto start_pos_tensor =
from_blob(&start_pos, {1}, executorch::aten::ScalarType::Long);

// run the first token and get back logits tensor. Assuming the first token
// is bos so don't callback.
auto logits_tensor =
ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos_tensor));
ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos));

pos += 1; // start the loop from index 1
start_pos += 1;
Expand All @@ -122,8 +116,7 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(
// NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds)
cur_token = prompt_tokens[pos];

logits_tensor =
ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos_tensor));
logits_tensor = ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos));

pos++;
start_pos++;
Expand Down
Loading
Loading