[llm] Support different shape of input_pos

larryliu0820 · larryliu0820 · commit 58e27925138b · 2025-06-23T22:07:40.000-07:00
For huggingface models, `forward()` is taking `tokens` as well as `cache_positions`, which is a list of cache indices. This is different than the .pte files `export_llama` gives, which are taking `tokens` and `input_pos` where `input_pos` is a scalar tensor. This PR adds support inside `text_decoder_runner.cpp` to handle both shapes of `input_pos`/`cache_positions`. To make the logic more generic without relying on extra metadata, here I'm adding the logic of inspecting method meta and input tensor info, to make a decision if we want to feed in `input_pos` or `cache_position`. Differential Revision: [D77203700](https://our.internmc.facebook.com/intern/diff/D77203700/) [ghstack-poisoned]
diff --git a/extension/llm/runner/test/CMakeLists.txt b/extension/llm/runner/test/CMakeLists.txt
@@ -18,7 +18,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
 include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs test_generation_config.cpp test_text_llm_runner.cpp
-               test_text_prefiller.cpp
+               test_text_prefiller.cpp test_text_decoder_runner.cpp
 )
 
 et_cxx_test(
diff --git a/extension/llm/runner/test/targets.bzl b/extension/llm/runner/test/targets.bzl
@@ -36,3 +36,18 @@ def define_common_targets():
             "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
         ],
     )
+
+    runtime.cxx_test(
+        name = "test_text_decoder_runner",
+        srcs = ["test_text_decoder_runner.cpp"],
+        deps = [
+            "//executorch/extension/llm/runner:runner_lib",
+            "//executorch/kernels/portable:generated_lib",
+            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+        ],
+        env = {
+            "KVCACHE_CACHE_POS": "$(location fbcode//executorch/test/models:exported_programs[ModuleKVCacheCachePos.pte])",
+            "KVCACHE_INPUT_POS": "$(location fbcode//executorch/test/models:exported_programs[ModuleKVCacheInputPos.pte])",
+            "NO_KVCACHE": "$(location fbcode//executorch/test/models:exported_programs[ModuleNoKVCache.pte])",
+        }
+    )
diff --git a/extension/llm/runner/test/test_text_decoder_runner.cpp b/extension/llm/runner/test/test_text_decoder_runner.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ * @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
+ */
+
+#include <executorch/extension/llm/runner/text_decoder_runner.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <cstdlib>
+
+using namespace ::testing;
+using executorch::extension::Module;
+using executorch::extension::TensorPtr;
+using executorch::extension::llm::TextDecoderRunner;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::Result;
+using executorch::runtime::testing::TensorFactory;
+
+// Mock Module class for testing
+class MockModule : public Module {
+ public:
+  MockModule() : Module("") {}
+};
+
+class TextDecoderRunnerTest : public Test {
+ protected:
+  void SetUp() override {
+    mock_module_ = std::make_unique<MockModule>();
+    runner_ = std::make_unique<TextDecoderRunner>(mock_module_.get());
+  }
+
+  std::unique_ptr<MockModule> mock_module_;
+  std::unique_ptr<TextDecoderRunner> runner_;
+};
+
+// Test logits_to_token() method with Float tensor
+TEST_F(TextDecoderRunnerTest, LogitsToTokenFloat) {
+  TensorFactory<executorch::aten::ScalarType::Float> tf_float;
+  auto logits = tf_float.make({1, 4}, {0.1f, 0.2f, 0.8f, 0.4f});
+
+  // Call logits_to_token with temperature 0 (deterministic)
+  int32_t token = runner_->logits_to_token(logits, 0.0f);
+
+  // With temperature 0, should return the argmax (index 2)
+  EXPECT_EQ(token, 2);
+}
+
+// Test logits_to_token() method with 3D tensor (batch, seq_length, vocab_size)
+TEST_F(TextDecoderRunnerTest, LogitsToToken3D) {
+  TensorFactory<executorch::aten::ScalarType::Float> tf_float;
+  // Shape: [1, 2, 4] - batch=1, seq_length=2, vocab_size=4
+  auto logits = tf_float.make(
+      {1, 2, 4},
+      {
+          0.1f,
+          0.2f,
+          0.3f,
+          0.4f, // First sequence position
+          0.5f,
+          0.6f,
+          0.9f,
+          0.8f // Second sequence position (last)
+      });
+
+  // Call logits_to_token with temperature 0 (deterministic)
+  int32_t token = runner_->logits_to_token(logits, 0.0f);
+
+  // Should use the last sequence position and return argmax (index 2)
+  EXPECT_EQ(token, 2);
+}
+
+// Test logits_to_token() method with Half tensor
+TEST_F(TextDecoderRunnerTest, LogitsToTokenHalf) {
+  TensorFactory<executorch::aten::ScalarType::Half> tf_half;
+  auto logits = tf_half.make({1, 4}, {0.1f, 0.2f, 0.8f, 0.4f});
+
+  // Call logits_to_token with temperature 0 (deterministic)
+  int32_t token = runner_->logits_to_token(logits, 0.0f);
+
+  // With temperature 0, should return the argmax (index 2)
+  EXPECT_EQ(token, 2);
+}
+
+// Test logits_to_token() method with BFloat16 tensor
+TEST_F(TextDecoderRunnerTest, LogitsToTokenBFloat16) {
+  TensorFactory<executorch::aten::ScalarType::BFloat16> tf_bfloat16;
+  auto logits = tf_bfloat16.make({1, 4}, {0.1f, 0.2f, 0.8f, 0.4f});
+
+  // Call logits_to_token with temperature 0 (deterministic)
+  int32_t token = runner_->logits_to_token(logits, 0.0f);
+
+  // With temperature 0, should return the argmax (index 2)
+  EXPECT_EQ(token, 2);
+}
+
+// Test logits_to_token() method with non-zero temperature
+TEST_F(TextDecoderRunnerTest, LogitsToTokenWithTemperature) {
+  TensorFactory<executorch::aten::ScalarType::Float> tf_float;
+  auto logits = tf_float.make({1, 4}, {0.1f, 0.2f, 0.8f, 0.4f});
+
+  // Call logits_to_token with temperature > 0 (stochastic)
+  int32_t token = runner_->logits_to_token(logits, 1.0f);
+
+  // With temperature > 0, result should be within valid range
+  EXPECT_GE(token, 0);
+  EXPECT_LT(token, 4);
+}
+
+// Test step() method with all available PTE models
+TEST_F(TextDecoderRunnerTest, StepWithAllModels) {
+  // List of all environment variables for PTE models
+  std::vector<std::pair<std::string, const char*>> env_vars = {
+      {"KVCACHE_CACHE_POS", "KVCACHE_CACHE_POS"},
+      {"KVCACHE_INPUT_POS", "KVCACHE_INPUT_POS"},
+      {"NO_KVCACHE", "NO_KVCACHE"}};
+
+  // Check if any environment variables are set up front
+  bool any_env_set = false;
+  for (const auto& [model_name, env_var] : env_vars) {
+    if (std::getenv(env_var)) {
+      any_env_set = true;
+      break;
+    }
+  }
+
+  // Skip test if no environment variables are set
+  if (!any_env_set) {
+    GTEST_SKIP() << "No PTE model environment variables were set";
+  }
+
+  bool any_model_tested = false;
+
+  // Loop through all available models
+  for (const auto& [model_name, env_var] : env_vars) {
+    const char* model_path = std::getenv(env_var);
+    if (!model_path) {
+      continue; // Skip if environment variable not set
+    }
+
+    SCOPED_TRACE(
+        "Testing model: " + model_name + " from " + std::string(model_path));
+
+    // Load the model
+    auto module = std::make_unique<Module>(model_path);
+    auto load_result = module->load();
+    if (load_result != Error::Ok) {
+      ADD_FAILURE() << "Failed to load model " << model_name << " from "
+                    << model_path << " with error: " << (int)load_result;
+      continue;
+    }
+
+    // Create TextDecoderRunner
+    TextDecoderRunner runner(module.get());
+    auto runner_load_result = runner.load();
+    ASSERT_EQ(runner_load_result, Error::Ok)
+        << "Failed to load runner for " << model_name;
+
+    // Verify method is loaded
+    EXPECT_TRUE(runner.is_method_loaded())
+        << "Method not loaded for " << model_name;
+
+    // Create input tensor pointer
+
+    TensorFactory<executorch::aten::ScalarType::Long> tf_long;
+    auto input_tokens_ =
+        tf_long.make({1, 3}, {50, 7, 11}); // Single token input
+
+    auto input_ptr = std::make_shared<executorch::aten::Tensor>(input_tokens_);
+    int64_t start_pos = 0;
+
+    // Call step() and verify result is ok
+    auto result = runner.step(input_ptr, start_pos);
+    ASSERT_TRUE(result.ok()) << "step() failed for " << model_name
+                             << " with error: " << (int)result.error();
+
+    // Verify output tensor is valid
+    auto output_tensor = result.get();
+    EXPECT_GT(output_tensor.numel(), 0)
+        << "Output tensor empty for " << model_name;
+
+    // Test logits_to_token works
+    int32_t token = runner.logits_to_token(output_tensor, 0.0f);
+    EXPECT_GE(token, 0) << "Invalid token for " << model_name;
+
+    any_model_tested = true;
+  }
+
+  // This should not happen since we checked environment variables up front
+  ASSERT_TRUE(any_model_tested)
+      << "No models were tested despite environment variables being set";
+}
diff --git a/extension/llm/runner/test/test_text_llm_runner.cpp b/extension/llm/runner/test/test_text_llm_runner.cpp
@@ -63,11 +63,11 @@ class MockModule : public ::executorch::extension::Module {
 
 class MockTextDecoderRunner : public TextDecoderRunner {
  public:
-  MockTextDecoderRunner() : TextDecoderRunner(nullptr, false) {}
+  MockTextDecoderRunner() : TextDecoderRunner(nullptr) {}
   MOCK_METHOD(
       Result<executorch::aten::Tensor>,
       step,
-      (executorch::extension::TensorPtr&, executorch::extension::TensorPtr&),
+      (executorch::extension::TensorPtr&, int64_t),
       ());
   MOCK_METHOD(bool, is_method_loaded, (), ());
   MOCK_METHOD(Result<uint64_t>, prefill, (std::vector<uint64_t>&, int64_t), ());
@@ -134,8 +134,7 @@ class RunnerTest : public Test {
   std::unique_ptr<MockTextDecoderRunner> createMockTextDecoderRunner() {
     auto text_decoder_runner = std::make_unique<MockTextDecoderRunner>();
     ON_CALL(*text_decoder_runner, step)
-        .WillByDefault([&](executorch::extension::TensorPtr&,
-                           executorch::extension::TensorPtr&) {
+        .WillByDefault([&](executorch::extension::TensorPtr&, int64_t) {
           return Result<executorch::aten::Tensor>(tensor);
         });
     ON_CALL(*text_decoder_runner, is_method_loaded())
diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp
@@ -21,18 +21,52 @@ namespace llm {
 // NOTE: we observed ~2x loading performance increase on iPhone 15
 // and a ~5% improvement on Galaxy S22 by switching to
 // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
-TextDecoderRunner::TextDecoderRunner(Module* module, bool use_kv_cache)
-    : module_(module), use_kv_cache_(use_kv_cache) {}
+TextDecoderRunner::TextDecoderRunner(Module* module) : module_(module) {}
 
 // This function is functional, meaning it shouldn't modify any state of the
 // input. It should be safe to call multiple times with the same inputs. The
 // outer loop (call site) is responsible for managing state.
 ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
     TensorPtr& tokens,
-    TensorPtr& start_pos) {
+    int64_t start_pos) {
   // ET_LOG(Info, "Input token %" PRIu64, input_token);
-  if (use_kv_cache_) {
-    auto outputs_res = module_->forward({tokens, start_pos});
+  auto method_meta = ET_UNWRAP(module_->method_meta("forward"));
+  // If only 1 input, we are not using kv cache
+  bool use_kv_cache = method_meta.num_inputs() > 1;
+
+  if (use_kv_cache) {
+    // Size of the second argument. This could be either input_pos or
+    // cache_positions
+
+    // Check if we are using cache positions instead of input pos.
+    auto second_input_info = ET_UNWRAP(method_meta.input_tensor_meta(1));
+    // For input_pos, numel is 1, for cache_positions, numel is max_seq_len
+    auto sizes = second_input_info.sizes();
+    auto numel = 1;
+    std::vector<::executorch::aten::SizesType> sizes_vec;
+    for (const auto& size : sizes) {
+      sizes_vec.emplace_back(size);
+      numel *= size;
+    }
+    // Assuming the last dimension is the one with the variable token length
+    sizes_vec[sizes_vec.size() - 1] = -1;
+    TensorPtr start_pos_tensor;
+    if (numel > 1) {
+      // Assuming model is exported with cache_positions, create a tensor with
+      // the same size as cache_positions
+      start_pos_tensor = arange(
+          start_pos,
+          start_pos + tokens->numel(),
+          1,
+          sizes_vec,
+          ::executorch::aten::ScalarType::Long);
+    } else {
+      // Assuming model is exported with input_pos, create a tensor with size 1
+      start_pos_tensor =
+          from_blob(&start_pos, {1}, ::executorch::aten::ScalarType::Long);
+    }
+    ET_LOG(Info, "Start pos tensor numel: %zu", start_pos_tensor->numel());
+    auto outputs_res = module_->forward({tokens, start_pos_tensor});
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
     ET_CHECK_MSG(
         outputs_res.get().size() == 1,
diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h
@@ -21,7 +21,7 @@ namespace llm {
 
 class ET_EXPERIMENTAL TextDecoderRunner {
  public:
-  TextDecoderRunner(Module* module, bool use_kv_cache);
+  TextDecoderRunner(Module* module);
 
   virtual ~TextDecoderRunner() = default;
 
@@ -34,7 +34,7 @@ class ET_EXPERIMENTAL TextDecoderRunner {
    */
   virtual ::executorch::runtime::Result<executorch::aten::Tensor> step(
       TensorPtr& input,
-      TensorPtr& start_pos);
+      int64_t start_pos);
 
   /**
    * Load the Module for text decode purpose.
@@ -101,7 +101,6 @@ class ET_EXPERIMENTAL TextDecoderRunner {
    * Module remains valid for the duration of TextDecoderRunner's usage.
    */
   Module* module_;
-  bool use_kv_cache_;
   bool should_stop_{false};
 };
 
diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp
@@ -393,8 +393,7 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
 
   // Create text_decoder_runner. Use a shared_ptr so that it can be shared with
   // TextPrefiller and TextTokenGenerator
-  auto text_decoder_runner = std::make_unique<TextDecoderRunner>(
-      module.get(), metadata.at(kUseKVCache));
+  auto text_decoder_runner = std::make_unique<TextDecoderRunner>(module.get());
 
   // Create text_prefiller
   auto text_prefiller = std::make_unique<TextPrefiller>(
diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp
@@ -86,10 +86,7 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(
         {1, num_prompt_tokens},
         executorch::aten::ScalarType::Long);
 
-    auto start_pos_tensor =
-        from_blob(&start_pos, {1}, executorch::aten::ScalarType::Long);
-
-    auto outputs_res = text_decoder_runner_->step(tokens, start_pos_tensor);
+    auto outputs_res = text_decoder_runner_->step(tokens, start_pos);
 
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
     ET_LOG(
@@ -106,13 +103,10 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(
     auto tokens =
         from_blob(&cur_token, {1, 1}, executorch::aten::ScalarType::Long);
 
-    auto start_pos_tensor =
-        from_blob(&start_pos, {1}, executorch::aten::ScalarType::Long);
-
     // run the first token and get back logits tensor. Assuming the first token
     // is bos so don't callback.
     auto logits_tensor =
-        ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos_tensor));
+        ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos));
 
     pos += 1; // start the loop from index 1
     start_pos += 1;
@@ -122,8 +116,7 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(
       // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds)
       cur_token = prompt_tokens[pos];
 
-      logits_tensor =
-          ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos_tensor));
+      logits_tensor = ET_UNWRAP(text_decoder_runner_->step(tokens, start_pos));
 
       pos++;
       start_pos++;
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
diff --git a/runtime/executor/method_meta.h b/runtime/executor/method_meta.h
diff --git a/test/models/export_program.py b/test/models/export_program.py
diff --git a/test/models/targets.bzl b/test/models/targets.bzl

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)`
`18`	`18`	`include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)`
`19`	`19`
`20`	`20`	`set(_test_srcs test_generation_config.cpp test_text_llm_runner.cpp`
`21`		`- test_text_prefiller.cpp`
	`21`	`+ test_text_prefiller.cpp test_text_decoder_runner.cpp`
`22`	`22`	`)`
`23`	`23`
`24`	`24`	`et_cxx_test(`