Update on "[llm] Support different shape of input_pos"

larryliu0820 · larryliu0820 · commit 9481d79b4cb3 · 2025-06-25T01:30:33.000-07:00
For huggingface models, `forward()` is taking `tokens` as well as `cache_positions`, which is a list of cache indices. This is different than the .pte files `export_llama` gives, which are taking `tokens` and `input_pos` where `input_pos` is a scalar tensor. This PR adds support inside `text_decoder_runner.cpp` to handle both shapes of `input_pos`/`cache_positions`. To make the logic more generic without relying on extra metadata, here I'm adding the logic of inspecting method meta and input tensor info, to make a decision if we want to feed in `input_pos` or `cache_position`. Differential Revision: [D77203700](https://our.internmc.facebook.com/intern/diff/D77203700/) [ghstack-poisoned]
diff --git a/extension/llm/runner/test/TARGETS b/extension/llm/runner/test/TARGETS
@@ -8,7 +8,22 @@
 # targets.bzl. This file can contain fbcode-only targets.
 
 load(":targets.bzl", "define_common_targets")
-
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 oncall("executorch")
 
 define_common_targets()
+
+runtime.cxx_test(
+    name = "test_text_decoder_runner",
+    srcs = ["test_text_decoder_runner.cpp"],
+    deps = [
+        "//executorch/extension/llm/runner:runner_lib",
+        "//executorch/kernels/portable:generated_lib",
+        "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+    ],
+    env = {
+        "KVCACHE_CACHE_POS": "$(location fbcode//executorch/test/models:exported_programs[ModuleKVCacheCachePos.pte])",
+        "KVCACHE_INPUT_POS": "$(location fbcode//executorch/test/models:exported_programs[ModuleKVCacheInputPos.pte])",
+        "NO_KVCACHE": "$(location fbcode//executorch/test/models:exported_programs[ModuleNoKVCache.pte])",
+    }
+)
diff --git a/extension/llm/runner/test/targets.bzl b/extension/llm/runner/test/targets.bzl
@@ -36,18 +36,3 @@ def define_common_targets():
             "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
         ],
     )
-
-    runtime.cxx_test(
-        name = "test_text_decoder_runner",
-        srcs = ["test_text_decoder_runner.cpp"],
-        deps = [
-            "//executorch/extension/llm/runner:runner_lib",
-            "//executorch/kernels/portable:generated_lib",
-            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
-        ],
-        env = {
-            "KVCACHE_CACHE_POS": "$(location fbcode//executorch/test/models:exported_programs[ModuleKVCacheCachePos.pte])",
-            "KVCACHE_INPUT_POS": "$(location fbcode//executorch/test/models:exported_programs[ModuleKVCacheInputPos.pte])",
-            "NO_KVCACHE": "$(location fbcode//executorch/test/models:exported_programs[ModuleNoKVCache.pte])",
-        }
-    )
diff --git a/extension/llm/runner/test/test_text_prefiller.cpp b/extension/llm/runner/test/test_text_prefiller.cpp
@@ -24,11 +24,11 @@ using executorch::runtime::testing::TensorFactory;
 // Mock class for TextDecoderRunner
 class MockTextDecoderRunner : public TextDecoderRunner {
  public:
-  MockTextDecoderRunner() : TextDecoderRunner(nullptr, false) {}
+  MockTextDecoderRunner() : TextDecoderRunner(nullptr) {}
   MOCK_METHOD(
       Result<executorch::aten::Tensor>,
       step,
-      (executorch::extension::TensorPtr&, executorch::extension::TensorPtr&),
+      (executorch::extension::TensorPtr&, int64_t),
       ());
   MOCK_METHOD(bool, is_method_loaded, (), ());
   MOCK_METHOD(Result<uint64_t>, prefill, (std::vector<uint64_t>&, int64_t), ());
@@ -44,8 +44,7 @@ class TextPrefillerTest : public Test {
     ON_CALL(text_decoder_runner_, is_method_loaded())
         .WillByDefault(Return(true));
     ON_CALL(text_decoder_runner_, step)
-        .WillByDefault([&](executorch::extension::TensorPtr&,
-                           executorch::extension::TensorPtr&) {
+        .WillByDefault([&](executorch::extension::TensorPtr&, int64_t) {
           return Result<executorch::aten::Tensor>(tensor);
         });
   }