From f041be8bbbaa8a1493b79977af8bbd08782f8648 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 15 Jul 2025 02:21:50 -0400
Subject: [PATCH] Fix cache_positions tensor size in TextLLMRunner (#12478)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/12476 by
@larryliu0820
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/larryliu0820/68/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/larryliu0820/68/head
Merge bot PR base: https://github.com/pytorch/executorch/tree/main
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/larryliu0820/68/orig
@diff-train-skip-merge

Co-authored-by: Mengwei Liu <larryliu@meta.com>
(cherry picked from commit aa44c06b7bc499099ff31d4898ea4dd6347e933f)
---
 extension/llm/runner/text_decoder_runner.cpp | 17 ++++++++++-------
 extension/llm/runner/text_prefiller.h        |  2 +-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp
index e60a07bc50a..4293b2a08d8 100644
--- a/extension/llm/runner/text_decoder_runner.cpp
+++ b/extension/llm/runner/text_decoder_runner.cpp
@@ -52,22 +52,25 @@ ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
     auto numel = sizes[0];
     std::vector<::executorch::aten::SizesType> sizes_vec = {numel};
 
-    // Assuming the last dimension is the one with the variable token length,
-    // for example [1, S] or [1, 1, S]
-    sizes_vec[sizes_vec.size() - 1] = numel;
     TensorPtr start_pos_tensor;
     if (numel > 1) {
-      // Assuming model is exported with cache_positions, create a tensor with
-      // the same size as cache_positions
+      // If we are here, model is exported with cache_positions, create a tensor
+      // with the same length as input_ids. Assuming the last dimension is the
+      // one with the variable token length, for example [1, S] or [1, 1, S]
+      sizes_vec[sizes_vec.size() - 1] = tokens->numel();
       start_pos_tensor = empty(sizes_vec, ::executorch::aten::ScalarType::Long);
       torch::executor::native::arange_out_impl(
-          start_pos, start_pos + numel, 1.0, *start_pos_tensor);
+          start_pos, start_pos + tokens->numel(), 1.0, *start_pos_tensor);
     } else {
       // Assuming model is exported with input_pos, create a tensor with size 1
       start_pos_tensor = from_blob(
           &start_pos, sizes_vec, ::executorch::aten::ScalarType::Long);
     }
-    ET_LOG(Info, "Start pos tensor numel: %zu", start_pos_tensor->numel());
+    ET_LOG(
+        Info,
+        "Start pos tensor numel: %zu, tokens numel: %zu",
+        start_pos_tensor->numel(),
+        tokens->numel());
     auto outputs_res = module_->forward({tokens, start_pos_tensor});
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
     ET_CHECK_MSG(
diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h
index ce12506a05c..a02cd3d1bf4 100644
--- a/extension/llm/runner/text_prefiller.h
+++ b/extension/llm/runner/text_prefiller.h
@@ -21,7 +21,7 @@ class ET_EXPERIMENTAL TextPrefiller {
  public:
   TextPrefiller(
       TextDecoderRunner* text_decoder_runner,
-      bool use_kv_cache_,
+      bool use_kv_cache,
       bool enable_parallel_prefill,
       int64_t max_seq_len = 128);