From 7418d2ea68b8eb04c454bfe084fe2499f4903369 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Mon, 14 Jul 2025 20:54:23 -0700
Subject: [PATCH] Phi3 runner uses TextLLMRunner

As titled, this PR started to use `TextLLMRunner` to run phi-3-mini.

Eager model comes from Huggingface, not using kv cache as custom op
because it is only being supported on Optimum-executorch repo.

Performance may not be the best.
---
 .ci/scripts/test_phi_3_mini.sh                |  25 +---
 examples/models/phi-3-mini/CMakeLists.txt     |  32 ++---
 examples/models/phi-3-mini/README.md          |  32 ++---
 .../models/phi-3-mini/export_phi-3-mini.py    | 114 +++++++++++++-----
 examples/models/phi-3-mini/main.cpp           |  19 ++-
 examples/models/phi-3-mini/phi_3_mini.py      |   4 +-
 examples/models/phi-3-mini/runner.cpp         | 104 ----------------
 examples/models/phi-3-mini/runner.h           |  50 --------
 extension/llm/runner/text_decoder_runner.cpp  |  17 +--
 extension/llm/runner/text_llm_runner.cpp      |  26 ++++
 extension/llm/runner/text_prefiller.cpp       |   2 +
 extension/llm/runner/text_prefiller.h         |   2 +-
 tools/cmake/executorch-config.cmake           |  14 +++
 13 files changed, 175 insertions(+), 266 deletions(-)
 delete mode 100644 examples/models/phi-3-mini/runner.cpp
 delete mode 100644 examples/models/phi-3-mini/runner.h

diff --git a/.ci/scripts/test_phi_3_mini.sh b/.ci/scripts/test_phi_3_mini.sh
index 2b41e5b308d..7f01995ae99 100644
--- a/.ci/scripts/test_phi_3_mini.sh
+++ b/.ci/scripts/test_phi_3_mini.sh
@@ -22,31 +22,14 @@ NPROC=8
 if hash nproc &> /dev/null; then NPROC=$(nproc); fi
 
 cmake_install_executorch_libraries() {
-  cmake -DPYTHON_EXECUTABLE=python \
-      -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
-      -DEXECUTORCH_ENABLE_LOGGING=1 \
-      -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-      -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-      -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
-      -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-      -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-      -DEXECUTORCH_BUILD_XNNPACK=ON \
-      -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-      -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-      -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-      -B${BUILD_DIR} .
-
-  cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
+  cmake --preset llm -DCMAKE_INSTALL_PREFIX=cmake-out -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
+
+  cmake --build cmake-out -j16 --target install --config ${BUILD_TYPE}
 }
 
 cmake_build_phi_3_mini() {
-  cmake -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
-      -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
+  cmake -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
       -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-      -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-      -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-      -DEXECUTORCH_BUILD_XNNPACK=ON \
-      -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
       -B${BUILD_DIR}/${MODEL_DIR} \
       ${MODEL_DIR}
 
diff --git a/examples/models/phi-3-mini/CMakeLists.txt b/examples/models/phi-3-mini/CMakeLists.txt
index 9f7790cb8ab..38da3066117 100644
--- a/examples/models/phi-3-mini/CMakeLists.txt
+++ b/examples/models/phi-3-mini/CMakeLists.txt
@@ -20,17 +20,14 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 set(CMAKE_BUILD_TYPE Release)
 
-# Set options for executorch build.
-option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
-option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
-option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "" ON)
-option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON)
-option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
-option(EXECUTORCH_BUILD_XNNPACK "" ON)
-
-add_subdirectory(
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../.. ${CMAKE_BINARY_DIR}/../../..
-)
+set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../..")
+find_package(executorch CONFIG REQUIRED)
+
+target_link_options_shared_lib(executorch)
+
+set(BUILD_TESTING OFF)
+add_subdirectory(${EXECUTORCH_ROOT}/extension/llm/runner ${CMAKE_BINARY_DIR}/../../../extension/llm/runner)
+
 if(NOT TARGET gflags)
   add_subdirectory(
     ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags
@@ -40,16 +37,9 @@ endif()
 
 add_executable(
   phi_3_mini_runner
-  main.cpp runner.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/sampler/sampler.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizers/src/llama2c_tokenizer.cpp
-)
-target_include_directories(
-  phi_3_mini_runner
-  PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags/src
-         ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizers/include
+  main.cpp
 )
+
 target_link_libraries(
-  phi_3_mini_runner PRIVATE executorch extension_module_static extension_tensor
-                            optimized_native_cpu_ops_lib xnnpack_backend gflags
+  phi_3_mini_runner PUBLIC executorch optimized_native_cpu_ops_lib xnnpack_backend gflags extension_llm_runner
 )
diff --git a/examples/models/phi-3-mini/README.md b/examples/models/phi-3-mini/README.md
index 5571637e021..a51599a6906 100644
--- a/examples/models/phi-3-mini/README.md
+++ b/examples/models/phi-3-mini/README.md
@@ -21,33 +21,17 @@ python -m examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-m
 ```
 3. Build and run the model.
 - Build executorch with optimized CPU performance as follows. Build options available [here](https://github.com/pytorch/executorch/blob/main/CMakeLists.txt#L59).
- ```
- cmake -DPYTHON_EXECUTABLE=python \
-     -DCMAKE_INSTALL_PREFIX=cmake-out \
-     -DEXECUTORCH_ENABLE_LOGGING=1 \
-     -DCMAKE_BUILD_TYPE=Release \
-     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-     -DEXECUTORCH_BUILD_XNNPACK=ON \
-     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-     -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-     -Bcmake-out .
+```
+cmake --preset llm -DCMAKE_INSTALL_PREFIX=cmake-out
 
- cmake --build cmake-out -j16 --target install --config Release
- ```
+cmake --build cmake-out -j16 --target install --config Release
+```
 - Build Phi-3-mini runner.
 ```
-cmake -DPYTHON_EXECUTABLE=python \
-    -DCMAKE_INSTALL_PREFIX=cmake-out \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-    -Bcmake-out/examples/models/phi-3-mini \
-    examples/models/phi-3-mini
+cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
+      -DCMAKE_BUILD_TYPE=Release \
+      -Bcmake-out/examples/models/phi-3-mini \
+      examples/models/phi-3-mini
 
 cmake --build cmake-out/examples/models/phi-3-mini -j16 --config Release
 ```
diff --git a/examples/models/phi-3-mini/export_phi-3-mini.py b/examples/models/phi-3-mini/export_phi-3-mini.py
index 246b3ccd6c6..d1239d9769d 100644
--- a/examples/models/phi-3-mini/export_phi-3-mini.py
+++ b/examples/models/phi-3-mini/export_phi-3-mini.py
@@ -19,13 +19,42 @@
     XNNPACKQuantizer,
 )
 from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
-from executorch.exir import to_edge
+from executorch.exir import to_edge_transform_and_lower
+from executorch.exir.capture._config import ExecutorchBackendConfig
+from executorch.exir.passes import MemoryPlanningPass
+from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 from torch.export import export_for_training
+from torch.nn.attention import SDPBackend
 from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 from transformers import Phi3ForCausalLM
+from transformers.cache_utils import StaticCacheConfig
 
-from .phi_3_mini import Phi3Mini
+from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM
+
+
+def _prepare_export_inputs(max_seq_len: int, sliding_window: int):
+    """
+    Prepare example inputs and configurations for export.
+
+    Returns:
+        example_input_ids (torch.Tensor): Example input IDs tensor.
+        example_cache_position (torch.Tensor): Example cache position tensor.
+        dynamic_shapes (dict or None): Dynamic shape specifications for export.
+        strict (bool): Whether to use strict export mode.
+    """
+    # Prepare inputs with dynamic shapes
+    seq_length = 3  # Sequence length > 1 to avoid specialization issues
+    example_input_ids = torch.zeros((1, seq_length), dtype=torch.long)
+    example_cache_position = torch.arange(seq_length, dtype=torch.long)
+    max_dim = min(max_seq_len, sliding_window) - 1
+    seq_len_dim = torch.export.Dim("seq_length_dim", max=max_dim)
+    dynamic_shapes = {
+        "input_ids": {1: seq_len_dim},
+        "cache_position": {0: seq_len_dim},
+    }
+
+    return example_input_ids, example_cache_position, dynamic_shapes
 
 
 def export(args) -> None:
@@ -40,23 +69,34 @@ def export(args) -> None:
             f"Invalid context length {args.context_length}. Should be either 4k or 128k"
         )
 
-    with torch.no_grad():
-        model = Phi3Mini(
-            # pyre-ignore: Undefined attribute [16]: Module `transformers` has no attribute `Phi3ForCausalLM`
-            model=Phi3ForCausalLM.from_pretrained(model_name),
+    with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+        model = Phi3ForCausalLM.from_pretrained(model_name)
+        model.generation_config.cache_implementation = "static"
+        model.generation_config.cache_config = StaticCacheConfig(
+            batch_size=1, max_cache_len=model.config.max_position_embeddings
+        )
+
+        exportable_module = TorchExportableModuleForDecoderOnlyLM(
+            model,
             max_batch_size=1,
-            max_seq_len=args.seq_len,
+            max_cache_len=model.config.max_position_embeddings,
         )
-        example_inputs = (
-            torch.tensor(
-                [[1048, 263, 931, 746]], dtype=torch.long, requires_grad=False
-            ),
+        input_ids, cache_position, dynamic_shapes = _prepare_export_inputs(
+            model.config.max_position_embeddings, model.config.sliding_window
+        )
+        example_inputs = (input_ids, cache_position)
+        exported_program = exportable_module.export(
+            input_ids, cache_position, dynamic_shapes, strict=False
+        )
+        # Apply RemoveTransposes pass to remove
+        # any back-to-back transpose ops that are not needed
+        # e.g. output of update_cache is transposed and
+        # input to custom_sdpa is transposed.
+        from executorch.extension.llm.export.export_passes import (
+            RemoveRedundantTransposes,
         )
-        dynamic_shapes = {
-            "input_ids": {
-                1: torch.export.Dim("sequence_length", min=1, max=args.seq_len)
-            }
-        }
+
+        mutated_gm = RemoveRedundantTransposes()(exported_program.module())[0]
 
         xnnpack_quant_config = get_symmetric_quantization_config(
             is_per_channel=True, is_dynamic=True
@@ -64,27 +104,35 @@ def export(args) -> None:
         xnnpack_quantizer = XNNPACKQuantizer()
         xnnpack_quantizer.set_global(xnnpack_quant_config)
 
-        model = export_for_training(
-            model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True
-        ).module()
-        model = prepare_pt2e(model, xnnpack_quantizer)  # pyre-fixme[6]
-        model(*example_inputs)
-        model = convert_pt2e(model)
-        DuplicateDynamicQuantChainPass()(model)
-        # TODO(lunwenh): update it to use export once
-        # https://github.com/pytorch/pytorch/issues/128394 is resolved.
-        model = torch.export._trace._export(
-            model,
-            example_inputs,
-            dynamic_shapes=dynamic_shapes,
-            strict=False,
-            pre_dispatch=False,
+        gm = prepare_pt2e(mutated_gm, xnnpack_quantizer)  # pyre-fixme[6]
+        gm(*example_inputs)
+        gm = convert_pt2e(gm)
+        DuplicateDynamicQuantChainPass()(gm)
+        exported_program = export_for_training(
+            gm, example_inputs, dynamic_shapes=dynamic_shapes, strict=False
         )
 
     edge_config = get_xnnpack_edge_compile_config()
-    edge_manager = to_edge(model, compile_config=edge_config)
+    edge_manager = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[XnnpackPartitioner()],
+        compile_config=edge_config,
+        constant_methods={
+            "get_eos_ids": [32000],
+            "use_kv_cache": True,
+            "enable_dynamic_shape": True,
+            "get_max_seq_len": model.config.max_position_embeddings - 1,
+        },
+    )
     edge_manager = edge_manager.to_backend(XnnpackPartitioner())
-    et_program = edge_manager.to_executorch()
+    et_program = edge_manager.to_executorch(
+        ExecutorchBackendConfig(
+            extract_delegate_segments=True,
+            do_quant_fusion_and_const_prop=True,
+            memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
+            sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
+        )
+    )
 
     with open(args.output_name, "wb") as file:
         file.write(et_program.buffer)
diff --git a/examples/models/phi-3-mini/main.cpp b/examples/models/phi-3-mini/main.cpp
index 86446a8bde3..ca3224c11b2 100644
--- a/examples/models/phi-3-mini/main.cpp
+++ b/examples/models/phi-3-mini/main.cpp
@@ -6,9 +6,12 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/extension/llm/runner/text_llm_runner.h>
 #include <gflags/gflags.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
+#include <iostream>
 
-#include <executorch/examples/models/phi-3-mini/runner.h>
+using executorch::extension::llm::TextLLMRunner;
 
 DEFINE_string(
     model_path,
@@ -42,9 +45,17 @@ int main(int32_t argc, char** argv) {
 
   int32_t seq_len = FLAGS_seq_len;
 
-  example::Runner runner(model_path, tokenizer_path, temperature);
-
-  runner.generate(prompt, seq_len);
+  std::unique_ptr<tokenizers::Tokenizer> tokenizer =
+      std::make_unique<tokenizers::Llama2cTokenizer>();
+  tokenizer->load(tokenizer_path);
+  std::cout << "Tokenizer loaded, eos_id = " << tokenizer->eos_tok()
+            << std::endl;
+  auto runner = executorch::extension::llm::create_text_llm_runner(
+      model_path, std::move(tokenizer));
+
+  runner->generate(
+      prompt,
+      {.seq_len = seq_len, .temperature = static_cast<float>(temperature)});
 
   return 0;
 }
diff --git a/examples/models/phi-3-mini/phi_3_mini.py b/examples/models/phi-3-mini/phi_3_mini.py
index b8cd5ef3840..eec25fc7490 100644
--- a/examples/models/phi-3-mini/phi_3_mini.py
+++ b/examples/models/phi-3-mini/phi_3_mini.py
@@ -30,11 +30,13 @@ def __init__(self, model: Phi3ForCausalLM, max_batch_size: int, max_seq_len: int
     def forward(
         self,
         # pyre-fixme[9]: input_ids has type `LongTensor`; used as `None`.
-        input_ids: torch.LongTensor = None,
+        input_ids: torch.LongTensor,
+        cache_positions: torch.Tensor,
     ) -> torch.FloatTensor:
         # pyre-fixme[16]: `Phi3ForCausalLM` has no attribute `forward`.
         return self.model.forward(
             input_ids=input_ids,
+            cache_positions=cache_positions,
             use_cache=True,
             return_dict=True,
             past_key_values=self.cache,
diff --git a/examples/models/phi-3-mini/runner.cpp b/examples/models/phi-3-mini/runner.cpp
deleted file mode 100644
index 15f76e9522c..00000000000
--- a/examples/models/phi-3-mini/runner.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/examples/models/phi-3-mini/runner.h>
-
-#include <ctime>
-#include <iostream>
-
-#include <executorch/extension/tensor/tensor.h>
-#include <executorch/runtime/platform/log.h>
-#include <pytorch/tokenizers/llama2c_tokenizer.h>
-
-using executorch::aten::ScalarType;
-using executorch::extension::Module;
-using executorch::extension::llm::Sampler;
-using executorch::runtime::Error;
-using tokenizers::Llama2cTokenizer;
-
-namespace example {
-
-#define SAMPLER_TOP 0.9f
-#define ENDOFTEXT_TOKEN 32000
-#define VOCABULARY_SIZE 32064
-
-Runner::Runner(
-    const std::string& model_path,
-    const std::string& tokenizer_path,
-    const float temperature)
-    : module_(std::make_unique<Module>(model_path, Module::LoadMode::File)),
-      tokenizer_(std::make_unique<Llama2cTokenizer>()),
-      sampler_(std::make_unique<Sampler>(
-          VOCABULARY_SIZE,
-          temperature,
-          SAMPLER_TOP,
-          static_cast<unsigned long long>(std::time(nullptr)))) {
-  ET_CHECK_MSG(
-      tokenizer_->load(tokenizer_path) == tokenizers::Error::Ok,
-      "Failed to load tokenizer at %s",
-      tokenizer_path.c_str());
-  ET_LOG(
-      Info,
-      "Created Phi-3-mini runner: model_path=%s, tokenizer_path=%s",
-      model_path.c_str(),
-      tokenizer_path.c_str());
-}
-
-void Runner::generate(const std::string& prompt, std::size_t max_seq_len) {
-  auto encode_res = tokenizer_->encode(prompt, 0, 0);
-  ET_CHECK_MSG(
-      encode_res.error() == tokenizers::Error::Ok,
-      "Failed to encode %s",
-      prompt.c_str());
-  auto input_tokens = encode_res.get();
-  auto prev_token = input_tokens.back();
-  auto current_token = prefill(input_tokens);
-  std::cout << tokenizer_->decode(prev_token, current_token).get();
-  std::cout.flush();
-
-  std::size_t seq_len = input_tokens.size() + 1;
-
-  while (current_token != ENDOFTEXT_TOKEN && seq_len < max_seq_len) {
-    prev_token = current_token;
-    current_token = run_model_step(current_token);
-    std::cout << tokenizer_->decode(prev_token, current_token).get();
-    std::cout.flush();
-
-    ++seq_len;
-  }
-
-  std::cout << std::endl;
-}
-
-uint64_t Runner::logits_to_token(
-    const executorch::aten::Tensor& logits_tensor) {
-  return sampler_->sample(logits_tensor.data_ptr<float>());
-}
-
-uint64_t Runner::prefill(std::vector<uint64_t>& tokens) {
-  auto result = module_->forward(executorch::extension::from_blob(
-      tokens.data(),
-      {1, static_cast<executorch::aten::SizesType>(tokens.size())},
-      ScalarType::Long));
-  ET_CHECK_MSG(result.error() == Error::Ok, "Failed to prefill tokens");
-
-  return logits_to_token(result.get()[0].toTensor());
-}
-
-uint64_t Runner::run_model_step(uint64_t token) {
-  auto result = module_->forward(
-      executorch::extension::from_blob(&token, {1, 1}, ScalarType::Long));
-  ET_CHECK_MSG(
-      result.error() == Error::Ok,
-      "Failed to run forward() for token %" PRIu64,
-      token);
-
-  return logits_to_token(result.get()[0].toTensor());
-}
-
-} // namespace example
diff --git a/examples/models/phi-3-mini/runner.h b/examples/models/phi-3-mini/runner.h
deleted file mode 100644
index 2f0042a57ea..00000000000
--- a/examples/models/phi-3-mini/runner.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// A simple phi-3-mini runner that includes preprocessing and post processing
-// logic. The module takes in a string as input and emits a string as output.
-
-#pragma once
-
-#include <memory>
-#include <string>
-
-#include <executorch/extension/llm/sampler/sampler.h>
-#include <executorch/extension/module/module.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <pytorch/tokenizers/tokenizer.h>
-
-namespace example {
-
-class Runner {
- public:
-  explicit Runner(
-      const std::string& model_path,
-      const std::string& tokenizer_path,
-      const float temperature = 0.8f);
-
-  /**
-   * Generates response for a given prompt.
-   *
-   * @param[in] prompt The prompt to generate a response for.
-   * @param[in] max_seq_len The maximum length of the sequence to generate,
-   * including prompt.
-   */
-  void generate(const std::string& prompt, std::size_t max_seq_len);
-
- private:
-  uint64_t logits_to_token(const executorch::aten::Tensor& logits_tensor);
-  uint64_t prefill(std::vector<uint64_t>& tokens);
-  uint64_t run_model_step(uint64_t token);
-
-  std::unique_ptr<executorch::extension::Module> module_;
-  std::unique_ptr<tokenizers::Tokenizer> tokenizer_;
-  std::unique_ptr<executorch::extension::llm::Sampler> sampler_;
-};
-
-} // namespace example
diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp
index e60a07bc50a..4293b2a08d8 100644
--- a/extension/llm/runner/text_decoder_runner.cpp
+++ b/extension/llm/runner/text_decoder_runner.cpp
@@ -52,22 +52,25 @@ ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
     auto numel = sizes[0];
     std::vector<::executorch::aten::SizesType> sizes_vec = {numel};
 
-    // Assuming the last dimension is the one with the variable token length,
-    // for example [1, S] or [1, 1, S]
-    sizes_vec[sizes_vec.size() - 1] = numel;
     TensorPtr start_pos_tensor;
     if (numel > 1) {
-      // Assuming model is exported with cache_positions, create a tensor with
-      // the same size as cache_positions
+      // If we are here, model is exported with cache_positions, create a tensor
+      // with the same length as input_ids. Assuming the last dimension is the
+      // one with the variable token length, for example [1, S] or [1, 1, S]
+      sizes_vec[sizes_vec.size() - 1] = tokens->numel();
       start_pos_tensor = empty(sizes_vec, ::executorch::aten::ScalarType::Long);
       torch::executor::native::arange_out_impl(
-          start_pos, start_pos + numel, 1.0, *start_pos_tensor);
+          start_pos, start_pos + tokens->numel(), 1.0, *start_pos_tensor);
     } else {
       // Assuming model is exported with input_pos, create a tensor with size 1
       start_pos_tensor = from_blob(
           &start_pos, sizes_vec, ::executorch::aten::ScalarType::Long);
     }
-    ET_LOG(Info, "Start pos tensor numel: %zu", start_pos_tensor->numel());
+    ET_LOG(
+        Info,
+        "Start pos tensor numel: %zu, tokens numel: %zu",
+        start_pos_tensor->numel(),
+        tokens->numel());
     auto outputs_res = module_->forward({tokens, start_pos_tensor});
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
     ET_CHECK_MSG(
diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp
index cf55d98224a..1842472aada 100644
--- a/extension/llm/runner/text_llm_runner.cpp
+++ b/extension/llm/runner/text_llm_runner.cpp
@@ -32,6 +32,7 @@ static constexpr auto kMaxContextLen = "get_max_context_len";
 static constexpr auto kVocabSize = "get_vocab_size";
 static constexpr auto kUseKVCache = "use_kv_cache";
 static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
+static constexpr auto kUseCachePositions = "use_cache_positions";
 
 TextLLMRunner::TextLLMRunner(
     std::unordered_map<std::string, int64_t> metadata,
@@ -306,6 +307,7 @@ std::unordered_map<std::string, int64_t> get_llm_metadata(
       {llm::kMaxContextLen, 128},
       {llm::kUseKVCache, true},
       {llm::kUseSDPAWithKVCache, false},
+      {llm::kUseCachePositions, false},
   });
 
   // Read metadata from the model
@@ -335,6 +337,29 @@ std::unordered_map<std::string, int64_t> get_llm_metadata(
   // Set tokenizer-related metadata
   metadata[llm::kBosId] = tokenizer->bos_tok();
   metadata[llm::kVocabSize] = tokenizer->vocab_size();
+
+  // Override metadata using the module's method_meta
+  auto method_meta_result = module->method_meta("forward");
+  if (method_meta_result.error() != Error::Ok) {
+    ET_LOG(Error, "Failed reading method meta");
+    return metadata;
+  }
+  auto method_meta = method_meta_result.get();
+  // If only 1 input, we are not using kv cache
+  metadata[llm::kUseKVCache] = method_meta.num_inputs() > 1;
+
+  if (method_meta.num_inputs() == 1) {
+    return metadata;
+  }
+  // Check if we are using cache positions instead of input pos.
+  auto second_input_info = method_meta.input_tensor_meta(1).get();
+  // For input_pos, numel is 1, for cache_positions, numel is max_seq_len
+  auto sizes = second_input_info.sizes();
+  int64_t total_size = 1;
+  for (const auto& size : sizes) {
+    total_size *= size;
+  }
+  metadata[llm::kUseCachePositions] = total_size > 1;
   return metadata;
 }
 
@@ -401,6 +426,7 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
   auto text_prefiller = std::make_unique<TextPrefiller>(
       text_decoder_runner.get(),
       metadata.at(kUseKVCache),
+      metadata.at(kUseCachePositions),
       metadata.at(kEnableDynamicShape),
       metadata.at(kMaxSeqLen));
 
diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp
index de092b6b05d..86e89c416b5 100644
--- a/extension/llm/runner/text_prefiller.cpp
+++ b/extension/llm/runner/text_prefiller.cpp
@@ -19,10 +19,12 @@ namespace llm {
 TextPrefiller::TextPrefiller(
     TextDecoderRunner* text_decoder_runner,
     bool use_kv_cache,
+    bool use_cache_positions,
     bool enable_parallel_prefill,
     int64_t max_seq_len)
     : text_decoder_runner_(text_decoder_runner),
       use_kv_cache_(use_kv_cache),
+      use_cache_positions_(use_cache_positions),
       enable_parallel_prefill_(enable_parallel_prefill),
       max_seq_len_(max_seq_len > 0 ? max_seq_len : 128) {}
 
diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h
index ce12506a05c..a02cd3d1bf4 100644
--- a/extension/llm/runner/text_prefiller.h
+++ b/extension/llm/runner/text_prefiller.h
@@ -21,7 +21,7 @@ class ET_EXPERIMENTAL TextPrefiller {
  public:
   TextPrefiller(
       TextDecoderRunner* text_decoder_runner,
-      bool use_kv_cache_,
+      bool use_kv_cache,
       bool enable_parallel_prefill,
       int64_t max_seq_len = 128);
 
diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake
index e73104b0f1e..62746a4a713 100644
--- a/tools/cmake/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -26,6 +26,8 @@
 cmake_minimum_required(VERSION 3.19)
 include("${CMAKE_CURRENT_LIST_DIR}/Utils.cmake")
 
+include(${CMAKE_CURRENT_LIST_DIR}/Utils.cmake)
+
 set(_root "${CMAKE_CURRENT_LIST_DIR}/../../..")
 set(required_lib_list executorch executorch_core portable_kernels)
 set(EXECUTORCH_LIBRARIES)
@@ -186,3 +188,15 @@ foreach(lib ${shared_lib_list})
     target_link_options_shared_lib(${lib})
   endif()
 endforeach()
+
+if(TARGET xnnpack_backend)
+  if(TARGET kleidiai)
+    set(_deps "XNNPACK;xnnpack-microkernels-prod;kleidiai")
+  else()
+    set(_deps "XNNPACK;xnnpack-microkernels-prod")
+  endif()
+  set_target_properties(
+    xnnpack_backend PROPERTIES INTERFACE_LINK_LIBRARIES "${_deps}"
+  )
+  target_link_options_shared_lib(xnnpack_backend)
+endif()