diff --git a/.ci/scripts/test_ane_static_llama.sh b/.ci/scripts/test_ane_static_llama.sh
index 73a9c4ca54b..f04b716fd97 100644
--- a/.ci/scripts/test_ane_static_llama.sh
+++ b/.ci/scripts/test_ane_static_llama.sh
@@ -28,13 +28,78 @@ pushd $EXECUTORCH_ROOT/examples/apple/coreml/llama
 # Download stories llama110m artifacts
 download_stories_model_artifacts
 
-# Test static ANE llama model
+# Test static ANE llama model export
+echo "Exporting static ANE llama model..."
 python export_static_llm_coreml.py --checkpoint stories110M.pt --params params.json --output model.pte
 
-# The ANE cannot run in github CI
-# python run_static_llm.py --model model.pte --params params.json --tokenizer tokenizer.model --prompt "Once upon a time," --lookahead
+# The ANE is not accessible in github CI, so we export with CPU to test runner
+echo "Exporting CPU-only model for CI testing..."
+python export_static_llm_coreml.py --checkpoint stories110M.pt --params params.json --output model_cpu.pte --cpu_only
+
+popd
+
+# Build the C++ runner
+echo "Building C++ runner..."
+BUILD_DIR="${EXECUTORCH_ROOT}/cmake-out"
+
+# Clean build directory completely to avoid stale artifacts and generator conflicts
+rm -rf "${BUILD_DIR}"
+
+cmake -S "${EXECUTORCH_ROOT}" -B "${BUILD_DIR}" \
+  -DCMAKE_INSTALL_PREFIX="${BUILD_DIR}" \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DEXECUTORCH_ENABLE_LOGGING=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
+  -DEXECUTORCH_BUILD_COREML=ON \
+  -G Ninja
+
+cmake --build "${BUILD_DIR}" -j --target run_static_llm_coreml --config Release
+
+# TODO: enable runner once CoreML bug with caching is fixed
+# # Run the C++ runner with the CPU model
+# echo "Running C++ runner with CPU model..."
+# RUNNER="${BUILD_DIR}/examples/apple/coreml/llama/runner/run_static_llm_coreml"
+# MODEL_DIR="${EXECUTORCH_ROOT}/examples/apple/coreml/llama"
+
+# # Run the model and capture full output for debugging
+# FULL_OUTPUT=$("${RUNNER}" \
+#   --model "${MODEL_DIR}/model.pte" \
+#   --params "${MODEL_DIR}/params.json" \
+#   --tokenizer "${MODEL_DIR}/tokenizer.model" \
+#   --prompt "Once upon a time," \
+#   --max_new_tokens 50 2>&1)
+
+# echo "Full output:"
+# echo "${FULL_OUTPUT}"
+
+# # Check that the model produced meaningful output
+# # The output should contain: the prompt "Once upon a time," and the continuation including "there was"
+# # Due to log interleaving, we check for individual key parts separately
+# if [[ "${FULL_OUTPUT}" == *"Once upon a time,"* ]] && [[ "${FULL_OUTPUT}" == *"there"* ]] && [[ "${FULL_OUTPUT}" == *"was"* ]]; then
+#   echo "Output contains expected prompt and generated text"
+#   echo "C++ runner test passed!"
+# else
+#   echo "ERROR: Output does not contain expected text"
+#   echo "Expected: 'Once upon a time,' followed by 'there' and 'was'"
+#   exit 1
+# fi
+
+# TODO: enable runner once CoreML bug with caching is fixed
+# # Run lookahead decoding test (currently produces <unk> tokens on stories, but works with llama)
+# echo "Running C++ runner with lookahead decoding..."
+# "${RUNNER}" \
+#   --model "${MODEL_DIR}/model.pte" \
+#   --params "${MODEL_DIR}/params.json" \
+#   --tokenizer "${MODEL_DIR}/tokenizer.model" \
+#   --prompt "Once upon a time," \
+#   --max_new_tokens 50 \
+#   --lookahead
 
 # Test export of deprecated model
+pushd $EXECUTORCH_ROOT/examples/apple/coreml/llama
 python export.py -n model.pte -p params.json -c stories110M.pt --seq_length 32 --max_seq_length 64 --dtype fp16 --coreml-quantize c4w --embedding-quantize 4,32
-
 popd
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 2cd284b059b..2a883051e9f 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -489,7 +489,7 @@ jobs:
     name: test-static-llama-ane
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
-      runner: macos-m1-stable
+      runner: macos-15-xlarge
       python-version: '3.11'
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -839,7 +839,7 @@ jobs:
           qwen3-0.6b|xnnpack|--quantize,
           qwen3-1.7b|xnnpack|--quantize,
           gemma3-1b|xnnpack|--quantize,
-          # phi4-mini|xnnpack|--quantize, transformers v5.0.0rc0 introduces a data-dependent branching in transformers/modeling_rope_utils.py:61 
+          # phi4-mini|xnnpack|--quantize, transformers v5.0.0rc0 introduces a data-dependent branching in transformers/modeling_rope_utils.py:61
           smollm2-135m|xnnpack|--quantize,
           smollm3-3b|xnnpack|--quantize
         ]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 30cee4afe53..f02a216c191 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -936,12 +936,20 @@ if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/training)
   list(APPEND _executorch_extensions extension_training)
 endif()
-
 if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
   list(APPEND _executorch_extensions extension_llm_runner)
 endif()
 
+# Static LLM CoreML runner for Apple platforms
+if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER
+   AND EXECUTORCH_BUILD_COREML
+   AND APPLE
+)
+  add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/apple/coreml/llama/runner
+  )
+endif()
 if(EXECUTORCH_BUILD_EXTENSION_ASR_RUNNER)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/asr/runner)
   list(APPEND _executorch_extensions extension_asr_runner)
diff --git a/examples/apple/coreml/llama/export_static_llm_coreml.py b/examples/apple/coreml/llama/export_static_llm_coreml.py
index a3fd8201414..155c24264a4 100644
--- a/examples/apple/coreml/llama/export_static_llm_coreml.py
+++ b/examples/apple/coreml/llama/export_static_llm_coreml.py
@@ -98,18 +98,19 @@ def remove_graph_break_(edge_manager):
     edge_manager.exported_program().graph_module.graph.eliminate_dead_code()
 
 
-def load_model(checkpoint_path: str, params_path: str, max_context_len: int):
+def load_model(
+    checkpoint_path: str,
+    params_path: str,
+    max_context_len: int,
+    generate_full_logits: bool = True,
+):
     """Load the model from checkpoint with static_mha attention type."""
     with open(params_path, "r") as f:
         params = json.loads(f.read())
 
-    # TODO: to support lookahead decoding, the static model outputs
-    # full logits, but if we are not using lookahead decoding, we can have a
-    # more efficient model by setting generate_full_logits=False and supplying the last
-    # valid token
     args = ModelArgs(
         max_context_len=max_context_len,
-        generate_full_logits=True,
+        generate_full_logits=generate_full_logits,
         **params,
     )
     args.attention_type = "static_mha"
@@ -320,15 +321,39 @@ def main():
         help="Disable graph breaks between transformer blocks",
     )
 
+    # Output options
+    parser.add_argument(
+        "--no_generate_full_logits",
+        action="store_true",
+        help="Only generate logits for the last token position (more efficient, but no lookahead support).",
+    )
+
+    # Compute options
+    parser.add_argument(
+        "--cpu_only",
+        action="store_true",
+        help="Use CPU only (no ANE). Useful for CI testing where ANE is not accessible.",
+    )
+
     args = parser.parse_args()
 
     # Compute cache length
 
+    generate_full_logits = not args.no_generate_full_logits
+
     print("Quantization and datatype:")
     print(f"\tEmbedding quantize: {args.embedding_quantize}")
     print(f"\tLinear quantize: {args.linear_quantize}")
     print(f"\tDtype: {args.dtype}")
 
+    print("\nOutput configuration:")
+    print(f"\tGenerate full logits: {generate_full_logits}")
+    if not generate_full_logits:
+        print("\t(Lookahead decoding will NOT be supported)")
+
+    print("\nCompute configuration:")
+    print(f"\tCPU only: {args.cpu_only}")
+
     cache_len = args.max_context_len - args.input_len
     print("\nGeneration configuration:")
     print(f"\tMax context length: {args.max_context_len}")
@@ -345,6 +370,7 @@ def main():
         args.checkpoint,
         args.params,
         args.max_context_len,
+        generate_full_logits,
     )
     print(f"Model loaded: {model_args.n_layers} layers, {model_args.dim} dim")
 
@@ -453,13 +479,16 @@ def main():
 
     # Setup CoreML partitioner
     print("\nSetting up CoreML partitioner...")
+    compute_unit = (
+        ct.ComputeUnit.CPU_ONLY if args.cpu_only else ct.ComputeUnit.CPU_AND_NE
+    )
     compile_specs = CoreMLBackend.generate_compile_specs(
         minimum_deployment_target=ct.target.iOS18,
         compute_precision={
             torch.float16: ct.precision.FLOAT16,
             torch.float32: ct.precision.FLOAT32,
         }[float_dtype],
-        compute_unit=ct.ComputeUnit.CPU_AND_NE,
+        compute_unit=compute_unit,
         model_type=CoreMLBackend.MODEL_TYPE.MODEL,
     )
     partitioner = CoreMLPartitioner(
diff --git a/examples/apple/coreml/llama/readme.md b/examples/apple/coreml/llama/readme.md
index 46e9043a5fc..7ceb5265e5c 100644
--- a/examples/apple/coreml/llama/readme.md
+++ b/examples/apple/coreml/llama/readme.md
@@ -29,7 +29,64 @@ The static model has several ANE optimizations, including:
 * Re-writing SDPA to avoid 5-D tensors to imporve performance.  This also fixes an accuracy bug that was introduced in iOS 26 (addresses this: https://github.com/pytorch/executorch/issues/15833)
 
 
-We are working on adding a C++ runner as well.
+## C++ Runner
+
+A C++ runner is also available for running static attention LLM models. The runner extends `TextDecoderRunner` from the ExecutorTorch LLM extension and manages KV cache I/O with smart_mask style cache updates.
+
+### Building on macOS
+
+The easiest way to build is using the provided build script:
+
+```bash
+cd examples/apple/coreml/llama/runner
+./build_and_run.sh --help  # Show options
+./build_and_run.sh         # Build and run with defaults
+```
+
+Or build manually from the executorch root directory using the macos preset:
+
+```bash
+cmake -S . -B cmake-out --preset macos
+cmake --build cmake-out --config Release --target run_static_llm_coreml -j$(sysctl -n hw.ncpu)
+```
+
+The executable will be at: `cmake-out/examples/apple/coreml/llama/runner/Release/run_static_llm_coreml`
+
+### Running
+
+```bash
+./cmake-out/examples/apple/coreml/llama/runner/Release/run_static_llm_coreml \
+  --model static_llm_coreml_model.pte \
+  --params /path/to/params.json \
+  --tokenizer /path/to/tokenizer.model \
+  --prompt "Once upon a time," \
+  --max_new_tokens 100 \
+  --input_len 32 \
+  --cache_len 992 \
+  --temperature 0.0
+```
+
+### Command-line Options
+
+| Option | Description | Default |
+|--------|-------------|---------|
+| `--model` | Path to the .pte model file | (required) |
+| `--params` | Path to params.json | (required) |
+| `--tokenizer` | Path to tokenizer file | (required) |
+| `--prompt` | Input prompt | (required) |
+| `--max_new_tokens` | Maximum tokens to generate | 100 |
+| `--input_len` | Input sequence length (must match export) | 32 |
+| `--cache_len` | KV cache length (must match export) | 992 |
+| `--temperature` | Sampling temperature (0 = greedy) | 0.0 |
+
+### Features
+
+The C++ runner:
+- Extends `TextDecoderRunner` from `executorch/extension/llm/runner/`
+- Manages KV cache I/O with smart_mask style cache updates
+- Supports multiple tokenizer formats (HuggingFace JSON, TikToken, SentencePiece, BPE)
+- Computes RoPE frequencies internally (Llama 3 style with base=500000)
+- Reads model configuration from params.json
 
 
 # Deprecated (export.py, run.py, and run_lookahead.py)
diff --git a/examples/apple/coreml/llama/runner/CMakeLists.txt b/examples/apple/coreml/llama/runner/CMakeLists.txt
new file mode 100644
index 00000000000..7e0cd4dd4d1
--- /dev/null
+++ b/examples/apple/coreml/llama/runner/CMakeLists.txt
@@ -0,0 +1,172 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#
+# CMake build system for static LLM runner for CoreML.
+#
+# ### Building ###
+#
+# From the executorch root directory:
+# ~~~
+# mkdir cmake-out && cd cmake-out
+# cmake .. \
+#   -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
+#   -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
+#   -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+#   -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+#   -DEXECUTORCH_BUILD_COREML=ON
+# cmake --build . -j --target run_static_llm_coreml
+# ~~~
+#
+# ### Running ###
+#
+# ~~~
+# ./examples/apple/coreml/llama/runner/run_static_llm_coreml \
+#   --model /path/to/model.pte \
+#   --params /path/to/params.json \
+#   --tokenizer /path/to/tokenizer.model \
+#   --prompt "Once upon a time," \
+#   --max_new_tokens 100
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+project(static_llm_coreml_runner)
+
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+
+# Source files
+set(static_llm_runner_srcs static_llm_runner.cpp)
+
+set(static_llm_main_srcs main.cpp)
+
+# Create the runner library
+if(CMAKE_TOOLCHAIN_IOS
+   OR ANDROID
+   OR APPLE
+)
+  add_library(static_llm_runner STATIC ${static_llm_runner_srcs})
+else()
+  add_library(static_llm_runner SHARED ${static_llm_runner_srcs})
+endif()
+
+# Check for required targets
+if(NOT TARGET extension_llm_runner)
+  message(
+    FATAL_ERROR
+      "ExecuTorch must be built with EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER enabled."
+  )
+endif()
+
+if(NOT TARGET extension_module)
+  message(
+    FATAL_ERROR
+      "ExecuTorch must be built with EXECUTORCH_BUILD_EXTENSION_MODULE enabled."
+  )
+endif()
+
+# Dependencies for the runner library
+set(static_llm_runner_deps
+    executorch_core
+    extension_data_loader
+    extension_module
+    extension_tensor
+    extension_flat_tensor
+    extension_llm_runner
+    executorch_backends
+)
+
+# Add CoreML delegate if available (required for running CoreML models) Note:
+# coremldelegate is linked transitively through executorch build system when
+# EXECUTORCH_BUILD_COREML is ON. We don't need to link it again here to avoid
+# duplicate symbol errors. if(TARGET coremldelegate) list(APPEND
+# static_llm_runner_deps coremldelegate) endif()
+
+target_link_libraries(static_llm_runner PUBLIC ${static_llm_runner_deps})
+target_link_libraries(static_llm_runner PUBLIC tokenizers::tokenizers)
+
+# Find nlohmann_json for params parsing - executorch already includes it in
+# third-party
+if(TARGET nlohmann_json)
+  target_link_libraries(static_llm_runner PUBLIC nlohmann_json::nlohmann_json)
+elseif(TARGET nlohmann_json::nlohmann_json)
+  target_link_libraries(static_llm_runner PUBLIC nlohmann_json::nlohmann_json)
+else()
+  find_package(nlohmann_json QUIET CONFIG)
+  if(nlohmann_json_FOUND)
+    target_link_libraries(static_llm_runner PUBLIC nlohmann_json::nlohmann_json)
+  else()
+    # The executorch third-party directory includes nlohmann_json
+    target_include_directories(
+      static_llm_runner PUBLIC ${EXECUTORCH_ROOT}/third-party/json/include
+    )
+  endif()
+endif()
+
+target_include_directories(
+  static_llm_runner
+  PUBLIC ${EXECUTORCH_ROOT} ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
+         ${EXECUTORCH_ROOT}/..
+)
+
+# Create the executable
+add_executable(run_static_llm_coreml ${static_llm_main_srcs})
+
+target_link_libraries(run_static_llm_coreml PRIVATE static_llm_runner)
+
+# Find and link gflags - executorch builds gflags as a third-party dependency
+if(TARGET gflags)
+  target_link_libraries(run_static_llm_coreml PRIVATE gflags)
+elseif(TARGET gflags_nothreads_static)
+  target_link_libraries(run_static_llm_coreml PRIVATE gflags_nothreads_static)
+else()
+  find_package(gflags QUIET)
+  if(gflags_FOUND)
+    target_link_libraries(run_static_llm_coreml PRIVATE gflags::gflags)
+  else()
+    # Try to find gflags via pkg-config
+    find_package(PkgConfig QUIET)
+    if(PkgConfig_FOUND)
+      pkg_check_modules(GFLAGS QUIET gflags)
+      if(GFLAGS_FOUND)
+        target_include_directories(
+          run_static_llm_coreml PRIVATE ${GFLAGS_INCLUDE_DIRS}
+        )
+        target_link_libraries(run_static_llm_coreml PRIVATE ${GFLAGS_LIBRARIES})
+      else()
+        message(
+          FATAL_ERROR
+            "gflags not found. Please install gflags or set GFLAGS_ROOT."
+        )
+      endif()
+    else()
+      message(
+        FATAL_ERROR
+          "gflags not found. Please install gflags or set GFLAGS_ROOT."
+      )
+    endif()
+  endif()
+endif()
+
+target_include_directories(
+  run_static_llm_coreml PRIVATE ${EXECUTORCH_ROOT} ${EXECUTORCH_ROOT}/..
+)
+
+# Set C++ standard
+set_target_properties(
+  static_llm_runner run_static_llm_coreml PROPERTIES CXX_STANDARD 17
+                                                     CXX_STANDARD_REQUIRED ON
+)
+
+# Install targets
+install(TARGETS run_static_llm_coreml DESTINATION bin)
+install(TARGETS static_llm_runner DESTINATION lib)
+install(FILES static_llm_runner.h
+        DESTINATION include/executorch/examples/apple/coreml/llama/runner
+)
diff --git a/examples/apple/coreml/llama/runner/build_and_run.sh b/examples/apple/coreml/llama/runner/build_and_run.sh
new file mode 100755
index 00000000000..ca2c4588caa
--- /dev/null
+++ b/examples/apple/coreml/llama/runner/build_and_run.sh
@@ -0,0 +1,248 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Build and run the static LLM C++ runner for CoreML
+#
+# Usage:
+#   ./build_and_run.sh [--rebuild] [--run-only] [--help]
+#
+# Arguments:
+#   --rebuild    Force a clean rebuild
+#   --run-only   Skip build, just run the executable
+#   --help       Show this help message
+#
+# Environment variables (override defaults):
+#   MODEL_PATH      Path to the .pte model file
+#   PARAMS_PATH     Path to params.json
+#   TOKENIZER_PATH  Path to tokenizer.model
+#   PROMPT          Input prompt
+#   MAX_NEW_TOKENS  Maximum tokens to generate
+#   INPUT_LEN       Input sequence length
+#   CACHE_LEN       KV cache length
+
+set -e
+
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+EXECUTORCH_ROOT="$(cd "${SCRIPT_DIR}/../../../../.." && pwd)"
+BUILD_DIR="${EXECUTORCH_ROOT}/cmake-out"
+
+# Default values (can be overridden via environment variables)
+MODEL_PATH="${MODEL_PATH:-$HOME/Desktop/static_llama1b_coreml_model.pte}"
+PARAMS_PATH="${PARAMS_PATH:-$HOME/models/llama1b/params.json}"
+TOKENIZER_PATH="${TOKENIZER_PATH:-$HOME/models/llama1b/tokenizer.model}"
+PROMPT="${PROMPT:-Once upon a time,}"
+MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-100}"
+INPUT_LEN="${INPUT_LEN:-32}"
+CACHE_LEN="${CACHE_LEN:-992}"
+TEMPERATURE="${TEMPERATURE:-0.0}"
+
+# Lookahead decoding options
+LOOKAHEAD="${LOOKAHEAD:-false}"
+NGRAM_SIZE="${NGRAM_SIZE:-4}"
+WINDOW_SIZE="${WINDOW_SIZE:-5}"
+N_VERIFICATIONS="${N_VERIFICATIONS:-3}"
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+print_usage() {
+    echo "Usage: $0 [--rebuild] [--run-only] [--help]"
+    echo ""
+    echo "Options:"
+    echo "  --rebuild    Force a clean rebuild"
+    echo "  --run-only   Skip build, just run the executable"
+    echo "  --help       Show this help message"
+    echo ""
+    echo "Environment variables:"
+    echo "  MODEL_PATH      Path to the .pte model file (default: \$HOME/Desktop/static_llama1b_coreml_model.pte)"
+    echo "  PARAMS_PATH     Path to params.json (default: \$HOME/models/llama1b/params.json)"
+    echo "  TOKENIZER_PATH  Path to tokenizer.model (default: \$HOME/models/llama1b/tokenizer.model)"
+    echo "  PROMPT          Input prompt (default: 'Once upon a time,')"
+    echo "  MAX_NEW_TOKENS  Maximum tokens to generate (default: 100)"
+    echo "  INPUT_LEN       Input sequence length (default: 32)"
+    echo "  CACHE_LEN       KV cache length (default: 992)"
+    echo "  TEMPERATURE     Sampling temperature (default: 0.0)"
+}
+
+log_info() {
+    echo -e "${GREEN}[INFO]${NC} $1"
+}
+
+log_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Parse arguments
+REBUILD=false
+RUN_ONLY=false
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --rebuild)
+            REBUILD=true
+            shift
+            ;;
+        --run-only)
+            RUN_ONLY=true
+            shift
+            ;;
+        --help)
+            print_usage
+            exit 0
+            ;;
+        *)
+            log_error "Unknown option: $1"
+            print_usage
+            exit 1
+            ;;
+    esac
+done
+
+# Validate required files exist
+validate_files() {
+    local missing=false
+
+    if [[ ! -f "${MODEL_PATH}" ]]; then
+        log_error "Model file not found: ${MODEL_PATH}"
+        missing=true
+    fi
+
+    if [[ ! -f "${PARAMS_PATH}" ]]; then
+        log_error "Params file not found: ${PARAMS_PATH}"
+        missing=true
+    fi
+
+    if [[ ! -f "${TOKENIZER_PATH}" ]]; then
+        log_error "Tokenizer file not found: ${TOKENIZER_PATH}"
+        missing=true
+    fi
+
+    if [[ "$missing" == "true" ]]; then
+        echo ""
+        log_info "You can set paths via environment variables:"
+        echo "  export MODEL_PATH=/path/to/model.pte"
+        echo "  export PARAMS_PATH=/path/to/params.json"
+        echo "  export TOKENIZER_PATH=/path/to/tokenizer.model"
+        exit 1
+    fi
+}
+
+build_project() {
+    log_info "ExecutorTorch root: ${EXECUTORCH_ROOT}"
+    log_info "Build directory: ${BUILD_DIR}"
+
+    # Clean build if requested
+    if [[ "$REBUILD" == "true" ]] && [[ -d "${BUILD_DIR}" ]]; then
+        log_info "Cleaning build directory..."
+        rm -rf "${BUILD_DIR}"
+    fi
+
+    cd "${EXECUTORCH_ROOT}"
+
+    # Configure CMake using macos preset (includes all necessary LLM extensions)
+    log_info "Configuring CMake with macos preset..."
+    cmake -S "${EXECUTORCH_ROOT}" \
+          -B "${BUILD_DIR}" \
+          -DCMAKE_BUILD_TYPE=Release \
+          --preset macos
+
+    # Build the target
+    log_info "Building run_static_llm_coreml..."
+    cmake --build "${BUILD_DIR}" \
+          -j$(sysctl -n hw.ncpu) \
+          --config Release \
+          --target run_static_llm_coreml
+
+    log_info "Build complete!"
+}
+
+run_model() {
+    local executable="${BUILD_DIR}/examples/apple/coreml/llama/runner/Release/run_static_llm_coreml"
+
+    # Also check non-Release location
+    if [[ ! -f "${executable}" ]]; then
+        executable="${BUILD_DIR}/examples/apple/coreml/llama/runner/run_static_llm_coreml"
+    fi
+
+    # Check Debug location
+    if [[ ! -f "${executable}" ]]; then
+        executable="${BUILD_DIR}/examples/apple/coreml/llama/runner/Debug/run_static_llm_coreml"
+    fi
+
+    if [[ ! -f "${executable}" ]]; then
+        log_error "Executable not found: ${executable}"
+        log_info "Run without --run-only to build first"
+        exit 1
+    fi
+
+    log_info "Running model..."
+    echo ""
+    echo "Configuration:"
+    echo "  Model:          ${MODEL_PATH}"
+    echo "  Params:         ${PARAMS_PATH}"
+    echo "  Tokenizer:      ${TOKENIZER_PATH}"
+    echo "  Prompt:         ${PROMPT}"
+    echo "  Max tokens:     ${MAX_NEW_TOKENS}"
+    echo "  Input length:   ${INPUT_LEN}"
+    echo "  Cache length:   ${CACHE_LEN}"
+    echo "  Temperature:    ${TEMPERATURE}"
+    if [[ "${LOOKAHEAD}" == "true" ]]; then
+        echo "  Lookahead:      enabled"
+        echo "    ngram_size:   ${NGRAM_SIZE}"
+        echo "    window_size:  ${WINDOW_SIZE}"
+        echo "    n_verifications: ${N_VERIFICATIONS}"
+    fi
+    echo ""
+    echo "=========================================="
+
+    # Build command with optional lookahead flags
+    local cmd=("${executable}"
+        --model "${MODEL_PATH}"
+        --params "${PARAMS_PATH}"
+        --tokenizer "${TOKENIZER_PATH}"
+        --prompt "${PROMPT}"
+        --max_new_tokens "${MAX_NEW_TOKENS}"
+        --temperature "${TEMPERATURE}")
+
+    if [[ "${LOOKAHEAD}" == "true" ]]; then
+        cmd+=(--lookahead
+              --ngram_size "${NGRAM_SIZE}"
+              --window_size "${WINDOW_SIZE}"
+              --n_verifications "${N_VERIFICATIONS}")
+    fi
+
+    "${cmd[@]}"
+
+    echo "=========================================="
+    log_info "Done!"
+}
+
+# Main execution
+main() {
+    echo ""
+    log_info "Static LLM CoreML Runner - Build & Test Script"
+    echo ""
+
+    # Validate files before running
+    validate_files
+
+    if [[ "$RUN_ONLY" == "false" ]]; then
+        build_project
+    fi
+
+    run_model
+}
+
+main
diff --git a/examples/apple/coreml/llama/runner/main.cpp b/examples/apple/coreml/llama/runner/main.cpp
new file mode 100644
index 00000000000..0a96f0b4fc5
--- /dev/null
+++ b/examples/apple/coreml/llama/runner/main.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Main executable for running static attention LLM models.
+//
+// Usage:
+//   ./run_static_llm \
+//       --model /path/to/model.pte \
+//       --params /path/to/params.json \
+//       --tokenizer /path/to/tokenizer.model \
+//       --prompt "Once upon a time," \
+//       --max_new_tokens 100
+//
+// With lookahead decoding:
+//   ./run_static_llm \
+//       --model /path/to/model.pte \
+//       --params /path/to/params.json \
+//       --tokenizer /path/to/tokenizer.model \
+//       --prompt "Once upon a time," \
+//       --max_new_tokens 100 \
+//       --lookahead \
+//       --ngram_size 4 \
+//       --window_size 5 \
+//       --n_verifications 3
+
+#include <iostream>
+#include <string>
+
+#include <executorch/examples/apple/coreml/llama/runner/static_llm_runner.h>
+#include <executorch/runtime/platform/log.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <gflags/gflags.h>
+
+DEFINE_string(model, "", "Path to the .pte model file (required)");
+DEFINE_string(params, "", "Path to params.json file (optional, for rope_theta)");
+DEFINE_string(tokenizer, "", "Path to tokenizer model file (required)");
+DEFINE_string(prompt, "Once upon a time,", "Input prompt");
+DEFINE_int32(max_new_tokens, 100, "Maximum number of tokens to generate");
+DEFINE_double(temperature, 0.0, "Sampling temperature (0 = greedy)");
+
+// Lookahead decoding options
+DEFINE_bool(lookahead, false, "Enable lookahead (speculative) decoding");
+DEFINE_int32(ngram_size, 4, "N-gram size for lookahead decoding");
+DEFINE_int32(window_size, 5, "Window size for lookahead decoding");
+DEFINE_int32(n_verifications, 3, "Number of verification branches for lookahead decoding");
+
+int main(int argc, char* argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  // Validate required arguments
+  if (FLAGS_model.empty()) {
+    std::cerr << "Error: --model is required" << std::endl;
+    return 1;
+  }
+  if (FLAGS_params.empty()) {
+    std::cerr << "Error: --params is required" << std::endl;
+    return 1;
+  }
+  if (FLAGS_tokenizer.empty()) {
+    std::cerr << "Error: --tokenizer is required" << std::endl;
+    return 1;
+  }
+
+  // Initialize runtime
+  executorch::runtime::runtime_init();
+
+  // Create runner (config is auto-detected from model metadata)
+  auto runner = example::create_static_llm_runner(
+      FLAGS_model,
+      FLAGS_tokenizer,
+      FLAGS_params);
+
+  if (!runner) {
+    std::cerr << "Error: Failed to create runner" << std::endl;
+    return 1;
+  }
+
+  // Load model
+  auto load_err = runner->load();
+  if (load_err != executorch::runtime::Error::Ok) {
+    std::cerr << "Error: Failed to load model" << std::endl;
+    return 1;
+  }
+
+  // Print prompt
+  std::cout << "\n" << FLAGS_prompt << std::flush;
+
+  // Generate
+  executorch::runtime::Error gen_err;
+
+  if (FLAGS_lookahead) {
+    // Use lookahead decoding
+    example::LookaheadConfig lookahead_config;
+    lookahead_config.enabled = true;
+    lookahead_config.ngram_size = static_cast<size_t>(FLAGS_ngram_size);
+    lookahead_config.window_size = static_cast<size_t>(FLAGS_window_size);
+    lookahead_config.n_verifications = static_cast<size_t>(FLAGS_n_verifications);
+
+    gen_err = runner->generate_with_lookahead(
+        FLAGS_prompt,
+        FLAGS_max_new_tokens,
+        lookahead_config,
+        [](const std::string& token) { std::cout << token << std::flush; });
+  } else {
+    // Use standard decoding
+    gen_err = runner->generate(
+        FLAGS_prompt,
+        FLAGS_max_new_tokens,
+        static_cast<float>(FLAGS_temperature),
+        [](const std::string& token) { std::cout << token << std::flush; });
+  }
+
+  if (gen_err != executorch::runtime::Error::Ok) {
+    std::cerr << "\nError: Generation failed" << std::endl;
+    return 1;
+  }
+
+  std::cout << "\n" << std::endl;
+
+  return 0;
+}
diff --git a/examples/apple/coreml/llama/runner/static_llm_runner.cpp b/examples/apple/coreml/llama/runner/static_llm_runner.cpp
new file mode 100644
index 00000000000..d02c3529617
--- /dev/null
+++ b/examples/apple/coreml/llama/runner/static_llm_runner.cpp
@@ -0,0 +1,844 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/apple/coreml/llama/runner/static_llm_runner.h>
+
+#include <fstream>
+#include <nlohmann/json.hpp>
+
+#include <executorch/extension/llm/runner/util.h>
+#include <executorch/runtime/platform/log.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <pytorch/tokenizers/hf_tokenizer.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
+#include <pytorch/tokenizers/sentencepiece.h>
+#include <pytorch/tokenizers/tiktoken.h>
+
+namespace example {
+
+using namespace ::executorch::extension;
+using namespace ::executorch::runtime;
+
+namespace {
+
+std::unique_ptr<::tokenizers::Tokenizer> load_tokenizer(
+    const std::string& tokenizer_path) {
+  auto hf_tokenizer = std::make_unique<::tokenizers::HFTokenizer>();
+  if (hf_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
+    ET_LOG(Info, "Loaded HuggingFace tokenizer");
+    return hf_tokenizer;
+  }
+
+  auto tiktoken = std::make_unique<::tokenizers::Tiktoken>();
+  if (tiktoken->load(tokenizer_path) == ::tokenizers::Error::Ok) {
+    ET_LOG(Info, "Loaded TikToken tokenizer");
+    return tiktoken;
+  }
+
+  auto sp_tokenizer = std::make_unique<::tokenizers::SPTokenizer>();
+  if (sp_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
+    ET_LOG(Info, "Loaded SentencePiece tokenizer");
+    return sp_tokenizer;
+  }
+
+  auto bpe_tokenizer = std::make_unique<::tokenizers::Llama2cTokenizer>();
+  if (bpe_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
+    ET_LOG(Info, "Loaded BPE tokenizer");
+    return bpe_tokenizer;
+  }
+
+  ET_LOG(Error, "Failed to load tokenizer from %s", tokenizer_path.c_str());
+  return nullptr;
+}
+
+} // namespace
+
+// ============================================================================
+// StaticLLMIOManager implementation
+// ============================================================================
+
+StaticLLMIOManager::StaticLLMIOManager(
+    Module& module,
+    const StaticLLMConfig& config)
+    : IOManager(module), module_(module), config_(config) {
+  compute_rope_frequencies();
+
+  input_buffer_.resize(config_.input_len, 0);
+
+  size_t logits_size = config_.input_len * config_.vocab_size;
+  logits_output_.resize(logits_size, static_cast<LogitT>(0));
+
+  size_t update_size =
+      config_.n_kv_heads * config_.input_len * config_.head_dim;
+  k_update_buffers_.resize(config_.n_layers);
+  v_update_buffers_.resize(config_.n_layers);
+  for (size_t i = 0; i < config_.n_layers; i++) {
+    k_update_buffers_[i].resize(update_size, static_cast<CacheT>(0));
+    v_update_buffers_[i].resize(update_size, static_cast<CacheT>(0));
+  }
+}
+
+void StaticLLMIOManager::compute_rope_frequencies() {
+  size_t max_seq_len = config_.cache_len + config_.input_len;
+  size_t rope_dim = config_.head_dim / 2;
+
+  freqs_cos_.resize(max_seq_len * rope_dim);
+  freqs_sin_.resize(max_seq_len * rope_dim);
+
+  for (size_t pos = 0; pos < max_seq_len; pos++) {
+    for (size_t i = 0; i < rope_dim; i++) {
+      float freq =
+          1.0f /
+          std::pow(
+              config_.rope_base, static_cast<float>(2 * i) / config_.head_dim);
+      float angle = static_cast<float>(pos) * freq;
+      freqs_cos_[pos * rope_dim + i] = static_cast<RopeT>(std::cos(angle));
+      freqs_sin_[pos * rope_dim + i] = static_cast<RopeT>(std::sin(angle));
+    }
+  }
+}
+
+Error StaticLLMIOManager::load(
+    const std::string& prefill_method,
+    const std::string& decode_method) {
+  (void)prefill_method;
+  (void)decode_method;
+
+  // Build input/output indices for StaticAttentionIOManager
+  std::vector<size_t> k_cache_input_indices(config_.n_layers);
+  std::vector<size_t> k_cache_output_indices(config_.n_layers);
+  std::vector<size_t> v_cache_input_indices(config_.n_layers);
+  std::vector<size_t> v_cache_output_indices(config_.n_layers);
+
+  for (size_t i = 0; i < config_.n_layers; i++) {
+    k_cache_input_indices[i] = 4 + i;
+    k_cache_output_indices[i] = 1 + i;
+    v_cache_input_indices[i] = 4 + config_.n_layers + i;
+    v_cache_output_indices[i] = 1 + config_.n_layers + i;
+  }
+
+  typename StaticAttentionIOManager<CacheT, MaskT, RopeT>::StaticAttentionIOConfig
+      io_config;
+  io_config.n_caches = config_.n_layers;
+  io_config.cache_lengths =
+      std::vector<size_t>(config_.n_layers, config_.cache_len);
+  io_config.head_dim = config_.head_dim;
+  io_config.max_input_len = config_.input_len;
+  io_config.n_heads_per_cache = config_.n_kv_heads;
+  io_config.cache_len_to_mask_idx = {{config_.cache_len, 1}};
+  io_config.rope_freqs_cos_input_index = 2;
+  io_config.rope_freqs_sin_input_index = 3;
+  io_config.k_cache_input_indices = k_cache_input_indices;
+  io_config.k_cache_output_indices = k_cache_output_indices;
+  io_config.v_cache_input_indices = v_cache_input_indices;
+  io_config.v_cache_output_indices = v_cache_output_indices;
+  io_config.max_context_len = config_.cache_len + config_.input_len;
+  io_config.rope_freqs_cos = freqs_cos_.data();
+  io_config.rope_freqs_sin = freqs_sin_.data();
+  io_config.style = StaticAttentionUpdateStyle::SMART_MASK;
+  io_config.generate_full_logits = config_.generate_full_logits;
+  io_config.last_valid_token_pos_index = std::nullopt;
+
+  static_io_manager_ =
+      std::make_unique<StaticAttentionIOManager<CacheT, MaskT, RopeT>>(
+          std::move(io_config));
+
+  MaskT zero_val = static_cast<MaskT>(0.0f);
+  MaskT mask_val = static_cast<MaskT>(-65504.0f);
+  static_io_manager_->add_mask(config_.input_len, zero_val, mask_val);
+
+  return Error::Ok;
+}
+
+Error StaticLLMIOManager::reset(
+    const std::string& prefill_method,
+    const std::string& decode_method) {
+  (void)prefill_method;
+  (void)decode_method;
+  if (static_io_manager_) {
+    static_io_manager_->reset();
+  }
+  output_buffers_set_ = false;
+  return Error::Ok;
+}
+
+void StaticLLMIOManager::setup_output_buffers(Method& method) {
+  if (output_buffers_set_) {
+    return;
+  }
+
+  auto method_meta = method.method_meta();
+
+  auto logits_meta = method_meta.output_tensor_meta(0);
+  ET_CHECK_MSG(logits_meta.ok(), "Failed to get logits output meta");
+  ET_CHECK(
+      method.set_output_data_ptr(
+          logits_output_.data(), logits_meta->nbytes(), 0) == Error::Ok);
+
+  for (size_t i = 0; i < config_.n_layers; i++) {
+    auto k_out_meta = method_meta.output_tensor_meta(1 + i);
+    ET_CHECK_MSG(
+        k_out_meta.ok(), "Failed to get k_cache output meta for layer %zu", i);
+    ET_CHECK(
+        method.set_output_data_ptr(
+            k_update_buffers_[i].data(), k_out_meta->nbytes(), 1 + i) ==
+        Error::Ok);
+  }
+
+  for (size_t i = 0; i < config_.n_layers; i++) {
+    auto v_out_meta = method_meta.output_tensor_meta(1 + config_.n_layers + i);
+    ET_CHECK_MSG(
+        v_out_meta.ok(), "Failed to get v_cache output meta for layer %zu", i);
+    ET_CHECK(
+        method.set_output_data_ptr(
+            v_update_buffers_[i].data(),
+            v_out_meta->nbytes(),
+            1 + config_.n_layers + i) == Error::Ok);
+  }
+
+  output_buffers_set_ = true;
+}
+
+Result<std::vector<EValue>> StaticLLMIOManager::prepare_prefill(
+    const TensorPtr& input,
+    const TensorPtr& start_pos,
+    const std::string& prefill_method) {
+  (void)start_pos;
+  (void)prefill_method;
+
+  // Copy tokens to input buffer
+  const int64_t* input_data = input->const_data_ptr<int64_t>();
+  actual_input_len_ = input->numel();
+  for (size_t i = 0; i < config_.input_len; i++) {
+    input_buffer_[i] = (i < actual_input_len_)
+        ? static_cast<TokenT>(input_data[i])
+        : 0;
+  }
+
+  // Return empty - inputs are set via Method::set_input by StaticAttentionIOManager
+  return std::vector<EValue>{};
+}
+
+Result<std::vector<EValue>> StaticLLMIOManager::prepare_decode(
+    const TensorPtr& input,
+    const TensorPtr& start_pos,
+    const std::string& decode_method) {
+  (void)start_pos;
+  (void)decode_method;
+
+  const int64_t* input_data = input->const_data_ptr<int64_t>();
+  actual_input_len_ = 1;
+  input_buffer_[0] = static_cast<TokenT>(input_data[0]);
+  for (size_t i = 1; i < config_.input_len; i++) {
+    input_buffer_[i] = 0;
+  }
+
+  return std::vector<EValue>{};
+}
+
+Error StaticLLMIOManager::update_prefill(
+    const std::vector<EValue>& model_outputs,
+    const std::string& prefill_method) {
+  (void)model_outputs;
+  (void)prefill_method;
+  // KV cache update is handled by StaticAttentionIOManager
+  return Error::Ok;
+}
+
+Error StaticLLMIOManager::update_decode(
+    const std::vector<EValue>& model_outputs,
+    const std::string& decode_method) {
+  (void)model_outputs;
+  (void)decode_method;
+  return Error::Ok;
+}
+
+// ============================================================================
+// StaticLLMTextDecoderRunner implementation
+// ============================================================================
+
+StaticLLMTextDecoderRunner::StaticLLMTextDecoderRunner(
+    Module* module,
+    StaticLLMIOManager* io_manager)
+    : TextDecoderRunner(module, io_manager), static_io_manager_(io_manager) {}
+
+Error StaticLLMTextDecoderRunner::load() {
+  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
+  ET_CHECK_OK_OR_RETURN_ERROR(io_manager_->load());
+  return Error::Ok;
+}
+
+Result<executorch::aten::Tensor> StaticLLMTextDecoderRunner::step(
+    TensorPtr& input,
+    int64_t start_pos) {
+  (void)start_pos;
+
+  auto method_result = module_->method("forward");
+  if (!method_result.ok()) {
+    return method_result.error();
+  }
+  Method* method = method_result.get();
+
+  // Set up output buffers for CoreML
+  static_io_manager_->setup_output_buffers(*method);
+
+  // Get the underlying StaticAttentionIOManager
+  auto* static_io = static_io_manager_->get_static_io_manager();
+
+  // Set token input
+  auto input_meta = method->method_meta().input_tensor_meta(0);
+  ET_CHECK_MSG(input_meta.ok(), "Failed to get input tensor meta");
+
+  const int64_t* input_data = input->const_data_ptr<int64_t>();
+  std::vector<StaticLLMIOManager::TokenT> tokens(
+      static_io_manager_->config().input_len, 0);
+  tokens[0] = static_cast<StaticLLMIOManager::TokenT>(input_data[0]);
+
+  auto input_impl = ::executorch::runtime::etensor::TensorImpl(
+      input_meta->scalar_type(),
+      input_meta->sizes().size(),
+      const_cast<executorch::aten::TensorImpl::SizesType*>(
+          input_meta->sizes().data()),
+      tokens.data(),
+      const_cast<executorch::aten::TensorImpl::DimOrderType*>(
+          input_meta->dim_order().data()));
+  executorch::runtime::etensor::Tensor input_tensor(&input_impl);
+  ET_CHECK(method->set_input(input_tensor, 0) == Error::Ok);
+
+  // Set up mask and RoPE via StaticAttentionIOManager
+  auto& masks = static_io->get_mask(static_io_manager_->config().input_len);
+  for (auto& pair : masks) {
+    auto& mask = *pair.second;
+    mask.set_causal_mask();
+
+    auto mask_meta = method->method_meta().input_tensor_meta(1);
+    ET_CHECK_MSG(mask_meta.ok(), "Failed to get mask tensor meta");
+    auto mask_impl = ::executorch::runtime::etensor::TensorImpl(
+        mask_meta->scalar_type(),
+        mask_meta->sizes().size(),
+        const_cast<executorch::aten::TensorImpl::SizesType*>(
+            mask_meta->sizes().data()),
+        mask.get(),
+        const_cast<executorch::aten::TensorImpl::DimOrderType*>(
+            mask_meta->dim_order().data()));
+    executorch::runtime::etensor::Tensor mask_tensor(&mask_impl);
+    ET_CHECK(method->set_input(mask_tensor, 1) == Error::Ok);
+  }
+
+  static_io->prepare(*method);
+
+  auto exec_result = method->execute();
+  if (exec_result != Error::Ok) {
+    return exec_result;
+  }
+
+  // Update KV caches
+  const auto& config = static_io_manager_->config();
+  std::vector<size_t> k_out_indices(config.n_layers);
+  std::vector<size_t> v_out_indices(config.n_layers);
+  for (size_t i = 0; i < config.n_layers; i++) {
+    k_out_indices[i] = 1 + i;
+    v_out_indices[i] = 1 + config.n_layers + i;
+  }
+  static_io->update(*method, k_out_indices, v_out_indices, 1);
+
+  return method->get_output(0).toTensor();
+}
+
+// ============================================================================
+// StaticLLMRunner implementation
+// ============================================================================
+
+StaticLLMRunner::StaticLLMRunner(
+    const std::string& model_path,
+    const std::string& tokenizer_path,
+    const StaticLLMConfig& config)
+    : model_path_(model_path),
+      tokenizer_path_(tokenizer_path),
+      config_(config) {
+  runtime_init();
+}
+
+Error StaticLLMRunner::load() {
+  if (is_loaded()) {
+    return Error::Ok;
+  }
+
+  stats_.model_load_start_ms = llm::time_in_ms();
+
+  ET_LOG(Info, "Loading model from %s", model_path_.c_str());
+  module_ = std::make_unique<Module>(model_path_, Module::LoadMode::File);
+
+  ET_LOG(Info, "Loading tokenizer from %s", tokenizer_path_.c_str());
+  tokenizer_ = load_tokenizer(tokenizer_path_);
+  if (!tokenizer_) {
+    return Error::InvalidArgument;
+  }
+
+  eos_ids_.insert(tokenizer_->eos_tok());
+  eos_ids_.insert(128001);
+  eos_ids_.insert(128009);
+
+  io_manager_ = std::make_unique<StaticLLMIOManager>(*module_, config_);
+  decoder_runner_ =
+      std::make_unique<StaticLLMTextDecoderRunner>(module_.get(), io_manager_.get());
+
+  ET_CHECK_OK_OR_RETURN_ERROR(decoder_runner_->load());
+
+  stats_.model_load_end_ms = llm::time_in_ms();
+  ET_LOG(
+      Info,
+      "Model loaded in %.2f seconds",
+      (stats_.model_load_end_ms - stats_.model_load_start_ms) / 1000.0);
+
+  return Error::Ok;
+}
+
+bool StaticLLMRunner::is_loaded() const {
+  return decoder_runner_ && decoder_runner_->is_method_loaded();
+}
+
+void StaticLLMRunner::reset() {
+  if (io_manager_) {
+    io_manager_->reset("forward", "forward");
+  }
+  stats_.reset();
+}
+
+StaticLLMRunner::TokenT StaticLLMRunner::sample_token(
+    Method& method,
+    size_t pos) {
+  auto logits_tensor = method.get_output(0).toTensor();
+  size_t vocab_size = logits_tensor.size(logits_tensor.dim() - 1);
+  const LogitT* logits_data = logits_tensor.const_data_ptr<LogitT>();
+
+  size_t offset = pos * vocab_size;
+
+  LogitT max_val = logits_data[offset];
+  TokenT max_idx = 0;
+  for (size_t i = 1; i < vocab_size; i++) {
+    if (logits_data[offset + i] > max_val) {
+      max_val = logits_data[offset + i];
+      max_idx = static_cast<TokenT>(i);
+    }
+  }
+  return max_idx;
+}
+
+std::vector<StaticLLMRunner::TokenT> StaticLLMRunner::sample_all_tokens(
+    Method& method) {
+  auto logits_tensor = method.get_output(0).toTensor();
+  size_t vocab_size = logits_tensor.size(logits_tensor.dim() - 1);
+  size_t seq_len = logits_tensor.size(logits_tensor.dim() - 2);
+  const LogitT* logits_data = logits_tensor.const_data_ptr<LogitT>();
+
+  std::vector<TokenT> tokens(seq_len);
+  for (size_t pos = 0; pos < seq_len; pos++) {
+    size_t offset = pos * vocab_size;
+    LogitT max_val = logits_data[offset];
+    TokenT max_idx = 0;
+    for (size_t i = 1; i < vocab_size; i++) {
+      if (logits_data[offset + i] > max_val) {
+        max_val = logits_data[offset + i];
+        max_idx = static_cast<TokenT>(i);
+      }
+    }
+    tokens[pos] = max_idx;
+  }
+  return tokens;
+}
+
+Error StaticLLMRunner::generate(
+    const std::string& prompt,
+    int32_t max_new_tokens,
+    float temperature,
+    std::function<void(const std::string&)> token_callback) {
+  (void)temperature;
+
+  if (!is_loaded()) {
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+  }
+
+  reset();
+  stats_.inference_start_ms = llm::time_in_ms();
+
+  auto encode_result = tokenizer_->encode(prompt, 1, 0);
+  if (!encode_result.ok()) {
+    return Error::InvalidArgument;
+  }
+
+  std::vector<uint64_t> prompt_tokens_u64 = encode_result.get();
+  std::vector<TokenT> prompt_tokens(
+      prompt_tokens_u64.begin(), prompt_tokens_u64.end());
+  size_t num_prompt_tokens = prompt_tokens.size();
+
+  ET_LOG(Info, "Prompt: %s", prompt.c_str());
+  ET_LOG(Info, "Prompt tokens: %zu", num_prompt_tokens);
+
+  auto method_result = module_->method("forward");
+  if (!method_result.ok()) {
+    return method_result.error();
+  }
+  Method* method = method_result.get();
+
+  io_manager_->setup_output_buffers(*method);
+
+  auto* static_io = io_manager_->get_static_io_manager();
+  std::vector<TokenT> input_buffer(config_.input_len, 0);
+
+  Span<TokenT> prompt_span(prompt_tokens.data(), prompt_tokens.size());
+  Span<TokenT> input_span(input_buffer.data(), input_buffer.size());
+
+  auto input_meta = method->method_meta().input_tensor_meta(0);
+  ET_CHECK_MSG(input_meta.ok(), "Failed to get input tensor meta");
+  auto input_impl = ::executorch::runtime::etensor::TensorImpl(
+      input_meta->scalar_type(),
+      input_meta->sizes().size(),
+      const_cast<executorch::aten::TensorImpl::SizesType*>(
+          input_meta->sizes().data()),
+      input_buffer.data(),
+      const_cast<executorch::aten::TensorImpl::DimOrderType*>(
+          input_meta->dim_order().data()));
+  executorch::runtime::etensor::Tensor input_tensor(&input_impl);
+  ET_CHECK(method->set_input(input_tensor, 0) == Error::Ok);
+
+  size_t last_logit_pos = static_io->prefill<TokenT, LogitT>(
+      prompt_span, input_span, *method, nullptr);
+
+  TokenT cur_token = sample_token(*method, last_logit_pos);
+
+  stats_.first_token_ms = llm::time_in_ms();
+  stats_.prompt_eval_end_ms = llm::time_in_ms();
+
+  auto decode_result = tokenizer_->decode(cur_token, cur_token);
+  if (decode_result.ok() && token_callback) {
+    token_callback(*decode_result);
+  }
+
+  TokenT prev_token = cur_token;
+  std::function<TokenT(Method&)> sample_fn = [this](Method& m) -> TokenT {
+    return sample_token(m, 0);
+  };
+
+  int32_t num_generated = 1;
+  std::function<bool(TokenT)> token_cb = [&](TokenT tok) -> bool {
+    num_generated++;
+    if (num_generated > max_new_tokens) {
+      return false;
+    }
+    if (eos_ids_.find(tok) != eos_ids_.end()) {
+      return false;
+    }
+    auto decode = tokenizer_->decode(prev_token, tok);
+    if (decode.ok() && token_callback) {
+      token_callback(*decode);
+    }
+    prev_token = tok;
+    return true;
+  };
+
+  static_io->decode<TokenT>(prev_token, input_span, *method, sample_fn, token_cb);
+
+  stats_.inference_end_ms = llm::time_in_ms();
+  stats_.num_prompt_tokens = num_prompt_tokens;
+  stats_.num_generated_tokens = num_generated;
+
+  double prefill_time_s =
+      (stats_.first_token_ms - stats_.inference_start_ms) / 1000.0;
+  double decode_time_s =
+      (stats_.inference_end_ms - stats_.first_token_ms) / 1000.0;
+  double tokens_per_sec =
+      decode_time_s > 0 ? (num_generated - 1) / decode_time_s : 0;
+
+  ET_LOG(
+      Info,
+      "\nPrefill: %zu tokens in %.2f s",
+      num_prompt_tokens,
+      prefill_time_s);
+  ET_LOG(
+      Info,
+      "Decode: %d tokens in %.2f s (%.2f tok/s)",
+      num_generated,
+      decode_time_s,
+      tokens_per_sec);
+
+  return Error::Ok;
+}
+
+Error StaticLLMRunner::generate_with_lookahead(
+    const std::string& prompt,
+    int32_t max_new_tokens,
+    const LookaheadConfig& lookahead_config,
+    std::function<void(const std::string&)> token_callback) {
+  if (!config_.generate_full_logits) {
+    ET_LOG(
+        Error,
+        "Lookahead decoding requires generate_full_logits=true, but model "
+        "outputs only last token logits");
+    return Error::InvalidArgument;
+  }
+
+  if (!is_loaded()) {
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+  }
+
+  reset();
+  stats_.inference_start_ms = llm::time_in_ms();
+
+  size_t ngram_size = lookahead_config.ngram_size;
+  size_t window_size = lookahead_config.window_size;
+  size_t n_verifications = lookahead_config.n_verifications;
+
+  ET_LOG(
+      Info,
+      "Using lookahead decoding: ngram=%zu, window=%zu, verifications=%zu",
+      ngram_size,
+      window_size,
+      n_verifications);
+
+  auto encode_result = tokenizer_->encode(prompt, 1, 0);
+  if (!encode_result.ok()) {
+    return Error::InvalidArgument;
+  }
+
+  std::vector<uint64_t> prompt_tokens_u64 = encode_result.get();
+  std::vector<TokenT> prompt_tokens(
+      prompt_tokens_u64.begin(), prompt_tokens_u64.end());
+  size_t num_prompt_tokens = prompt_tokens.size();
+
+  ET_LOG(Info, "Prompt: %s", prompt.c_str());
+  ET_LOG(Info, "Prompt tokens: %zu", num_prompt_tokens);
+
+  auto method_result = module_->method("forward");
+  if (!method_result.ok()) {
+    return method_result.error();
+  }
+  Method* method = method_result.get();
+
+  io_manager_->setup_output_buffers(*method);
+
+  auto* static_io = io_manager_->get_static_io_manager();
+  std::vector<TokenT> input_buffer(config_.input_len, 0);
+
+  Span<TokenT> prompt_span(prompt_tokens.data(), prompt_tokens.size());
+  Span<TokenT> input_span(input_buffer.data(), input_buffer.size());
+
+  auto input_meta = method->method_meta().input_tensor_meta(0);
+  ET_CHECK_MSG(input_meta.ok(), "Failed to get input tensor meta");
+  auto input_impl = ::executorch::runtime::etensor::TensorImpl(
+      input_meta->scalar_type(),
+      input_meta->sizes().size(),
+      const_cast<executorch::aten::TensorImpl::SizesType*>(
+          input_meta->sizes().data()),
+      input_buffer.data(),
+      const_cast<executorch::aten::TensorImpl::DimOrderType*>(
+          input_meta->dim_order().data()));
+  executorch::runtime::etensor::Tensor input_tensor(&input_impl);
+  ET_CHECK(method->set_input(input_tensor, 0) == Error::Ok);
+
+  size_t last_logit_pos = static_io->prefill<TokenT, LogitT>(
+      prompt_span, input_span, *method, nullptr);
+
+  TokenT cur_token = sample_token(*method, last_logit_pos);
+
+  stats_.first_token_ms = llm::time_in_ms();
+  stats_.prompt_eval_end_ms = llm::time_in_ms();
+
+  auto decode_result = tokenizer_->decode(cur_token, cur_token);
+  if (decode_result.ok() && token_callback) {
+    token_callback(*decode_result);
+  }
+
+  std::unordered_map<TokenT, SuffixCache<TokenT>> suffix_caches;
+  SuffixCache<TokenT>::seed_suffix_caches(
+      suffix_caches, prompt_span, ngram_size, n_verifications);
+
+  TokenT prev_token = cur_token;
+  std::function<std::vector<TokenT>(Method&)> sample_all_fn =
+      [this](Method& m) -> std::vector<TokenT> { return sample_all_tokens(m); };
+
+  int32_t num_generated = 1;
+  std::function<bool(TokenT)> token_cb = [&](TokenT tok) -> bool {
+    num_generated++;
+    if (num_generated > max_new_tokens) {
+      return false;
+    }
+    if (eos_ids_.find(tok) != eos_ids_.end()) {
+      return false;
+    }
+    auto decode = tokenizer_->decode(prev_token, tok);
+    if (decode.ok() && token_callback) {
+      token_callback(*decode);
+    }
+    prev_token = tok;
+    return true;
+  };
+
+  static_io->lookahead_decode<TokenT>(
+      prev_token,
+      input_span,
+      *method,
+      sample_all_fn,
+      token_cb,
+      ngram_size,
+      window_size,
+      n_verifications,
+      std::move(suffix_caches));
+
+  stats_.inference_end_ms = llm::time_in_ms();
+  stats_.num_prompt_tokens = num_prompt_tokens;
+  stats_.num_generated_tokens = num_generated;
+
+  double prefill_time_s =
+      (stats_.first_token_ms - stats_.inference_start_ms) / 1000.0;
+  double decode_time_s =
+      (stats_.inference_end_ms - stats_.first_token_ms) / 1000.0;
+  double tokens_per_sec =
+      decode_time_s > 0 ? (num_generated - 1) / decode_time_s : 0;
+
+  ET_LOG(
+      Info,
+      "\nPrefill: %zu tokens in %.2f s",
+      num_prompt_tokens,
+      prefill_time_s);
+  ET_LOG(
+      Info,
+      "Decode: %d tokens in %.2f s (%.2f tok/s)",
+      num_generated,
+      decode_time_s,
+      tokens_per_sec);
+
+  return Error::Ok;
+}
+
+std::unique_ptr<StaticLLMRunner> create_static_llm_runner(
+    const std::string& model_path,
+    const std::string& tokenizer_path,
+    const std::string& params_path) {
+  ET_LOG(Info, "Creating static LLM runner with model: %s", model_path.c_str());
+
+  // Load model to extract metadata
+  Module module(model_path, Module::LoadMode::File);
+  ET_LOG(Info, "Module created, loading method 'forward'...");
+
+  auto load_result = module.load_method("forward");
+  if (load_result != Error::Ok) {
+    ET_LOG(
+        Error,
+        "Failed to load model method 'forward' from %s (error code: %d)",
+        model_path.c_str(),
+        static_cast<int>(load_result));
+    return nullptr;
+  }
+  ET_LOG(Info, "Method 'forward' loaded successfully");
+
+  auto method_meta = module.method_meta("forward");
+  if (!method_meta.ok()) {
+    ET_LOG(Error, "Failed to get method metadata");
+    return nullptr;
+  }
+
+  // Extract input_len and cache_len from mask tensor shape
+  // Mask shape is (1, input_len, cache_len + input_len)
+  auto mask_meta = method_meta->input_tensor_meta(1);
+  if (!mask_meta.ok()) {
+    ET_LOG(Error, "Failed to get mask tensor metadata");
+    return nullptr;
+  }
+
+  auto mask_sizes = mask_meta->sizes();
+  if (mask_sizes.size() != 3) {
+    ET_LOG(
+        Error,
+        "Expected mask tensor to have 3 dimensions, got %zu",
+        mask_sizes.size());
+    return nullptr;
+  }
+
+  size_t input_len = mask_sizes[1];
+  size_t total_len = mask_sizes[2];
+  size_t cache_len = total_len - input_len;
+
+  // Extract n_layers from number of k_cache inputs
+  // Inputs: tokens(0), mask(1), freqs_cos(2), freqs_sin(3), k_caches..., v_caches...
+  size_t num_inputs = method_meta->num_inputs();
+  size_t n_layers = (num_inputs - 4) / 2;
+
+  // Extract n_kv_heads and head_dim from k_cache shape
+  // k_cache shape is (1, n_kv_heads, cache_len, head_dim)
+  auto k_cache_meta = method_meta->input_tensor_meta(4);
+  if (!k_cache_meta.ok()) {
+    ET_LOG(Error, "Failed to get k_cache tensor metadata");
+    return nullptr;
+  }
+
+  auto k_cache_sizes = k_cache_meta->sizes();
+  if (k_cache_sizes.size() != 4) {
+    ET_LOG(
+        Error,
+        "Expected k_cache tensor to have 4 dimensions, got %zu",
+        k_cache_sizes.size());
+    return nullptr;
+  }
+
+  size_t n_kv_heads = k_cache_sizes[1];
+  size_t head_dim = k_cache_sizes[3];
+
+  // Extract vocab_size and generate_full_logits from logits output shape
+  // Full logits shape: (1, input_len, vocab_size)
+  // Last token only: (1, 1, vocab_size)
+  auto logits_meta = method_meta->output_tensor_meta(0);
+  if (!logits_meta.ok()) {
+    ET_LOG(Error, "Failed to get logits tensor metadata");
+    return nullptr;
+  }
+
+  auto logits_sizes = logits_meta->sizes();
+  size_t vocab_size = logits_sizes[logits_sizes.size() - 1];
+  size_t logits_seq_len = logits_sizes[logits_sizes.size() - 2];
+  bool generate_full_logits = (logits_seq_len == input_len);
+
+  // Read params.json for any additional config (rope_base, etc.)
+  float rope_base = 500000.0f;
+  std::ifstream params_file(params_path);
+  if (params_file.is_open()) {
+    try {
+      nlohmann::json params;
+      params_file >> params;
+      rope_base = params.value("rope_theta", 500000.0f);
+    } catch (const std::exception& e) {
+      ET_LOG(Info, "Could not parse params.json, using defaults: %s", e.what());
+    }
+  }
+
+  StaticLLMConfig config;
+  config.n_layers = n_layers;
+  config.n_kv_heads = n_kv_heads;
+  config.head_dim = head_dim;
+  config.vocab_size = vocab_size;
+  config.input_len = input_len;
+  config.cache_len = cache_len;
+  config.generate_full_logits = generate_full_logits;
+  config.rope_base = rope_base;
+
+  ET_LOG(
+      Info,
+      "Config from model metadata: n_layers=%zu, n_kv_heads=%zu, head_dim=%zu, "
+      "input_len=%zu, cache_len=%zu, vocab_size=%zu, full_logits=%s",
+      config.n_layers,
+      config.n_kv_heads,
+      config.head_dim,
+      config.input_len,
+      config.cache_len,
+      config.vocab_size,
+      config.generate_full_logits ? "true" : "false");
+
+  return std::make_unique<StaticLLMRunner>(model_path, tokenizer_path, config);
+}
+
+} // namespace example
diff --git a/examples/apple/coreml/llama/runner/static_llm_runner.h b/examples/apple/coreml/llama/runner/static_llm_runner.h
new file mode 100644
index 00000000000..e3c9064e47d
--- /dev/null
+++ b/examples/apple/coreml/llama/runner/static_llm_runner.h
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// C++ runner for static attention LLM models exported with
+// export_static_llm_coreml.py. Subclasses TextDecoderRunner to maintain
+// the standard LLM runner interface.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include <executorch/examples/models/llama/runner/static_attention_io_manager.h>
+#include <executorch/extension/llm/runner/io_manager/io_manager.h>
+#include <executorch/extension/llm/runner/stats.h>
+#include <executorch/extension/llm/runner/text_decoder_runner.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/runtime/core/span.h>
+#include <executorch/runtime/executor/method.h>
+#include <pytorch/tokenizers/tokenizer.h>
+
+namespace example {
+
+/**
+ * Configuration for the static LLM model.
+ */
+struct StaticLLMConfig {
+  size_t n_layers = 16;
+  size_t n_kv_heads = 8;
+  size_t head_dim = 64;
+  size_t input_len = 32;
+  size_t cache_len = 992;
+  size_t vocab_size = 128256;
+  bool generate_full_logits = true;
+  float rope_base = 500000.0f;
+};
+
+/**
+ * Configuration for lookahead (speculative) decoding.
+ */
+struct LookaheadConfig {
+  bool enabled = false;
+  size_t ngram_size = 4;
+  size_t window_size = 5;
+  size_t n_verifications = 3;
+};
+
+/**
+ * IOManager adapter that wraps StaticAttentionIOManager to implement
+ * the executorch::extension::llm::IOManager interface.
+ *
+ * This bridges the gap between TextDecoderRunner's expected interface
+ * and the static attention model's I/O requirements.
+ */
+class StaticLLMIOManager : public ::executorch::extension::llm::IOManager {
+ public:
+  using CacheT = __fp16;
+  using MaskT = __fp16;
+  using RopeT = __fp16;
+  using LogitT = __fp16;
+  using TokenT = int32_t;
+
+  StaticLLMIOManager(
+      ::executorch::extension::Module& module,
+      const StaticLLMConfig& config);
+
+  ~StaticLLMIOManager() override = default;
+
+  ::executorch::runtime::Error load(
+      const std::string& prefill_method,
+      const std::string& decode_method) override;
+
+  ::executorch::runtime::Error reset(
+      const std::string& prefill_method,
+      const std::string& decode_method) override;
+
+  ::executorch::runtime::Result<std::vector<::executorch::runtime::EValue>>
+  prepare_prefill(
+      const ::executorch::extension::TensorPtr& input,
+      const ::executorch::extension::TensorPtr& start_pos,
+      const std::string& prefill_method) override;
+
+  ::executorch::runtime::Result<std::vector<::executorch::runtime::EValue>>
+  prepare_decode(
+      const ::executorch::extension::TensorPtr& input,
+      const ::executorch::extension::TensorPtr& start_pos,
+      const std::string& decode_method) override;
+
+  ::executorch::runtime::Error update_prefill(
+      const std::vector<::executorch::runtime::EValue>& model_outputs,
+      const std::string& prefill_method) override;
+
+  ::executorch::runtime::Error update_decode(
+      const std::vector<::executorch::runtime::EValue>& model_outputs,
+      const std::string& decode_method) override;
+
+  /**
+   * Get the underlying StaticAttentionIOManager for advanced operations.
+   */
+  StaticAttentionIOManager<CacheT, MaskT, RopeT>* get_static_io_manager() {
+    return static_io_manager_.get();
+  }
+
+  const StaticLLMConfig& config() const {
+    return config_;
+  }
+
+  /**
+   * Set up CoreML output buffers on the Method.
+   * Must be called before first inference.
+   */
+  void setup_output_buffers(::executorch::runtime::Method& method);
+
+ private:
+  void compute_rope_frequencies();
+
+  ::executorch::extension::Module& module_;
+  StaticLLMConfig config_;
+
+  std::vector<RopeT> freqs_cos_;
+  std::vector<RopeT> freqs_sin_;
+  std::vector<TokenT> input_buffer_;
+  std::vector<LogitT> logits_output_;
+  std::vector<std::vector<CacheT>> k_update_buffers_;
+  std::vector<std::vector<CacheT>> v_update_buffers_;
+
+  std::unique_ptr<StaticAttentionIOManager<CacheT, MaskT, RopeT>>
+      static_io_manager_;
+
+  size_t actual_input_len_ = 0;
+  bool output_buffers_set_ = false;
+};
+
+/**
+ * TextDecoderRunner subclass for static attention LLM models.
+ *
+ * Overrides step() to use StaticAttentionIOManager's prepare/execute flow.
+ */
+class StaticLLMTextDecoderRunner
+    : public ::executorch::extension::llm::TextDecoderRunner {
+ public:
+  StaticLLMTextDecoderRunner(
+      ::executorch::extension::Module* module,
+      StaticLLMIOManager* io_manager);
+
+  ~StaticLLMTextDecoderRunner() override = default;
+
+  ::executorch::runtime::Result<::executorch::aten::Tensor> step(
+      ::executorch::extension::TensorPtr& input,
+      int64_t start_pos) override;
+
+  ::executorch::runtime::Error load() override;
+
+ private:
+  StaticLLMIOManager* static_io_manager_;
+};
+
+/**
+ * Main runner class that orchestrates text generation.
+ */
+class StaticLLMRunner {
+ public:
+  StaticLLMRunner(
+      const std::string& model_path,
+      const std::string& tokenizer_path,
+      const StaticLLMConfig& config);
+
+  ~StaticLLMRunner() = default;
+
+  ::executorch::runtime::Error load();
+  bool is_loaded() const;
+  void reset();
+
+  ::executorch::runtime::Error generate(
+      const std::string& prompt,
+      int32_t max_new_tokens,
+      float temperature = 0.0f,
+      std::function<void(const std::string&)> token_callback = nullptr);
+
+  ::executorch::runtime::Error generate_with_lookahead(
+      const std::string& prompt,
+      int32_t max_new_tokens,
+      const LookaheadConfig& lookahead_config,
+      std::function<void(const std::string&)> token_callback = nullptr);
+
+  const ::executorch::extension::llm::Stats& stats() const {
+    return stats_;
+  }
+
+  /**
+   * Get the TextDecoderRunner for external use.
+   */
+  StaticLLMTextDecoderRunner* decoder_runner() {
+    return decoder_runner_.get();
+  }
+
+ private:
+  using TokenT = int32_t;
+  using LogitT = __fp16;
+
+  TokenT sample_token(::executorch::runtime::Method& method, size_t pos = 0);
+  std::vector<TokenT> sample_all_tokens(::executorch::runtime::Method& method);
+
+  std::string model_path_;
+  std::string tokenizer_path_;
+  StaticLLMConfig config_;
+
+  std::unique_ptr<::executorch::extension::Module> module_;
+  std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
+  std::unique_ptr<StaticLLMIOManager> io_manager_;
+  std::unique_ptr<StaticLLMTextDecoderRunner> decoder_runner_;
+
+  std::unordered_set<uint64_t> eos_ids_;
+  ::executorch::extension::llm::Stats stats_;
+};
+
+/**
+ * Create a StaticLLMRunner with configuration auto-detected from model metadata.
+ * Reads input_len, cache_len, n_layers, n_kv_heads, head_dim, vocab_size from
+ * the model's method metadata. Only rope_base is read from params.json.
+ */
+std::unique_ptr<StaticLLMRunner> create_static_llm_runner(
+    const std::string& model_path,
+    const std::string& tokenizer_path,
+    const std::string& params_path);
+
+} // namespace example