diff --git a/.ci/scripts/test_ane_static_llama.sh b/.ci/scripts/test_ane_static_llama.sh index 73a9c4ca54b..f04b716fd97 100644 --- a/.ci/scripts/test_ane_static_llama.sh +++ b/.ci/scripts/test_ane_static_llama.sh @@ -28,13 +28,78 @@ pushd $EXECUTORCH_ROOT/examples/apple/coreml/llama # Download stories llama110m artifacts download_stories_model_artifacts -# Test static ANE llama model +# Test static ANE llama model export +echo "Exporting static ANE llama model..." python export_static_llm_coreml.py --checkpoint stories110M.pt --params params.json --output model.pte -# The ANE cannot run in github CI -# python run_static_llm.py --model model.pte --params params.json --tokenizer tokenizer.model --prompt "Once upon a time," --lookahead +# The ANE is not accessible in github CI, so we export with CPU to test runner +echo "Exporting CPU-only model for CI testing..." +python export_static_llm_coreml.py --checkpoint stories110M.pt --params params.json --output model_cpu.pte --cpu_only + +popd + +# Build the C++ runner +echo "Building C++ runner..." +BUILD_DIR="${EXECUTORCH_ROOT}/cmake-out" + +# Clean build directory completely to avoid stale artifacts and generator conflicts +rm -rf "${BUILD_DIR}" + +cmake -S "${EXECUTORCH_ROOT}" -B "${BUILD_DIR}" \ + -DCMAKE_INSTALL_PREFIX="${BUILD_DIR}" \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \ + -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ + -DEXECUTORCH_BUILD_COREML=ON \ + -G Ninja + +cmake --build "${BUILD_DIR}" -j --target run_static_llm_coreml --config Release + +# TODO: enable runner once CoreML bug with caching is fixed +# # Run the C++ runner with the CPU model +# echo "Running C++ runner with CPU model..." +# RUNNER="${BUILD_DIR}/examples/apple/coreml/llama/runner/run_static_llm_coreml" +# MODEL_DIR="${EXECUTORCH_ROOT}/examples/apple/coreml/llama" + +# # Run the model and capture full output for debugging +# FULL_OUTPUT=$("${RUNNER}" \ +# --model "${MODEL_DIR}/model.pte" \ +# --params "${MODEL_DIR}/params.json" \ +# --tokenizer "${MODEL_DIR}/tokenizer.model" \ +# --prompt "Once upon a time," \ +# --max_new_tokens 50 2>&1) + +# echo "Full output:" +# echo "${FULL_OUTPUT}" + +# # Check that the model produced meaningful output +# # The output should contain: the prompt "Once upon a time," and the continuation including "there was" +# # Due to log interleaving, we check for individual key parts separately +# if [[ "${FULL_OUTPUT}" == *"Once upon a time,"* ]] && [[ "${FULL_OUTPUT}" == *"there"* ]] && [[ "${FULL_OUTPUT}" == *"was"* ]]; then +# echo "Output contains expected prompt and generated text" +# echo "C++ runner test passed!" +# else +# echo "ERROR: Output does not contain expected text" +# echo "Expected: 'Once upon a time,' followed by 'there' and 'was'" +# exit 1 +# fi + +# TODO: enable runner once CoreML bug with caching is fixed +# # Run lookahead decoding test (currently produces tokens on stories, but works with llama) +# echo "Running C++ runner with lookahead decoding..." +# "${RUNNER}" \ +# --model "${MODEL_DIR}/model.pte" \ +# --params "${MODEL_DIR}/params.json" \ +# --tokenizer "${MODEL_DIR}/tokenizer.model" \ +# --prompt "Once upon a time," \ +# --max_new_tokens 50 \ +# --lookahead # Test export of deprecated model +pushd $EXECUTORCH_ROOT/examples/apple/coreml/llama python export.py -n model.pte -p params.json -c stories110M.pt --seq_length 32 --max_seq_length 64 --dtype fp16 --coreml-quantize c4w --embedding-quantize 4,32 - popd diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 2cd284b059b..2a883051e9f 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -489,7 +489,7 @@ jobs: name: test-static-llama-ane uses: pytorch/test-infra/.github/workflows/macos_job.yml@main with: - runner: macos-m1-stable + runner: macos-15-xlarge python-version: '3.11' submodules: 'recursive' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} @@ -839,7 +839,7 @@ jobs: qwen3-0.6b|xnnpack|--quantize, qwen3-1.7b|xnnpack|--quantize, gemma3-1b|xnnpack|--quantize, - # phi4-mini|xnnpack|--quantize, transformers v5.0.0rc0 introduces a data-dependent branching in transformers/modeling_rope_utils.py:61 + # phi4-mini|xnnpack|--quantize, transformers v5.0.0rc0 introduces a data-dependent branching in transformers/modeling_rope_utils.py:61 smollm2-135m|xnnpack|--quantize, smollm3-3b|xnnpack|--quantize ] diff --git a/CMakeLists.txt b/CMakeLists.txt index 30cee4afe53..f02a216c191 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -936,12 +936,20 @@ if(EXECUTORCH_BUILD_EXTENSION_TRAINING) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/training) list(APPEND _executorch_extensions extension_training) endif() - if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner) list(APPEND _executorch_extensions extension_llm_runner) endif() +# Static LLM CoreML runner for Apple platforms +if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER + AND EXECUTORCH_BUILD_COREML + AND APPLE +) + add_subdirectory( + ${CMAKE_CURRENT_SOURCE_DIR}/examples/apple/coreml/llama/runner + ) +endif() if(EXECUTORCH_BUILD_EXTENSION_ASR_RUNNER) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/asr/runner) list(APPEND _executorch_extensions extension_asr_runner) diff --git a/examples/apple/coreml/llama/export_static_llm_coreml.py b/examples/apple/coreml/llama/export_static_llm_coreml.py index a3fd8201414..155c24264a4 100644 --- a/examples/apple/coreml/llama/export_static_llm_coreml.py +++ b/examples/apple/coreml/llama/export_static_llm_coreml.py @@ -98,18 +98,19 @@ def remove_graph_break_(edge_manager): edge_manager.exported_program().graph_module.graph.eliminate_dead_code() -def load_model(checkpoint_path: str, params_path: str, max_context_len: int): +def load_model( + checkpoint_path: str, + params_path: str, + max_context_len: int, + generate_full_logits: bool = True, +): """Load the model from checkpoint with static_mha attention type.""" with open(params_path, "r") as f: params = json.loads(f.read()) - # TODO: to support lookahead decoding, the static model outputs - # full logits, but if we are not using lookahead decoding, we can have a - # more efficient model by setting generate_full_logits=False and supplying the last - # valid token args = ModelArgs( max_context_len=max_context_len, - generate_full_logits=True, + generate_full_logits=generate_full_logits, **params, ) args.attention_type = "static_mha" @@ -320,15 +321,39 @@ def main(): help="Disable graph breaks between transformer blocks", ) + # Output options + parser.add_argument( + "--no_generate_full_logits", + action="store_true", + help="Only generate logits for the last token position (more efficient, but no lookahead support).", + ) + + # Compute options + parser.add_argument( + "--cpu_only", + action="store_true", + help="Use CPU only (no ANE). Useful for CI testing where ANE is not accessible.", + ) + args = parser.parse_args() # Compute cache length + generate_full_logits = not args.no_generate_full_logits + print("Quantization and datatype:") print(f"\tEmbedding quantize: {args.embedding_quantize}") print(f"\tLinear quantize: {args.linear_quantize}") print(f"\tDtype: {args.dtype}") + print("\nOutput configuration:") + print(f"\tGenerate full logits: {generate_full_logits}") + if not generate_full_logits: + print("\t(Lookahead decoding will NOT be supported)") + + print("\nCompute configuration:") + print(f"\tCPU only: {args.cpu_only}") + cache_len = args.max_context_len - args.input_len print("\nGeneration configuration:") print(f"\tMax context length: {args.max_context_len}") @@ -345,6 +370,7 @@ def main(): args.checkpoint, args.params, args.max_context_len, + generate_full_logits, ) print(f"Model loaded: {model_args.n_layers} layers, {model_args.dim} dim") @@ -453,13 +479,16 @@ def main(): # Setup CoreML partitioner print("\nSetting up CoreML partitioner...") + compute_unit = ( + ct.ComputeUnit.CPU_ONLY if args.cpu_only else ct.ComputeUnit.CPU_AND_NE + ) compile_specs = CoreMLBackend.generate_compile_specs( minimum_deployment_target=ct.target.iOS18, compute_precision={ torch.float16: ct.precision.FLOAT16, torch.float32: ct.precision.FLOAT32, }[float_dtype], - compute_unit=ct.ComputeUnit.CPU_AND_NE, + compute_unit=compute_unit, model_type=CoreMLBackend.MODEL_TYPE.MODEL, ) partitioner = CoreMLPartitioner( diff --git a/examples/apple/coreml/llama/readme.md b/examples/apple/coreml/llama/readme.md index 46e9043a5fc..7ceb5265e5c 100644 --- a/examples/apple/coreml/llama/readme.md +++ b/examples/apple/coreml/llama/readme.md @@ -29,7 +29,64 @@ The static model has several ANE optimizations, including: * Re-writing SDPA to avoid 5-D tensors to imporve performance. This also fixes an accuracy bug that was introduced in iOS 26 (addresses this: https://github.com/pytorch/executorch/issues/15833) -We are working on adding a C++ runner as well. +## C++ Runner + +A C++ runner is also available for running static attention LLM models. The runner extends `TextDecoderRunner` from the ExecutorTorch LLM extension and manages KV cache I/O with smart_mask style cache updates. + +### Building on macOS + +The easiest way to build is using the provided build script: + +```bash +cd examples/apple/coreml/llama/runner +./build_and_run.sh --help # Show options +./build_and_run.sh # Build and run with defaults +``` + +Or build manually from the executorch root directory using the macos preset: + +```bash +cmake -S . -B cmake-out --preset macos +cmake --build cmake-out --config Release --target run_static_llm_coreml -j$(sysctl -n hw.ncpu) +``` + +The executable will be at: `cmake-out/examples/apple/coreml/llama/runner/Release/run_static_llm_coreml` + +### Running + +```bash +./cmake-out/examples/apple/coreml/llama/runner/Release/run_static_llm_coreml \ + --model static_llm_coreml_model.pte \ + --params /path/to/params.json \ + --tokenizer /path/to/tokenizer.model \ + --prompt "Once upon a time," \ + --max_new_tokens 100 \ + --input_len 32 \ + --cache_len 992 \ + --temperature 0.0 +``` + +### Command-line Options + +| Option | Description | Default | +|--------|-------------|---------| +| `--model` | Path to the .pte model file | (required) | +| `--params` | Path to params.json | (required) | +| `--tokenizer` | Path to tokenizer file | (required) | +| `--prompt` | Input prompt | (required) | +| `--max_new_tokens` | Maximum tokens to generate | 100 | +| `--input_len` | Input sequence length (must match export) | 32 | +| `--cache_len` | KV cache length (must match export) | 992 | +| `--temperature` | Sampling temperature (0 = greedy) | 0.0 | + +### Features + +The C++ runner: +- Extends `TextDecoderRunner` from `executorch/extension/llm/runner/` +- Manages KV cache I/O with smart_mask style cache updates +- Supports multiple tokenizer formats (HuggingFace JSON, TikToken, SentencePiece, BPE) +- Computes RoPE frequencies internally (Llama 3 style with base=500000) +- Reads model configuration from params.json # Deprecated (export.py, run.py, and run_lookahead.py) diff --git a/examples/apple/coreml/llama/runner/CMakeLists.txt b/examples/apple/coreml/llama/runner/CMakeLists.txt new file mode 100644 index 00000000000..7e0cd4dd4d1 --- /dev/null +++ b/examples/apple/coreml/llama/runner/CMakeLists.txt @@ -0,0 +1,172 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# +# CMake build system for static LLM runner for CoreML. +# +# ### Building ### +# +# From the executorch root directory: +# ~~~ +# mkdir cmake-out && cd cmake-out +# cmake .. \ +# -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \ +# -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \ +# -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ +# -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ +# -DEXECUTORCH_BUILD_COREML=ON +# cmake --build . -j --target run_static_llm_coreml +# ~~~ +# +# ### Running ### +# +# ~~~ +# ./examples/apple/coreml/llama/runner/run_static_llm_coreml \ +# --model /path/to/model.pte \ +# --params /path/to/params.json \ +# --tokenizer /path/to/tokenizer.model \ +# --prompt "Once upon a time," \ +# --max_new_tokens 100 +# ~~~ + +cmake_minimum_required(VERSION 3.19) +project(static_llm_coreml_runner) + +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..) +endif() + +include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) + +# Source files +set(static_llm_runner_srcs static_llm_runner.cpp) + +set(static_llm_main_srcs main.cpp) + +# Create the runner library +if(CMAKE_TOOLCHAIN_IOS + OR ANDROID + OR APPLE +) + add_library(static_llm_runner STATIC ${static_llm_runner_srcs}) +else() + add_library(static_llm_runner SHARED ${static_llm_runner_srcs}) +endif() + +# Check for required targets +if(NOT TARGET extension_llm_runner) + message( + FATAL_ERROR + "ExecuTorch must be built with EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER enabled." + ) +endif() + +if(NOT TARGET extension_module) + message( + FATAL_ERROR + "ExecuTorch must be built with EXECUTORCH_BUILD_EXTENSION_MODULE enabled." + ) +endif() + +# Dependencies for the runner library +set(static_llm_runner_deps + executorch_core + extension_data_loader + extension_module + extension_tensor + extension_flat_tensor + extension_llm_runner + executorch_backends +) + +# Add CoreML delegate if available (required for running CoreML models) Note: +# coremldelegate is linked transitively through executorch build system when +# EXECUTORCH_BUILD_COREML is ON. We don't need to link it again here to avoid +# duplicate symbol errors. if(TARGET coremldelegate) list(APPEND +# static_llm_runner_deps coremldelegate) endif() + +target_link_libraries(static_llm_runner PUBLIC ${static_llm_runner_deps}) +target_link_libraries(static_llm_runner PUBLIC tokenizers::tokenizers) + +# Find nlohmann_json for params parsing - executorch already includes it in +# third-party +if(TARGET nlohmann_json) + target_link_libraries(static_llm_runner PUBLIC nlohmann_json::nlohmann_json) +elseif(TARGET nlohmann_json::nlohmann_json) + target_link_libraries(static_llm_runner PUBLIC nlohmann_json::nlohmann_json) +else() + find_package(nlohmann_json QUIET CONFIG) + if(nlohmann_json_FOUND) + target_link_libraries(static_llm_runner PUBLIC nlohmann_json::nlohmann_json) + else() + # The executorch third-party directory includes nlohmann_json + target_include_directories( + static_llm_runner PUBLIC ${EXECUTORCH_ROOT}/third-party/json/include + ) + endif() +endif() + +target_include_directories( + static_llm_runner + PUBLIC ${EXECUTORCH_ROOT} ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include + ${EXECUTORCH_ROOT}/.. +) + +# Create the executable +add_executable(run_static_llm_coreml ${static_llm_main_srcs}) + +target_link_libraries(run_static_llm_coreml PRIVATE static_llm_runner) + +# Find and link gflags - executorch builds gflags as a third-party dependency +if(TARGET gflags) + target_link_libraries(run_static_llm_coreml PRIVATE gflags) +elseif(TARGET gflags_nothreads_static) + target_link_libraries(run_static_llm_coreml PRIVATE gflags_nothreads_static) +else() + find_package(gflags QUIET) + if(gflags_FOUND) + target_link_libraries(run_static_llm_coreml PRIVATE gflags::gflags) + else() + # Try to find gflags via pkg-config + find_package(PkgConfig QUIET) + if(PkgConfig_FOUND) + pkg_check_modules(GFLAGS QUIET gflags) + if(GFLAGS_FOUND) + target_include_directories( + run_static_llm_coreml PRIVATE ${GFLAGS_INCLUDE_DIRS} + ) + target_link_libraries(run_static_llm_coreml PRIVATE ${GFLAGS_LIBRARIES}) + else() + message( + FATAL_ERROR + "gflags not found. Please install gflags or set GFLAGS_ROOT." + ) + endif() + else() + message( + FATAL_ERROR + "gflags not found. Please install gflags or set GFLAGS_ROOT." + ) + endif() + endif() +endif() + +target_include_directories( + run_static_llm_coreml PRIVATE ${EXECUTORCH_ROOT} ${EXECUTORCH_ROOT}/.. +) + +# Set C++ standard +set_target_properties( + static_llm_runner run_static_llm_coreml PROPERTIES CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON +) + +# Install targets +install(TARGETS run_static_llm_coreml DESTINATION bin) +install(TARGETS static_llm_runner DESTINATION lib) +install(FILES static_llm_runner.h + DESTINATION include/executorch/examples/apple/coreml/llama/runner +) diff --git a/examples/apple/coreml/llama/runner/build_and_run.sh b/examples/apple/coreml/llama/runner/build_and_run.sh new file mode 100755 index 00000000000..ca2c4588caa --- /dev/null +++ b/examples/apple/coreml/llama/runner/build_and_run.sh @@ -0,0 +1,248 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Build and run the static LLM C++ runner for CoreML +# +# Usage: +# ./build_and_run.sh [--rebuild] [--run-only] [--help] +# +# Arguments: +# --rebuild Force a clean rebuild +# --run-only Skip build, just run the executable +# --help Show this help message +# +# Environment variables (override defaults): +# MODEL_PATH Path to the .pte model file +# PARAMS_PATH Path to params.json +# TOKENIZER_PATH Path to tokenizer.model +# PROMPT Input prompt +# MAX_NEW_TOKENS Maximum tokens to generate +# INPUT_LEN Input sequence length +# CACHE_LEN KV cache length + +set -e + +# Get the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +EXECUTORCH_ROOT="$(cd "${SCRIPT_DIR}/../../../../.." && pwd)" +BUILD_DIR="${EXECUTORCH_ROOT}/cmake-out" + +# Default values (can be overridden via environment variables) +MODEL_PATH="${MODEL_PATH:-$HOME/Desktop/static_llama1b_coreml_model.pte}" +PARAMS_PATH="${PARAMS_PATH:-$HOME/models/llama1b/params.json}" +TOKENIZER_PATH="${TOKENIZER_PATH:-$HOME/models/llama1b/tokenizer.model}" +PROMPT="${PROMPT:-Once upon a time,}" +MAX_NEW_TOKENS="${MAX_NEW_TOKENS:-100}" +INPUT_LEN="${INPUT_LEN:-32}" +CACHE_LEN="${CACHE_LEN:-992}" +TEMPERATURE="${TEMPERATURE:-0.0}" + +# Lookahead decoding options +LOOKAHEAD="${LOOKAHEAD:-false}" +NGRAM_SIZE="${NGRAM_SIZE:-4}" +WINDOW_SIZE="${WINDOW_SIZE:-5}" +N_VERIFICATIONS="${N_VERIFICATIONS:-3}" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +print_usage() { + echo "Usage: $0 [--rebuild] [--run-only] [--help]" + echo "" + echo "Options:" + echo " --rebuild Force a clean rebuild" + echo " --run-only Skip build, just run the executable" + echo " --help Show this help message" + echo "" + echo "Environment variables:" + echo " MODEL_PATH Path to the .pte model file (default: \$HOME/Desktop/static_llama1b_coreml_model.pte)" + echo " PARAMS_PATH Path to params.json (default: \$HOME/models/llama1b/params.json)" + echo " TOKENIZER_PATH Path to tokenizer.model (default: \$HOME/models/llama1b/tokenizer.model)" + echo " PROMPT Input prompt (default: 'Once upon a time,')" + echo " MAX_NEW_TOKENS Maximum tokens to generate (default: 100)" + echo " INPUT_LEN Input sequence length (default: 32)" + echo " CACHE_LEN KV cache length (default: 992)" + echo " TEMPERATURE Sampling temperature (default: 0.0)" +} + +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Parse arguments +REBUILD=false +RUN_ONLY=false + +while [[ $# -gt 0 ]]; do + case $1 in + --rebuild) + REBUILD=true + shift + ;; + --run-only) + RUN_ONLY=true + shift + ;; + --help) + print_usage + exit 0 + ;; + *) + log_error "Unknown option: $1" + print_usage + exit 1 + ;; + esac +done + +# Validate required files exist +validate_files() { + local missing=false + + if [[ ! -f "${MODEL_PATH}" ]]; then + log_error "Model file not found: ${MODEL_PATH}" + missing=true + fi + + if [[ ! -f "${PARAMS_PATH}" ]]; then + log_error "Params file not found: ${PARAMS_PATH}" + missing=true + fi + + if [[ ! -f "${TOKENIZER_PATH}" ]]; then + log_error "Tokenizer file not found: ${TOKENIZER_PATH}" + missing=true + fi + + if [[ "$missing" == "true" ]]; then + echo "" + log_info "You can set paths via environment variables:" + echo " export MODEL_PATH=/path/to/model.pte" + echo " export PARAMS_PATH=/path/to/params.json" + echo " export TOKENIZER_PATH=/path/to/tokenizer.model" + exit 1 + fi +} + +build_project() { + log_info "ExecutorTorch root: ${EXECUTORCH_ROOT}" + log_info "Build directory: ${BUILD_DIR}" + + # Clean build if requested + if [[ "$REBUILD" == "true" ]] && [[ -d "${BUILD_DIR}" ]]; then + log_info "Cleaning build directory..." + rm -rf "${BUILD_DIR}" + fi + + cd "${EXECUTORCH_ROOT}" + + # Configure CMake using macos preset (includes all necessary LLM extensions) + log_info "Configuring CMake with macos preset..." + cmake -S "${EXECUTORCH_ROOT}" \ + -B "${BUILD_DIR}" \ + -DCMAKE_BUILD_TYPE=Release \ + --preset macos + + # Build the target + log_info "Building run_static_llm_coreml..." + cmake --build "${BUILD_DIR}" \ + -j$(sysctl -n hw.ncpu) \ + --config Release \ + --target run_static_llm_coreml + + log_info "Build complete!" +} + +run_model() { + local executable="${BUILD_DIR}/examples/apple/coreml/llama/runner/Release/run_static_llm_coreml" + + # Also check non-Release location + if [[ ! -f "${executable}" ]]; then + executable="${BUILD_DIR}/examples/apple/coreml/llama/runner/run_static_llm_coreml" + fi + + # Check Debug location + if [[ ! -f "${executable}" ]]; then + executable="${BUILD_DIR}/examples/apple/coreml/llama/runner/Debug/run_static_llm_coreml" + fi + + if [[ ! -f "${executable}" ]]; then + log_error "Executable not found: ${executable}" + log_info "Run without --run-only to build first" + exit 1 + fi + + log_info "Running model..." + echo "" + echo "Configuration:" + echo " Model: ${MODEL_PATH}" + echo " Params: ${PARAMS_PATH}" + echo " Tokenizer: ${TOKENIZER_PATH}" + echo " Prompt: ${PROMPT}" + echo " Max tokens: ${MAX_NEW_TOKENS}" + echo " Input length: ${INPUT_LEN}" + echo " Cache length: ${CACHE_LEN}" + echo " Temperature: ${TEMPERATURE}" + if [[ "${LOOKAHEAD}" == "true" ]]; then + echo " Lookahead: enabled" + echo " ngram_size: ${NGRAM_SIZE}" + echo " window_size: ${WINDOW_SIZE}" + echo " n_verifications: ${N_VERIFICATIONS}" + fi + echo "" + echo "==========================================" + + # Build command with optional lookahead flags + local cmd=("${executable}" + --model "${MODEL_PATH}" + --params "${PARAMS_PATH}" + --tokenizer "${TOKENIZER_PATH}" + --prompt "${PROMPT}" + --max_new_tokens "${MAX_NEW_TOKENS}" + --temperature "${TEMPERATURE}") + + if [[ "${LOOKAHEAD}" == "true" ]]; then + cmd+=(--lookahead + --ngram_size "${NGRAM_SIZE}" + --window_size "${WINDOW_SIZE}" + --n_verifications "${N_VERIFICATIONS}") + fi + + "${cmd[@]}" + + echo "==========================================" + log_info "Done!" +} + +# Main execution +main() { + echo "" + log_info "Static LLM CoreML Runner - Build & Test Script" + echo "" + + # Validate files before running + validate_files + + if [[ "$RUN_ONLY" == "false" ]]; then + build_project + fi + + run_model +} + +main diff --git a/examples/apple/coreml/llama/runner/main.cpp b/examples/apple/coreml/llama/runner/main.cpp new file mode 100644 index 00000000000..0a96f0b4fc5 --- /dev/null +++ b/examples/apple/coreml/llama/runner/main.cpp @@ -0,0 +1,127 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Main executable for running static attention LLM models. +// +// Usage: +// ./run_static_llm \ +// --model /path/to/model.pte \ +// --params /path/to/params.json \ +// --tokenizer /path/to/tokenizer.model \ +// --prompt "Once upon a time," \ +// --max_new_tokens 100 +// +// With lookahead decoding: +// ./run_static_llm \ +// --model /path/to/model.pte \ +// --params /path/to/params.json \ +// --tokenizer /path/to/tokenizer.model \ +// --prompt "Once upon a time," \ +// --max_new_tokens 100 \ +// --lookahead \ +// --ngram_size 4 \ +// --window_size 5 \ +// --n_verifications 3 + +#include +#include + +#include +#include +#include + +#include + +DEFINE_string(model, "", "Path to the .pte model file (required)"); +DEFINE_string(params, "", "Path to params.json file (optional, for rope_theta)"); +DEFINE_string(tokenizer, "", "Path to tokenizer model file (required)"); +DEFINE_string(prompt, "Once upon a time,", "Input prompt"); +DEFINE_int32(max_new_tokens, 100, "Maximum number of tokens to generate"); +DEFINE_double(temperature, 0.0, "Sampling temperature (0 = greedy)"); + +// Lookahead decoding options +DEFINE_bool(lookahead, false, "Enable lookahead (speculative) decoding"); +DEFINE_int32(ngram_size, 4, "N-gram size for lookahead decoding"); +DEFINE_int32(window_size, 5, "Window size for lookahead decoding"); +DEFINE_int32(n_verifications, 3, "Number of verification branches for lookahead decoding"); + +int main(int argc, char* argv[]) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + + // Validate required arguments + if (FLAGS_model.empty()) { + std::cerr << "Error: --model is required" << std::endl; + return 1; + } + if (FLAGS_params.empty()) { + std::cerr << "Error: --params is required" << std::endl; + return 1; + } + if (FLAGS_tokenizer.empty()) { + std::cerr << "Error: --tokenizer is required" << std::endl; + return 1; + } + + // Initialize runtime + executorch::runtime::runtime_init(); + + // Create runner (config is auto-detected from model metadata) + auto runner = example::create_static_llm_runner( + FLAGS_model, + FLAGS_tokenizer, + FLAGS_params); + + if (!runner) { + std::cerr << "Error: Failed to create runner" << std::endl; + return 1; + } + + // Load model + auto load_err = runner->load(); + if (load_err != executorch::runtime::Error::Ok) { + std::cerr << "Error: Failed to load model" << std::endl; + return 1; + } + + // Print prompt + std::cout << "\n" << FLAGS_prompt << std::flush; + + // Generate + executorch::runtime::Error gen_err; + + if (FLAGS_lookahead) { + // Use lookahead decoding + example::LookaheadConfig lookahead_config; + lookahead_config.enabled = true; + lookahead_config.ngram_size = static_cast(FLAGS_ngram_size); + lookahead_config.window_size = static_cast(FLAGS_window_size); + lookahead_config.n_verifications = static_cast(FLAGS_n_verifications); + + gen_err = runner->generate_with_lookahead( + FLAGS_prompt, + FLAGS_max_new_tokens, + lookahead_config, + [](const std::string& token) { std::cout << token << std::flush; }); + } else { + // Use standard decoding + gen_err = runner->generate( + FLAGS_prompt, + FLAGS_max_new_tokens, + static_cast(FLAGS_temperature), + [](const std::string& token) { std::cout << token << std::flush; }); + } + + if (gen_err != executorch::runtime::Error::Ok) { + std::cerr << "\nError: Generation failed" << std::endl; + return 1; + } + + std::cout << "\n" << std::endl; + + return 0; +} diff --git a/examples/apple/coreml/llama/runner/static_llm_runner.cpp b/examples/apple/coreml/llama/runner/static_llm_runner.cpp new file mode 100644 index 00000000000..d02c3529617 --- /dev/null +++ b/examples/apple/coreml/llama/runner/static_llm_runner.cpp @@ -0,0 +1,844 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace example { + +using namespace ::executorch::extension; +using namespace ::executorch::runtime; + +namespace { + +std::unique_ptr<::tokenizers::Tokenizer> load_tokenizer( + const std::string& tokenizer_path) { + auto hf_tokenizer = std::make_unique<::tokenizers::HFTokenizer>(); + if (hf_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { + ET_LOG(Info, "Loaded HuggingFace tokenizer"); + return hf_tokenizer; + } + + auto tiktoken = std::make_unique<::tokenizers::Tiktoken>(); + if (tiktoken->load(tokenizer_path) == ::tokenizers::Error::Ok) { + ET_LOG(Info, "Loaded TikToken tokenizer"); + return tiktoken; + } + + auto sp_tokenizer = std::make_unique<::tokenizers::SPTokenizer>(); + if (sp_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { + ET_LOG(Info, "Loaded SentencePiece tokenizer"); + return sp_tokenizer; + } + + auto bpe_tokenizer = std::make_unique<::tokenizers::Llama2cTokenizer>(); + if (bpe_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { + ET_LOG(Info, "Loaded BPE tokenizer"); + return bpe_tokenizer; + } + + ET_LOG(Error, "Failed to load tokenizer from %s", tokenizer_path.c_str()); + return nullptr; +} + +} // namespace + +// ============================================================================ +// StaticLLMIOManager implementation +// ============================================================================ + +StaticLLMIOManager::StaticLLMIOManager( + Module& module, + const StaticLLMConfig& config) + : IOManager(module), module_(module), config_(config) { + compute_rope_frequencies(); + + input_buffer_.resize(config_.input_len, 0); + + size_t logits_size = config_.input_len * config_.vocab_size; + logits_output_.resize(logits_size, static_cast(0)); + + size_t update_size = + config_.n_kv_heads * config_.input_len * config_.head_dim; + k_update_buffers_.resize(config_.n_layers); + v_update_buffers_.resize(config_.n_layers); + for (size_t i = 0; i < config_.n_layers; i++) { + k_update_buffers_[i].resize(update_size, static_cast(0)); + v_update_buffers_[i].resize(update_size, static_cast(0)); + } +} + +void StaticLLMIOManager::compute_rope_frequencies() { + size_t max_seq_len = config_.cache_len + config_.input_len; + size_t rope_dim = config_.head_dim / 2; + + freqs_cos_.resize(max_seq_len * rope_dim); + freqs_sin_.resize(max_seq_len * rope_dim); + + for (size_t pos = 0; pos < max_seq_len; pos++) { + for (size_t i = 0; i < rope_dim; i++) { + float freq = + 1.0f / + std::pow( + config_.rope_base, static_cast(2 * i) / config_.head_dim); + float angle = static_cast(pos) * freq; + freqs_cos_[pos * rope_dim + i] = static_cast(std::cos(angle)); + freqs_sin_[pos * rope_dim + i] = static_cast(std::sin(angle)); + } + } +} + +Error StaticLLMIOManager::load( + const std::string& prefill_method, + const std::string& decode_method) { + (void)prefill_method; + (void)decode_method; + + // Build input/output indices for StaticAttentionIOManager + std::vector k_cache_input_indices(config_.n_layers); + std::vector k_cache_output_indices(config_.n_layers); + std::vector v_cache_input_indices(config_.n_layers); + std::vector v_cache_output_indices(config_.n_layers); + + for (size_t i = 0; i < config_.n_layers; i++) { + k_cache_input_indices[i] = 4 + i; + k_cache_output_indices[i] = 1 + i; + v_cache_input_indices[i] = 4 + config_.n_layers + i; + v_cache_output_indices[i] = 1 + config_.n_layers + i; + } + + typename StaticAttentionIOManager::StaticAttentionIOConfig + io_config; + io_config.n_caches = config_.n_layers; + io_config.cache_lengths = + std::vector(config_.n_layers, config_.cache_len); + io_config.head_dim = config_.head_dim; + io_config.max_input_len = config_.input_len; + io_config.n_heads_per_cache = config_.n_kv_heads; + io_config.cache_len_to_mask_idx = {{config_.cache_len, 1}}; + io_config.rope_freqs_cos_input_index = 2; + io_config.rope_freqs_sin_input_index = 3; + io_config.k_cache_input_indices = k_cache_input_indices; + io_config.k_cache_output_indices = k_cache_output_indices; + io_config.v_cache_input_indices = v_cache_input_indices; + io_config.v_cache_output_indices = v_cache_output_indices; + io_config.max_context_len = config_.cache_len + config_.input_len; + io_config.rope_freqs_cos = freqs_cos_.data(); + io_config.rope_freqs_sin = freqs_sin_.data(); + io_config.style = StaticAttentionUpdateStyle::SMART_MASK; + io_config.generate_full_logits = config_.generate_full_logits; + io_config.last_valid_token_pos_index = std::nullopt; + + static_io_manager_ = + std::make_unique>( + std::move(io_config)); + + MaskT zero_val = static_cast(0.0f); + MaskT mask_val = static_cast(-65504.0f); + static_io_manager_->add_mask(config_.input_len, zero_val, mask_val); + + return Error::Ok; +} + +Error StaticLLMIOManager::reset( + const std::string& prefill_method, + const std::string& decode_method) { + (void)prefill_method; + (void)decode_method; + if (static_io_manager_) { + static_io_manager_->reset(); + } + output_buffers_set_ = false; + return Error::Ok; +} + +void StaticLLMIOManager::setup_output_buffers(Method& method) { + if (output_buffers_set_) { + return; + } + + auto method_meta = method.method_meta(); + + auto logits_meta = method_meta.output_tensor_meta(0); + ET_CHECK_MSG(logits_meta.ok(), "Failed to get logits output meta"); + ET_CHECK( + method.set_output_data_ptr( + logits_output_.data(), logits_meta->nbytes(), 0) == Error::Ok); + + for (size_t i = 0; i < config_.n_layers; i++) { + auto k_out_meta = method_meta.output_tensor_meta(1 + i); + ET_CHECK_MSG( + k_out_meta.ok(), "Failed to get k_cache output meta for layer %zu", i); + ET_CHECK( + method.set_output_data_ptr( + k_update_buffers_[i].data(), k_out_meta->nbytes(), 1 + i) == + Error::Ok); + } + + for (size_t i = 0; i < config_.n_layers; i++) { + auto v_out_meta = method_meta.output_tensor_meta(1 + config_.n_layers + i); + ET_CHECK_MSG( + v_out_meta.ok(), "Failed to get v_cache output meta for layer %zu", i); + ET_CHECK( + method.set_output_data_ptr( + v_update_buffers_[i].data(), + v_out_meta->nbytes(), + 1 + config_.n_layers + i) == Error::Ok); + } + + output_buffers_set_ = true; +} + +Result> StaticLLMIOManager::prepare_prefill( + const TensorPtr& input, + const TensorPtr& start_pos, + const std::string& prefill_method) { + (void)start_pos; + (void)prefill_method; + + // Copy tokens to input buffer + const int64_t* input_data = input->const_data_ptr(); + actual_input_len_ = input->numel(); + for (size_t i = 0; i < config_.input_len; i++) { + input_buffer_[i] = (i < actual_input_len_) + ? static_cast(input_data[i]) + : 0; + } + + // Return empty - inputs are set via Method::set_input by StaticAttentionIOManager + return std::vector{}; +} + +Result> StaticLLMIOManager::prepare_decode( + const TensorPtr& input, + const TensorPtr& start_pos, + const std::string& decode_method) { + (void)start_pos; + (void)decode_method; + + const int64_t* input_data = input->const_data_ptr(); + actual_input_len_ = 1; + input_buffer_[0] = static_cast(input_data[0]); + for (size_t i = 1; i < config_.input_len; i++) { + input_buffer_[i] = 0; + } + + return std::vector{}; +} + +Error StaticLLMIOManager::update_prefill( + const std::vector& model_outputs, + const std::string& prefill_method) { + (void)model_outputs; + (void)prefill_method; + // KV cache update is handled by StaticAttentionIOManager + return Error::Ok; +} + +Error StaticLLMIOManager::update_decode( + const std::vector& model_outputs, + const std::string& decode_method) { + (void)model_outputs; + (void)decode_method; + return Error::Ok; +} + +// ============================================================================ +// StaticLLMTextDecoderRunner implementation +// ============================================================================ + +StaticLLMTextDecoderRunner::StaticLLMTextDecoderRunner( + Module* module, + StaticLLMIOManager* io_manager) + : TextDecoderRunner(module, io_manager), static_io_manager_(io_manager) {} + +Error StaticLLMTextDecoderRunner::load() { + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward")); + ET_CHECK_OK_OR_RETURN_ERROR(io_manager_->load()); + return Error::Ok; +} + +Result StaticLLMTextDecoderRunner::step( + TensorPtr& input, + int64_t start_pos) { + (void)start_pos; + + auto method_result = module_->method("forward"); + if (!method_result.ok()) { + return method_result.error(); + } + Method* method = method_result.get(); + + // Set up output buffers for CoreML + static_io_manager_->setup_output_buffers(*method); + + // Get the underlying StaticAttentionIOManager + auto* static_io = static_io_manager_->get_static_io_manager(); + + // Set token input + auto input_meta = method->method_meta().input_tensor_meta(0); + ET_CHECK_MSG(input_meta.ok(), "Failed to get input tensor meta"); + + const int64_t* input_data = input->const_data_ptr(); + std::vector tokens( + static_io_manager_->config().input_len, 0); + tokens[0] = static_cast(input_data[0]); + + auto input_impl = ::executorch::runtime::etensor::TensorImpl( + input_meta->scalar_type(), + input_meta->sizes().size(), + const_cast( + input_meta->sizes().data()), + tokens.data(), + const_cast( + input_meta->dim_order().data())); + executorch::runtime::etensor::Tensor input_tensor(&input_impl); + ET_CHECK(method->set_input(input_tensor, 0) == Error::Ok); + + // Set up mask and RoPE via StaticAttentionIOManager + auto& masks = static_io->get_mask(static_io_manager_->config().input_len); + for (auto& pair : masks) { + auto& mask = *pair.second; + mask.set_causal_mask(); + + auto mask_meta = method->method_meta().input_tensor_meta(1); + ET_CHECK_MSG(mask_meta.ok(), "Failed to get mask tensor meta"); + auto mask_impl = ::executorch::runtime::etensor::TensorImpl( + mask_meta->scalar_type(), + mask_meta->sizes().size(), + const_cast( + mask_meta->sizes().data()), + mask.get(), + const_cast( + mask_meta->dim_order().data())); + executorch::runtime::etensor::Tensor mask_tensor(&mask_impl); + ET_CHECK(method->set_input(mask_tensor, 1) == Error::Ok); + } + + static_io->prepare(*method); + + auto exec_result = method->execute(); + if (exec_result != Error::Ok) { + return exec_result; + } + + // Update KV caches + const auto& config = static_io_manager_->config(); + std::vector k_out_indices(config.n_layers); + std::vector v_out_indices(config.n_layers); + for (size_t i = 0; i < config.n_layers; i++) { + k_out_indices[i] = 1 + i; + v_out_indices[i] = 1 + config.n_layers + i; + } + static_io->update(*method, k_out_indices, v_out_indices, 1); + + return method->get_output(0).toTensor(); +} + +// ============================================================================ +// StaticLLMRunner implementation +// ============================================================================ + +StaticLLMRunner::StaticLLMRunner( + const std::string& model_path, + const std::string& tokenizer_path, + const StaticLLMConfig& config) + : model_path_(model_path), + tokenizer_path_(tokenizer_path), + config_(config) { + runtime_init(); +} + +Error StaticLLMRunner::load() { + if (is_loaded()) { + return Error::Ok; + } + + stats_.model_load_start_ms = llm::time_in_ms(); + + ET_LOG(Info, "Loading model from %s", model_path_.c_str()); + module_ = std::make_unique(model_path_, Module::LoadMode::File); + + ET_LOG(Info, "Loading tokenizer from %s", tokenizer_path_.c_str()); + tokenizer_ = load_tokenizer(tokenizer_path_); + if (!tokenizer_) { + return Error::InvalidArgument; + } + + eos_ids_.insert(tokenizer_->eos_tok()); + eos_ids_.insert(128001); + eos_ids_.insert(128009); + + io_manager_ = std::make_unique(*module_, config_); + decoder_runner_ = + std::make_unique(module_.get(), io_manager_.get()); + + ET_CHECK_OK_OR_RETURN_ERROR(decoder_runner_->load()); + + stats_.model_load_end_ms = llm::time_in_ms(); + ET_LOG( + Info, + "Model loaded in %.2f seconds", + (stats_.model_load_end_ms - stats_.model_load_start_ms) / 1000.0); + + return Error::Ok; +} + +bool StaticLLMRunner::is_loaded() const { + return decoder_runner_ && decoder_runner_->is_method_loaded(); +} + +void StaticLLMRunner::reset() { + if (io_manager_) { + io_manager_->reset("forward", "forward"); + } + stats_.reset(); +} + +StaticLLMRunner::TokenT StaticLLMRunner::sample_token( + Method& method, + size_t pos) { + auto logits_tensor = method.get_output(0).toTensor(); + size_t vocab_size = logits_tensor.size(logits_tensor.dim() - 1); + const LogitT* logits_data = logits_tensor.const_data_ptr(); + + size_t offset = pos * vocab_size; + + LogitT max_val = logits_data[offset]; + TokenT max_idx = 0; + for (size_t i = 1; i < vocab_size; i++) { + if (logits_data[offset + i] > max_val) { + max_val = logits_data[offset + i]; + max_idx = static_cast(i); + } + } + return max_idx; +} + +std::vector StaticLLMRunner::sample_all_tokens( + Method& method) { + auto logits_tensor = method.get_output(0).toTensor(); + size_t vocab_size = logits_tensor.size(logits_tensor.dim() - 1); + size_t seq_len = logits_tensor.size(logits_tensor.dim() - 2); + const LogitT* logits_data = logits_tensor.const_data_ptr(); + + std::vector tokens(seq_len); + for (size_t pos = 0; pos < seq_len; pos++) { + size_t offset = pos * vocab_size; + LogitT max_val = logits_data[offset]; + TokenT max_idx = 0; + for (size_t i = 1; i < vocab_size; i++) { + if (logits_data[offset + i] > max_val) { + max_val = logits_data[offset + i]; + max_idx = static_cast(i); + } + } + tokens[pos] = max_idx; + } + return tokens; +} + +Error StaticLLMRunner::generate( + const std::string& prompt, + int32_t max_new_tokens, + float temperature, + std::function token_callback) { + (void)temperature; + + if (!is_loaded()) { + ET_CHECK_OK_OR_RETURN_ERROR(load()); + } + + reset(); + stats_.inference_start_ms = llm::time_in_ms(); + + auto encode_result = tokenizer_->encode(prompt, 1, 0); + if (!encode_result.ok()) { + return Error::InvalidArgument; + } + + std::vector prompt_tokens_u64 = encode_result.get(); + std::vector prompt_tokens( + prompt_tokens_u64.begin(), prompt_tokens_u64.end()); + size_t num_prompt_tokens = prompt_tokens.size(); + + ET_LOG(Info, "Prompt: %s", prompt.c_str()); + ET_LOG(Info, "Prompt tokens: %zu", num_prompt_tokens); + + auto method_result = module_->method("forward"); + if (!method_result.ok()) { + return method_result.error(); + } + Method* method = method_result.get(); + + io_manager_->setup_output_buffers(*method); + + auto* static_io = io_manager_->get_static_io_manager(); + std::vector input_buffer(config_.input_len, 0); + + Span prompt_span(prompt_tokens.data(), prompt_tokens.size()); + Span input_span(input_buffer.data(), input_buffer.size()); + + auto input_meta = method->method_meta().input_tensor_meta(0); + ET_CHECK_MSG(input_meta.ok(), "Failed to get input tensor meta"); + auto input_impl = ::executorch::runtime::etensor::TensorImpl( + input_meta->scalar_type(), + input_meta->sizes().size(), + const_cast( + input_meta->sizes().data()), + input_buffer.data(), + const_cast( + input_meta->dim_order().data())); + executorch::runtime::etensor::Tensor input_tensor(&input_impl); + ET_CHECK(method->set_input(input_tensor, 0) == Error::Ok); + + size_t last_logit_pos = static_io->prefill( + prompt_span, input_span, *method, nullptr); + + TokenT cur_token = sample_token(*method, last_logit_pos); + + stats_.first_token_ms = llm::time_in_ms(); + stats_.prompt_eval_end_ms = llm::time_in_ms(); + + auto decode_result = tokenizer_->decode(cur_token, cur_token); + if (decode_result.ok() && token_callback) { + token_callback(*decode_result); + } + + TokenT prev_token = cur_token; + std::function sample_fn = [this](Method& m) -> TokenT { + return sample_token(m, 0); + }; + + int32_t num_generated = 1; + std::function token_cb = [&](TokenT tok) -> bool { + num_generated++; + if (num_generated > max_new_tokens) { + return false; + } + if (eos_ids_.find(tok) != eos_ids_.end()) { + return false; + } + auto decode = tokenizer_->decode(prev_token, tok); + if (decode.ok() && token_callback) { + token_callback(*decode); + } + prev_token = tok; + return true; + }; + + static_io->decode(prev_token, input_span, *method, sample_fn, token_cb); + + stats_.inference_end_ms = llm::time_in_ms(); + stats_.num_prompt_tokens = num_prompt_tokens; + stats_.num_generated_tokens = num_generated; + + double prefill_time_s = + (stats_.first_token_ms - stats_.inference_start_ms) / 1000.0; + double decode_time_s = + (stats_.inference_end_ms - stats_.first_token_ms) / 1000.0; + double tokens_per_sec = + decode_time_s > 0 ? (num_generated - 1) / decode_time_s : 0; + + ET_LOG( + Info, + "\nPrefill: %zu tokens in %.2f s", + num_prompt_tokens, + prefill_time_s); + ET_LOG( + Info, + "Decode: %d tokens in %.2f s (%.2f tok/s)", + num_generated, + decode_time_s, + tokens_per_sec); + + return Error::Ok; +} + +Error StaticLLMRunner::generate_with_lookahead( + const std::string& prompt, + int32_t max_new_tokens, + const LookaheadConfig& lookahead_config, + std::function token_callback) { + if (!config_.generate_full_logits) { + ET_LOG( + Error, + "Lookahead decoding requires generate_full_logits=true, but model " + "outputs only last token logits"); + return Error::InvalidArgument; + } + + if (!is_loaded()) { + ET_CHECK_OK_OR_RETURN_ERROR(load()); + } + + reset(); + stats_.inference_start_ms = llm::time_in_ms(); + + size_t ngram_size = lookahead_config.ngram_size; + size_t window_size = lookahead_config.window_size; + size_t n_verifications = lookahead_config.n_verifications; + + ET_LOG( + Info, + "Using lookahead decoding: ngram=%zu, window=%zu, verifications=%zu", + ngram_size, + window_size, + n_verifications); + + auto encode_result = tokenizer_->encode(prompt, 1, 0); + if (!encode_result.ok()) { + return Error::InvalidArgument; + } + + std::vector prompt_tokens_u64 = encode_result.get(); + std::vector prompt_tokens( + prompt_tokens_u64.begin(), prompt_tokens_u64.end()); + size_t num_prompt_tokens = prompt_tokens.size(); + + ET_LOG(Info, "Prompt: %s", prompt.c_str()); + ET_LOG(Info, "Prompt tokens: %zu", num_prompt_tokens); + + auto method_result = module_->method("forward"); + if (!method_result.ok()) { + return method_result.error(); + } + Method* method = method_result.get(); + + io_manager_->setup_output_buffers(*method); + + auto* static_io = io_manager_->get_static_io_manager(); + std::vector input_buffer(config_.input_len, 0); + + Span prompt_span(prompt_tokens.data(), prompt_tokens.size()); + Span input_span(input_buffer.data(), input_buffer.size()); + + auto input_meta = method->method_meta().input_tensor_meta(0); + ET_CHECK_MSG(input_meta.ok(), "Failed to get input tensor meta"); + auto input_impl = ::executorch::runtime::etensor::TensorImpl( + input_meta->scalar_type(), + input_meta->sizes().size(), + const_cast( + input_meta->sizes().data()), + input_buffer.data(), + const_cast( + input_meta->dim_order().data())); + executorch::runtime::etensor::Tensor input_tensor(&input_impl); + ET_CHECK(method->set_input(input_tensor, 0) == Error::Ok); + + size_t last_logit_pos = static_io->prefill( + prompt_span, input_span, *method, nullptr); + + TokenT cur_token = sample_token(*method, last_logit_pos); + + stats_.first_token_ms = llm::time_in_ms(); + stats_.prompt_eval_end_ms = llm::time_in_ms(); + + auto decode_result = tokenizer_->decode(cur_token, cur_token); + if (decode_result.ok() && token_callback) { + token_callback(*decode_result); + } + + std::unordered_map> suffix_caches; + SuffixCache::seed_suffix_caches( + suffix_caches, prompt_span, ngram_size, n_verifications); + + TokenT prev_token = cur_token; + std::function(Method&)> sample_all_fn = + [this](Method& m) -> std::vector { return sample_all_tokens(m); }; + + int32_t num_generated = 1; + std::function token_cb = [&](TokenT tok) -> bool { + num_generated++; + if (num_generated > max_new_tokens) { + return false; + } + if (eos_ids_.find(tok) != eos_ids_.end()) { + return false; + } + auto decode = tokenizer_->decode(prev_token, tok); + if (decode.ok() && token_callback) { + token_callback(*decode); + } + prev_token = tok; + return true; + }; + + static_io->lookahead_decode( + prev_token, + input_span, + *method, + sample_all_fn, + token_cb, + ngram_size, + window_size, + n_verifications, + std::move(suffix_caches)); + + stats_.inference_end_ms = llm::time_in_ms(); + stats_.num_prompt_tokens = num_prompt_tokens; + stats_.num_generated_tokens = num_generated; + + double prefill_time_s = + (stats_.first_token_ms - stats_.inference_start_ms) / 1000.0; + double decode_time_s = + (stats_.inference_end_ms - stats_.first_token_ms) / 1000.0; + double tokens_per_sec = + decode_time_s > 0 ? (num_generated - 1) / decode_time_s : 0; + + ET_LOG( + Info, + "\nPrefill: %zu tokens in %.2f s", + num_prompt_tokens, + prefill_time_s); + ET_LOG( + Info, + "Decode: %d tokens in %.2f s (%.2f tok/s)", + num_generated, + decode_time_s, + tokens_per_sec); + + return Error::Ok; +} + +std::unique_ptr create_static_llm_runner( + const std::string& model_path, + const std::string& tokenizer_path, + const std::string& params_path) { + ET_LOG(Info, "Creating static LLM runner with model: %s", model_path.c_str()); + + // Load model to extract metadata + Module module(model_path, Module::LoadMode::File); + ET_LOG(Info, "Module created, loading method 'forward'..."); + + auto load_result = module.load_method("forward"); + if (load_result != Error::Ok) { + ET_LOG( + Error, + "Failed to load model method 'forward' from %s (error code: %d)", + model_path.c_str(), + static_cast(load_result)); + return nullptr; + } + ET_LOG(Info, "Method 'forward' loaded successfully"); + + auto method_meta = module.method_meta("forward"); + if (!method_meta.ok()) { + ET_LOG(Error, "Failed to get method metadata"); + return nullptr; + } + + // Extract input_len and cache_len from mask tensor shape + // Mask shape is (1, input_len, cache_len + input_len) + auto mask_meta = method_meta->input_tensor_meta(1); + if (!mask_meta.ok()) { + ET_LOG(Error, "Failed to get mask tensor metadata"); + return nullptr; + } + + auto mask_sizes = mask_meta->sizes(); + if (mask_sizes.size() != 3) { + ET_LOG( + Error, + "Expected mask tensor to have 3 dimensions, got %zu", + mask_sizes.size()); + return nullptr; + } + + size_t input_len = mask_sizes[1]; + size_t total_len = mask_sizes[2]; + size_t cache_len = total_len - input_len; + + // Extract n_layers from number of k_cache inputs + // Inputs: tokens(0), mask(1), freqs_cos(2), freqs_sin(3), k_caches..., v_caches... + size_t num_inputs = method_meta->num_inputs(); + size_t n_layers = (num_inputs - 4) / 2; + + // Extract n_kv_heads and head_dim from k_cache shape + // k_cache shape is (1, n_kv_heads, cache_len, head_dim) + auto k_cache_meta = method_meta->input_tensor_meta(4); + if (!k_cache_meta.ok()) { + ET_LOG(Error, "Failed to get k_cache tensor metadata"); + return nullptr; + } + + auto k_cache_sizes = k_cache_meta->sizes(); + if (k_cache_sizes.size() != 4) { + ET_LOG( + Error, + "Expected k_cache tensor to have 4 dimensions, got %zu", + k_cache_sizes.size()); + return nullptr; + } + + size_t n_kv_heads = k_cache_sizes[1]; + size_t head_dim = k_cache_sizes[3]; + + // Extract vocab_size and generate_full_logits from logits output shape + // Full logits shape: (1, input_len, vocab_size) + // Last token only: (1, 1, vocab_size) + auto logits_meta = method_meta->output_tensor_meta(0); + if (!logits_meta.ok()) { + ET_LOG(Error, "Failed to get logits tensor metadata"); + return nullptr; + } + + auto logits_sizes = logits_meta->sizes(); + size_t vocab_size = logits_sizes[logits_sizes.size() - 1]; + size_t logits_seq_len = logits_sizes[logits_sizes.size() - 2]; + bool generate_full_logits = (logits_seq_len == input_len); + + // Read params.json for any additional config (rope_base, etc.) + float rope_base = 500000.0f; + std::ifstream params_file(params_path); + if (params_file.is_open()) { + try { + nlohmann::json params; + params_file >> params; + rope_base = params.value("rope_theta", 500000.0f); + } catch (const std::exception& e) { + ET_LOG(Info, "Could not parse params.json, using defaults: %s", e.what()); + } + } + + StaticLLMConfig config; + config.n_layers = n_layers; + config.n_kv_heads = n_kv_heads; + config.head_dim = head_dim; + config.vocab_size = vocab_size; + config.input_len = input_len; + config.cache_len = cache_len; + config.generate_full_logits = generate_full_logits; + config.rope_base = rope_base; + + ET_LOG( + Info, + "Config from model metadata: n_layers=%zu, n_kv_heads=%zu, head_dim=%zu, " + "input_len=%zu, cache_len=%zu, vocab_size=%zu, full_logits=%s", + config.n_layers, + config.n_kv_heads, + config.head_dim, + config.input_len, + config.cache_len, + config.vocab_size, + config.generate_full_logits ? "true" : "false"); + + return std::make_unique(model_path, tokenizer_path, config); +} + +} // namespace example diff --git a/examples/apple/coreml/llama/runner/static_llm_runner.h b/examples/apple/coreml/llama/runner/static_llm_runner.h new file mode 100644 index 00000000000..e3c9064e47d --- /dev/null +++ b/examples/apple/coreml/llama/runner/static_llm_runner.h @@ -0,0 +1,236 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// C++ runner for static attention LLM models exported with +// export_static_llm_coreml.py. Subclasses TextDecoderRunner to maintain +// the standard LLM runner interface. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace example { + +/** + * Configuration for the static LLM model. + */ +struct StaticLLMConfig { + size_t n_layers = 16; + size_t n_kv_heads = 8; + size_t head_dim = 64; + size_t input_len = 32; + size_t cache_len = 992; + size_t vocab_size = 128256; + bool generate_full_logits = true; + float rope_base = 500000.0f; +}; + +/** + * Configuration for lookahead (speculative) decoding. + */ +struct LookaheadConfig { + bool enabled = false; + size_t ngram_size = 4; + size_t window_size = 5; + size_t n_verifications = 3; +}; + +/** + * IOManager adapter that wraps StaticAttentionIOManager to implement + * the executorch::extension::llm::IOManager interface. + * + * This bridges the gap between TextDecoderRunner's expected interface + * and the static attention model's I/O requirements. + */ +class StaticLLMIOManager : public ::executorch::extension::llm::IOManager { + public: + using CacheT = __fp16; + using MaskT = __fp16; + using RopeT = __fp16; + using LogitT = __fp16; + using TokenT = int32_t; + + StaticLLMIOManager( + ::executorch::extension::Module& module, + const StaticLLMConfig& config); + + ~StaticLLMIOManager() override = default; + + ::executorch::runtime::Error load( + const std::string& prefill_method, + const std::string& decode_method) override; + + ::executorch::runtime::Error reset( + const std::string& prefill_method, + const std::string& decode_method) override; + + ::executorch::runtime::Result> + prepare_prefill( + const ::executorch::extension::TensorPtr& input, + const ::executorch::extension::TensorPtr& start_pos, + const std::string& prefill_method) override; + + ::executorch::runtime::Result> + prepare_decode( + const ::executorch::extension::TensorPtr& input, + const ::executorch::extension::TensorPtr& start_pos, + const std::string& decode_method) override; + + ::executorch::runtime::Error update_prefill( + const std::vector<::executorch::runtime::EValue>& model_outputs, + const std::string& prefill_method) override; + + ::executorch::runtime::Error update_decode( + const std::vector<::executorch::runtime::EValue>& model_outputs, + const std::string& decode_method) override; + + /** + * Get the underlying StaticAttentionIOManager for advanced operations. + */ + StaticAttentionIOManager* get_static_io_manager() { + return static_io_manager_.get(); + } + + const StaticLLMConfig& config() const { + return config_; + } + + /** + * Set up CoreML output buffers on the Method. + * Must be called before first inference. + */ + void setup_output_buffers(::executorch::runtime::Method& method); + + private: + void compute_rope_frequencies(); + + ::executorch::extension::Module& module_; + StaticLLMConfig config_; + + std::vector freqs_cos_; + std::vector freqs_sin_; + std::vector input_buffer_; + std::vector logits_output_; + std::vector> k_update_buffers_; + std::vector> v_update_buffers_; + + std::unique_ptr> + static_io_manager_; + + size_t actual_input_len_ = 0; + bool output_buffers_set_ = false; +}; + +/** + * TextDecoderRunner subclass for static attention LLM models. + * + * Overrides step() to use StaticAttentionIOManager's prepare/execute flow. + */ +class StaticLLMTextDecoderRunner + : public ::executorch::extension::llm::TextDecoderRunner { + public: + StaticLLMTextDecoderRunner( + ::executorch::extension::Module* module, + StaticLLMIOManager* io_manager); + + ~StaticLLMTextDecoderRunner() override = default; + + ::executorch::runtime::Result<::executorch::aten::Tensor> step( + ::executorch::extension::TensorPtr& input, + int64_t start_pos) override; + + ::executorch::runtime::Error load() override; + + private: + StaticLLMIOManager* static_io_manager_; +}; + +/** + * Main runner class that orchestrates text generation. + */ +class StaticLLMRunner { + public: + StaticLLMRunner( + const std::string& model_path, + const std::string& tokenizer_path, + const StaticLLMConfig& config); + + ~StaticLLMRunner() = default; + + ::executorch::runtime::Error load(); + bool is_loaded() const; + void reset(); + + ::executorch::runtime::Error generate( + const std::string& prompt, + int32_t max_new_tokens, + float temperature = 0.0f, + std::function token_callback = nullptr); + + ::executorch::runtime::Error generate_with_lookahead( + const std::string& prompt, + int32_t max_new_tokens, + const LookaheadConfig& lookahead_config, + std::function token_callback = nullptr); + + const ::executorch::extension::llm::Stats& stats() const { + return stats_; + } + + /** + * Get the TextDecoderRunner for external use. + */ + StaticLLMTextDecoderRunner* decoder_runner() { + return decoder_runner_.get(); + } + + private: + using TokenT = int32_t; + using LogitT = __fp16; + + TokenT sample_token(::executorch::runtime::Method& method, size_t pos = 0); + std::vector sample_all_tokens(::executorch::runtime::Method& method); + + std::string model_path_; + std::string tokenizer_path_; + StaticLLMConfig config_; + + std::unique_ptr<::executorch::extension::Module> module_; + std::unique_ptr<::tokenizers::Tokenizer> tokenizer_; + std::unique_ptr io_manager_; + std::unique_ptr decoder_runner_; + + std::unordered_set eos_ids_; + ::executorch::extension::llm::Stats stats_; +}; + +/** + * Create a StaticLLMRunner with configuration auto-detected from model metadata. + * Reads input_len, cache_len, n_layers, n_kv_heads, head_dim, vocab_size from + * the model's method metadata. Only rope_base is read from params.json. + */ +std::unique_ptr create_static_llm_runner( + const std::string& model_path, + const std::string& tokenizer_path, + const std::string& params_path); + +} // namespace example