From 7418d2ea68b8eb04c454bfe084fe2499f4903369 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Mon, 14 Jul 2025 20:54:23 -0700 Subject: [PATCH] Phi3 runner uses TextLLMRunner As titled, this PR started to use `TextLLMRunner` to run phi-3-mini. Eager model comes from Huggingface, not using kv cache as custom op because it is only being supported on Optimum-executorch repo. Performance may not be the best. --- .ci/scripts/test_phi_3_mini.sh | 25 +--- examples/models/phi-3-mini/CMakeLists.txt | 32 ++--- examples/models/phi-3-mini/README.md | 32 ++--- .../models/phi-3-mini/export_phi-3-mini.py | 114 +++++++++++++----- examples/models/phi-3-mini/main.cpp | 19 ++- examples/models/phi-3-mini/phi_3_mini.py | 4 +- examples/models/phi-3-mini/runner.cpp | 104 ---------------- examples/models/phi-3-mini/runner.h | 50 -------- extension/llm/runner/text_decoder_runner.cpp | 17 +-- extension/llm/runner/text_llm_runner.cpp | 26 ++++ extension/llm/runner/text_prefiller.cpp | 2 + extension/llm/runner/text_prefiller.h | 2 +- tools/cmake/executorch-config.cmake | 14 +++ 13 files changed, 175 insertions(+), 266 deletions(-) delete mode 100644 examples/models/phi-3-mini/runner.cpp delete mode 100644 examples/models/phi-3-mini/runner.h diff --git a/.ci/scripts/test_phi_3_mini.sh b/.ci/scripts/test_phi_3_mini.sh index 2b41e5b308d..7f01995ae99 100644 --- a/.ci/scripts/test_phi_3_mini.sh +++ b/.ci/scripts/test_phi_3_mini.sh @@ -22,31 +22,14 @@ NPROC=8 if hash nproc &> /dev/null; then NPROC=$(nproc); fi cmake_install_executorch_libraries() { - cmake -DPYTHON_EXECUTABLE=python \ - -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ - -DEXECUTORCH_ENABLE_LOGGING=1 \ - -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ - -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_LLM=ON \ - -B${BUILD_DIR} . - - cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE} + cmake --preset llm -DCMAKE_INSTALL_PREFIX=cmake-out -DCMAKE_BUILD_TYPE=${BUILD_TYPE} + + cmake --build cmake-out -j16 --target install --config ${BUILD_TYPE} } cmake_build_phi_3_mini() { - cmake -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \ - -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ + cmake -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ - -DEXECUTORCH_BUILD_KERNELS_LLM=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -B${BUILD_DIR}/${MODEL_DIR} \ ${MODEL_DIR} diff --git a/examples/models/phi-3-mini/CMakeLists.txt b/examples/models/phi-3-mini/CMakeLists.txt index 9f7790cb8ab..38da3066117 100644 --- a/examples/models/phi-3-mini/CMakeLists.txt +++ b/examples/models/phi-3-mini/CMakeLists.txt @@ -20,17 +20,14 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) set(CMAKE_BUILD_TYPE Release) -# Set options for executorch build. -option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON) -option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON) -option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "" ON) -option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON) -option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON) -option(EXECUTORCH_BUILD_XNNPACK "" ON) - -add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/../../.. ${CMAKE_BINARY_DIR}/../../.. -) +set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../..") +find_package(executorch CONFIG REQUIRED) + +target_link_options_shared_lib(executorch) + +set(BUILD_TESTING OFF) +add_subdirectory(${EXECUTORCH_ROOT}/extension/llm/runner ${CMAKE_BINARY_DIR}/../../../extension/llm/runner) + if(NOT TARGET gflags) add_subdirectory( ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags @@ -40,16 +37,9 @@ endif() add_executable( phi_3_mini_runner - main.cpp runner.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/sampler/sampler.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizers/src/llama2c_tokenizer.cpp -) -target_include_directories( - phi_3_mini_runner - PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags/src - ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizers/include + main.cpp ) + target_link_libraries( - phi_3_mini_runner PRIVATE executorch extension_module_static extension_tensor - optimized_native_cpu_ops_lib xnnpack_backend gflags + phi_3_mini_runner PUBLIC executorch optimized_native_cpu_ops_lib xnnpack_backend gflags extension_llm_runner ) diff --git a/examples/models/phi-3-mini/README.md b/examples/models/phi-3-mini/README.md index 5571637e021..a51599a6906 100644 --- a/examples/models/phi-3-mini/README.md +++ b/examples/models/phi-3-mini/README.md @@ -21,33 +21,17 @@ python -m examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-m ``` 3. Build and run the model. - Build executorch with optimized CPU performance as follows. Build options available [here](https://github.com/pytorch/executorch/blob/main/CMakeLists.txt#L59). - ``` - cmake -DPYTHON_EXECUTABLE=python \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DEXECUTORCH_ENABLE_LOGGING=1 \ - -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_LLM=ON \ - -Bcmake-out . +``` +cmake --preset llm -DCMAKE_INSTALL_PREFIX=cmake-out - cmake --build cmake-out -j16 --target install --config Release - ``` +cmake --build cmake-out -j16 --target install --config Release +``` - Build Phi-3-mini runner. ``` -cmake -DPYTHON_EXECUTABLE=python \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_KERNELS_LLM=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -Bcmake-out/examples/models/phi-3-mini \ - examples/models/phi-3-mini +cmake -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -Bcmake-out/examples/models/phi-3-mini \ + examples/models/phi-3-mini cmake --build cmake-out/examples/models/phi-3-mini -j16 --config Release ``` diff --git a/examples/models/phi-3-mini/export_phi-3-mini.py b/examples/models/phi-3-mini/export_phi-3-mini.py index 246b3ccd6c6..d1239d9769d 100644 --- a/examples/models/phi-3-mini/export_phi-3-mini.py +++ b/examples/models/phi-3-mini/export_phi-3-mini.py @@ -19,13 +19,42 @@ XNNPACKQuantizer, ) from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config -from executorch.exir import to_edge +from executorch.exir import to_edge_transform_and_lower +from executorch.exir.capture._config import ExecutorchBackendConfig +from executorch.exir.passes import MemoryPlanningPass +from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass from torch.export import export_for_training +from torch.nn.attention import SDPBackend from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e from transformers import Phi3ForCausalLM +from transformers.cache_utils import StaticCacheConfig -from .phi_3_mini import Phi3Mini +from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM + + +def _prepare_export_inputs(max_seq_len: int, sliding_window: int): + """ + Prepare example inputs and configurations for export. + + Returns: + example_input_ids (torch.Tensor): Example input IDs tensor. + example_cache_position (torch.Tensor): Example cache position tensor. + dynamic_shapes (dict or None): Dynamic shape specifications for export. + strict (bool): Whether to use strict export mode. + """ + # Prepare inputs with dynamic shapes + seq_length = 3 # Sequence length > 1 to avoid specialization issues + example_input_ids = torch.zeros((1, seq_length), dtype=torch.long) + example_cache_position = torch.arange(seq_length, dtype=torch.long) + max_dim = min(max_seq_len, sliding_window) - 1 + seq_len_dim = torch.export.Dim("seq_length_dim", max=max_dim) + dynamic_shapes = { + "input_ids": {1: seq_len_dim}, + "cache_position": {0: seq_len_dim}, + } + + return example_input_ids, example_cache_position, dynamic_shapes def export(args) -> None: @@ -40,23 +69,34 @@ def export(args) -> None: f"Invalid context length {args.context_length}. Should be either 4k or 128k" ) - with torch.no_grad(): - model = Phi3Mini( - # pyre-ignore: Undefined attribute [16]: Module `transformers` has no attribute `Phi3ForCausalLM` - model=Phi3ForCausalLM.from_pretrained(model_name), + with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad(): + model = Phi3ForCausalLM.from_pretrained(model_name) + model.generation_config.cache_implementation = "static" + model.generation_config.cache_config = StaticCacheConfig( + batch_size=1, max_cache_len=model.config.max_position_embeddings + ) + + exportable_module = TorchExportableModuleForDecoderOnlyLM( + model, max_batch_size=1, - max_seq_len=args.seq_len, + max_cache_len=model.config.max_position_embeddings, ) - example_inputs = ( - torch.tensor( - [[1048, 263, 931, 746]], dtype=torch.long, requires_grad=False - ), + input_ids, cache_position, dynamic_shapes = _prepare_export_inputs( + model.config.max_position_embeddings, model.config.sliding_window + ) + example_inputs = (input_ids, cache_position) + exported_program = exportable_module.export( + input_ids, cache_position, dynamic_shapes, strict=False + ) + # Apply RemoveTransposes pass to remove + # any back-to-back transpose ops that are not needed + # e.g. output of update_cache is transposed and + # input to custom_sdpa is transposed. + from executorch.extension.llm.export.export_passes import ( + RemoveRedundantTransposes, ) - dynamic_shapes = { - "input_ids": { - 1: torch.export.Dim("sequence_length", min=1, max=args.seq_len) - } - } + + mutated_gm = RemoveRedundantTransposes()(exported_program.module())[0] xnnpack_quant_config = get_symmetric_quantization_config( is_per_channel=True, is_dynamic=True @@ -64,27 +104,35 @@ def export(args) -> None: xnnpack_quantizer = XNNPACKQuantizer() xnnpack_quantizer.set_global(xnnpack_quant_config) - model = export_for_training( - model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True - ).module() - model = prepare_pt2e(model, xnnpack_quantizer) # pyre-fixme[6] - model(*example_inputs) - model = convert_pt2e(model) - DuplicateDynamicQuantChainPass()(model) - # TODO(lunwenh): update it to use export once - # https://github.com/pytorch/pytorch/issues/128394 is resolved. - model = torch.export._trace._export( - model, - example_inputs, - dynamic_shapes=dynamic_shapes, - strict=False, - pre_dispatch=False, + gm = prepare_pt2e(mutated_gm, xnnpack_quantizer) # pyre-fixme[6] + gm(*example_inputs) + gm = convert_pt2e(gm) + DuplicateDynamicQuantChainPass()(gm) + exported_program = export_for_training( + gm, example_inputs, dynamic_shapes=dynamic_shapes, strict=False ) edge_config = get_xnnpack_edge_compile_config() - edge_manager = to_edge(model, compile_config=edge_config) + edge_manager = to_edge_transform_and_lower( + exported_program, + partitioner=[XnnpackPartitioner()], + compile_config=edge_config, + constant_methods={ + "get_eos_ids": [32000], + "use_kv_cache": True, + "enable_dynamic_shape": True, + "get_max_seq_len": model.config.max_position_embeddings - 1, + }, + ) edge_manager = edge_manager.to_backend(XnnpackPartitioner()) - et_program = edge_manager.to_executorch() + et_program = edge_manager.to_executorch( + ExecutorchBackendConfig( + extract_delegate_segments=True, + do_quant_fusion_and_const_prop=True, + memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False), + sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(), + ) + ) with open(args.output_name, "wb") as file: file.write(et_program.buffer) diff --git a/examples/models/phi-3-mini/main.cpp b/examples/models/phi-3-mini/main.cpp index 86446a8bde3..ca3224c11b2 100644 --- a/examples/models/phi-3-mini/main.cpp +++ b/examples/models/phi-3-mini/main.cpp @@ -6,9 +6,12 @@ * LICENSE file in the root directory of this source tree. */ +#include #include +#include +#include -#include +using executorch::extension::llm::TextLLMRunner; DEFINE_string( model_path, @@ -42,9 +45,17 @@ int main(int32_t argc, char** argv) { int32_t seq_len = FLAGS_seq_len; - example::Runner runner(model_path, tokenizer_path, temperature); - - runner.generate(prompt, seq_len); + std::unique_ptr tokenizer = + std::make_unique(); + tokenizer->load(tokenizer_path); + std::cout << "Tokenizer loaded, eos_id = " << tokenizer->eos_tok() + << std::endl; + auto runner = executorch::extension::llm::create_text_llm_runner( + model_path, std::move(tokenizer)); + + runner->generate( + prompt, + {.seq_len = seq_len, .temperature = static_cast(temperature)}); return 0; } diff --git a/examples/models/phi-3-mini/phi_3_mini.py b/examples/models/phi-3-mini/phi_3_mini.py index b8cd5ef3840..eec25fc7490 100644 --- a/examples/models/phi-3-mini/phi_3_mini.py +++ b/examples/models/phi-3-mini/phi_3_mini.py @@ -30,11 +30,13 @@ def __init__(self, model: Phi3ForCausalLM, max_batch_size: int, max_seq_len: int def forward( self, # pyre-fixme[9]: input_ids has type `LongTensor`; used as `None`. - input_ids: torch.LongTensor = None, + input_ids: torch.LongTensor, + cache_positions: torch.Tensor, ) -> torch.FloatTensor: # pyre-fixme[16]: `Phi3ForCausalLM` has no attribute `forward`. return self.model.forward( input_ids=input_ids, + cache_positions=cache_positions, use_cache=True, return_dict=True, past_key_values=self.cache, diff --git a/examples/models/phi-3-mini/runner.cpp b/examples/models/phi-3-mini/runner.cpp deleted file mode 100644 index 15f76e9522c..00000000000 --- a/examples/models/phi-3-mini/runner.cpp +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include -#include -#include - -using executorch::aten::ScalarType; -using executorch::extension::Module; -using executorch::extension::llm::Sampler; -using executorch::runtime::Error; -using tokenizers::Llama2cTokenizer; - -namespace example { - -#define SAMPLER_TOP 0.9f -#define ENDOFTEXT_TOKEN 32000 -#define VOCABULARY_SIZE 32064 - -Runner::Runner( - const std::string& model_path, - const std::string& tokenizer_path, - const float temperature) - : module_(std::make_unique(model_path, Module::LoadMode::File)), - tokenizer_(std::make_unique()), - sampler_(std::make_unique( - VOCABULARY_SIZE, - temperature, - SAMPLER_TOP, - static_cast(std::time(nullptr)))) { - ET_CHECK_MSG( - tokenizer_->load(tokenizer_path) == tokenizers::Error::Ok, - "Failed to load tokenizer at %s", - tokenizer_path.c_str()); - ET_LOG( - Info, - "Created Phi-3-mini runner: model_path=%s, tokenizer_path=%s", - model_path.c_str(), - tokenizer_path.c_str()); -} - -void Runner::generate(const std::string& prompt, std::size_t max_seq_len) { - auto encode_res = tokenizer_->encode(prompt, 0, 0); - ET_CHECK_MSG( - encode_res.error() == tokenizers::Error::Ok, - "Failed to encode %s", - prompt.c_str()); - auto input_tokens = encode_res.get(); - auto prev_token = input_tokens.back(); - auto current_token = prefill(input_tokens); - std::cout << tokenizer_->decode(prev_token, current_token).get(); - std::cout.flush(); - - std::size_t seq_len = input_tokens.size() + 1; - - while (current_token != ENDOFTEXT_TOKEN && seq_len < max_seq_len) { - prev_token = current_token; - current_token = run_model_step(current_token); - std::cout << tokenizer_->decode(prev_token, current_token).get(); - std::cout.flush(); - - ++seq_len; - } - - std::cout << std::endl; -} - -uint64_t Runner::logits_to_token( - const executorch::aten::Tensor& logits_tensor) { - return sampler_->sample(logits_tensor.data_ptr()); -} - -uint64_t Runner::prefill(std::vector& tokens) { - auto result = module_->forward(executorch::extension::from_blob( - tokens.data(), - {1, static_cast(tokens.size())}, - ScalarType::Long)); - ET_CHECK_MSG(result.error() == Error::Ok, "Failed to prefill tokens"); - - return logits_to_token(result.get()[0].toTensor()); -} - -uint64_t Runner::run_model_step(uint64_t token) { - auto result = module_->forward( - executorch::extension::from_blob(&token, {1, 1}, ScalarType::Long)); - ET_CHECK_MSG( - result.error() == Error::Ok, - "Failed to run forward() for token %" PRIu64, - token); - - return logits_to_token(result.get()[0].toTensor()); -} - -} // namespace example diff --git a/examples/models/phi-3-mini/runner.h b/examples/models/phi-3-mini/runner.h deleted file mode 100644 index 2f0042a57ea..00000000000 --- a/examples/models/phi-3-mini/runner.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// A simple phi-3-mini runner that includes preprocessing and post processing -// logic. The module takes in a string as input and emits a string as output. - -#pragma once - -#include -#include - -#include -#include -#include -#include - -namespace example { - -class Runner { - public: - explicit Runner( - const std::string& model_path, - const std::string& tokenizer_path, - const float temperature = 0.8f); - - /** - * Generates response for a given prompt. - * - * @param[in] prompt The prompt to generate a response for. - * @param[in] max_seq_len The maximum length of the sequence to generate, - * including prompt. - */ - void generate(const std::string& prompt, std::size_t max_seq_len); - - private: - uint64_t logits_to_token(const executorch::aten::Tensor& logits_tensor); - uint64_t prefill(std::vector& tokens); - uint64_t run_model_step(uint64_t token); - - std::unique_ptr module_; - std::unique_ptr tokenizer_; - std::unique_ptr sampler_; -}; - -} // namespace example diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp index e60a07bc50a..4293b2a08d8 100644 --- a/extension/llm/runner/text_decoder_runner.cpp +++ b/extension/llm/runner/text_decoder_runner.cpp @@ -52,22 +52,25 @@ ::executorch::runtime::Result TextDecoderRunner::step( auto numel = sizes[0]; std::vector<::executorch::aten::SizesType> sizes_vec = {numel}; - // Assuming the last dimension is the one with the variable token length, - // for example [1, S] or [1, 1, S] - sizes_vec[sizes_vec.size() - 1] = numel; TensorPtr start_pos_tensor; if (numel > 1) { - // Assuming model is exported with cache_positions, create a tensor with - // the same size as cache_positions + // If we are here, model is exported with cache_positions, create a tensor + // with the same length as input_ids. Assuming the last dimension is the + // one with the variable token length, for example [1, S] or [1, 1, S] + sizes_vec[sizes_vec.size() - 1] = tokens->numel(); start_pos_tensor = empty(sizes_vec, ::executorch::aten::ScalarType::Long); torch::executor::native::arange_out_impl( - start_pos, start_pos + numel, 1.0, *start_pos_tensor); + start_pos, start_pos + tokens->numel(), 1.0, *start_pos_tensor); } else { // Assuming model is exported with input_pos, create a tensor with size 1 start_pos_tensor = from_blob( &start_pos, sizes_vec, ::executorch::aten::ScalarType::Long); } - ET_LOG(Info, "Start pos tensor numel: %zu", start_pos_tensor->numel()); + ET_LOG( + Info, + "Start pos tensor numel: %zu, tokens numel: %zu", + start_pos_tensor->numel(), + tokens->numel()); auto outputs_res = module_->forward({tokens, start_pos_tensor}); ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error()); ET_CHECK_MSG( diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp index cf55d98224a..1842472aada 100644 --- a/extension/llm/runner/text_llm_runner.cpp +++ b/extension/llm/runner/text_llm_runner.cpp @@ -32,6 +32,7 @@ static constexpr auto kMaxContextLen = "get_max_context_len"; static constexpr auto kVocabSize = "get_vocab_size"; static constexpr auto kUseKVCache = "use_kv_cache"; static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache"; +static constexpr auto kUseCachePositions = "use_cache_positions"; TextLLMRunner::TextLLMRunner( std::unordered_map metadata, @@ -306,6 +307,7 @@ std::unordered_map get_llm_metadata( {llm::kMaxContextLen, 128}, {llm::kUseKVCache, true}, {llm::kUseSDPAWithKVCache, false}, + {llm::kUseCachePositions, false}, }); // Read metadata from the model @@ -335,6 +337,29 @@ std::unordered_map get_llm_metadata( // Set tokenizer-related metadata metadata[llm::kBosId] = tokenizer->bos_tok(); metadata[llm::kVocabSize] = tokenizer->vocab_size(); + + // Override metadata using the module's method_meta + auto method_meta_result = module->method_meta("forward"); + if (method_meta_result.error() != Error::Ok) { + ET_LOG(Error, "Failed reading method meta"); + return metadata; + } + auto method_meta = method_meta_result.get(); + // If only 1 input, we are not using kv cache + metadata[llm::kUseKVCache] = method_meta.num_inputs() > 1; + + if (method_meta.num_inputs() == 1) { + return metadata; + } + // Check if we are using cache positions instead of input pos. + auto second_input_info = method_meta.input_tensor_meta(1).get(); + // For input_pos, numel is 1, for cache_positions, numel is max_seq_len + auto sizes = second_input_info.sizes(); + int64_t total_size = 1; + for (const auto& size : sizes) { + total_size *= size; + } + metadata[llm::kUseCachePositions] = total_size > 1; return metadata; } @@ -401,6 +426,7 @@ std::unique_ptr create_text_llm_runner( auto text_prefiller = std::make_unique( text_decoder_runner.get(), metadata.at(kUseKVCache), + metadata.at(kUseCachePositions), metadata.at(kEnableDynamicShape), metadata.at(kMaxSeqLen)); diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp index de092b6b05d..86e89c416b5 100644 --- a/extension/llm/runner/text_prefiller.cpp +++ b/extension/llm/runner/text_prefiller.cpp @@ -19,10 +19,12 @@ namespace llm { TextPrefiller::TextPrefiller( TextDecoderRunner* text_decoder_runner, bool use_kv_cache, + bool use_cache_positions, bool enable_parallel_prefill, int64_t max_seq_len) : text_decoder_runner_(text_decoder_runner), use_kv_cache_(use_kv_cache), + use_cache_positions_(use_cache_positions), enable_parallel_prefill_(enable_parallel_prefill), max_seq_len_(max_seq_len > 0 ? max_seq_len : 128) {} diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h index ce12506a05c..a02cd3d1bf4 100644 --- a/extension/llm/runner/text_prefiller.h +++ b/extension/llm/runner/text_prefiller.h @@ -21,7 +21,7 @@ class ET_EXPERIMENTAL TextPrefiller { public: TextPrefiller( TextDecoderRunner* text_decoder_runner, - bool use_kv_cache_, + bool use_kv_cache, bool enable_parallel_prefill, int64_t max_seq_len = 128); diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake index e73104b0f1e..62746a4a713 100644 --- a/tools/cmake/executorch-config.cmake +++ b/tools/cmake/executorch-config.cmake @@ -26,6 +26,8 @@ cmake_minimum_required(VERSION 3.19) include("${CMAKE_CURRENT_LIST_DIR}/Utils.cmake") +include(${CMAKE_CURRENT_LIST_DIR}/Utils.cmake) + set(_root "${CMAKE_CURRENT_LIST_DIR}/../../..") set(required_lib_list executorch executorch_core portable_kernels) set(EXECUTORCH_LIBRARIES) @@ -186,3 +188,15 @@ foreach(lib ${shared_lib_list}) target_link_options_shared_lib(${lib}) endif() endforeach() + +if(TARGET xnnpack_backend) + if(TARGET kleidiai) + set(_deps "XNNPACK;xnnpack-microkernels-prod;kleidiai") + else() + set(_deps "XNNPACK;xnnpack-microkernels-prod") + endif() + set_target_properties( + xnnpack_backend PROPERTIES INTERFACE_LINK_LIBRARIES "${_deps}" + ) + target_link_options_shared_lib(xnnpack_backend) +endif()