diff --git a/.ci/scripts/build_llama_android.sh b/.ci/scripts/build_llama_android.sh index ed0fa5d16bb..1b22051533d 100644 --- a/.ci/scripts/build_llama_android.sh +++ b/.ci/scripts/build_llama_android.sh @@ -42,6 +42,7 @@ build_llama_runner() { popd ANDROID_ABI=arm64-v8a cmake -DBUCK2="${BUCK2}" \ + -DBUILD_TESTING=OFF \ -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK"/build/cmake/android.toolchain.cmake \ -DANDROID_ABI="${ANDROID_ABI}" \ -DCMAKE_INSTALL_PREFIX=cmake-android-out \ diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 4ed5ec308c5..88fedabba27 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -169,6 +169,7 @@ cmake_build_llama_runner() { popd dir="examples/models/llama" retry cmake \ + -DBUILD_TESTING=OFF \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \ -Bcmake-out/${dir} \ diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh index ae9924c2a2b..3fe5fa0faea 100644 --- a/.ci/scripts/test_llama_torchao_lowbit.sh +++ b/.ci/scripts/test_llama_torchao_lowbit.sh @@ -40,6 +40,7 @@ cmake --build cmake-out -j16 --target install --config Release # Install llama runner with torchao cmake -DPYTHON_EXECUTABLE=python \ + -DBUILD_TESTING=OFF \ -DCMAKE_BUILD_TYPE=Release \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh index 9a0251c9a38..0d9f2b8b141 100644 --- a/.ci/scripts/test_llava.sh +++ b/.ci/scripts/test_llava.sh @@ -64,9 +64,10 @@ cmake_install_executorch_libraries_for_android() { LLAVA_COMMON_CMAKE_ARGS=" \ + -DBUILD_TESTING=OFF \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON" diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm index c2f01bf17b1..fc7f440d999 100644 --- a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm @@ -14,6 +14,7 @@ using executorch::extension::llm::GenerationConfig; using executorch::extension::llm::Image; +using executorch::extension::llm::TextLLMRunner; using executorch::runtime::Error; NSErrorDomain const LLaMARunnerErrorDomain = @"LLaMARunnerErrorDomain"; @@ -23,7 +24,7 @@ @interface LLaMARunner () @end @implementation LLaMARunner { - std::unique_ptr _runner; + std::unique_ptr _runner; } - (instancetype)initWithModelPath:(NSString*)modelPath @@ -31,7 +32,7 @@ - (instancetype)initWithModelPath:(NSString*)modelPath self = [super init]; if (self) { [ExecuTorchLog.sharedLog addSink:self]; - _runner = example::Runner::create( + _runner = example::create_llama_runner( modelPath.UTF8String, tokenizerPath.UTF8String); } return self; diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt index 0f7648ff65e..8c27de20845 100644 --- a/examples/models/llama/CMakeLists.txt +++ b/examples/models/llama/CMakeLists.txt @@ -220,7 +220,6 @@ endif() target_include_directories( llama_main PUBLIC ${_common_include_directories} - ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include ) target_link_libraries(llama_main PUBLIC llama_runner ${link_libraries}) target_compile_options(llama_main PUBLIC ${_common_compile_options}) diff --git a/examples/models/llama/main.cpp b/examples/models/llama/main.cpp index 1c1b6f62dc1..38009dd59ec 100644 --- a/examples/models/llama/main.cpp +++ b/examples/models/llama/main.cpp @@ -81,8 +81,13 @@ int32_t main(int32_t argc, char** argv) { } #endif // create llama runner - std::unique_ptr runner = - example::Runner::create(model_path, tokenizer_path, data_path); + std::unique_ptr<::executorch::extension::llm::TextLLMRunner> runner = + example::create_llama_runner(model_path, tokenizer_path, data_path); + + if (runner == nullptr) { + ET_LOG(Error, "Failed to create llama runner"); + return 1; + } if (warmup) { runner->warmup(prompt, /*max_new_tokens=*/seq_len); diff --git a/examples/models/llama/runner/CMakeLists.txt b/examples/models/llama/runner/CMakeLists.txt index fefee61092d..a73990edd96 100644 --- a/examples/models/llama/runner/CMakeLists.txt +++ b/examples/models/llama/runner/CMakeLists.txt @@ -52,23 +52,20 @@ else() add_library(llama_runner SHARED ${_llama_runner__srcs}) endif() +# For extension_llm_runner +if(NOT TARGET extension_llm_runner) + add_subdirectory( + ${EXECUTORCH_ROOT}/extension/llm/runner + ${CMAKE_CURRENT_BINARY_DIR}/../../../../extension/llm/runner + ) +endif() + set(llama_runner_deps executorch_core extension_data_loader extension_module - extension_tensor extension_flat_tensor + extension_tensor extension_flat_tensor extension_llm_runner ) target_link_libraries(llama_runner PUBLIC ${llama_runner_deps}) -target_include_directories( - llama_runner - INTERFACE ${_common_include_directories} -) - -# Include tokenizers dependency -set(CMAKE_POSITION_INDEPENDENT_CODE ON) -add_subdirectory( - ${EXECUTORCH_ROOT}/extension/llm/tokenizers - ${CMAKE_CURRENT_BINARY_DIR}/tokenizers -) target_link_libraries( llama_runner PUBLIC tokenizers ) diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp index 119eedc704e..2ba2fdf9941 100644 --- a/examples/models/llama/runner/runner.cpp +++ b/examples/models/llama/runner/runner.cpp @@ -11,8 +11,7 @@ // The module takes in a string as input and emits a string as output. #include - -#include +#include #include #include @@ -26,41 +25,14 @@ using ::executorch::runtime::Result; namespace llm = ::executorch::extension::llm; -namespace { -static constexpr auto kEnableDynamicShape = "enable_dynamic_shape"; -static constexpr auto kBosId = "get_bos_id"; -static constexpr auto kEosIds = "get_eos_ids"; -static constexpr auto kMaxSeqLen = "get_max_seq_len"; -static constexpr auto kMaxContextLen = "get_max_context_len"; -static constexpr auto kVocabSize = "get_vocab_size"; -static constexpr auto kUseKVCache = "use_kv_cache"; -static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache"; - -std::unique_ptr<::tokenizers::Tokenizer> load_tokenizer( - const std::string& tokenizer_path) { - auto json_tokenizer = std::make_unique(); - if (json_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { - ET_LOG(Info, "Loaded json tokenizer"); - return json_tokenizer; - } - - auto tiktoken_tokenizer = get_tiktoken_for_llama(); - if (tiktoken_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { - ET_LOG(Info, "Loaded TikToken tokenizer"); - return tiktoken_tokenizer; - } - - auto bpe_tokenizer = std::make_unique<::tokenizers::Llama2cTokenizer>(); - if (bpe_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { - ET_LOG(Info, "Loaded BPE tokenizer"); - return bpe_tokenizer; - } - - return nullptr; +std::unique_ptr<::tokenizers::Tokenizer> load_llama_tokenizer( + const std::string& tokenizer_path, + Version version) { + auto special_tokens = get_special_tokens(version); + return llm::load_tokenizer(tokenizer_path, std::move(special_tokens)); } -} // namespace -std::unique_ptr Runner::create( +std::unique_ptr create_llama_runner( const std::string& model_path, const std::string& tokenizer_path, std::optional data_path, @@ -71,29 +43,10 @@ std::unique_ptr Runner::create( model_path.c_str(), tokenizer_path.c_str()); - // Create the Module - std::unique_ptr module; - if (data_path.has_value()) { - module = std::make_unique( - model_path, data_path.value(), Module::LoadMode::File); - } else { - module = std::make_unique(model_path, Module::LoadMode::File); - } - - // Initialize metadata with default values - std::unordered_map metadata({ - {kEnableDynamicShape, false}, - {kMaxSeqLen, 128}, - {kMaxContextLen, 128}, - {kUseKVCache, true}, - {kUseSDPAWithKVCache, false}, - }); - // Create and load tokenizer std::unique_ptr<::tokenizers::Tokenizer> tokenizer = - load_tokenizer(tokenizer_path); + load_llama_tokenizer(tokenizer_path, Version::Default); - // Fallback to BPE tokenizer if tiktoken fails if (tokenizer == nullptr) { ET_LOG( Info, @@ -101,279 +54,8 @@ std::unique_ptr Runner::create( tokenizer_path.c_str()); return nullptr; } - - ET_LOG(Info, "Reading metadata from model"); - - // Set tokenizer-related metadata - metadata[kBosId] = tokenizer->bos_tok(); - auto eos_ids = std::make_unique>( - std::unordered_set{tokenizer->eos_tok()}); - metadata[kVocabSize] = tokenizer->vocab_size(); - - // Read metadata from the model - auto method_names_result = module->method_names(); - if (method_names_result.error() != Error::Ok) { - ET_LOG(Error, "Failed reading method names"); - return nullptr; - } - const auto method_names = method_names_result.get(); - - for (auto& pair : metadata) { - const auto& method_name = pair.first; - auto& value = pair.second; - - if (method_names.count(method_name)) { - auto get_result = module->get(method_name); - value = get_result.get().toScalar().to(); - } else { - ET_LOG( - Info, - "Method %s not found, using the default value %" PRId64, - method_name.c_str(), - value); - } - ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value); - } - - // Get EOS IDs if available - if (method_names.count(kEosIds)) { - eos_ids->clear(); - auto execute_result = module->execute(kEosIds); - if (execute_result.error() != Error::Ok) { - ET_LOG(Error, "Failed to execute %s", kEosIds); - return nullptr; - } - for (const auto& eos_id : execute_result.get()) { - auto value = eos_id.toScalar().to(); - eos_ids->emplace(value); - ET_LOG(Info, "eos_id = %" PRId64, value); - } - } - - // Create text_decoder_runner. Use a shared_ptr so that it can be shared with - // TextPrefiller and TextTokenGenerator - auto text_decoder_runner = std::make_unique( - module.get(), metadata.at(kUseKVCache)); - - // Create text_prefiller - auto text_prefiller = std::make_unique( - text_decoder_runner.get(), - metadata.at(kUseKVCache), - metadata.at(kEnableDynamicShape), - metadata.at(kMaxSeqLen)); - - // Create text_token_generator with stats - auto stats = std::make_unique(); - auto text_token_generator = std::make_unique( - tokenizer.get(), - text_decoder_runner.get(), - metadata.at(kUseKVCache), - std::move(eos_ids), - stats.get()); - - // Create and return the Runner instance - return std::make_unique( - std::move(metadata), - std::move(tokenizer), - std::move(module), - std::move(text_decoder_runner), - std::move(text_prefiller), - std::move(text_token_generator), - std::move(stats), - temperature); -} - -Runner::Runner( - std::unordered_map metadata, - std::unique_ptr<::tokenizers::Tokenizer> tokenizer, - std::unique_ptr<::executorch::extension::Module> module, - std::unique_ptr<::executorch::extension::llm::TextDecoderRunner> - text_decoder_runner, - std::unique_ptr<::executorch::extension::llm::TextPrefiller> text_prefiller, - std::unique_ptr<::executorch::extension::llm::TextTokenGenerator> - text_token_generator, - std::unique_ptr<::executorch::extension::llm::Stats> stats, - float temperature) - : tokenizer_(std::move(tokenizer)), - metadata_(std::move(metadata)), - module_(std::move(module)), - text_decoder_runner_(std::move(text_decoder_runner)), - text_prefiller_(std::move(text_prefiller)), - text_token_generator_(std::move(text_token_generator)), - stats_(std::move(stats)), - temperature_(temperature) { - // Note: This constructor assumes that text_prefiller and text_token_generator - // already have references to the Module and TextDecoderRunner they need -} - -bool Runner::is_loaded() const { - return text_prefiller_->is_loaded() && text_token_generator_->is_loaded(); -} - -Error Runner::load() { - if (is_loaded()) { - return Error::Ok; - } - ET_CHECK_OK_OR_RETURN_ERROR(text_prefiller_->load()); - ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load()); - return Error::Ok; -} - -// Don't print with the same priority during warmup -#define RUNNER_ET_LOG(warmup, format, ...) \ - if (warmup) { \ - ET_LOG(Debug, format, __VA_ARGS__); \ - } else { \ - ET_LOG(Info, format, __VA_ARGS__); \ - } - -Error Runner::generate( - const std::string& prompt, - const ::executorch::extension::llm::GenerationConfig& config, - std::function token_callback, - std::function stats_callback) { - // Prepare the inputs. - // Use ones-initialized inputs. - ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); - if (!is_loaded()) { - stats_->model_load_start_ms = llm::time_in_ms(); - ET_CHECK_OK_OR_RETURN_ERROR(load()); - stats_->model_load_end_ms = llm::time_in_ms(); - } - - if (config.warming) { - ET_LOG(Info, "Doing a warmup run..."); - } - - RUNNER_ET_LOG( - config.warming, - "RSS after loading model: %f MiB (0 if unsupported)", - llm::get_rss_bytes() / 1024.0 / 1024.0); - - // Wrap the token_callback with print function - std::function wrapped_callback = - [token_callback, config](const std::string& piece) { - if (!config.warming) { - llm::safe_printf(piece.c_str()); - fflush(stdout); - } - if (token_callback) { - token_callback(piece); - } - }; - // First token time only measures the time it takes to encode the prompt and - // return a response token. - - stats_->inference_start_ms = llm::time_in_ms(); - shouldStop_ = false; - - ::tokenizers::Result> encode_res = tokenizer_->encode( - prompt, - /* bos */ 0, - /* eos */ 0); - - ET_CHECK_TK_OK_OR_RETURN_ERROR( - encode_res.error(), "Failed to encode prompt %s", prompt.c_str()); - - // encode the (string) prompt into tokens sequence - std::vector prompt_tokens = encode_res.get(); - int num_prompt_tokens = prompt_tokens.size(); - - ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token"); - ET_CHECK_MSG( - num_prompt_tokens < metadata_.at(kMaxContextLen), - "num_prompt_tokens %d >= max_seq_len_ %" PRId64 - ", Max seq length exceeded - please increase max seq len value in your export script", - num_prompt_tokens, - metadata_.at(kMaxContextLen)); - - // Determine max_new_tokens using the GenerationConfig's resolve method - int max_new_tokens = config.resolve_max_new_tokens( - metadata_.at(kMaxContextLen), num_prompt_tokens); - - ET_LOG(Info, "Max new tokens resolved: %d", max_new_tokens); - - // Prefill first - // Here feed all tokens to the model and get the next predicted token - // after the prompt. After that we will enter generate loop. - - // print prompts - if (config.echo) { - wrapped_callback(prompt); - } - int64_t pos = 0; - auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos); - ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error()); - uint64_t cur_token = prefill_res.get(); - stats_->first_token_ms = llm::time_in_ms(); - stats_->prompt_eval_end_ms = llm::time_in_ms(); - - // print the first token from prefill. No prev_token so use cur_token for it. - wrapped_callback( - ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token))); - RUNNER_ET_LOG( - config.warming, - "RSS after prompt prefill: %f MiB (0 if unsupported)", - llm::get_rss_bytes() / 1024.0 / 1024.0); - - // start the main loop - prompt_tokens.push_back(cur_token); - - // Generate max_new_tokens - 1 because prefill already generated 1 token. - int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate( - prompt_tokens, - num_prompt_tokens, - max_new_tokens - 1, - temperature_ == -1.0f ? config.temperature : temperature_, - wrapped_callback)); - - stats_->inference_end_ms = llm::time_in_ms(); - if (!config.warming) { - printf("\n"); - } - RUNNER_ET_LOG( - config.warming, - "RSS after finishing text generation: %f MiB (0 if unsupported)", - llm::get_rss_bytes() / 1024.0 / 1024.0); - - if (num_generated_tokens == max_new_tokens) { - RUNNER_ET_LOG(config.warming, "Max new tokens %i reached!", max_new_tokens); - } - - stats_->num_prompt_tokens = num_prompt_tokens; - stats_->num_generated_tokens = num_generated_tokens; - - if (config.warming) { - ET_LOG(Info, "Warmup run finished!"); - } else { - // Do not print report during warmup - ::executorch::llm::print_report(*stats_); - } - if (stats_callback) { - stats_callback(*stats_); - } - - return Error::Ok; -} - -Error Runner::warmup(const std::string& prompt, int32_t max_new_tokens) { - // Create a GenerationConfig for warmup - llm::GenerationConfig config{ - .echo = false, .max_new_tokens = max_new_tokens, .warming = true}; - - // Call generate with the warmup config - Error err = generate(prompt, config); - - // Reset stats after warmup, not resetting the std::unique_ptr! - stats_->reset(); - return err; + return llm::create_text_llm_runner( + model_path, std::move(tokenizer), data_path); } -void Runner::stop() { - if (is_loaded()) { - text_token_generator_->stop(); - } else { - ET_LOG(Error, "Token generator is not loaded, cannot stop"); - } -} } // namespace example diff --git a/examples/models/llama/runner/runner.h b/examples/models/llama/runner/runner.h index e4e91db37d5..09a166b0109 100644 --- a/examples/models/llama/runner/runner.h +++ b/examples/models/llama/runner/runner.h @@ -19,74 +19,20 @@ #include #include -#include -#include -#include -#include -#include +#include #include namespace example { -class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner { - public: - // Static factory method to create a Runner instance - static std::unique_ptr create( - const std::string& model_path, - const std::string& tokenizer_path, - std::optional data_path = std::nullopt, - float temperature = -1.0f); +namespace llm = ::executorch::extension::llm; - // Constructor with dependency injection - explicit Runner( - std::unordered_map metadata, - std::unique_ptr<::tokenizers::Tokenizer> tokenizer, - std::unique_ptr<::executorch::extension::Module> module, - std::unique_ptr<::executorch::extension::llm::TextDecoderRunner> - text_decoder_runner, - std::unique_ptr<::executorch::extension::llm::TextPrefiller> - text_prefiller, - std::unique_ptr<::executorch::extension::llm::TextTokenGenerator> - text_token_generator, - std::unique_ptr<::executorch::extension::llm::Stats> stats, - float temperature = -1.0f); +std::unique_ptr create_llama_runner( + const std::string& model_path, + const std::string& tokenizer_path, + std::optional data_path = std::nullopt, + float temperature = -1.0f); - bool is_loaded() const override; - ::executorch::runtime::Error load() override; - ::executorch::runtime::Error generate( - const std::string& prompt, - const ::executorch::extension::llm::GenerationConfig& config, - std::function token_callback = {}, - std::function - stats_callback = {}) override; - ::executorch::runtime::Error warmup( - const std::string& prompt, - int32_t max_new_tokens); - void stop() override; - - private: - bool shouldStop_{false}; - - // Components - std::unique_ptr<::tokenizers::Tokenizer> tokenizer_; - std::unordered_map metadata_; - std::unique_ptr<::executorch::extension::Module> - module_; // Manage module's lifecycle, make sure it outlives - // text_decoder_runner_. - std::unique_ptr<::executorch::extension::llm::TextDecoderRunner> - text_decoder_runner_; // Manage text_decoder_runner_'s lifecycle, make - // sure it outlives text_prefiller_ & - // text_token_generator_. - std::unique_ptr<::executorch::extension::llm::TextPrefiller> text_prefiller_; - std::unique_ptr<::executorch::extension::llm::TextTokenGenerator> - text_token_generator_; - - // Stats - std::unique_ptr<::executorch::extension::llm::Stats> stats_; - - // temperature. - // Deprecated, we should rely on the temperature in GenerationConfig instead. - float temperature_ = -1.0f; -}; +std::unique_ptr load_llama_tokenizer( + const std::string& tokenizer_path); } // namespace example diff --git a/examples/models/llama/runner/targets.bzl b/examples/models/llama/runner/targets.bzl index 158202cf55a..e0a96af85bb 100644 --- a/examples/models/llama/runner/targets.bzl +++ b/examples/models/llama/runner/targets.bzl @@ -34,16 +34,12 @@ def define_common_targets(): visibility = [ "@EXECUTORCH_CLIENTS", ], + compiler_flags = [ + "-Wno-missing-prototypes", + ], exported_deps = [ "//executorch/backends/xnnpack:xnnpack_backend", - "//executorch/extension/llm/runner:irunner", - "//executorch/extension/llm/runner:stats", - "//executorch/extension/llm/runner:text_decoder_runner" + aten_suffix, - "//executorch/extension/llm/runner:text_prefiller" + aten_suffix, - "//executorch/extension/llm/runner:text_token_generator" + aten_suffix, - "//executorch/extension/evalue_util:print_evalue" + aten_suffix, - "//executorch/extension/module:module" + aten_suffix, - "//executorch/extension/tensor:tensor" + aten_suffix, + "//executorch/extension/llm/runner:runner_lib" + aten_suffix, "//executorch/kernels/quantized:generated_lib" + aten_suffix, "//executorch/runtime/core/exec_aten:lib" + aten_suffix, "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, diff --git a/examples/models/llama/runner/test/CMakeLists.txt b/examples/models/llama/runner/test/CMakeLists.txt deleted file mode 100644 index aa754b96da6..00000000000 --- a/examples/models/llama/runner/test/CMakeLists.txt +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# This file should be formatted with -# ~~~ -# cmake-format -i CMakeLists.txt -# ~~~ -# It should also be cmake-lint clean. -# - -cmake_minimum_required(VERSION 3.19) - -set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..) - -include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake) - -set(_test_srcs test_runner.cpp) - -et_cxx_test( - test_runner - SOURCES - ${_test_srcs} - EXTRA_LIBS - executorch -) diff --git a/examples/models/llama/runner/test/TARGETS b/examples/models/llama/runner/test/TARGETS deleted file mode 100644 index 97de7abe9b1..00000000000 --- a/examples/models/llama/runner/test/TARGETS +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/examples/models/llama/runner/test/targets.bzl b/examples/models/llama/runner/test/targets.bzl deleted file mode 100644 index 3b02360da08..00000000000 --- a/examples/models/llama/runner/test/targets.bzl +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") - -def define_common_targets(): - runtime.cxx_test( - name = "test_runner", - srcs = ["test_runner.cpp"], - deps = [ - "//executorch/examples/models/llama/runner:runner", - "//executorch/extension/llm/runner:irunner", - "//executorch/extension/llm/runner:stats", - "//executorch/extension/llm/runner:text_token_generator", - "//executorch/extension/llm/runner:text_decoder_runner", - "//executorch/extension/llm/runner:text_prefiller", - "//executorch/extension/module:module", - "//executorch/runtime/core:core", - "//executorch/runtime/platform:platform", - "//executorch/runtime/core/exec_aten/testing_util:tensor_util", - ], - ) diff --git a/examples/models/llama/tokenizer/llama_tiktoken.cpp b/examples/models/llama/tokenizer/llama_tiktoken.cpp index f595de3c4e7..7b98a6ca415 100644 --- a/examples/models/llama/tokenizer/llama_tiktoken.cpp +++ b/examples/models/llama/tokenizer/llama_tiktoken.cpp @@ -42,7 +42,9 @@ _get_default_special_tokens() { return special_tokens; } -std::unique_ptr> _get_special_tokens(Version version) { +} // namespace + +std::unique_ptr> get_special_tokens(Version version) { switch (version) { case Version::Multimodal: return get_multimodal_special_tokens(); @@ -51,11 +53,9 @@ std::unique_ptr> _get_special_tokens(Version version) { } } -} // namespace - std::unique_ptr get_tiktoken_for_llama(Version version) { return std::make_unique( - _get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex); + get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex); } std::unique_ptr> get_multimodal_special_tokens() { diff --git a/examples/models/llama/tokenizer/llama_tiktoken.h b/examples/models/llama/tokenizer/llama_tiktoken.h index a7f65eca29e..01d836ffbe6 100644 --- a/examples/models/llama/tokenizer/llama_tiktoken.h +++ b/examples/models/llama/tokenizer/llama_tiktoken.h @@ -20,6 +20,8 @@ enum class Version { std::unique_ptr<::tokenizers::Tiktoken> get_tiktoken_for_llama( Version version = Version::Default); +std::unique_ptr> get_special_tokens(Version version); + std::unique_ptr> get_multimodal_special_tokens(); } // namespace example diff --git a/examples/models/llava/runner/CMakeLists.txt b/examples/models/llava/runner/CMakeLists.txt index 1f9d6fa8e1d..016678e3c54 100644 --- a/examples/models/llava/runner/CMakeLists.txt +++ b/examples/models/llava/runner/CMakeLists.txt @@ -28,8 +28,6 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..) # build llava_runner library set(_llava_runner__srcs "${CMAKE_CURRENT_SOURCE_DIR}/llava_runner.cpp" - "${EXECUTORCH_ROOT}/extension/llm/sampler/sampler.cpp" - "${EXECUTORCH_ROOT}/extension/llm/tokenizers/src/llama2c_tokenizer.cpp" ) # extension llm runner lib @@ -45,8 +43,3 @@ set(llava_runner_deps executorch_core extension_data_loader extension_llm_runner ) target_link_libraries(llava_runner PUBLIC ${llava_runner_deps}) - -target_include_directories( - llava_runner INTERFACE ${_common_include_directories} - ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include -) diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index 03e26f089db..ad1c77a92b9 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -168,7 +168,9 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { std::optional data_path_str = data_path ? std::optional{data_path->toStdString()} : std::nullopt; - runner_ = example::Runner::create( + // TODO(larryliu0820): Use the API in text_llm_runner.h to create the + // runner. + runner_ = example::create_llama_runner( model_path->toStdString(), tokenizer_path->toStdString(), data_path_str); diff --git a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj index b0cddfa808c..f6fe811b4ab 100644 --- a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj +++ b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj @@ -33,6 +33,7 @@ 30AA4B642DC0766800B1BE50 /* std_regex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 30AA4B5E2DC0766800B1BE50 /* std_regex.cpp */; }; 30AA4B652DC0766800B1BE50 /* pre_tokenizer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 30AA4B5B2DC0766800B1BE50 /* pre_tokenizer.cpp */; }; 30AA4B662DC0766800B1BE50 /* re2_regex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 30AA4B5C2DC0766800B1BE50 /* re2_regex.cpp */; }; + F22E9E1A2DF2CBB900EC5425 /* text_llm_runner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F22E9E192DF2CBB900EC5425 /* text_llm_runner.cpp */; }; F292B01D2D88AF3500BE6839 /* bpe_tokenizer_base.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F292B0162D88AF3500BE6839 /* bpe_tokenizer_base.cpp */; }; F292B0202D88AF3500BE6839 /* llama2c_tokenizer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F292B0172D88AF3500BE6839 /* llama2c_tokenizer.cpp */; }; F292B0212D88AF3500BE6839 /* tiktoken.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F292B01A2D88AF3500BE6839 /* tiktoken.cpp */; }; @@ -94,6 +95,8 @@ 30AA4B5D2DC0766800B1BE50 /* regex.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = regex.cpp; path = src/regex.cpp; sourceTree = ""; }; 30AA4B5E2DC0766800B1BE50 /* std_regex.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = std_regex.cpp; path = src/std_regex.cpp; sourceTree = ""; }; 30AA4B5F2DC0766800B1BE50 /* token_decoder.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = token_decoder.cpp; path = src/token_decoder.cpp; sourceTree = ""; }; + F22E9E182DF2CBB900EC5425 /* text_llm_runner.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = text_llm_runner.h; sourceTree = ""; }; + F22E9E192DF2CBB900EC5425 /* text_llm_runner.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = text_llm_runner.cpp; sourceTree = ""; }; F292B0162D88AF3500BE6839 /* bpe_tokenizer_base.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = bpe_tokenizer_base.cpp; path = src/bpe_tokenizer_base.cpp; sourceTree = ""; }; F292B0172D88AF3500BE6839 /* llama2c_tokenizer.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = llama2c_tokenizer.cpp; path = src/llama2c_tokenizer.cpp; sourceTree = ""; }; F292B01A2D88AF3500BE6839 /* tiktoken.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = tiktoken.cpp; path = src/tiktoken.cpp; sourceTree = ""; }; @@ -146,6 +149,8 @@ 032A73E02CAFBB7800932D36 /* runner */ = { isa = PBXGroup; children = ( + F22E9E182DF2CBB900EC5425 /* text_llm_runner.h */, + F22E9E192DF2CBB900EC5425 /* text_llm_runner.cpp */, 032A73D42CAFBB7800932D36 /* image.h */, 032A73D52CAFBB7800932D36 /* image_prefiller.h */, 032A73D62CAFBB7800932D36 /* multimodal_runner.h */, @@ -409,6 +414,7 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( + F22E9E1A2DF2CBB900EC5425 /* text_llm_runner.cpp in Sources */, 03B0118E2CAC567900054791 /* DynamicTestCase.m in Sources */, 032A74182CAFBB7800932D36 /* text_decoder_runner.cpp in Sources */, 032A741D2CAFBB7800932D36 /* text_prefiller.cpp in Sources */, diff --git a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm index e53a457939c..c56f054ae3b 100644 --- a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm +++ b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm @@ -74,7 +74,7 @@ @implementation LLaMATests NSString *tokenizerPath = resources[@"tokenizer"]; return @{ @"generate" : ^(XCTestCase *testCase){ - auto __block runner = example::Runner::create( + auto __block runner = example::create_llama_runner( modelPath.UTF8String, tokenizerPath.UTF8String); if (!runner) { XCTFail("Failed to create runner"); diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt index 9696737f471..05f47e55c74 100644 --- a/extension/llm/runner/CMakeLists.txt +++ b/extension/llm/runner/CMakeLists.txt @@ -43,16 +43,15 @@ target_include_directories( add_library(extension_llm_runner STATIC ${_extension_llm_runner__srcs}) -set(runner_deps executorch_core extension_module extension_tensor) +# add tokenizers +add_subdirectory( + ${EXECUTORCH_ROOT}/extension/llm/tokenizers + ${CMAKE_CURRENT_BINARY_DIR}/../../../extension/llm/tokenizers +) -target_link_libraries(extension_llm_runner PUBLIC ${runner_deps}) +set(runner_deps executorch_core extension_module extension_tensor tokenizers) -target_include_directories( - extension_llm_runner - PUBLIC - ${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/llama.cpp-unicode/include - ${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/llama.cpp-unicode/src -) +target_link_libraries(extension_llm_runner PUBLIC ${runner_deps}) target_include_directories( extension_llm_runner INTERFACE ${_common_include_directories} diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl index 03b593cacf5..2e8231748ed 100644 --- a/extension/llm/runner/targets.bzl +++ b/extension/llm/runner/targets.bzl @@ -84,14 +84,26 @@ def define_common_targets(): name = "runner_lib" + aten_suffix, exported_headers = [ "multimodal_runner.h", + "text_llm_runner.h", + ], + srcs = [ + "text_llm_runner.cpp", ], visibility = [ "@EXECUTORCH_CLIENTS", ], + compiler_flags = [ + "-Wno-missing-prototypes", + ], exported_deps = [ ":image_prefiller" + aten_suffix, + ":irunner", ":text_decoder_runner" + aten_suffix, ":text_prefiller" + aten_suffix, ":text_token_generator" + aten_suffix, + "//pytorch/tokenizers:hf_tokenizer", + "//pytorch/tokenizers:llama2c_tokenizer", + # "//pytorch/tokenizers:sentencepiece", # TODO(larryliu0820) Make sure this compiles in xplat. + "//pytorch/tokenizers:tiktoken", ], ) diff --git a/extension/llm/runner/test/CMakeLists.txt b/extension/llm/runner/test/CMakeLists.txt index b17a318a080..ac46f0021fb 100644 --- a/extension/llm/runner/test/CMakeLists.txt +++ b/extension/llm/runner/test/CMakeLists.txt @@ -17,12 +17,13 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..) include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake) -set(_test_srcs generation_config_test.cpp) +set(_test_srcs test_generation_config.cpp test_text_llm_runner.cpp) et_cxx_test( - generation_config_test + test_runner SOURCES ${_test_srcs} EXTRA_LIBS executorch + extension_llm_runner ) diff --git a/extension/llm/runner/test/targets.bzl b/extension/llm/runner/test/targets.bzl index 9cdaad990bb..a5c8be7b6de 100644 --- a/extension/llm/runner/test/targets.bzl +++ b/extension/llm/runner/test/targets.bzl @@ -8,8 +8,8 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") def define_common_targets(): runtime.cxx_test( - name = "generation_config_test", - srcs = ["generation_config_test.cpp"], + name = "test_generation_config", + srcs = ["test_generation_config.cpp"], deps = [ "//executorch/extension/llm/runner:irunner", "//executorch/extension/llm/runner:stats", @@ -17,3 +17,13 @@ def define_common_targets(): "//executorch/runtime/platform:platform", ], ) + + runtime.cxx_test( + name = "test_text_llm_runner", + srcs = ["test_text_llm_runner.cpp"], + deps = [ + "//executorch/extension/llm/runner:irunner", + "//executorch/extension/llm/runner:runner_lib", + "//executorch/runtime/core/exec_aten/testing_util:tensor_util", + ], + ) diff --git a/extension/llm/runner/test/generation_config_test.cpp b/extension/llm/runner/test/test_generation_config.cpp similarity index 100% rename from extension/llm/runner/test/generation_config_test.cpp rename to extension/llm/runner/test/test_generation_config.cpp diff --git a/examples/models/llama/runner/test/test_runner.cpp b/extension/llm/runner/test/test_text_llm_runner.cpp similarity index 97% rename from examples/models/llama/runner/test/test_runner.cpp rename to extension/llm/runner/test/test_text_llm_runner.cpp index f158ca8515d..a9c2c680609 100644 --- a/examples/models/llama/runner/test/test_runner.cpp +++ b/extension/llm/runner/test/test_text_llm_runner.cpp @@ -7,18 +7,19 @@ * @lint-ignore-every CLANGTIDY facebook-hte-Deprecated */ -#include #include +#include +#include #include #include #include #include using namespace ::testing; -using namespace example; using executorch::extension::llm::GenerationConfig; using executorch::extension::llm::Stats; using executorch::extension::llm::TextDecoderRunner; +using executorch::extension::llm::TextLLMRunner; using executorch::extension::llm::TextPrefiller; using executorch::extension::llm::TextTokenGenerator; using executorch::runtime::Error; @@ -212,7 +213,7 @@ TEST_F(RunnerTest, GenerateCallsCallbackExactlyMaxNewTokensTimes) { tokenizer.get(), text_decoder_runner.get(), stats.get()); // Create a Runner with our mocked components - Runner runner( + TextLLMRunner runner( createDefaultMetadata(), std::unique_ptr<::tokenizers::Tokenizer>(tokenizer.release()), std::make_unique(), @@ -271,7 +272,7 @@ TEST_F(RunnerTest, WarmupCallsGenerateWithWarmingFlag) { tokenizer.get(), text_decoder_runner.get(), stats.get()); // Create a Runner with our mocked components - Runner runner( + TextLLMRunner runner( createDefaultMetadata(), std::move(tokenizer), std::make_unique(), @@ -305,7 +306,7 @@ TEST_F(RunnerTest, IsLoadedReturnsTrueWhenComponentsInitialized) { tokenizer.get(), text_decoder_runner.get(), stats.get()); // Create a Runner with our mocked components - Runner runner( + TextLLMRunner runner( createDefaultMetadata(), std::unique_ptr<::tokenizers::Tokenizer>(tokenizer.release()), std::make_unique(), diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp new file mode 100644 index 00000000000..9fa20d2646e --- /dev/null +++ b/extension/llm/runner/text_llm_runner.cpp @@ -0,0 +1,394 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + * @lint-ignore-every CLANGTIDY facebook-hte-Deprecated + */ + +// A simple llama2 runner that includes preprocessing and post processing logic. +// The module takes in a string as input and emits a string as output. + +#include +#include +#include +#include +#include + +namespace executorch::extension::llm { + +using ::executorch::extension::Module; +using ::executorch::runtime::Error; +using ::executorch::runtime::Result; + +static constexpr auto kEnableDynamicShape = "enable_dynamic_shape"; +static constexpr auto kBosId = "get_bos_id"; +static constexpr auto kEosIds = "get_eos_ids"; +static constexpr auto kMaxSeqLen = "get_max_seq_len"; +static constexpr auto kMaxContextLen = "get_max_context_len"; +static constexpr auto kVocabSize = "get_vocab_size"; +static constexpr auto kUseKVCache = "use_kv_cache"; +static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache"; + +TextLLMRunner::TextLLMRunner( + std::unordered_map metadata, + std::unique_ptr<::tokenizers::Tokenizer> tokenizer, + std::unique_ptr<::executorch::extension::Module> module, + std::unique_ptr text_decoder_runner, + std::unique_ptr text_prefiller, + std::unique_ptr text_token_generator, + std::unique_ptr stats, + float temperature) + : tokenizer_(std::move(tokenizer)), + metadata_(std::move(metadata)), + module_(std::move(module)), + text_decoder_runner_(std::move(text_decoder_runner)), + text_prefiller_(std::move(text_prefiller)), + text_token_generator_(std::move(text_token_generator)), + stats_(std::move(stats)), + temperature_(temperature) { + // Note: This constructor assumes that text_prefiller and text_token_generator + // already have references to the Module and TextDecoderRunner they need +} + +bool TextLLMRunner::is_loaded() const { + return text_prefiller_->is_loaded() && text_token_generator_->is_loaded(); +} + +Error TextLLMRunner::load() { + if (is_loaded()) { + return Error::Ok; + } + ET_CHECK_OK_OR_RETURN_ERROR(text_prefiller_->load()); + ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load()); + return Error::Ok; +} + +// Don't print with the same priority during warmup +#define RUNNER_ET_LOG(warmup, format, ...) \ + if (warmup) { \ + ET_LOG(Debug, format, __VA_ARGS__); \ + } else { \ + ET_LOG(Info, format, __VA_ARGS__); \ + } + +Error TextLLMRunner::generate( + const std::string& prompt, + const GenerationConfig& config, + std::function token_callback, + std::function stats_callback) { + // Prepare the inputs. + // Use ones-initialized inputs. + ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); + if (!is_loaded()) { + stats_->model_load_start_ms = time_in_ms(); + ET_CHECK_OK_OR_RETURN_ERROR(load()); + stats_->model_load_end_ms = time_in_ms(); + } + + if (config.warming) { + ET_LOG(Info, "Doing a warmup run..."); + } + + RUNNER_ET_LOG( + config.warming, + "RSS after loading model: %f MiB (0 if unsupported)", + get_rss_bytes() / 1024.0 / 1024.0); + + // Wrap the token_callback with print function + std::function wrapped_callback = + [token_callback, config](const std::string& piece) { + if (!config.warming) { + llm::safe_printf(piece.c_str()); + fflush(stdout); + } + if (token_callback) { + token_callback(piece); + } + }; + // First token time only measures the time it takes to encode the prompt and + // return a response token. + + stats_->inference_start_ms = time_in_ms(); + shouldStop_ = false; + + ::tokenizers::Result> encode_res = tokenizer_->encode( + prompt, + /* bos */ 0, + /* eos */ 0); + + ET_CHECK_TK_OK_OR_RETURN_ERROR( + encode_res.error(), "Failed to encode prompt %s", prompt.c_str()); + + // encode the (string) prompt into tokens sequence + std::vector prompt_tokens = encode_res.get(); + int num_prompt_tokens = prompt_tokens.size(); + + ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token"); + ET_CHECK_MSG( + num_prompt_tokens < metadata_.at(kMaxContextLen), + "num_prompt_tokens %d >= max_seq_len_ %" PRId64 + ", Max seq length exceeded - please increase max seq len value in your export script", + num_prompt_tokens, + metadata_.at(kMaxContextLen)); + + // Determine max_new_tokens using the GenerationConfig's resolve method + int max_new_tokens = config.resolve_max_new_tokens( + metadata_.at(kMaxContextLen), num_prompt_tokens); + + ET_LOG(Info, "Max new tokens resolved: %d", max_new_tokens); + + // Prefill first + // Here feed all tokens to the model and get the next predicted token + // after the prompt. After that we will enter generate loop. + + // print prompts + if (config.echo) { + wrapped_callback(prompt); + } + int64_t pos = 0; + auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos); + ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error()); + uint64_t cur_token = prefill_res.get(); + stats_->first_token_ms = time_in_ms(); + stats_->prompt_eval_end_ms = time_in_ms(); + + // print the first token from prefill. No prev_token so use cur_token for it. + wrapped_callback( + ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token))); + RUNNER_ET_LOG( + config.warming, + "RSS after prompt prefill: %f MiB (0 if unsupported)", + get_rss_bytes() / 1024.0 / 1024.0); + + // start the main loop + prompt_tokens.push_back(cur_token); + + // Generate max_new_tokens - 1 because prefill already generated 1 token. + int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate( + prompt_tokens, + num_prompt_tokens, + max_new_tokens - 1, + temperature_ == -1.0f ? config.temperature : temperature_, + wrapped_callback)); + + stats_->inference_end_ms = time_in_ms(); + if (!config.warming) { + printf("\n"); + } + RUNNER_ET_LOG( + config.warming, + "RSS after finishing text generation: %f MiB (0 if unsupported)", + get_rss_bytes() / 1024.0 / 1024.0); + + if (num_generated_tokens == max_new_tokens) { + RUNNER_ET_LOG(config.warming, "Max new tokens %i reached!", max_new_tokens); + } + + stats_->num_prompt_tokens = num_prompt_tokens; + stats_->num_generated_tokens = num_generated_tokens; + + if (config.warming) { + ET_LOG(Info, "Warmup run finished!"); + } else { + // Do not print report during warmup + print_report(*stats_); + } + if (stats_callback) { + stats_callback(*stats_); + } + + return Error::Ok; +} + +Error TextLLMRunner::warmup(const std::string& prompt, int32_t max_new_tokens) { + // Create a GenerationConfig for warmup + GenerationConfig config{ + .echo = false, .max_new_tokens = max_new_tokens, .warming = true}; + + // Call generate with the warmup config + Error err = generate(prompt, config); + + // Reset stats after warmup, not resetting the std::unique_ptr! + stats_->reset(); + return err; +} + +void TextLLMRunner::stop() { + if (is_loaded()) { + text_token_generator_->stop(); + } else { + ET_LOG(Error, "Token generator is not loaded, cannot stop"); + } +} + +std::unique_ptr load_tokenizer( + const std::string& tokenizer_path, + std::unique_ptr> special_tokens, + std::optional pattern, + size_t bos_token_index, + size_t eos_token_index) { + auto json_tokenizer = std::make_unique(); + if (json_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { + ET_LOG(Info, "Loaded json tokenizer"); + return json_tokenizer; + } + std::unique_ptr<::tokenizers::Tiktoken> tiktoken_tokenizer; + if (special_tokens != nullptr && !pattern.has_value()) { + tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>( + std::move(special_tokens), bos_token_index, eos_token_index); + } else if (special_tokens != nullptr && pattern.has_value()) { + tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>( + pattern.value(), + std::move(special_tokens), + bos_token_index, + eos_token_index); + } else { + tiktoken_tokenizer = std::make_unique<::tokenizers::Tiktoken>(); + } + if (tiktoken_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { + ET_LOG(Info, "Loaded TikToken tokenizer"); + return tiktoken_tokenizer; + } + + auto bpe_tokenizer = std::make_unique<::tokenizers::Llama2cTokenizer>(); + if (bpe_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { + ET_LOG(Info, "Loaded BPE tokenizer"); + return bpe_tokenizer; + } + + return nullptr; +} + +std::unordered_map get_llm_metadata( + tokenizers::Tokenizer* tokenizer, + Module* module) { + // Initialize metadata with default values + std::unordered_map metadata({ + {llm::kEnableDynamicShape, false}, + {llm::kMaxSeqLen, 128}, + {llm::kMaxContextLen, 128}, + {llm::kUseKVCache, true}, + {llm::kUseSDPAWithKVCache, false}, + }); + + // Read metadata from the model + auto method_names_result = module->method_names(); + if (method_names_result.error() != Error::Ok) { + ET_LOG(Error, "Failed reading method names"); + return metadata; + } + const auto method_names = method_names_result.get(); + + for (auto& pair : metadata) { + const auto& method_name = pair.first; + auto& value = pair.second; + + if (method_names.count(method_name)) { + auto get_result = module->get(method_name); + value = get_result.get().toScalar().to(); + } else { + ET_LOG( + Info, + "Method %s not found, using the default value %" PRId64, + method_name.c_str(), + value); + } + ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value); + } + // Set tokenizer-related metadata + metadata[llm::kBosId] = tokenizer->bos_tok(); + metadata[llm::kVocabSize] = tokenizer->vocab_size(); + return metadata; +} + +std::unordered_set get_eos_ids( + tokenizers::Tokenizer* tokenizer, + Module* module) { + std::unordered_set eos_ids = {tokenizer->eos_tok()}; + // Get EOS IDs if available + auto method_names_result = module->method_names(); + if (method_names_result.error() != Error::Ok) { + ET_LOG(Error, "Failed reading method names"); + return eos_ids; + } + const auto method_names = method_names_result.get(); + + if (method_names.count(llm::kEosIds)) { + eos_ids.clear(); + auto execute_result = module->execute(llm::kEosIds); + if (execute_result.error() != Error::Ok) { + ET_LOG(Error, "Failed to execute %s", llm::kEosIds); + return eos_ids; + } + for (const auto& eos_id : execute_result.get()) { + auto value = eos_id.toScalar().to(); + eos_ids.emplace(value); + ET_LOG(Info, "eos_id = %" PRId64, value); + } + } + return eos_ids; +} + +std::unique_ptr create_text_llm_runner( + const std::string& model_path, + std::unique_ptr<::tokenizers::Tokenizer> tokenizer, + std::optional data_path, + float temperature) { + // Sanity check tokenizer + if (!tokenizer || !tokenizer->is_loaded()) { + ET_LOG(Error, "Tokenizer is null or not loaded"); + return nullptr; + } + + // Create the Module + std::unique_ptr module; + if (data_path.has_value()) { + module = std::make_unique( + model_path, data_path.value(), Module::LoadMode::File); + } else { + module = std::make_unique(model_path, Module::LoadMode::File); + } + + // Get metadata from Module + ET_LOG(Info, "Reading metadata from model"); + auto metadata = llm::get_llm_metadata(tokenizer.get(), module.get()); + + auto eos_ids = std::make_unique>( + llm::get_eos_ids(tokenizer.get(), module.get())); + + // Create text_decoder_runner. Use a shared_ptr so that it can be shared with + // TextPrefiller and TextTokenGenerator + auto text_decoder_runner = std::make_unique( + module.get(), metadata.at(kUseKVCache)); + + // Create text_prefiller + auto text_prefiller = std::make_unique( + text_decoder_runner.get(), + metadata.at(kUseKVCache), + metadata.at(kEnableDynamicShape), + metadata.at(kMaxSeqLen)); + + // Create text_token_generator with stats + auto stats = std::make_unique(); + auto text_token_generator = std::make_unique( + tokenizer.get(), + text_decoder_runner.get(), + metadata.at(kUseKVCache), + std::move(eos_ids), + stats.get()); + + // Create and return the Runner instance + return std::make_unique( + std::move(metadata), + std::move(tokenizer), + std::move(module), + std::move(text_decoder_runner), + std::move(text_prefiller), + std::move(text_token_generator), + std::move(stats), + temperature); +} + +} // namespace executorch::extension::llm diff --git a/extension/llm/runner/text_llm_runner.h b/extension/llm/runner/text_llm_runner.h new file mode 100644 index 00000000000..715688ba82c --- /dev/null +++ b/extension/llm/runner/text_llm_runner.h @@ -0,0 +1,183 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// A simple llama2 runner that includes preprocessing and post processing logic. +// The module takes in a string as input and emits a string as output. + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace executorch::extension::llm { + +class ET_EXPERIMENTAL TextLLMRunner : public IRunner { + public: + /** + * @brief Constructor for TextLLMRunner with dependency injection + * + * Creates a TextLLMRunner instance with all required components for text + * generation. + * + * @param metadata Key-value pairs containing model metadata (e.g., + * vocab_size, context_length) + * @param tokenizer Tokenizer for converting between text and token IDs + * @param module The underlying model module that performs inference + * @param text_decoder_runner Component responsible for running the decoder + * part of the model + * @param text_prefiller Component for handling the prefill phase of text + * generation + * @param text_token_generator Component for generating tokens during the + * decode phase + * @param stats Statistics tracking object for performance monitoring + * @param temperature Temperature parameter for controlling randomness in + * generation (deprecated). Please use GenerationConfig.temperature instead. + */ + explicit TextLLMRunner( + std::unordered_map metadata, + std::unique_ptr<::tokenizers::Tokenizer> tokenizer, + std::unique_ptr<::executorch::extension::Module> module, + std::unique_ptr text_decoder_runner, + std::unique_ptr text_prefiller, + std::unique_ptr text_token_generator, + std::unique_ptr stats, + float temperature = -1.0f); + + /** + * @brief Checks if the model is loaded and ready for inference + * + * @return bool True if the model is loaded, false otherwise + */ + bool is_loaded() const override; + /** + * @brief Loads the model and prepares it for inference + * + * This method initializes all components and prepares the model for text + * generation. + * + * @return ::executorch::runtime::Error Success or error status + */ + ::executorch::runtime::Error load() override; + /** + * @brief Generates text based on the provided prompt + * + * This method performs text generation using the loaded model. It processes + * the input prompt, runs the model in prefill and decode phases, and returns + * generated text through callbacks. + * + * @param prompt The input text to generate from + * @param config Configuration parameters for text generation (e.g., + * max_new_tokens, temperature) + * @param token_callback Function called for each generated token with the + * decoded text + * @param stats_callback Function called with performance statistics + * @return ::executorch::runtime::Error Success or error status + */ + ::executorch::runtime::Error generate( + const std::string& prompt, + const GenerationConfig& config, + std::function token_callback = {}, + std::function stats_callback = {}) override; + /** + * @brief Warms up the model with a sample prompt + * + * This method runs a complete generation cycle without returning results, + * which helps initialize internal caches and optimize subsequent inferences. + * + * @param prompt The sample prompt to use for warmup + * @param max_new_tokens Maximum number of tokens to generate during warmup + * @return ::executorch::runtime::Error Success or error status + */ + ::executorch::runtime::Error warmup( + const std::string& prompt, + int32_t max_new_tokens); + /** + * @brief Stops the ongoing text generation process + * + * This method signals the generator to stop producing new tokens and + * terminate the current generation process. + */ + void stop() override; + + private: + bool shouldStop_{false}; + + // Components + std::unique_ptr<::tokenizers::Tokenizer> tokenizer_; + std::unordered_map metadata_; + std::unique_ptr<::executorch::extension::Module> + module_; // Manage module's lifecycle, make sure it outlives + // text_decoder_runner_. + std::unique_ptr + text_decoder_runner_; // Manage text_decoder_runner_'s lifecycle, make + // sure it outlives text_prefiller_ & + // text_token_generator_. + std::unique_ptr text_prefiller_; + std::unique_ptr text_token_generator_; + + // Stats + std::unique_ptr stats_; + + // temperature. + // Deprecated, we should rely on the temperature in GenerationConfig instead. + float temperature_ = -1.0f; +}; + +/** + * @brief Loads a tokenizer from the specified path + * + * This function creates and initializes a tokenizer from a file, with options + * to customize special tokens and regex patterns. + * + * @param tokenizer_path Path to the tokenizer file + * @param special_tokens Optional list of special tokens to add to the tokenizer + * @param pattern Optional regex pattern for tokenization + * @param bos_token_index Index of the beginning-of-sequence token + * @param eos_token_index Index of the end-of-sequence token + * @return std::unique_ptr Initialized tokenizer instance + */ +ET_EXPERIMENTAL std::unique_ptr load_tokenizer( + const std::string& tokenizer_path, + std::unique_ptr> special_tokens = nullptr, + std::optional pattern = std::nullopt, + size_t bos_token_index = 0, + size_t eos_token_index = 1); + +/** + * @brief Creates a TextLLMRunner instance with the specified model and + * tokenizer + * + * This factory function creates and initializes a TextLLMRunner with all + * necessary components for text generation using the specified model and + * tokenizer. + * + * @param model_path Path to the model file + * @param tokenizer Initialized tokenizer instance + * @param data_path Optional path to additional data required by the model + * @param temperature Optional temperature parameter for controlling randomness + * (deprecated) + * @return std::unique_ptr Initialized TextLLMRunner instance + */ +ET_EXPERIMENTAL std::unique_ptr create_text_llm_runner( + const std::string& model_path, + std::unique_ptr<::tokenizers::Tokenizer> tokenizer, + std::optional data_path = std::nullopt, + float temperature = -1.0f); + +} // namespace executorch::extension::llm diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh index 422cd579d04..32368661b19 100755 --- a/test/run_oss_cpp_tests.sh +++ b/test/run_oss_cpp_tests.sh @@ -32,6 +32,7 @@ build_executorch() { if [ -x "$(command -v glslc)" ]; then BUILD_VULKAN="ON" fi + # -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \ TODO(larryliu0820): Fix the name collision between Abseil and XNNPACK and turn this on. cmake . \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DEXECUTORCH_USE_CPP_CODE_COVERAGE=ON \ @@ -40,7 +41,6 @@ build_executorch() { -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ - -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ diff --git a/tools/cmake/cmake_deps.toml b/tools/cmake/cmake_deps.toml index 6f12c9d4413..a033fba4929 100644 --- a/tools/cmake/cmake_deps.toml +++ b/tools/cmake/cmake_deps.toml @@ -241,6 +241,17 @@ deps = [ "executorch_core", ] +[targets.extension_tokenizers] +buck_targets = [ + "//extension/llm/tokenizers:sentencepiece", + "//extension/llm/tokenizers:tiktoken", + "//extension/llm/tokenizers:hf_tokenizer", + "//extension/llm/tokenizers:llama2c_tokenizer", +] +filters = [ + ".cpp$", +] + [targets.extension_llm_runner] buck_targets = [ "//extension/llm/runner:runner_lib", @@ -257,6 +268,7 @@ deps = [ "extension_flat_tensor", "extension_runner_util", "extension_tensor", + "extension_tokenizers", ] [targets.extension_tensor]