diff --git a/examples/models/llama/main.cpp b/examples/models/llama/main.cpp index 38009dd59ec..5d34bf932e7 100644 --- a/examples/models/llama/main.cpp +++ b/examples/models/llama/main.cpp @@ -42,6 +42,16 @@ DEFINE_int32( -1, "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device."); +DEFINE_int32( + num_bos, + 0, + "Number of BOS tokens to prepend to the prompt. Defaults to 0. If > 0, the prompt will be prepended with BOS tokens. This is useful for models that expect one or more BOS token at the start."); + +DEFINE_int32( + num_eos, + 0, + "Number of EOS tokens to append to the prompt. Defaults to 0. If > 0, the prompt will be appended with EOS tokens. This is useful for models that expect one or more EOS token at the end."); + DEFINE_bool(warmup, false, "Whether to run a warmup run."); int32_t main(int32_t argc, char** argv) { diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt index 4b0e6b2d3a2..757c7518f0c 100644 --- a/examples/qualcomm/CMakeLists.txt +++ b/examples/qualcomm/CMakeLists.txt @@ -35,7 +35,10 @@ find_package(gflags REQUIRED) set(_common_compile_options -Wno-deprecated-declarations -fPIC) # Let files say "include ". -set(_common_include_directories ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/json/single_include) +set(_common_include_directories + ${EXECUTORCH_ROOT}/.. + ${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/json/single_include +) # # The `__srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}. @@ -72,20 +75,11 @@ target_include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/third-party/llama.cpp-unicode/src ) -# find RE2 for tokenizer -set(ABSL_ENABLE_INSTALL ON) -set(ABSL_PROPAGATE_CXX_STD ON) -set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE}) -set(CMAKE_POSITION_INDEPENDENT_CODE ON) -add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/third-party/abseil-cpp - ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp -) +# add tokenizers add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/third-party/re2 - ${CMAKE_CURRENT_BINARY_DIR}/re2 + ${EXECUTORCH_ROOT}/extension/llm/tokenizers + ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/tokenizers ) -set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) # build qnn_executor_runner add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/executor_runner) diff --git a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt index 246a47fceba..dadf51bf298 100644 --- a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt +++ b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt @@ -6,16 +6,16 @@ # model sharding with custom op set(CUSTOM_OP_SRCS_FILE - "${EXECUTORCH_SOURCE_DIR}/extension/llm/custom_ops/op_fallback.cpp" + "${EXECUTORCH_SOURCE_DIR}/extension/llm/custom_ops/op_fallback.cpp" ) +set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..) + add_library(custom_ops ${CUSTOM_OP_SRCS_FILE}) target_include_directories(custom_ops PUBLIC "${_common_include_directories}") target_include_directories( custom_ops PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../include" ) -target_link_libraries( - custom_ops PUBLIC full_portable_ops_lib -) +target_link_libraries(custom_ops PUBLIC full_portable_ops_lib) target_link_options_shared_lib(custom_ops) # preprocess qnn runner src files for llama @@ -44,17 +44,15 @@ list( ${CMAKE_CURRENT_LIST_DIR}/runner/kv_manager.h ) -list( - APPEND - _llama_runner__srcs - ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/src/tiktoken.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../../models/llama/tokenizer/llama_tiktoken.cpp -) +list(APPEND _llama_runner__srcs) # build qnn llama runner add_executable(qnn_llama_runner ${_llama_runner__srcs}) target_include_directories( - qnn_llama_runner PUBLIC ${_common_include_directories} ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/include + qnn_llama_runner + PUBLIC + ${_common_include_directories} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/include ) target_link_options_shared_lib(quantized_ops_lib) @@ -68,14 +66,12 @@ target_link_libraries( extension_module extension_tensor gflags - re2::re2 custom_ops quantized_ops_lib quantized_kernels + tokenizers ) -target_compile_options( - qnn_llama_runner PUBLIC ${_common_compile_options} -) +target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options}) set_target_properties( qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'" ) diff --git a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt index 4e44a1599b1..2a13bbe861c 100644 --- a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt +++ b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt @@ -6,6 +6,8 @@ # preprocess qaihub runner src files for llama2,3 set(_qaihub_llama_runner__srcs ${_llama_runner__srcs}) +set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..) + list(TRANSFORM _qaihub_llama_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/") list(FILTER _qaihub_llama_runner__srcs EXCLUDE REGEX ".*(/runner/).*") list( @@ -26,13 +28,11 @@ list(PREPEND _qaihub_llama2_7b_runner__srcs # build qaihub llama2 7b runner add_executable(qaihub_llama2_7b_runner ${_qaihub_llama2_7b_runner__srcs}) + target_include_directories( - qaihub_llama2_7b_runner PUBLIC - ${_common_include_directories} - ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/include - ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/third-party/json/single_include - ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/third-party/llama.cpp-unicode/include - ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/third-party/llama.cpp-unicode/src + qaihub_llama2_7b_runner + PUBLIC ${_common_include_directories} + ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include ) target_link_libraries( qaihub_llama2_7b_runner @@ -43,7 +43,7 @@ target_link_libraries( extension_module extension_tensor gflags - re2::re2 + tokenizers ) target_compile_options( qaihub_llama2_7b_runner PUBLIC ${_common_compile_options} @@ -62,25 +62,13 @@ list(PREPEND _qaihub_llama3_8b_runner__srcs # Adding a compile option to differentiate llama2 with llama3 logic list(APPEND _common_compile_options -DQAIHUB_LLAMA3_RUNNER) -list( - APPEND _qaihub_llama3_8b_runner__srcs - ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/src/tiktoken.cpp -) -list( - APPEND - _qaihub_llama3_8b_runner__srcs - ${CMAKE_CURRENT_SOURCE_DIR}/../../../models/llama/tokenizer/llama_tiktoken.cpp -) - # build qaihub llama3 8b runner add_executable(qaihub_llama3_8b_runner ${_qaihub_llama3_8b_runner__srcs}) target_include_directories( - qaihub_llama3_8b_runner PUBLIC - ${_common_include_directories} + qaihub_llama3_8b_runner + PUBLIC + ${_common_include_directories} ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/include - ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/third-party/json/single_include - ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/third-party/llama.cpp-unicode/include - ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/third-party/llama.cpp-unicode/src ) target_link_libraries( @@ -92,7 +80,7 @@ target_link_libraries( extension_module extension_tensor gflags - re2::re2 + tokenizers ) target_compile_options( qaihub_llama3_8b_runner PUBLIC ${_common_compile_options} diff --git a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj index 47a7af09dbd..00c1eb16079 100644 --- a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj +++ b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj @@ -38,6 +38,7 @@ F292B01D2D88AF3500BE6839 /* bpe_tokenizer_base.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F292B0162D88AF3500BE6839 /* bpe_tokenizer_base.cpp */; }; F292B0202D88AF3500BE6839 /* llama2c_tokenizer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F292B0172D88AF3500BE6839 /* llama2c_tokenizer.cpp */; }; F292B0212D88AF3500BE6839 /* tiktoken.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F292B01A2D88AF3500BE6839 /* tiktoken.cpp */; }; + F2E1B5172E03AC19002C9718 /* sentencepiece.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F2E1B5162E03AC19002C9718 /* sentencepiece.cpp */; }; /* End PBXBuildFile section */ /* Begin PBXContainerItemProxy section */ @@ -110,6 +111,7 @@ F292B0292D88AF4800BE6839 /* result.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = result.h; sourceTree = ""; }; F292B02B2D88AF4800BE6839 /* tiktoken.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tiktoken.h; sourceTree = ""; }; F292B02D2D88AF4800BE6839 /* tokenizer.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tokenizer.h; sourceTree = ""; }; + F2E1B5162E03AC19002C9718 /* sentencepiece.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = sentencepiece.cpp; path = src/sentencepiece.cpp; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -183,6 +185,7 @@ 032A74022CAFBB7800932D36 /* tokenizers */ = { isa = PBXGroup; children = ( + F2E1B5162E03AC19002C9718 /* sentencepiece.cpp */, 3C6ABD322DFA27DE0015DE55 /* regex_lookahead.cpp */, 30AA4B592DC0766800B1BE50 /* hf_tokenizer.cpp */, 30AA4B5A2DC0766800B1BE50 /* pcre2_regex.cpp */, @@ -426,6 +429,7 @@ F292B01D2D88AF3500BE6839 /* bpe_tokenizer_base.cpp in Sources */, F292B0202D88AF3500BE6839 /* llama2c_tokenizer.cpp in Sources */, F292B0212D88AF3500BE6839 /* tiktoken.cpp in Sources */, + F2E1B5172E03AC19002C9718 /* sentencepiece.cpp in Sources */, 03E7E6792CBDCAE900205E71 /* CoreMLTests.mm in Sources */, 032A74232CAFC1B300932D36 /* runner.cpp in Sources */, 03B2D37A2C8A515C0046936E /* GenericTests.mm in Sources */, diff --git a/extension/benchmark/apple/Benchmark/Tests/Tests.xcconfig b/extension/benchmark/apple/Benchmark/Tests/Tests.xcconfig index 0172f28b1bb..bf915abc25b 100644 --- a/extension/benchmark/apple/Benchmark/Tests/Tests.xcconfig +++ b/extension/benchmark/apple/Benchmark/Tests/Tests.xcconfig @@ -17,7 +17,9 @@ OTHER_LDFLAGS = $(inherited) \ HEADER_SEARCH_PATHS = $(inherited) \ $(SRCROOT)/../../../../.. \ $(TEMP_DIR)/cmake/include \ - $(SRCROOT)/../../../../extension/llm/tokenizers/include + $(SRCROOT)/../../../../extension/llm/tokenizers/include \ + $(SRCROOT)/../../../../extension/llm/tokenizers/third-party/sentencepiece \ + $(SRCROOT)/../../../../extension/llm/tokenizers/third-party/sentencepiece/src LIBRARY_SEARCH_PATHS = $(inherited) \ $(TEMP_DIR)/cmake/lib diff --git a/extension/llm/runner/irunner.h b/extension/llm/runner/irunner.h index 5564cd09441..4c2efc91203 100644 --- a/extension/llm/runner/irunner.h +++ b/extension/llm/runner/irunner.h @@ -49,6 +49,10 @@ struct GenerationConfig { // Temperature for sampling (higher = more random) float temperature = 0.8f; + // Number of eos and bos to add to the prompt + int32_t num_bos = 0; + int32_t num_eos = 0; + /** * Resolve the maximum number of new tokens to generate based on constraints. * diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl index 2e8231748ed..244515112ac 100644 --- a/extension/llm/runner/targets.bzl +++ b/extension/llm/runner/targets.bzl @@ -103,7 +103,7 @@ def define_common_targets(): ":text_token_generator" + aten_suffix, "//pytorch/tokenizers:hf_tokenizer", "//pytorch/tokenizers:llama2c_tokenizer", - # "//pytorch/tokenizers:sentencepiece", # TODO(larryliu0820) Make sure this compiles in xplat. + "//pytorch/tokenizers:sentencepiece", "//pytorch/tokenizers:tiktoken", ], ) diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp index 691073ef45c..6a0cfd45044 100644 --- a/extension/llm/runner/text_llm_runner.cpp +++ b/extension/llm/runner/text_llm_runner.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include namespace executorch::extension::llm { @@ -116,8 +117,8 @@ Error TextLLMRunner::generate_from_pos( ::tokenizers::Result> encode_res = tokenizer_->encode( prompt, - /* bos */ 0, - /* eos */ 0); + /*bos=*/config.num_bos, + /*eos=*/config.num_eos); ET_CHECK_TK_OK_OR_RETURN_ERROR( encode_res.error(), "Failed to encode prompt %s", prompt.c_str()); @@ -278,6 +279,12 @@ std::unique_ptr load_tokenizer( return tiktoken_tokenizer; } + auto sp_tokenizer = std::make_unique<::tokenizers::SPTokenizer>(); + if (sp_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { + ET_LOG(Info, "Loaded Sentencepiece tokenizer"); + return sp_tokenizer; + } + auto bpe_tokenizer = std::make_unique<::tokenizers::Llama2cTokenizer>(); if (bpe_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) { ET_LOG(Info, "Loaded BPE tokenizer"); diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers index fc320288580..ffd2973e887 160000 --- a/extension/llm/tokenizers +++ b/extension/llm/tokenizers @@ -1 +1 @@ -Subproject commit fc32028858020c4fcafe37aaaeaf5d1b480336a2 +Subproject commit ffd2973e8879f64c78f01a3f4aa0f77bdc5a1abe