[llama] Build the runner with tiktoken by default

larryliu0820 · web-flow · commit f99e25f42c29 · 2024-08-29T18:10:16.000-07:00
Differential Revision: D61830302 Pull Request resolved: #4921
diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh
@@ -6,6 +6,7 @@
 # LICENSE file in the root directory of this source tree.
 
 set -eux
+set -o xtrace
 
 build_qnn_backend() {
   echo "Start building qnn backend."
diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 set -e
+set -o xtrace
 
 if [[ -z ${QNN_SDK_ROOT} ]]; then
     echo "Please export QNN_SDK_ROOT=/path/to/qnn_sdk"
diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh
@@ -20,11 +20,6 @@ build_android_native_library() {
   TOKENIZER="$2"
   ANDROID_NDK="${ANDROID_NDK:-/opt/ndk}"
   CMAKE_OUT="cmake-out-android-${ANDROID_ABI}"
-  if [[ $TOKENIZER == "tiktoken" ]]; then
-    EXECUTORCH_USE_TIKTOKEN=ON
-  else
-    EXECUTORCH_USE_TIKTOKEN=OFF
-  fi
 
   cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
     -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
@@ -54,7 +49,6 @@ build_android_native_library() {
     -DANDROID_ABI="$ANDROID_ABI" \
     -DANDROID_PLATFORM=android-23 \
     -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
-    -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
@@ -72,7 +66,6 @@ build_android_native_library() {
     -DEXECUTORCH_ENABLE_LOGGING=ON \
     -DEXECUTORCH_LOG_LEVEL=Info \
     -DEXECUTORCH_BUILD_LLAMA_JNI=ON \
-    -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \
     -DCMAKE_BUILD_TYPE=Release \
     -B"${CMAKE_OUT}"/extension/android
 
diff --git a/examples/demo-apps/android/LlamaDemo/README.md b/examples/demo-apps/android/LlamaDemo/README.md
@@ -64,22 +64,14 @@ Note: `<path_to_android_ndk>` is the root for the NDK, which is usually under
 `~/Library/Android/sdk/ndk/XX.Y.ZZZZZ` for macOS, and contains NOTICE and README.md.
 We use `<path_to_android_ndk>/build/cmake/android.toolchain.cmake` for CMake to cross-compile.
 
-3. (Optional) If you need to use tiktoken as the tokenizer (for LLaMA3), set
-`EXECUTORCH_USE_TIKTOKEN=ON` and later CMake will use it as the tokenizer.
-If you need to run other models like LLaMA2, skip this skip.
-
-```bash
-export EXECUTORCH_USE_TIKTOKEN=ON # Only for LLaMA3
-```
-
-4. Build the Android Java extension code:
+3. Build the Android Java extension code:
 ```bash
 pushd extension/android
 ./gradlew build
 popd
 ```
 
-5. Run the following command set up the required JNI library:
+4. Run the following command set up the required JNI library:
 ```bash
 pushd examples/demo-apps/android/LlamaDemo
 ./gradlew :app:setup
diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh
@@ -35,7 +35,6 @@ cmake examples/models/llama2 \
          -DANDROID_ABI="$ANDROID_ABI" \
          -DANDROID_PLATFORM=android-23 \
          -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
-         -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \
          -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
          -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
          -DEXECUTORCH_BUILD_XNNPACK=ON \
@@ -50,7 +49,6 @@ cmake extension/android \
   -DANDROID_PLATFORM=android-23 \
   -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \
   -DEXECUTORCH_BUILD_LLAMA_JNI=ON \
-  -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \
   -DCMAKE_BUILD_TYPE=Release \
   -B"${CMAKE_OUT}"/extension/android
 
diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt
@@ -21,8 +21,6 @@ project(llama_runner)
 # Duplicating options as root CMakeLists.txt
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF)
 
-option(EXECUTORCH_USE_TIKTOKEN "Use Tiktoken as a tokenizer" OFF)
-
 include(CMakeDependentOption)
 #
 # pthreadpool: build pthreadpool library. Disable on unsupported platforms
@@ -94,23 +92,6 @@ endif()
 
 # llama_runner library
 add_subdirectory(runner)
-if(EXECUTORCH_USE_TIKTOKEN)
-  # find RE2 for tokenizer
-  set(ABSL_ENABLE_INSTALL ON)
-  set(ABSL_PROPAGATE_CXX_STD ON)
-  set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
-  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-  add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/third-party/abseil-cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
-  )
-  add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/third-party/re2
-    ${CMAKE_CURRENT_BINARY_DIR}/re2
-  )
-  set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
-  target_link_libraries(llama_runner PUBLIC re2::re2)
-endif()
 
 set(link_libraries gflags)
 set(_srcs main.cpp)
diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
@@ -227,8 +227,6 @@ Note for Mac users: There's a known linking issue with Xcode 15.1. Refer to the
     cmake --build cmake-out/examples/models/llama2 -j16 --config Release
     ```
 
-For Llama3, add `-DEXECUTORCH_USE_TIKTOKEN=ON` option when building the llama runner.
-
 3. Run model. Run options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/main.cpp#L18-L40).
     ```
     cmake-out/examples/models/llama2/llama_main --model_path=<model pte file> --tokenizer_path=<tokenizer.bin> --prompt=<prompt>
@@ -283,7 +281,6 @@ cmake  -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
 
 cmake --build cmake-out-android/examples/models/llama2 -j16 --config Release
 ```
-For Llama3, add `-DEXECUTORCH_USE_TIKTOKEN=ON` option when building the llama runner.
 
 **2. Run on Android via adb shell**
 
diff --git a/examples/models/llama2/runner/CMakeLists.txt b/examples/models/llama2/runner/CMakeLists.txt
@@ -41,16 +41,13 @@ target_include_directories(
   extension_module INTERFACE ${_common_include_directories}
 )
 
-if(EXECUTORCH_USE_TIKTOKEN)
-  list(
-    APPEND _llama_runner__srcs
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
-  )
-  list(APPEND _llama_runner__srcs
-       ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/llama_tiktoken.cpp
-  )
-  set(_preprocessor_flag -DET_USE_TIKTOKEN)
-endif()
+list(
+  APPEND _llama_runner__srcs
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
+)
+list(APPEND _llama_runner__srcs
+     ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/llama_tiktoken.cpp
+)
 
 if(CMAKE_TOOLCHAIN_IOS
    OR ANDROID
@@ -63,7 +60,24 @@ else()
   add_library(llama_runner SHARED ${_llama_runner__srcs})
 endif()
 
-set(llama_runner_deps executorch extension_module extension_data_loader)
+# find RE2 for tokenizer, build tiktoken
+set(ABSL_ENABLE_INSTALL ON)
+set(ABSL_PROPAGATE_CXX_STD ON)
+set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+add_subdirectory(
+  ${EXECUTORCH_ROOT}/extension/llm/third-party/abseil-cpp
+  ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
+)
+add_subdirectory(
+  ${EXECUTORCH_ROOT}/extension/llm/third-party/re2
+  ${CMAKE_CURRENT_BINARY_DIR}/re2
+)
+set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
+
+set(llama_runner_deps executorch extension_module extension_data_loader
+                      re2::re2
+)
 
 target_link_libraries(llama_runner PUBLIC ${llama_runner_deps})
 
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
@@ -16,11 +16,8 @@
 #include <executorch/extension/llm/runner/util.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 
-#if ET_USE_TIKTOKEN
 #include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
-#else /* BPE */
 #include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
-#endif /* ET_USE_TIKTOKEN*/
 
 namespace torch::executor {
 namespace {
@@ -46,13 +43,6 @@ Runner::Runner(
     : temperature_(temperature),
       module_(std::make_unique<Module>(model_path, Module::LoadMode::File)),
       tokenizer_path_(tokenizer_path),
-      tokenizer_(
-#if ET_USE_TIKTOKEN
-          get_tiktoken_for_llama()
-#else
-          std::make_unique<BPETokenizer>()
-#endif
-              ),
       metadata_({
           {kAppendEosToPrompt, false},
           {kEnableDynamicShape, false},
@@ -79,8 +69,19 @@ Error Runner::load() {
     return Error::Ok;
   }
   ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
-
-  tokenizer_->load(tokenizer_path_);
+  // load tokenizer
+  tokenizer_ = nullptr;
+  tokenizer_ = std::make_unique<BPETokenizer>();
+  Error err = tokenizer_->load(tokenizer_path_);
+  if (err == Error::InvalidArgument) {
+    ET_LOG(
+        Info,
+        "Failed to load %s as a BPETokenizer artifact, trying Tiktoken",
+        tokenizer_path_.c_str());
+    tokenizer_.reset();
+    tokenizer_ = get_tiktoken_for_llama();
+    tokenizer_->load(tokenizer_path_);
+  }
 
   ET_LOG(Info, "Reading metadata from model");
 
diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl
@@ -8,9 +8,6 @@ def _get_operator_lib(aten = False):
     else:
         return ["//executorch/configurations:optimized_native_cpu_ops", "//executorch/extension/llm/custom_ops:custom_ops"]
 
-def use_tiktoken():
-    return native.read_config("llama", "use_tiktoken", "0") == "1"
-
 def define_common_targets():
     for aten in (True, False):
         aten_suffix = "_aten" if aten else ""
@@ -26,7 +23,6 @@ def define_common_targets():
             preprocessor_flags = [
                 "-DUSE_ATEN_LIB",
             ] if aten else [],
-            exported_preprocessor_flags = ["-DET_USE_TIKTOKEN"] if use_tiktoken() else [],
             visibility = [
                 "@EXECUTORCH_CLIENTS",
             ],
@@ -43,11 +39,9 @@ def define_common_targets():
                 "//executorch/kernels/quantized:generated_lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
-            ] + ([
                 "//executorch/examples/models/llama2/tokenizer:tiktoken",
-            ] if use_tiktoken() else [
                 "//executorch/extension/llm/tokenizer:bpe_tokenizer",
-            ]) + (_get_operator_lib(aten)) + ([
+            ] + (_get_operator_lib(aten)) + ([
                 # Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
                 # Therefore enable it explicitly for now to avoid failing tests
                 "//executorch/backends/vulkan:vulkan_backend_lib",
diff --git a/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama2/CMakeLists.txt
@@ -24,6 +24,6 @@ target_include_directories(
 )
 target_link_libraries(
   qnn_llama_runner qnn_executorch_backend full_portable_ops_lib
-  extension_data_loader extension_module gflags
+  extension_data_loader extension_module gflags re2::re2
 )
 target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options})
diff --git a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt
@@ -31,7 +31,7 @@ target_include_directories(
 )
 target_link_libraries(
   qaihub_llama2_7b_runner qnn_executorch_backend executorch_no_prim_ops
-  extension_data_loader extension_module gflags
+  extension_data_loader extension_module gflags re2::re2
 )
 target_compile_options(
   qaihub_llama2_7b_runner PUBLIC ${_common_compile_options}
@@ -71,7 +71,6 @@ list(
   _qaihub_llama3_8b_runner__srcs
   ${CMAKE_CURRENT_SOURCE_DIR}/../../../models/llama2/tokenizer/llama_tiktoken.cpp
 )
-set(_preprocessor_flag -DET_USE_TIKTOKEN)
 
 # build qaihub llama3 8b runner
 add_executable(qaihub_llama3_8b_runner ${_qaihub_llama3_8b_runner__srcs})
diff --git a/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/stable_diffusion/CMakeLists.txt
@@ -20,7 +20,7 @@ target_include_directories(
 )
 target_link_libraries(
   qaihub_stable_diffusion_runner qnn_executorch_backend executorch_no_prim_ops
-  extension_data_loader extension_module gflags
+  extension_data_loader extension_module gflags re2::re2
 )
 target_compile_options(
   qaihub_stable_diffusion_runner PUBLIC ${_common_compile_options}
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
@@ -129,19 +129,18 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
     quantized_ops_lib
   )
   target_compile_options(executorch_llama_jni PUBLIC ${_common_compile_options})
-  if(EXECUTORCH_USE_TIKTOKEN)
-    set(ABSL_ENABLE_INSTALL ON)
-    set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
-    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-    add_subdirectory(
-      ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/abseil-cpp
-      ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
-    )
-    add_subdirectory(
-      ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/re2
-      ${CMAKE_CURRENT_BINARY_DIR}/re2
-    )
-    set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
-    target_link_libraries(executorch_llama_jni re2::re2)
-  endif()
+  # link re2
+  set(ABSL_ENABLE_INSTALL ON)
+  set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+  add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/abseil-cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
+  )
+  add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/re2
+    ${CMAKE_CURRENT_BINARY_DIR}/re2
+  )
+  set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
+  target_link_libraries(executorch_llama_jni re2::re2)
 endif()
diff --git a/extension/llm/third-party/TARGETS b/extension/llm/third-party/TARGETS
@@ -0,0 +1,47 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+oncall("executorch")
+
+runtime.cxx_library(
+    name = "abseil",
+    public_include_directories = ["abseil-cpp"],
+    srcs = glob(
+        ["abseil-cpp/absl/**/*.cc"],
+        exclude = [
+            "abseil-cpp/absl/**/*test*.cc",
+            "abseil-cpp/absl/**/*mock*.cc",
+            "abseil-cpp/absl/**/*matchers*.cc",
+            "abseil-cpp/absl/**/*benchmark*.cc",
+        ],
+    ),
+    exported_linker_flags = select(
+        {
+            "DEFAULT": [],
+            "ovr_config//os:macos": ["-Wl,-framework,CoreFoundation"],
+        },
+    ),
+    visibility = ["PUBLIC"],
+    _is_external_target = True,
+)
+
+runtime.cxx_library(
+    name = "re2",
+    public_include_directories = ["re2"],
+    srcs = glob(
+        [
+            "re2/re2/**/*.cc",
+            "re2/util/**/*.cc",
+        ],
+        exclude = [
+            "re2/re2/**/*test*.cc",
+            "re2/re2/testing/*.cc",
+            "re2/re2/fuzzing/*.cc",
+            "re2/re2/**/*benchmark*.cc",
+        ],
+    ),
+    exported_deps = [
+        ":abseil",
+    ],
+    visibility = ["PUBLIC"],
+    _is_external_target = True,
+)
diff --git a/shim/xplat/executorch/build/env_interface.bzl b/shim/xplat/executorch/build/env_interface.bzl
@@ -41,7 +41,7 @@ _EXTERNAL_DEPS = {
     "libtorch_python": "//third-party:libtorch_python",
     "prettytable": "//third-party:prettytable",
     "pybind11": "//third-party:pybind11",
-    "re2": [],  # TODO(larryliu0820): Add support
+    "re2": "//extension/llm/third-party:re2",
     "sentencepiece-py": [],
     # Core C++ PyTorch functionality like Tensor and ScalarType.
     "torch-core-cpp": "//third-party:libtorch",