Skip to content

Commit 90b9c2b

Browse files
committed
tokenizer
1 parent c8311e6 commit 90b9c2b

File tree

10 files changed

+165
-71
lines changed

10 files changed

+165
-71
lines changed

CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,10 @@ option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "Build the Flat Tensor extension"
186186
OFF
187187
)
188188

189+
option(EXECUTORCH_BUILD_EXTENSION_LLM "Build the LLM extension"
190+
OFF
191+
)
192+
189193
option(EXECUTORCH_BUILD_EXTENSION_MODULE "Build the Module extension" OFF)
190194

191195
option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension"
@@ -717,6 +721,10 @@ if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
717721
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor/serialize)
718722
endif()
719723

724+
if(EXECUTORCH_BUILD_EXTENSION_LLM)
725+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/tokenizer)
726+
endif()
727+
720728
if(EXECUTORCH_BUILD_EXTENSION_MODULE)
721729
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
722730
endif()

build/cmake_deps.toml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,22 @@ deps = [
402402
"xnnpack_backend",
403403
]
404404

405+
[targets.extension_llm_tokenizer]
406+
buck_targets = [
407+
"//extension/llm/tokenizer:bpe_tokenizer",
408+
"//extension/llm/tokenizer:tiktoken",
409+
]
410+
filters = [
411+
".cpp$",
412+
]
413+
excludes = [
414+
"^codegen",
415+
]
416+
deps = [
417+
"executorch",
418+
"executorch_core",
419+
]
420+
405421
[targets.llama_runner]
406422
buck_targets = [
407423
"//examples/models/llama/runner:runner",
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
# Please this file formatted by running:
8+
# ~~~
9+
# cmake-format -i CMakeLists.txt
10+
# ~~~
11+
12+
cmake_minimum_required(VERSION 3.19)
13+
14+
# Source root directory for executorch.
15+
if(NOT EXECUTORCH_ROOT)
16+
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
17+
endif()
18+
19+
set(ABSL_ENABLE_INSTALL ON)
20+
set(ABSL_PROPAGATE_CXX_STD ON)
21+
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
22+
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
23+
add_subdirectory(
24+
${CMAKE_CURRENT_SOURCE_DIR}/../third-party/abseil-cpp
25+
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
26+
)
27+
add_subdirectory(
28+
${CMAKE_CURRENT_SOURCE_DIR}/../third-party/re2
29+
${CMAKE_CURRENT_BINARY_DIR}/re2
30+
)
31+
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
32+
33+
list(TRANSFORM _extension_llm_tokenizer__srcs PREPEND "${EXECUTORCH_ROOT}/")
34+
add_library(extension_llm_tokenizer ${_extension_llm_tokenizer__srcs})
35+
target_include_directories(extension_llm_tokenizer PUBLIC
36+
re2::re2
37+
${EXECUTORCH_ROOT}/..
38+
${_common_include_directories})
39+
40+
target_compile_options(extension_llm_tokenizer PUBLIC ${_common_compile_options})
41+
42+
# Install libraries
43+
install(
44+
TARGETS extension_llm_tokenizer
45+
DESTINATION lib
46+
INCLUDES
47+
DESTINATION ${_common_include_directories}
48+
)
49+
50+
target_include_directories(
51+
extension_llm_tokenizer
52+
PRIVATE ${CMAKE_INSTALL_PREFIX}/include
53+
${CMAKE_CURRENT_SOURCE_DIR}/../third-party/abseil-cpp
54+
)
55+
56+
if(BUILD_TESTING)
57+
add_subdirectory(test)
58+
endif()

extension/llm/tokenizer/test/CMakeLists.txt

Lines changed: 34 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7+
# @generated by test/utils/generate_gtest_cmakelists.py
8+
#
79
# This file should be formatted with
810
# ~~~
911
# cmake-format -i CMakeLists.txt
@@ -12,39 +14,42 @@
1214
#
1315

1416
cmake_minimum_required(VERSION 3.19)
15-
project(tokenizer_test)
16-
17-
# Use C++17 for test.
18-
set(CMAKE_CXX_STANDARD 17)
1917

2018
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
2119

2220
include(${EXECUTORCH_ROOT}/build/Test.cmake)
2321

24-
set(_tokenizer_test_srcs
25-
test_tiktoken.cpp test_bpe_tokenizer.cpp
26-
${CMAKE_CURRENT_SOURCE_DIR}/../tiktoken.cpp
27-
${CMAKE_CURRENT_SOURCE_DIR}/../bpe_tokenizer.cpp
28-
)
29-
30-
set(ENV{RESOURCES_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/resources)
31-
set(ABSL_ENABLE_INSTALL ON)
32-
set(ABSL_PROPAGATE_CXX_STD ON)
33-
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
34-
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
35-
add_subdirectory(
36-
${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp
37-
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
38-
)
39-
add_subdirectory(
40-
${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/re2
41-
${CMAKE_CURRENT_BINARY_DIR}/re2
22+
set(test_env
23+
"TEST_BPE_TOKENIZER=${EXECUTORCH_ROOT}/extension/llm/tokenizer/test/resources/test_bpe_tokenizer.bin"
24+
"TEST_TIKTOKEN_INVALID_BASE64=${EXECUTORCH_ROOT}/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_base64.model"
25+
"TEST_TIKTOKEN_INVALID_RANK=${EXECUTORCH_ROOT}/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_rank.model"
26+
"TEST_TIKTOKEN_NO_SPACE=${EXECUTORCH_ROOT}/extension/llm/tokenizer/test/resources/test_tiktoken_no_space.model"
27+
"TEST_TIKTOKEN_TOKENIZER=${EXECUTORCH_ROOT}/extension/llm/tokenizer/test/resources/test_tiktoken_tokenizer.model"
4228
)
43-
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
4429

45-
et_cxx_test(tokenizer_test SOURCES ${_tokenizer_test_srcs} EXTRA_LIBS re2::re2)
46-
target_include_directories(
47-
tokenizer_test
48-
PRIVATE ${CMAKE_INSTALL_PREFIX}/include
49-
${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp
50-
)
30+
# set(ABSL_ENABLE_INSTALL ON)
31+
# set(ABSL_PROPAGATE_CXX_STD ON)
32+
# set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
33+
# set(CMAKE_POSITION_INDEPENDENT_CODE ON)
34+
# add_subdirectory(
35+
# ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp
36+
# ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
37+
# )
38+
# add_subdirectory(
39+
# ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/re2
40+
# ${CMAKE_CURRENT_BINARY_DIR}/re2
41+
# )
42+
# set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
43+
44+
set(_test_srcs test_bpe_tokenizer.cpp test_tiktoken.cpp)
45+
46+
et_cxx_test(extension_llm_tokenizer_test SOURCES ${_test_srcs} EXTRA_LIBS extension_llm_tokenizer)
47+
48+
set_property(TEST extension_llm_tokenizer_test PROPERTY ENVIRONMENT ${test_env})
49+
50+
# target_include_directories(extension_llm_tokenizer PUBLIC)
51+
# target_include_directories(
52+
# extension_llm_tokenizer_test
53+
# PRIVATE ${CMAKE_INSTALL_PREFIX}/include
54+
# ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp
55+
# )

extension/llm/tokenizer/test/targets.bzl

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ def define_common_targets():
77
TARGETS and BUCK files that call this function.
88
"""
99

10+
test_env = {
11+
"TEST_BPE_TOKENIZER": "$(location //executorch/extension/llm/tokenizer/test/resources/test_bpe_tokenizer.bin"
12+
"TEST_TIKTOKEN_INVALID_BASE64": "$(location //executorch/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_base64.model"
13+
"TEST_TIKTOKEN_INVALID_RANK": "$(location //executorch/extension/llm/tokenizer/test/resources/test_tiktoken_invalid_rank.model"
14+
"TEST_TIKTOKEN_NO_SPACE": "$(location //executorch/extension/llm/tokenizer/test/resources/test_tiktoken_no_space.model"
15+
"TEST_TIKTOKEN_TOKENIZER": "$(location //executorch/extension/llm/tokenizer/test/resources/test_tiktoken_tokenizer.model"
16+
},
17+
1018
runtime.python_test(
1119
name = "test_tokenizer_py",
1220
srcs = [
@@ -25,9 +33,7 @@ def define_common_targets():
2533
deps = [
2634
"//executorch/extension/llm/tokenizer:bpe_tokenizer",
2735
],
28-
env = {
29-
"RESOURCES_PATH": "$(location :resources)/resources",
30-
},
36+
env = test_env,
3137
)
3238

3339
runtime.cxx_test(
@@ -45,10 +51,3 @@ def define_common_targets():
4551
"re2",
4652
],
4753
)
48-
49-
runtime.filegroup(
50-
name = "resources",
51-
srcs = native.glob([
52-
"resources/**",
53-
]),
54-
)

extension/llm/tokenizer/test/test_bpe_tokenizer.cpp

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
#ifdef EXECUTORCH_FB_BUCK
10-
#include <TestResourceUtils/TestResourceUtils.h>
11-
#endif
9+
// #ifdef EXECUTORCH_FB_BUCK
10+
// #include <TestResourceUtils/TestResourceUtils.h>
11+
// #endif
1212
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
1313
#include <executorch/runtime/platform/runtime.h>
1414
#include <gtest/gtest.h>
@@ -26,13 +26,15 @@ class TokenizerExtensionTest : public Test {
2626
void SetUp() override {
2727
executorch::runtime::runtime_init();
2828
tokenizer_ = std::make_unique<BPETokenizer>();
29-
#ifdef EXECUTORCH_FB_BUCK
30-
modelPath_ = facebook::xplat::testing::getPathForTestResource(
31-
"resources/test_bpe_tokenizer.bin");
32-
#else
33-
modelPath_ =
34-
std::getenv("RESOURCES_PATH") + std::string("/test_bpe_tokenizer.bin");
35-
#endif
29+
modelPath_ = std::getenv("TEST_BPE_TOKENIZER");
30+
// #ifdef EXECUTORCH_FB_BUCK
31+
// modelPath_ = facebook::xplat::testing::getPathForTestResource(
32+
// "resources/test_bpe_tokenizer.bin");
33+
// #else
34+
// modelPath_ =
35+
// std::getenv("RESOURCES_PATH") +
36+
// std::string("/test_bpe_tokenizer.bin");
37+
// #endif
3638
}
3739

3840
std::unique_ptr<Tokenizer> tokenizer_;

extension/llm/tokenizer/test/test_tiktoken.cpp

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
#ifdef EXECUTORCH_FB_BUCK
10-
#include <TestResourceUtils/TestResourceUtils.h>
11-
#endif
9+
// #ifdef EXECUTORCH_FB_BUCK
10+
// #include <TestResourceUtils/TestResourceUtils.h>
11+
// #endif
1212
#include <executorch/extension/llm/tokenizer/tiktoken.h>
1313
#include <executorch/runtime/platform/runtime.h>
1414
#include <gmock/gmock.h>
@@ -50,13 +50,15 @@ static inline std::unique_ptr<std::vector<std::string>> _get_special_tokens() {
5050
return special_tokens;
5151
}
5252

53-
static inline std::string _get_resource_path(const std::string& name) {
54-
#ifdef EXECUTORCH_FB_BUCK
55-
return facebook::xplat::testing::getPathForTestResource("resources/" + name);
56-
#else
57-
return std::getenv("RESOURCES_PATH") + std::string("/") + name;
58-
#endif
59-
}
53+
// static inline std::string _get_resource_path(const std::string& name) {
54+
// return std::get_env(name);
55+
// #ifdef EXECUTORCH_FB_BUCK
56+
// return facebook::xplat::testing::getPathForTestResource("resources/" +
57+
// name);
58+
// #else
59+
// return std::getenv("RESOURCES_PATH") + std::string("/") + name;
60+
// #endif
61+
// }
6062

6163
} // namespace
6264

@@ -66,7 +68,7 @@ class TiktokenExtensionTest : public Test {
6668
executorch::runtime::runtime_init();
6769
tokenizer_ = std::make_unique<Tiktoken>(
6870
_get_special_tokens(), kBOSTokenIndex, kEOSTokenIndex);
69-
modelPath_ = _get_resource_path("test_tiktoken_tokenizer.model");
71+
modelPath_ = std::get_env("TEST_TIKTOKEN_TOKENIZER");
7072
}
7173

7274
std::unique_ptr<Tokenizer> tokenizer_;
@@ -160,30 +162,28 @@ TEST_F(TiktokenExtensionTest, LoadWithInvalidPath) {
160162
}
161163

162164
TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithInvalidRank) {
163-
auto invalidModelPath =
164-
_get_resource_path("test_tiktoken_invalid_rank.model");
165+
auto invalidModelPath = std::get_env("TEST_TIKTOKEN_INVALID_RANK");
165166
Error res = tokenizer_->load(invalidModelPath.c_str());
166167

167168
EXPECT_EQ(res, Error::InvalidArgument);
168169
}
169170

170171
TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithInvalidBase64) {
171-
auto invalidModelPath =
172-
_get_resource_path("test_tiktoken_invalid_base64.model");
172+
auto invalidModelPath = std::get_env("TEST_TIKTOKEN_INVALID_BASE64");
173173
Error res = tokenizer_->load(invalidModelPath.c_str());
174174

175175
EXPECT_EQ(res, Error::InvalidArgument);
176176
}
177177

178178
TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithNoSpace) {
179-
auto invalidModelPath = _get_resource_path("test_tiktoken_no_space.model");
180-
Error res = tokenizer_->load(invalidModelPath.c_str());
179+
auto invalidModelPath = std::get_env("TEST_TIKTOKEN_NO_SPACE") Error res =
180+
tokenizer_->load(invalidModelPath.c_str());
181181

182182
EXPECT_EQ(res, Error::InvalidArgument);
183183
}
184184

185185
TEST_F(TiktokenExtensionTest, LoadTiktokenFileWithBPEFile) {
186-
auto invalidModelPath = _get_resource_path("test_bpe_tokenizer.bin");
186+
auto invalidModelPath = std::get_env("TEST_BPE_TOKENIZER");
187187
Error res = tokenizer_->load(invalidModelPath.c_str());
188188

189189
EXPECT_EQ(res, Error::InvalidArgument);

runtime/executor/test/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ add_custom_command(
3232
"ModuleAdd,ModuleAddHalf,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleLinear,ModuleMultipleEntry,ModuleSimpleTrain"
3333
--outdir "${CMAKE_BINARY_DIR}" 2> /dev/null
3434
COMMAND
35-
python3 -m test.models.export_program --modules
35+
python -m test.models.export_program --modules
3636
"ModuleLinear" --external-constants
3737
--outdir "${CMAKE_BINARY_DIR}" 2> /dev/null
3838
COMMAND

test/run_oss_cpp_tests.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,18 +35,17 @@ build_executorch() {
3535
CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
3636
cmake . \
3737
-DCMAKE_INSTALL_PREFIX=cmake-out \
38-
-DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
38+
-DCMAKE_PREFIX_PATH=/home/lfq/.conda/envs/executorch/lib/python3.10/site-packages/torch\
3939
-DEXECUTORCH_USE_CPP_CODE_COVERAGE=ON \
4040
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
4141
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
4242
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
4343
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
44+
-DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
4445
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
4546
-DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
4647
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
4748
-DEXECUTORCH_BUILD_DEVTOOLS=ON \
48-
-DEXECUTORCH_BUILD_VULKAN=$BUILD_VULKAN \
49-
-DEXECUTORCH_BUILD_XNNPACK=ON \
5049
-DEXECUTORCH_BUILD_TESTS=ON \
5150
-Bcmake-out
5251
cmake --build cmake-out -j9 --target install

test/utils/OSSTestConfig.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,13 @@
3131
"make_boxed_from_unboxed_functor_test.cpp"
3232
]
3333
},
34+
{
35+
"directory": "extension/llm/tokenizer/test",
36+
"sources": [
37+
"test_bpe_tokenizer.cpp",
38+
"test_tiktoken.cpp"
39+
]
40+
},
3441
{
3542
"directory": "extension/memory_allocator/test",
3643
"sources": [

0 commit comments

Comments
 (0)