Skip to content

Commit 1ab0e48

Browse files
authored
Add faster tokenizers cpp package (#2139)
* Fix package name * Add comment * Add cpp lib install * remove pybind headers * Add ErnieFasterTokenizer * Add third party include headers
1 parent 8ee2eb9 commit 1ab0e48

File tree

16 files changed

+491
-29
lines changed

16 files changed

+491
-29
lines changed

faster_tokenizers/CMakeLists.txt

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.10)
33
project(tokenizers LANGUAGES CXX C VERSION 1.0)
44

55
option(WITH_TESTING "Compile PaddleNLP tokenizers with unit testing" ON)
6+
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
67

78
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
89

@@ -53,8 +54,48 @@ include(generic)
5354
include(third_party)
5455

5556
add_subdirectory(faster_tokenizers)
56-
add_subdirectory(python)
5757

58+
if(WITH_PYTHON)
59+
60+
add_subdirectory(python)
5861
add_custom_target(build_tokenizers_bdist_wheel ALL
5962
COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
6063
DEPENDS copy_python_tokenizers)
64+
65+
else(WITH_PYTHON)
66+
67+
set(CPP_PACKAGE_DIR ${CMAKE_BINARY_DIR}/faster_tokenizers_cpp)
68+
add_custom_target(build_cpp_package_dir ALL
69+
COMMAND ${CMAKE_COMMAND} -E make_directory ${CPP_PACKAGE_DIR}/lib ${CPP_PACKAGE_DIR}/include ${CPP_PACKAGE_DIR}/third_party/include)
70+
71+
# copy headers
72+
add_custom_target(copy_headers ALL
73+
COMMAND ${CMAKE_COMMAND} -E copy_directory
74+
${CMAKE_SOURCE_DIR}/faster_tokenizers/include ${CPP_PACKAGE_DIR}/include
75+
DEPENDS build_cpp_package_dir)
76+
77+
add_custom_target(copy_third_party_headers ALL
78+
COMMAND ${CMAKE_COMMAND} -E copy_directory
79+
${GFLAGS_INCLUDE_DIR} ${ICU_INCLUDE_DIR}
80+
${GLOG_INCLUDE_DIR} ${JSON_INCLUDE_DIR} ${RE2_INCLUDE_DIR}
81+
${CPP_PACKAGE_DIR}/third_party/include
82+
DEPENDS build_cpp_package_dir)
83+
84+
add_custom_target(copy_boost_headers ALL
85+
COMMAND ${CMAKE_COMMAND} -E copy_directory
86+
${BOOST_INCLUDE_DIR}/boost ${CPP_PACKAGE_DIR}/third_party/include/boost
87+
DEPENDS build_cpp_package_dir)
88+
89+
add_custom_target(remove_pybind_headers ALL
90+
COMMAND ${CMAKE_COMMAND} -E remove_directory ${CPP_PACKAGE_DIR}/include/pybind
91+
DEPENDS copy_headers)
92+
93+
94+
# copy library
95+
add_custom_target(copy_shared_library ALL
96+
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/faster_tokenizers/src/tokenizers/libcore_tokenizers.so ${CPP_PACKAGE_DIR}/lib
97+
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/third_party/install/icu/lib/libicuuc.so.70 ${CPP_PACKAGE_DIR}/lib
98+
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/third_party/install/icu/lib/libicudata.so.70 ${CPP_PACKAGE_DIR}/lib
99+
DEPENDS build_cpp_package_dir core_tokenizers)
100+
101+
endif(WITH_PYTHON)

faster_tokenizers/cmake/dummy.c.in

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
// Generated by @dummy_GENERATOR@. DO NOT EDIT!!!
2+
3+
const char *dummy = "@dummy_CONTENT@";

faster_tokenizers/cmake/external/python.cmake

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,7 @@ IF(PYTHONINTERP_FOUND)
6868
find_python_module(pip REQUIRED)
6969
find_python_module(numpy REQUIRED)
7070
find_python_module(wheel REQUIRED)
71-
find_python_module(google.protobuf REQUIRED)
7271
FIND_PACKAGE(NumPy REQUIRED)
73-
IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
74-
MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
75-
"please use pip to upgrade protobuf. pip install -U protobuf")
76-
ENDIF()
7772
ENDIF(PYTHONINTERP_FOUND)
7873
INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
7974
INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})

faster_tokenizers/cmake/external/re2.cmake

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ ELSE(WIN32)
2626
SET(RE2_LIBRARIES "${RE2_INSTALL_DIR}/lib64/libre2.a")
2727
ENDIF(WIN32)
2828

29-
INCLUDE_DIRECTORIES(${RE2_INSTALL_DIR}/include)
29+
SET(RE2_INCLUDE_DIR ${RE2_INSTALL_DIR}/include)
30+
INCLUDE_DIRECTORIES(${RE2_INCLUDE_DIR})
3031

3132
ExternalProject_Add(
3233
extern_re2

faster_tokenizers/cmake/generic.cmake

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ function(cc_library TARGET_NAME)
4444
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
4545
endif()
4646
# For C++ 17 filesystem
47-
target_link_libraries(${TARGET_NAME} stdc++fs)
47+
# target_link_libraries(${TARGET_NAME} stdc++fs)
4848

4949
# cpplint code style
5050
foreach(source_file ${cc_library_SRCS})
@@ -57,9 +57,15 @@ function(cc_library TARGET_NAME)
5757
else(cc_library_SRCS)
5858
if(cc_library_DEPS)
5959
list(REMOVE_DUPLICATES cc_library_DEPS)
60-
61-
generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR "generic.cmake:cc_library")
62-
60+
set(dummy_FILE_PATH "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c")
61+
configure_file(${PROJECT_SOURCE_DIR}/cmake/dummy.c.in ${dummy_FILE_PATH} @ONLY)
62+
if(cc_library_SHARED OR cc_library_shared) # build *.so
63+
add_library(${TARGET_NAME} SHARED ${dummy_FILE_PATH})
64+
elseif(cc_library_INTERFACE OR cc_library_interface)
65+
generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${dummy_FILE_PATH} GENERATOR "generic.cmake:cc_library")
66+
else()
67+
add_library(${TARGET_NAME} STATIC ${dummy_FILE_PATH})
68+
endif()
6369
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
6470
else()
6571
message(FATAL_ERROR "Please specify source files or libraries in cc_library(${TARGET_NAME} ...).")
@@ -119,3 +125,31 @@ function(cc_test TARGET_NAME)
119125
add_test(NAME ${TARGET_NAME} COMMAND ${CMAKE_COMMAND} -E echo CI skip ${TARGET_NAME}.)
120126
endif()
121127
endfunction(cc_test)
128+
129+
# create a dummy source file, then create a static library.
130+
# LIB_NAME should be the static lib name.
131+
# FILE_PATH should be the dummy source file path.
132+
# GENERATOR should be the file name invoke this function.
133+
# CONTENT should be some helpful info.
134+
# example: generate_dummy_static_lib(mylib FILE_PATH /path/to/dummy.c GENERATOR mylib.cmake CONTENT "helpful info")
135+
function(generate_dummy_static_lib)
136+
set(options "")
137+
set(oneValueArgs LIB_NAME FILE_PATH GENERATOR CONTENT)
138+
set(multiValueArgs "")
139+
cmake_parse_arguments(dummy "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
140+
if(NOT dummy_LIB_NAME)
141+
message(FATAL_ERROR "You must provide a static lib name.")
142+
endif()
143+
if(NOT dummy_FILE_PATH)
144+
set(dummy_FILE_PATH "${CMAKE_CURRENT_BINARY_DIR}/${dummy_LIB_NAME}_dummy.c")
145+
endif()
146+
if(NOT dummy_GENERATOR)
147+
message(FATAL_ERROR "You must provide a generator file name.")
148+
endif()
149+
if(NOT dummy_CONTENT)
150+
set(dummy_CONTENT "${dummy_LIB_NAME}_dummy.c for lib ${dummy_LIB_NAME}")
151+
endif()
152+
153+
configure_file(${PROJECT_SOURCE_DIR}/cmake/dummy.c.in ${dummy_FILE_PATH} @ONLY)
154+
add_library(${dummy_LIB_NAME} STATIC ${dummy_FILE_PATH})
155+
endfunction()

faster_tokenizers/cmake/third_party.cmake

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,14 @@ include(ExternalProject)
1717
set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
1818
"A path setting third party libraries download & build directories.")
1919

20-
include(external/pybind11)
2120
include(external/icu)
2221
include(external/gtest)
2322
include(external/gflags)
2423
include(external/glog)
2524
include(external/re2)
26-
include(external/python)
2725
include(external/boost)
2826
include(external/nlohmann_json)
27+
if (WITH_PYTHON)
28+
include(external/python)
29+
include(external/pybind11)
30+
endif()
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
#pragma once
15+
16+
#include <string>
17+
#include <unordered_map>
18+
#include "core/tokenizer.h"
19+
20+
namespace tokenizers {
21+
namespace tokenizers_impl {
22+
23+
struct ErnieFasterTokenizer : public core::Tokenizer {
24+
ErnieFasterTokenizer(const std::string& vocab_path,
25+
const std::string& unk_token = "[UNK]",
26+
const std::string& sep_token = "[SEP]",
27+
const std::string& cls_token = "[CLS]",
28+
const std::string& pad_token = "[PAD]",
29+
const std::string& mask_token = "[MASK]",
30+
bool clean_text = true,
31+
bool handle_chinese_chars = true,
32+
bool strip_accents = true,
33+
bool lowercase = true,
34+
const std::string& wordpieces_prefix = "##",
35+
uint max_sequence_len = 0);
36+
37+
ErnieFasterTokenizer(const core::Vocab& vocab,
38+
const std::string& unk_token = "[UNK]",
39+
const std::string& sep_token = "[SEP]",
40+
const std::string& cls_token = "[CLS]",
41+
const std::string& pad_token = "[PAD]",
42+
const std::string& mask_token = "[MASK]",
43+
bool clean_text = true,
44+
bool handle_chinese_chars = true,
45+
bool strip_accents = true,
46+
bool lowercase = true,
47+
const std::string& wordpieces_prefix = "##",
48+
uint max_sequence_len = 0);
49+
50+
private:
51+
void Init(const core::Vocab& vocab,
52+
const std::string& unk_token = "[UNK]",
53+
const std::string& sep_token = "[SEP]",
54+
const std::string& cls_token = "[CLS]",
55+
const std::string& pad_token = "[PAD]",
56+
const std::string& mask_token = "[MASK]",
57+
bool clean_text = true,
58+
bool handle_chinese_chars = true,
59+
bool strip_accents = true,
60+
bool lowercase = true,
61+
const std::string& wordpieces_prefix = "##",
62+
uint max_sequence_len = 0);
63+
};
64+
65+
} // tokenizers_impl
66+
} // tokenizers
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#pragma once
16+
#include <fstream>
17+
#include <string>
18+
#include <unordered_map>
19+
20+
namespace tokenizers {
21+
namespace utils {
22+
23+
inline void GetVocabFromFiles(const std::string& files,
24+
std::unordered_map<std::string, uint>* vocab) {
25+
const static std::string WHITESPACE = " \n\r\t\f\v";
26+
std::ifstream fin(files);
27+
vocab->clear();
28+
int i = 0;
29+
constexpr int MAX_BUFFER_SIZE = 256;
30+
char word[MAX_BUFFER_SIZE];
31+
while (fin.getline(word, MAX_BUFFER_SIZE)) {
32+
std::string word_str = word;
33+
auto leading_spaces = word_str.find_first_not_of(WHITESPACE);
34+
if (leading_spaces != std::string::npos) {
35+
word_str = word_str.substr(leading_spaces);
36+
}
37+
auto trailing_spaces = word_str.find_last_not_of(WHITESPACE);
38+
if (trailing_spaces != std::string::npos) {
39+
word_str = word_str.substr(0, trailing_spaces + 1);
40+
}
41+
if (word_str != "") {
42+
(*vocab)[word_str] = i++;
43+
}
44+
}
45+
}
46+
47+
} // namespace utils
48+
} // namespace tokenizers

faster_tokenizers/faster_tokenizers/src/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,11 @@ add_subdirectory(normalizers)
44
add_subdirectory(pretokenizers)
55
add_subdirectory(postprocessors)
66
add_subdirectory(core)
7+
# set the relative path of shared library
8+
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath='$ORIGIN'")
9+
10+
if (WITH_PYTHON)
711
add_subdirectory(pybind)
12+
else(WITH_PYTHON)
13+
add_subdirectory(tokenizers)
14+
endif(WITH_PYTHON)

faster_tokenizers/faster_tokenizers/src/core/tokenizer.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ limitations under the License. */
2525
#include "postprocessors/postprocessors.h"
2626
#include "pretokenizers/pretokenizers.h"
2727

28+
#ifdef WITH_OMP
29+
#include <omp.h>
30+
#endif
31+
2832
namespace tokenizers {
2933
namespace core {
3034

0 commit comments

Comments
 (0)