[FastTokenizer] Update fast_tokenizer doc (#3787)

joey12300 · web-flow · commit 707a94d12896 · 2022-11-17T17:00:41.000+08:00
* update link

* rename demo to examples

* add python readme

* update the cmakelist

* add ernie python demo

* add status print after compiling

* Add README for ernie fast tokenizer

* Add clip fast tokenizer cpp readme

* Update docs

* Add some tips

* add deps

* add deps

* update

* update

* update shell
diff --git a/fast_tokenizer/README.md b/fast_tokenizer/README.md
@@ -1,3 +1,4 @@
+
 # ⚡ FastTokenizer：高性能文本处理库
 
 ------------------------------------------------------------------------------------------
@@ -19,12 +20,12 @@ FastTokenizer是一款简单易用、功能强大的跨平台高性能文本预
 
 - 高性能。由于底层采用C++实现，所以其性能远高于目前常规Python实现的Tokenizer。在文本分类任务上，FastTokenizer对比Python版本Tokenizer加速比最高可达20倍。支持多线程加速多文本批处理分词。默认使用单线程分词。
 - 跨平台。FastTokenizer可在不同的系统平台上使用，目前已支持Windows x64，Linux x64以及MacOS 10.14+平台上使用。
-- 多编程语言支持。FastTokenizer提供在C++、Python语言上开发的能力。
+- 多编程语言支持。FastTokenizer提供在[C++](./docs/cpp/README.md)、[Python](./docs/python/README.md)语言上开发的能力。
 - 灵活性强。用户可以通过指定不同的FastTokenizer组件定制满足需求的Tokenizer。
 
 ## 快速开始
 
-下面将介绍Python版本FastTokenizer的使用方式，C++版本的使用方式可参考[FastTokenizer C++ Demo](./fast_tokenizer/demo/README.md)。
+下面将介绍Python版本FastTokenizer的使用方式，C++版本的使用方式可参考[FastTokenizer C++ 库使用教程](./docs/cpp/README.md)。
 
 ### 环境依赖
 
@@ -128,3 +129,7 @@ A：可以通过调用 `fast_tokenizer.set_thread_num(xxx)` 使用多线程进
 ## 相关文档
 
 [FastTokenizer编译指南](docs/compile/README.md)
+
+[FastTokenizer C++ 库使用教程](./docs/cpp/README.md)
+
+[FastTokenizer Python 库使用教程](./docs/python/README.md)
diff --git a/fast_tokenizer/docs/cpp/README.md b/fast_tokenizer/docs/cpp/README.md
@@ -0,0 +1,66 @@
+# FastTokenizer C++ 库使用教程
+
+## 1. 快速安装
+
+当前版本FastTokenizer C++库支持不同的操作系统以及硬件平台，并为以下平台提供预编译包：
+|系统|下载地址|
+|---|---|
+|Linux-x64| [fast_tokenizer-linux-x64-1.0.0.tgz](https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-x64-1.0.0.tgz) |
+|Linux-aarch64| [fast_tokenizer-linux-aarch64-1.0.0.tgz](https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-aarch64-1.0.0.tgz) |
+|Windows| [fast_tokenizer-win-x64-1.0.0.zip](https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-win-x64-1.0.0.zip) |
+|MacOS-x64| [fast_tokenizer-osx-x86_64-1.0.0.tgz](https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-osx-x86_64-1.0.0.tgz) |
+|MacOS-arm64| [fast_tokenizer-osx-arm64-1.0.0.tgz](https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-osx-arm64-1.0.0.tgz) |
+
+### 环境依赖
+
+#### 系统环境要求
+|系统|版本|
+|---|---|
+|Linux|Ubuntu 16.04+，CentOS 7+|
+|Windows|10|
+|MacOS| 11.4+|
+
+
+#### Linux，Mac编译环境要求
+|依赖|版本|
+|---|---|
+|cmake|>=16.0|
+|gcc|>=8.2.0|
+
+#### Windows编译环境要求
+|依赖|版本|
+|---|---|
+|cmake|>=16.0|
+|VisualStudio|2019|
+
+### 下载解压
+
+```shell
+wget -c https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-x64-1.0.0.tgz
+
+tar xvfz fast_tokenizer-linux-x64-1.0.0.tgz
+# 解压后为fast_tokenizer目录
+```
+
+解压后得到fast_tokenizer目录，该目录的结构如下：
+
+```shell
+
+fast_tokenizer
+|__ commit.log              # 编译时的commit id
+|__ FastTokenizer.cmake     # FastTokenizer CMake文件，定义了头文件目录、动态链接库目录变量
+|__ include                 # FastTokenizer的头文件目录
+|__ lib                     # FastTokenizer的动态链接库目录
+|__ third_party             # FastTokenizer依赖的第三方库目录
+
+```
+
+推荐用户直接使用cmake方式引入FastTokenizer库。在CMake引入FastTokenizer时，只需添加一行 `include(FastTokenizer.cmake)`，即可获取FastTokenizer的预定义的CMake变量`FAST_TOKENIZER_INCS`和`FAST_TOKENIZER_LIBS`，分别指定FastTokenizer的头文件目录以及动态链接库目录。
+
+
+## 2. 快速开始
+
+目前FastTokenizer提供了以下C++使用示例。
+
+[ErnieFastTokenizer C++示例](../../examples/ernie/)
+[ClipFastTokenizer C++示例](../../examples/clip/)
diff --git a/fast_tokenizer/docs/python/README.md b/fast_tokenizer/docs/python/README.md
@@ -0,0 +1,16 @@
+# FastTokenizer Python库 使用教程
+
+## 1. 快速安装
+
+### 环境依赖
+
+- Windows 64位系统
+- Linux x64系统
+- MacOS 10.14+系统（m1芯片的MacOS，需要使用x86_64版本的Anaconda作为python环境方可安装使用）
+- Python 3.6 ~ 3.10
+
+### 安装
+
+```shell
+pip install --upgrade fast_tokenizer
+```
diff --git a/fast_tokenizer/examples/clip/README.md b/fast_tokenizer/examples/clip/README.md
diff --git a/fast_tokenizer/examples/clip/cpp/CMakeLists.txt b/fast_tokenizer/examples/clip/cpp/CMakeLists.txt
@@ -1,17 +1,7 @@
 cmake_minimum_required(VERSION 3.10)
 project(cpp_fast_tokenizer_demo CXX C)
-
 option(FAST_TOKENIZER_INSTALL_DIR "Path of downloaded fast_tokenizer sdk.")
 
-# Download ernie vocab for demo
-set(ERNIE_VOCAB_PATH ${CMAKE_CURRENT_BINARY_DIR}/ernie_vocab.txt)
-if (EXISTS ${ERNIE_VOCAB_PATH})
-  message(STATUS "The ${ERNIE_VOCAB_PATH} exists already.")
-else()
-  file(DOWNLOAD "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt" ${ERNIE_VOCAB_PATH} SHOW_PROGRESS)
-  message(STATUS "Already download the vocab.txt of ernie to ${CMAKE_CURRENT_BINARY_DIR} for demo.")
-endif()
-
 # Download clip vocab and merge files
 set(CLIP_VOCAB_PATH ${CMAKE_CURRENT_BINARY_DIR}/clip_vocab.json)
 set(CLIP_MERGES_PATH ${CMAKE_CURRENT_BINARY_DIR}/clip_merges.txt)
@@ -32,10 +22,7 @@ endif()
 
 # Get FAST_TOKENIZER_INCS and FAST_TOKENIZER_LIBS
 include(${FAST_TOKENIZER_INSTALL_DIR}/FastTokenizer.cmake)
-
 include_directories(${FAST_TOKENIZER_INCS})
 
-add_executable(ernie_fast_tokenizer_demo ${PROJECT_SOURCE_DIR}/ernie_fast_tokenizer_demo.cc)
-add_executable(clip_fast_tokenizer_demo ${PROJECT_SOURCE_DIR}/clip_fast_tokenizer_demo.cc)
-target_link_libraries(ernie_fast_tokenizer_demo ${FAST_TOKENIZER_LIBS})
-target_link_libraries(clip_fast_tokenizer_demo ${FAST_TOKENIZER_LIBS})
+add_executable(demo ${PROJECT_SOURCE_DIR}/demo.cc)
+target_link_libraries(demo ${FAST_TOKENIZER_LIBS})
diff --git a/fast_tokenizer/examples/clip/cpp/README.md b/fast_tokenizer/examples/clip/cpp/README.md
@@ -0,0 +1,99 @@
+# ClipFastTokenizer C++ 示例
+
+## 1. 快速安装
+
+当前版本FastTokenizer C++库支持不同的操作系统以及硬件平台，用户可以根据实际的使用环境，从以下选择合适的预编译包：
+|系统|下载地址|
+|---|---|
+|Linux-x64| [fast_tokenizer-linux-x64-1.0.0.tgz](https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-x64-1.0.0.tgz) |
+|Linux-aarch64| [fast_tokenizer-linux-aarch64-1.0.0.tgz](https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-aarch64-1.0.0.tgz) |
+|Windows| [fast_tokenizer-win-x64-1.0.0.zip](https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-win-x64-1.0.0.zip) |
+|MacOS-x64| [fast_tokenizer-osx-x86_64-1.0.0.tgz](https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-osx-x86_64-1.0.0.tgz) |
+|MacOS-arm64| [fast_tokenizer-osx-arm64-1.0.0.tgz](https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-osx-arm64-1.0.0.tgz) |
+
+### 环境依赖
+
+#### 系统环境要求
+|系统|版本|
+|---|---|
+|Linux|Ubuntu 16.04+，CentOS 7+|
+|Windows|10|
+|MacOS| 11.4+|
+
+
+#### Linux，Mac编译环境要求
+|依赖|版本|
+|---|---|
+|cmake|>=16.0|
+|gcc|>=8.2.0|
+
+#### Windows编译环境要求
+|依赖|版本|
+|---|---|
+|cmake|>=16.0|
+|VisualStudio|2019|
+
+## 2. 快速开始
+
+以下以Linux平台为例, 介绍如何使用FastTokenizer C++预编译包完成demo示例编译及运行。该示例会生成一个名为`demo`的可执行文件。
+
+### 2.1 下载解压
+
+```shell
+wget -c https://bj.bcebos.com/paddlenlp/fast_tokenizer/fast_tokenizer-linux-x64-1.0.0.tgz
+
+tar xvfz fast_tokenizer-linux-x64-1.0.0.tgz
+# 解压后为fast_tokenizer目录
+```
+
+解压后得到fast_tokenizer目录，该目录的结构如下：
+
+```shell
+
+fast_tokenizer
+|__ commit.log              # 编译时的commit id
+|__ FastTokenizer.cmake     # FastTokenizer CMake文件，定义了头文件目录、动态链接库目录变量
+|__ include                 # FastTokenizer的头文件目录
+|__ lib                     # FastTokenizer的动态链接库目录
+|__ third_party             # FastTokenizer依赖的第三方库目录
+
+```
+
+推荐用户直接使用cmake方式引入FastTokenizer库。在CMake引入FastTokenizer时，只需添加一行 `include(FastTokenizer.cmake)`，即可获取FastTokenizer的预定义的CMake变量`FAST_TOKENIZER_INCS`和`FAST_TOKENIZER_LIBS`，分别指定FastTokenizer的头文件目录以及动态链接库目录。
+
+
+### 2.2 编译
+
+示例提供简单的CMakeLists.txt, 用户仅需指定fast_tokenizer包的路径，即可完成编译。
+
+```shell
+
+# 创建编译目录
+mkdir build
+cd build
+
+# 运行cmake，通过指定fast_tokenizer包的路径，构建Makefile
+cmake .. -DFAST_TOKENIZER_INSTALL_DIR=/path/to/fast_tokenizer
+
+# 编译
+make
+
+```
+
+### 2.3 运行
+
+```shell
+./demo
+```
+
+
+### 2.4 样例输出
+
+输出包含原始文本的输入，以及分词后的ids序列结果（含padding）。
+
+```shell
+
+text = "a photo of an astronaut riding a horse on mars"
+ids = [49406, 320, 1125, 539, 550, 18376, 6765, 320, 4558, 525, 7496, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407]
+
+```
diff --git a/fast_tokenizer/examples/clip/cpp/demo.cc b/fast_tokenizer/examples/clip/cpp/demo.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "fast_tokenizer/tokenizers/clip_fast_tokenizer.h"
 #include <iostream>
 #include <vector>
+#include "fast_tokenizer/tokenizers/clip_fast_tokenizer.h"
 using namespace paddlenlp;
 
 template <typename T>
@@ -35,10 +35,10 @@ fast_tokenizer::tokenizers_impl::ClipFastTokenizer CreateClipFastTokenizer(
     const std::string& vocab_path,
     const std::string& merge_path,
     uint32_t max_length,
-    bool pad = true) {
+    bool pad_to_max_length = true) {
   fast_tokenizer::tokenizers_impl::ClipFastTokenizer tokenizer(
       vocab_path, merge_path, max_length);
-  if (pad) {
+  if (pad_to_max_length) {
     tokenizer.EnablePadMethod(fast_tokenizer::core::RIGHT,
                               tokenizer.GetPadTokenId(),
                               0,
@@ -51,16 +51,20 @@ fast_tokenizer::tokenizers_impl::ClipFastTokenizer CreateClipFastTokenizer(
 
 int main() {
   // 1. Define a clip fast tokenizer
-  auto tokenizer =
-      CreateClipFastTokenizer("clip_vocab.json", "clip_merges.txt", 77, true);
+  auto tokenizer = CreateClipFastTokenizer("clip_vocab.json",
+                                           "clip_merges.txt",
+                                           /*max_length = */ 77,
+                                           /* pad_to_max_length = */ true);
   // 2. Tokenize the input strings
   std::vector<fast_tokenizer::core::Encoding> encodings;
   std::vector<std::string> texts = {
       "a photo of an astronaut riding a horse on mars"};
   tokenizer.EncodeBatchStrings(texts, &encodings);
-  for (auto&& encoding : encodings) {
-    auto ids = encoding.GetIds();
-    std::cout << ids << std::endl;
+
+  for (int i = 0; i < texts.size(); ++i) {
+    std::cout << "text = \"" << texts[i] << "\"" << std::endl;
+    std::cout << "ids = " << encodings[i].GetIds() << std::endl;
   }
+
   return 0;
-}
+}
diff --git a/fast_tokenizer/examples/clip/python/README.md b/fast_tokenizer/examples/clip/python/README.md
diff --git a/fast_tokenizer/examples/clip/python/demo.py b/fast_tokenizer/examples/clip/python/demo.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/fast_tokenizer/examples/ernie-3.0/README.md b/fast_tokenizer/examples/ernie-3.0/README.md
@@ -0,0 +1,20 @@
+# ErnieFastTokenizer分词示例
+
+FastTokenizer库在C++、Python端提供ErnieFastTokenizer接口，用户只需传入模型相应的词表即可调用该接口，完成高效分词操作。该接口底层使用`WordPiece`算法进行分词。针对`WordPiece`算法，FastTokenizer实现了"Fast WordPiece Tokenization"提出的基于`MinMaxMatch`的`FastWordPiece`算法。原有`WordPiece`算法的时间复杂度与序列长度为二次方关系，在对长文本进行分词操作时，时间开销比较大。而`FastWordPiece`算法通过`Aho–Corasick `算法将`WordPiece`算法的时间复杂度降低为与序列长度的线性关系，大大提升了分词效率。`ErnieFastTokenizer`除了支持ERNIE模型的分词以外，还支持其他基于`WordPiece`算法分词的模型，比如`BERT`, `TinyBert`等，详细的模型列表如下：
+
+## 支持的模型列表
+
+- ERNIE
+- BERT
+- TinyBERT
+- ERNIE Gram
+- ERNIE ViL
+
+## 详细分词示例文档
+
+[C++ 分词示例](./cpp)
+[Python 分词示例](./python)
+
+## 参考文献
+
+- Xinying Song, Alex Salcianuet al. "Fast WordPiece Tokenization", EMNLP, 2021
diff --git a/fast_tokenizer/examples/ernie-3.0/cpp/CMakeLists.txt b/fast_tokenizer/examples/ernie-3.0/cpp/CMakeLists.txt
@@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 3.10)
+project(cpp_fast_tokenizer_demo CXX C)
+
+option(FAST_TOKENIZER_INSTALL_DIR "Path of downloaded fast_tokenizer sdk.")
+
+# Download ernie vocab for demo
+set(ERNIE_VOCAB_PATH ${CMAKE_CURRENT_BINARY_DIR}/ernie_vocab.txt)
+if (EXISTS ${ERNIE_VOCAB_PATH})
+  message(STATUS "The ${ERNIE_VOCAB_PATH} exists already.")
+else()
+  file(DOWNLOAD "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt" ${ERNIE_VOCAB_PATH} SHOW_PROGRESS)
+  message(STATUS "Already download the vocab.txt of ernie to ${CMAKE_CURRENT_BINARY_DIR} for demo.")
+endif()
+
+# Get FAST_TOKENIZER_INCS and FAST_TOKENIZER_LIBS
+include(${FAST_TOKENIZER_INSTALL_DIR}/FastTokenizer.cmake)
+
+include_directories(${FAST_TOKENIZER_INCS})
+
+add_executable(demo ${PROJECT_SOURCE_DIR}/demo.cc)
+target_link_libraries(demo ${FAST_TOKENIZER_LIBS})
diff --git a/fast_tokenizer/examples/ernie-3.0/cpp/README.md b/fast_tokenizer/examples/ernie-3.0/cpp/README.md
diff --git a/fast_tokenizer/examples/ernie-3.0/cpp/demo.cc b/fast_tokenizer/examples/ernie-3.0/cpp/demo.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "fast_tokenizer/tokenizers/ernie_fast_tokenizer.h"
 #include <iostream>
 #include <vector>
+#include "fast_tokenizer/tokenizers/ernie_fast_tokenizer.h"
 using namespace paddlenlp;
 
 int main() {
diff --git a/fast_tokenizer/examples/ernie-3.0/python/demo.py b/fast_tokenizer/examples/ernie-3.0/python/demo.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fast_tokenizer
+from fast_tokenizer import ErnieFastTokenizer, models
+
+fast_tokenizer.set_thread_num(1)
+vocab = models.WordPiece.read_file("ernie_vocab.txt")
+fast_tokenizer = ErnieFastTokenizer(vocab)
+output = fast_tokenizer.encode("我爱中国")
+print("ids: ", output.ids)
+print("type_ids: ", output.type_ids)
+print("tokens: ", output.tokens)
+print("offsets: ", output.offsets)
+print("attention_mask: ", output.attention_mask)
diff --git a/fast_tokenizer/fast_tokenizer/core/CMakeLists.txt b/fast_tokenizer/fast_tokenizer/core/CMakeLists.txt
@@ -1,4 +1,4 @@
 cc_library(added_vocabulary SRCS added_vocabulary.cc DEPS normalizers pretokenizers json)
-cc_library(base SRCS base.cc)
+cc_library(base SRCS base.cc DEPS json)
 cc_library(tokenizer SRCS tokenizer.cc DEPS added_vocabulary json decoders trie models postprocessors base)
 cc_library(core SRCS encoding.cc DEPS json base)
diff --git a/fast_tokenizer/fast_tokenizer/demo/README.md b/fast_tokenizer/fast_tokenizer/demo/README.md
diff --git a/fast_tokenizer/run_build_cpp_lib.sh b/fast_tokenizer/run_build_cpp_lib.sh
diff --git a/fast_tokenizer/run_build_py_lib.sh b/fast_tokenizer/run_build_py_lib.sh