Move BERTTokenizer to Cython and add caching support (#921)

Davis Liang · leezu · commit 1704ab89fb93 · 2019-11-06T13:53:21.000+08:00
diff --git a/env/cpu/py3-master.yml b/env/cpu/py3-master.yml
@@ -3,6 +3,7 @@ channels:
 dependencies:
   - python=3.6
   - pip=18.1
+  - cython
   - perl
   - pylint=2.3.1
   - flake8
diff --git a/env/cpu/py3.yml b/env/cpu/py3.yml
@@ -3,6 +3,7 @@ channels:
 dependencies:
   - python=3.6
   - pip=18.1
+  - cython
   - perl
   - pylint=2.3.1
   - flake8
diff --git a/env/docker/py3.yml b/env/docker/py3.yml
@@ -3,6 +3,7 @@ channels:
 dependencies:
   - python=3.6
   - pip=18.1
+  - cython
   - perl
   - pylint=1.9.2
   - flake8
diff --git a/env/gpu/py3-master.yml b/env/gpu/py3-master.yml
@@ -3,6 +3,7 @@ channels:
 dependencies:
   - python=3.6
   - pip=18.1
+  - cython
   - perl
   - pylint=2.3.1
   - flake8
diff --git a/env/gpu/py3.yml b/env/gpu/py3.yml
@@ -3,6 +3,7 @@ channels:
 dependencies:
   - python=3.6
   - pip=18.1
+  - cython
   - perl
   - pylint=2.3.1
   - flake8
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 import re
 import shutil
 import sys
-from setuptools import setup, find_packages
+from setuptools import setup, find_packages, Extension
 
 
 def read(*names, **kwargs):
@@ -30,6 +30,7 @@ def find_version(*file_paths):
 
 requirements = [
     'numpy',
+    'cython'
 ]
 
 setup(
@@ -53,6 +54,11 @@ def find_version(*file_paths):
     package_dir={"": "src"},
     zip_safe=True,
     include_package_data=True,
+    setup_requires=[
+        # Setuptools 18.0 properly handles Cython extensions.
+        'setuptools>=18.0',
+        'cython',
+    ],
     install_requires=requirements,
     extras_require={
         'extras': [
@@ -82,4 +88,7 @@ def find_version(*file_paths):
             'flaky',
         ],
     },
+    ext_modules=[
+        Extension('gluonnlp.data.wordpiece', sources=['src/gluonnlp/data/wordpiece.pyx']),
+    ],
 )
diff --git a/src/gluonnlp/data/transforms.py b/src/gluonnlp/data/transforms.py
@@ -30,6 +30,7 @@
 ]
 
 import errno
+import functools
 import io
 import os
 import time
@@ -42,7 +43,9 @@
 import numpy as np
 
 from ..base import get_home_dir
+from ..vocab.vocab import Vocab
 from .utils import _extract_archive
+from .wordpiece import tokenize as wordpiece_tokenize
 
 
 class ClipSequence:
@@ -790,14 +793,17 @@ class BERTTokenizer:
 
     Parameters
     ----------
-    vocab : gluonnlp.Vocab or None, default None
+    vocab
         Vocabulary for the corpus.
-    lower : bool, default True
+    lower
         whether the text strips accents and convert to lower case.
         If you use the BERT pre-training model,
         lower is set to Flase when using the cased model,
         otherwise it is set to True.
-    max_input_chars_per_word : int, default 200
+    max_input_chars_per_word
+    lru_cache_size
+        Maximum size of a least-recently-used cache to speed up tokenization.
+        Use size of 2**20 for example.
 
     Examples
     --------
@@ -812,10 +818,14 @@ class BERTTokenizer:
 
     _special_prefix = '##'
 
-    def __init__(self, vocab, lower=True, max_input_chars_per_word=200):
+    def __init__(self, vocab: Vocab, lower: bool = True, max_input_chars_per_word: int = 200,
+                 lru_cache_size: Optional[int] = None):
         self.vocab = vocab
         self.max_input_chars_per_word = max_input_chars_per_word
         self.basic_tokenizer = BERTBasicTokenizer(lower=lower)
+        if lru_cache_size:
+            self._word_to_wordpiece_optimized = functools.lru_cache(maxsize=lru_cache_size)(
+                self._word_to_wordpiece_optimized)
 
     def __call__(self, sample):
         """
@@ -841,6 +851,10 @@ def _tokenizer(self, text):
 
         return split_tokens
 
+    def _word_to_wordpiece_optimized(self, text):  # pylint: disable=method-hidden
+        return wordpiece_tokenize(text, self.vocab, self.vocab.unknown_token,
+                                  self.max_input_chars_per_word)
+
     def _tokenize_wordpiece(self, text):
         """Tokenizes a piece of text into its word pieces.
 
@@ -861,35 +875,14 @@ def _tokenize_wordpiece(self, text):
         ret : A list of wordpiece tokens.
         """
 
+        # case where text is a single token
+        whitespace_tokenized_tokens = self.basic_tokenizer._whitespace_tokenize(text)
+        if len(whitespace_tokenized_tokens) == 1:
+            return self._word_to_wordpiece_optimized(whitespace_tokenized_tokens[0])
+
         output_tokens = []
-        for token in self.basic_tokenizer._whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.vocab.unknown_token)
-                continue
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = ''.join(chars[start:end])
-                    if start > 0:
-                        substr = self._special_prefix + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-            if is_bad:
-                output_tokens.append(self.vocab.unknown_token)
-            else:
-                output_tokens.extend(sub_tokens)
+        for token in whitespace_tokenized_tokens:
+            output_tokens.extend(self._word_to_wordpiece_optimized(token))
         return output_tokens
 
     def convert_tokens_to_ids(self, tokens):
diff --git a/src/gluonnlp/data/wordpiece.pyx b/src/gluonnlp/data/wordpiece.pyx
@@ -0,0 +1,42 @@
+from typing import Dict, List, Tuple
+
+import cython
+
+__all__ = ['tokenize']
+
+
+def tokenize(text: str, vocab: Dict[str, int], unknown_token: str, max_input_chars_per_word: cython.int = 200):
+    """
+    Cython implementation of single token tokenization. Average latency
+    decreases to 95ms (from 144ms using original Python code).
+    """
+    output_tokens: List[str] = []
+    token_size: cython.int = len(text)
+    if token_size > max_input_chars_per_word:
+        output_tokens.append(unknown_token)
+        return output_tokens
+    is_bad: cython.int = 0
+    start: cython.int = 0
+    sub_tokens: List[str] = []
+    while start < token_size:
+        end: cython.int = token_size
+        cur_substr: str = None
+        while start < end:
+            substr = text[start:end]
+            if start > 0:
+                substr = '##' + substr
+            if substr in vocab:
+                cur_substr = substr
+                break
+            end -= 1
+        if cur_substr is None:
+            is_bad = 1
+            break
+        sub_tokens.append(cur_substr)
+        start = end
+    if is_bad == 1:
+        output_tokens.append(unknown_token)
+    else:
+        output_tokens.extend(sub_tokens)
+
+    return output_tokens