Skip to content

Commit 5385d0d

Browse files
joey12300linjieccc
andauthored
[FastTokenizer] Fix fast_tokenizer not found when using auto tokenizer (#4060)
* Fix fast_tokenizer not found when using auto tokenizer * faster -> fast Co-authored-by: Linjie Chen <[email protected]>
1 parent cc92f28 commit 5385d0d

File tree

2 files changed

+16
-22
lines changed

2 files changed

+16
-22
lines changed

paddlenlp/transformers/auto/tokenizer.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
from huggingface_hub import hf_hub_download
2222

2323
from paddlenlp import __version__
24-
from paddlenlp.transformers import *
2524
from paddlenlp.utils.downloader import COMMUNITY_MODEL_PREFIX, get_path_from_url
2625
from paddlenlp.utils.env import HF_CACHE_HOME, MODEL_HOME
2726
from paddlenlp.utils.import_utils import is_fast_tokenizer_available
@@ -119,7 +118,7 @@ def get_configurations():
119118
# So same config would map more than one tokenizer
120119
if MAPPING_NAMES.get(name, None) is None:
121120
MAPPING_NAMES[name] = []
122-
# (tokenizer_name, is_faster)
121+
# (tokenizer_name, is_fast)
123122
MAPPING_NAMES[name].append((tokenizer_name, fast_name != ""))
124123
return MAPPING_NAMES
125124

@@ -135,7 +134,7 @@ class AutoTokenizer:
135134
MAPPING_NAMES = get_configurations()
136135
_tokenizer_mapping = MAPPING_NAMES
137136
_name_mapping = TOKENIZER_MAPPING_NAMES
138-
_faster_name_mapping = FAST_TOKENIZER_MAPPING_NAMES
137+
_fast_name_mapping = FAST_TOKENIZER_MAPPING_NAMES
139138
tokenizer_config_file = "tokenizer_config.json"
140139

141140
def __init__(self, *args, **kwargs):
@@ -158,10 +157,10 @@ def _get_tokenizer_class_from_config(cls, pretrained_model_name_or_path, config_
158157
import_class = importlib.import_module(f"paddlenlp.transformers.{class_name}.tokenizer")
159158
tokenizer_class = getattr(import_class, init_class)
160159
if use_fast:
161-
for faster_tokenizer_class, name in cls._faster_name_mapping.items():
160+
for fast_tokenizer_class, name in cls._fast_name_mapping.items():
162161
if name == class_name:
163-
import_class = importlib.import_module(f"paddlenlp.transformers.{class_name}.faster_tokenizer")
164-
tokenizer_class = getattr(import_class, faster_tokenizer_class)
162+
import_class = importlib.import_module(f"paddlenlp.transformers.{class_name}.fast_tokenizer")
163+
tokenizer_class = getattr(import_class, fast_tokenizer_class)
165164
return tokenizer_class
166165
# If no `init_class`, we use pattern recognition to recognize the tokenizer class.
167166
else:
@@ -219,7 +218,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, from_hf_hub=False, *mode
219218
print(type(tokenizer))
220219
# <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'>
221220
"""
222-
# Default not to use faster tokenizer
221+
# Default not to use fast tokenizer
223222
use_fast = kwargs.pop("use_fast", False)
224223
if "use_fast" in kwargs:
225224
use_fast = kwargs.pop("use_fast", False)
@@ -267,7 +266,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, from_hf_hub=False, *mode
267266
break
268267
if not is_support_fast_tokenizer:
269268
logger.warning(
270-
f"The tokenizer {actual_tokenizer_class} doesn't have the faster version."
269+
f"The tokenizer {actual_tokenizer_class} doesn't have the fast version."
271270
" Please check the map `paddlenlp.transformers.auto.tokenizer.FAST_TOKENIZER_MAPPING_NAMES`"
272271
" to see which fast tokenizers are currently supported."
273272
)

paddlenlp/utils/import_utils.py

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,16 @@
1313
# limitations under the License.
1414
from __future__ import annotations
1515

16-
import sys
16+
import importlib.util
1717
import os
18-
import site
1918
import shutil
19+
import site
20+
import sys
2021
from typing import Optional, Type
22+
2123
import pip
22-
import importlib.util
24+
2325
from paddlenlp.utils.log import logger
24-
import importlib.util
25-
import importlib_metadata
2626

2727

2828
def is_torch_available() -> bool:
@@ -44,12 +44,12 @@ def is_package_available(package_name: str) -> bool:
4444
return package_spec is not None and package_spec.has_location
4545

4646

47-
def is_faster_tokenizer_available() -> bool:
48-
"""check if `faster_tokenizer` ia avaliable
47+
def is_fast_tokenizer_available() -> bool:
48+
"""check if `fast_tokenizer` ia avaliable
4949
Returns:
50-
bool: if `faster_tokenizer` is avaliable
50+
bool: if `fast_tokenizer` is avaliable
5151
"""
52-
return is_package_available("faster_tokenizer")
52+
return is_package_available("fast_tokenizer")
5353

5454

5555
def is_transformers_available() -> bool:
@@ -60,11 +60,6 @@ def is_transformers_available() -> bool:
6060
return is_package_available("transformers")
6161

6262

63-
def is_fast_tokenizer_available():
64-
package_spec = importlib.util.find_spec("fast_tokenizer")
65-
return package_spec is not None and package_spec.has_location
66-
67-
6863
def install_package(
6964
package_name: str,
7065
version: Optional[str] = None,

0 commit comments

Comments
 (0)