[Tokenizer] Support fast tokenizer within AutoTokenizer import (#9466)

DrownFish19 · web-flow · commit d6d181b7191d · 2024-11-26T12:00:53.000+08:00
* add tokenizerfast import

* add test case
diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py
@@ -178,7 +178,12 @@ def tokenizer_class_from_name(class_name: str):
 
                     return getattr(module, class_name)
                 except AttributeError:
-                    raise ValueError(f"Tokenizer class {class_name} is not currently imported.")
+                    try:
+                        module = importlib.import_module(f".{module_name}.tokenizer_fast", "paddlenlp.transformers")
+
+                        return getattr(module, class_name)
+                    except AttributeError:
+                        raise ValueError(f"Tokenizer class {class_name} is not currently imported.")
 
     for config, tokenizers in TOKENIZER_MAPPING._extra_content.items():
         for tokenizer in tokenizers:
diff --git a/tests/transformers/auto/test_tokenizer.py b/tests/transformers/auto/test_tokenizer.py
@@ -48,6 +48,10 @@ def test_from_pretrained_cache_dir(self):
             # check against double appending model_name in cache_dir
             self.assertFalse(os.path.exists(os.path.join(tempdir, model_name, model_name)))
 
+    def test_from_pretrained_tokenizer_fast(self):
+        tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-base-v2", use_fast=True)
+        self.assertIsInstance(tokenizer, BertTokenizerFast)
+
     def test_new_tokenizer_registration(self):
         try:
             AutoConfig.register("custom", CustomConfig)