[fix] ADDTOKEN (#2545)

miao200years · web-flow · commit c239ca825beb · 2025-09-05T14:30:22.000+08:00
diff --git a/paddleformers/transformers/__init__.py b/paddleformers/transformers/__init__.py
@@ -18,6 +18,7 @@
 from typing import TYPE_CHECKING
 from ..utils.lazy_import import _LazyModule
 
+
 # from .auto.modeling import AutoModelForCausalLM
 import_structure = {
     "kto_criterion": [
@@ -35,10 +36,10 @@
         "BPETokenizer",
         "tokenize_chinese_chars",
         "is_chinese_char",
-        "AddedToken",
         "normalize_chars",
         "tokenize_special_chars",
         "convert_to_unicode",
+        "AddedToken",
     ],
     "attention_utils": ["create_bigbird_rand_mask_idx_list"],
     "tensor_parallel_utils": [],
@@ -334,6 +335,8 @@
     "qwen2_moe": [],
     "qwen3_moe": [],
     "auto": ["AutoModelForCausalLM"],
+    "legacy.tokenizer_utils_base": ["EncodingFast"],
+    "legacy": [],
 }
 
 if TYPE_CHECKING:
diff --git a/paddleformers/transformers/legacy/__init__.py b/paddleformers/transformers/legacy/__init__.py
@@ -11,3 +11,46 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import sys
+from typing import TYPE_CHECKING
+
+from ...utils.lazy_import import _LazyModule
+
+import_structure = {
+    "tokenizer_utils": [
+        "PretrainedTokenizer",
+        "BPETokenizer",
+        "tokenize_chinese_chars",
+        "is_chinese_char",
+        "normalize_chars",
+        "tokenize_special_chars",
+        "convert_to_unicode",
+    ],
+    "tokenizer_utils_base": [
+        "import_protobuf_decode_error",
+        "ExplicitEnum",
+        "PaddingStrategy",
+        "TensorType",
+        "to_py_obj",
+        "_is_numpy",
+        "TruncationStrategy",
+        "CharSpan",
+        "TokenSpan",
+        "BatchEncoding",
+        "SpecialTokensMixin",
+        "PretrainedTokenizerBase",
+        "EncodingFast",
+    ],
+}
+
+if TYPE_CHECKING:
+    from .tokenizer_utils import *
+    from .tokenizer_utils_base import *
+else:
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        import_structure,
+        module_spec=__spec__,
+    )
diff --git a/paddleformers/transformers/tokenizer_utils.py b/paddleformers/transformers/tokenizer_utils.py
@@ -21,6 +21,7 @@
 from functools import wraps
 from typing import Any, Dict, List, Optional, Tuple, Union
 
+from tokenizers import AddedToken  # noqa: F401
 from transformers import BatchEncoding
 from transformers.tokenization_utils import (
     PreTrainedTokenizer as PreTrainedTokenizer_tf,