Skip to content

Commit 48387c8

Browse files
authored
[FIx_v0.2] PreTrainedTokenizer (#2498)
1 parent c7012a1 commit 48387c8

File tree

5 files changed

+13
-10
lines changed

5 files changed

+13
-10
lines changed

paddleformers/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,13 @@
6060
"peft",
6161
"quantization",
6262
"trainer",
63-
"transformers",
6463
"trl",
6564
"utils",
6665
"version",
66+
"transformers",
6767
]
6868
import_structure = {module: [] for module in modules}
69+
import_structure["transformers.tokenizer_utils"] = ["PreTrainedTokenizer"]
6970

7071
if TYPE_CHECKING:
7172
from . import (

paddleformers/transformers/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
"sequence_parallel_utils": ["AllGatherVarlenOp", "sequence_parallel_sparse_mask_labels"],
3131
"model_utils": ["PretrainedModel", "register_base_model"],
3232
"tokenizer_utils": [
33+
"PreTrainedTokenizer",
3334
"PretrainedTokenizer",
3435
"BPETokenizer",
3536
"tokenize_chinese_chars",
@@ -38,7 +39,6 @@
3839
"normalize_chars",
3940
"tokenize_special_chars",
4041
"convert_to_unicode",
41-
"PreTrainedTokenizer",
4242
],
4343
"attention_utils": ["create_bigbird_rand_mask_idx_list"],
4444
"tensor_parallel_utils": [],

paddleformers/transformers/ernie4_5vl/tokenizer.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,10 @@
1717
import os
1818

1919
import sentencepiece as spm
20-
from transformers.tokenization_utils import PreTrainedTokenizer
2120
from transformers.utils import logging
2221

2322
# Fix relative import issues
24-
from ..tokenizer_utils import PaddleTokenizerMixin
23+
from ..tokenizer_utils import PreTrainedTokenizer
2524

2625
logger = logging.get_logger(__name__)
2726

@@ -30,7 +29,7 @@
3029
]
3130

3231

33-
class Ernie4_5_VLTokenizer(PaddleTokenizerMixin, PreTrainedTokenizer):
32+
class Ernie4_5_VLTokenizer(PreTrainedTokenizer):
3433
"""
3534
ERNIE 4.5 VL Tokenizer based on SentencePiece with smart tensor support.
3635

paddleformers/transformers/qwen/tokenizer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@
2121
from typing import Collection, Dict, List, Set, Tuple, Union
2222

2323
import tiktoken
24-
from transformers import AddedToken, PreTrainedTokenizer
24+
from transformers import AddedToken
2525

26-
from ..tokenizer_utils import PaddleTokenizerMixin
26+
from ..tokenizer_utils import PreTrainedTokenizer
2727

2828
logger = logging.getLogger(__name__)
2929

@@ -64,7 +64,7 @@ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
6464
}
6565

6666

67-
class QWenTokenizer(PaddleTokenizerMixin, PreTrainedTokenizer):
67+
class QWenTokenizer(PreTrainedTokenizer):
6868
"""QWen tokenizer."""
6969

7070
vocab_files_names = VOCAB_FILES_NAMES

paddleformers/transformers/tokenizer_utils.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@
2222
from typing import Any, Dict, List, Union
2323

2424
from transformers import BatchEncoding
25+
from transformers.tokenization_utils import (
26+
PreTrainedTokenizer as PreTrainedTokenizer_tf,
27+
)
2528
from transformers.tokenization_utils_base import (
2629
ADDED_TOKENS_FILE,
2730
CHAT_TEMPLATE_FILE,
@@ -390,10 +393,10 @@ def encode_chat_inputs(
390393
return query
391394

392395

393-
def warp_tokenizer(hf_tokenizer_class: PreTrainedTokenizer):
396+
def warp_tokenizer(hf_tokenizer_class: PreTrainedTokenizer_tf):
394397
return type(hf_tokenizer_class.__name__, (PaddleTokenizerMixin, hf_tokenizer_class), {})
395398

396399

397-
class PreTrainedTokenizer(PaddleTokenizerMixin, PretrainedTokenizer):
400+
class PreTrainedTokenizer(PaddleTokenizerMixin, PreTrainedTokenizer_tf):
398401
def init(self, *args, **kwargs):
399402
super().init(*args, **kwargs)

0 commit comments

Comments
 (0)