Skip to content

Commit 6dfc06d

Browse files
SdeeRKmiao200years
andauthored
hf tokenizer adaptation (#2445)
Co-authored-by: miao200years <[email protected]>
1 parent ed05711 commit 6dfc06d

38 files changed

+900
-13779
lines changed

paddleformers/data/data_collator.py

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,31 @@
1212
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
15+
from __future__ import annotations
1516

1617
import copy
1718
from collections.abc import Mapping
1819
from dataclasses import dataclass
19-
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
20+
from typing import (
21+
TYPE_CHECKING,
22+
Any,
23+
Callable,
24+
Dict,
25+
List,
26+
NewType,
27+
Optional,
28+
Tuple,
29+
Union,
30+
)
2031

2132
import numpy as np
2233
import paddle
2334

24-
from ..transformers.tokenizer_utils_base import (
25-
BatchEncoding,
26-
PaddingStrategy,
27-
PretrainedTokenizerBase,
28-
)
35+
if TYPE_CHECKING:
36+
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
37+
from transformers.utils import PaddingStrategy
38+
39+
from transformers.tokenization_utils_base import BatchEncoding
2940

3041
__all__ = [
3142
"DataCollatorWithPadding",
@@ -177,11 +188,11 @@ class DataCollatorWithPadding:
177188
Data collator that will dynamically pad the inputs to the longest sequence in the batch.
178189
179190
Args:
180-
tokenizer (`paddleformers.transformers.PretrainedTokenizer`):
191+
tokenizer (`transformers.PreTrainedTokenizer`):
181192
The tokenizer used for encoding the data.
182193
"""
183194

184-
tokenizer: PretrainedTokenizerBase
195+
tokenizer: PreTrainedTokenizerBase
185196
padding: Union[bool, str, PaddingStrategy] = True
186197
max_length: Optional[int] = None
187198
pad_to_multiple_of: Optional[int] = None
@@ -216,7 +227,7 @@ class DataCollatorForTokenClassification(DataCollatorMixin):
216227
Data collator that will dynamically pad the inputs received, as well as the labels.
217228
218229
Args:
219-
tokenizer ([`PretrainedTokenizer`] or [`PretrainedFasterTokenizer`]):
230+
tokenizer ([`PreTrainedTokenizer`] or [`PretrainedFasterTokenizer`]):
220231
The tokenizer used for encoding the data.
221232
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
222233
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
@@ -241,7 +252,7 @@ class DataCollatorForTokenClassification(DataCollatorMixin):
241252
The type of Tensor to return. Allowable values are "np", "pt" and "tf".
242253
"""
243254

244-
tokenizer: PretrainedTokenizerBase
255+
tokenizer: PreTrainedTokenizerBase
245256
padding: Union[bool, str, PaddingStrategy] = True
246257
max_length: Optional[int] = None
247258
pad_to_multiple_of: Optional[int] = None
@@ -321,7 +332,7 @@ class DataCollatorForSeq2Seq:
321332
Data collator that will dynamically pad the inputs received, as well as the labels.
322333
323334
Args:
324-
tokenizer ([`PretrainedTokenizer`] or [`PretrainedFasterTokenizer`]):
335+
tokenizer ([`PreTrainedTokenizer`] or [`PretrainedFasterTokenizer`]):
325336
The tokenizer used for encoding the data.
326337
model ([`PreTrainedModel`]):
327338
The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
@@ -352,7 +363,7 @@ class DataCollatorForSeq2Seq:
352363
max_label_length (`int`, *optional*, Pad label to max_label_length. defaults to `None`):
353364
"""
354365

355-
tokenizer: PretrainedTokenizerBase
366+
tokenizer: PreTrainedTokenizerBase
356367
model: Optional[Any] = None
357368
padding: Union[bool, str, PaddingStrategy] = True
358369
max_length: Optional[int] = None
@@ -421,7 +432,7 @@ def __call__(self, features, return_tensors=None):
421432

422433
@dataclass
423434
class DataCollatorForEmbedding:
424-
tokenizer: PretrainedTokenizerBase
435+
tokenizer: PreTrainedTokenizerBase
425436
model: Optional[Any] = None
426437
padding: Union[bool, str, PaddingStrategy] = True
427438
pad_to_multiple_of: Optional[int] = None
@@ -651,7 +662,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
651662
[`PreTrainedTokenizerFast`] with the argument `return_special_tokens_mask=True`.
652663
</Tip>"""
653664

654-
tokenizer: PretrainedTokenizerBase
665+
tokenizer: PreTrainedTokenizerBase
655666
mlm: bool = True
656667
mlm_probability: float = 0.15
657668
pad_to_multiple_of: Optional[int] = None

paddleformers/datasets/rlhf_datasets/rl_dataset.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,20 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
from __future__ import annotations
1415

1516
import os
16-
from typing import Any
17+
from typing import TYPE_CHECKING, Any
1718

1819
import numpy as np
1920
import paddle
2021
from datasets import load_dataset
2122
from paddle.io import Dataset
2223

23-
from ...transformers import PretrainedTokenizer
24-
from ...transformers.tokenizer_utils import PaddingStrategy
24+
if TYPE_CHECKING:
25+
from transformers.tokenization_utils import PreTrainedTokenizer
26+
27+
from transformers.utils import PaddingStrategy
2528

2629

2730
def left_padding(sequences, padding_value=0, max_length=None):
@@ -85,7 +88,7 @@ class RLHFDataset(Dataset):
8588
def __init__(
8689
self,
8790
dataset_name_or_path,
88-
tokenizer: PretrainedTokenizer,
91+
tokenizer: PreTrainedTokenizer,
8992
max_prompt_len=1024,
9093
filter_prompts=True,
9194
prompt_key="src",

paddleformers/generation/streamers.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,13 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
from __future__ import annotations
1415

1516
from queue import Queue
16-
from typing import Optional
17+
from typing import TYPE_CHECKING, Optional
1718

18-
from ..transformers.tokenizer_utils import PretrainedTokenizer
19+
if TYPE_CHECKING:
20+
from transformers.tokenization_utils import PreTrainedTokenizer
1921

2022

2123
class BaseStreamer:
@@ -59,7 +61,7 @@ class TextStreamer(BaseStreamer):
5961
```
6062
"""
6163

62-
def __init__(self, tokenizer: PretrainedTokenizer, skip_prompt: bool = False, **decode_kwargs):
64+
def __init__(self, tokenizer: PreTrainedTokenizer, skip_prompt: bool = False, **decode_kwargs):
6365
self.tokenizer = tokenizer
6466
self.skip_prompt = skip_prompt
6567
self.decode_kwargs = decode_kwargs
@@ -189,7 +191,7 @@ class TextIteratorStreamer(TextStreamer):
189191

190192
def __init__(
191193
self,
192-
tokenizer: PretrainedTokenizer,
194+
tokenizer: PreTrainedTokenizer,
193195
skip_prompt: bool = False,
194196
timeout: Optional[float] = None,
195197
**decode_kwargs

paddleformers/trainer/trainer.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
# This file is modified from
1717
# https://github.com/huggingface/transformers/blob/main/src/transformers/trainer.py
18+
from __future__ import annotations
1819

1920
import collections
2021
import contextlib
@@ -33,7 +34,7 @@
3334
from collections import OrderedDict
3435
from collections.abc import Mapping
3536
from pathlib import Path
36-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
37+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
3738

3839
import numpy as np
3940
import paddle
@@ -97,6 +98,8 @@
9798
)
9899
except:
99100
pass
101+
if TYPE_CHECKING:
102+
from transformers.tokenization_utils import PreTrainedTokenizer
100103

101104
from ..transformers.context_parallel_utils import split_inputs_sequence_dim_load_balance
102105
from ..transformers.image_processing_utils import ImageProcessingMixin
@@ -107,7 +110,6 @@
107110
unwrap_model,
108111
)
109112
from ..transformers.segment_parallel_utils import split_inputs_sequence_dim
110-
from ..transformers.tokenizer_utils import PretrainedTokenizer
111113
from ..utils import empty_device_cache
112114
from ..utils.batch_sampler import DistributedBatchSampler as NlpDistributedBatchSampler
113115
from ..utils.env import (
@@ -255,7 +257,7 @@ class Trainer:
255257
The dataset to use for evaluation. If it is a [`~datasets.Dataset`], columns not accepted by the
256258
`model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each
257259
dataset prepending the dictionary key to the metric name.
258-
tokenizer ([`PretrainedTokenizer`], *optional*):
260+
tokenizer ([`PreTrainedTokenizer`], *optional*):
259261
The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs the
260262
maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an
261263
interrupted training or reuse the fine-tuned model.
@@ -294,7 +296,7 @@ def __init__(
294296
data_collator: Optional[DataCollator] = None,
295297
train_dataset: Optional[Dataset] = None,
296298
eval_dataset: Union[Dataset, Dict[str, Dataset]] = None,
297-
tokenizer: Optional[PretrainedTokenizer] = None,
299+
tokenizer: Optional[PreTrainedTokenizer] = None,
298300
compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
299301
callbacks: Optional[List[TrainerCallback]] = None,
300302
optimizers: Tuple[paddle.optimizer.Optimizer, paddle.optimizer.lr.LRScheduler] = (None, None),

paddleformers/trainer/trainer_utils.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
"""
2020
Utilities for the Trainer class.
2121
"""
22+
from __future__ import annotations
23+
2224
import datetime
2325
import gc
2426
import inspect
@@ -39,10 +41,10 @@
3941
from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
4042
from paddle.io import IterableDataset
4143
from paddle.optimizer.lr import LambdaDecay
44+
from transformers.tokenization_utils_base import BatchEncoding
4245

4346
from ..ops import Topology
4447
from ..trainer.argparser import strtobool
45-
from ..transformers.tokenizer_utils_base import BatchEncoding
4648
from ..utils.env import PREFIX_CHECKPOINT_DIR, _re_checkpoint # noqa for compatibility
4749
from ..utils.fault_tolerance import PDC_DOWNLOAD_ERROR
4850
from ..utils.import_utils import is_paddle_cuda_available, is_psutil_available
@@ -653,11 +655,11 @@ def metrics_format(self, metrics: Dict[str, float]) -> Dict[str, float]:
653655
metrics_copy = metrics.copy()
654656
for k, v in metrics_copy.items():
655657
if "_mem_" in k:
656-
metrics_copy[k] = f"{ v >> 20 }MB"
658+
metrics_copy[k] = f"{v >> 20}MB"
657659
elif "_runtime" in k:
658660
metrics_copy[k] = _secs2timedelta(v)
659661
elif k == "total_flos":
660-
metrics_copy[k] = f"{ int(v) >> 30 }GF"
662+
metrics_copy[k] = f"{int(v) >> 30}GF"
661663
elif isinstance(metrics_copy[k], float):
662664
metrics_copy[k] = round(v, 4)
663665

paddleformers/transformers/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@
6161
"BertPretrainingCriterion",
6262
"BertForQuestionAnswering",
6363
],
64-
"bert.tokenizer": ["BasicTokenizer", "BertTokenizer", "WordpieceTokenizer"],
64+
"bert.tokenizer": ["BertTokenizer"],
65+
"bert.tokenizer_fast": ["BertTokenizerFast"],
6566
"bert.configuration": ["BERT_PRETRAINED_INIT_CONFIGURATION", "BertConfig", "BERT_PRETRAINED_RESOURCE_FILES_MAP"],
6667
"auto.configuration": ["AutoConfig"],
6768
"auto.image_processing": ["AutoImageProcessor"],

0 commit comments

Comments
 (0)