12
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
# See the License for the specific language governing permissions and
14
14
# limitations under the License.
15
+ from __future__ import annotations
15
16
16
17
import copy
17
18
from collections .abc import Mapping
18
19
from dataclasses import dataclass
19
- from typing import Any , Callable , Dict , List , NewType , Optional , Tuple , Union
20
+ from typing import (
21
+ TYPE_CHECKING ,
22
+ Any ,
23
+ Callable ,
24
+ Dict ,
25
+ List ,
26
+ NewType ,
27
+ Optional ,
28
+ Tuple ,
29
+ Union ,
30
+ )
20
31
21
32
import numpy as np
22
33
import paddle
23
34
24
- from .. transformers . tokenizer_utils_base import (
25
- BatchEncoding ,
26
- PaddingStrategy ,
27
- PretrainedTokenizerBase ,
28
- )
35
+ if TYPE_CHECKING :
36
+ from transformers . tokenization_utils_base import PreTrainedTokenizerBase
37
+ from transformers . utils import PaddingStrategy
38
+
39
+ from transformers . tokenization_utils_base import BatchEncoding
29
40
30
41
__all__ = [
31
42
"DataCollatorWithPadding" ,
@@ -177,11 +188,11 @@ class DataCollatorWithPadding:
177
188
Data collator that will dynamically pad the inputs to the longest sequence in the batch.
178
189
179
190
Args:
180
- tokenizer (`paddleformers. transformers.PretrainedTokenizer `):
191
+ tokenizer (`transformers.PreTrainedTokenizer `):
181
192
The tokenizer used for encoding the data.
182
193
"""
183
194
184
- tokenizer : PretrainedTokenizerBase
195
+ tokenizer : PreTrainedTokenizerBase
185
196
padding : Union [bool , str , PaddingStrategy ] = True
186
197
max_length : Optional [int ] = None
187
198
pad_to_multiple_of : Optional [int ] = None
@@ -216,7 +227,7 @@ class DataCollatorForTokenClassification(DataCollatorMixin):
216
227
Data collator that will dynamically pad the inputs received, as well as the labels.
217
228
218
229
Args:
219
- tokenizer ([`PretrainedTokenizer `] or [`PretrainedFasterTokenizer`]):
230
+ tokenizer ([`PreTrainedTokenizer `] or [`PretrainedFasterTokenizer`]):
220
231
The tokenizer used for encoding the data.
221
232
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
222
233
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
@@ -241,7 +252,7 @@ class DataCollatorForTokenClassification(DataCollatorMixin):
241
252
The type of Tensor to return. Allowable values are "np", "pt" and "tf".
242
253
"""
243
254
244
- tokenizer : PretrainedTokenizerBase
255
+ tokenizer : PreTrainedTokenizerBase
245
256
padding : Union [bool , str , PaddingStrategy ] = True
246
257
max_length : Optional [int ] = None
247
258
pad_to_multiple_of : Optional [int ] = None
@@ -321,7 +332,7 @@ class DataCollatorForSeq2Seq:
321
332
Data collator that will dynamically pad the inputs received, as well as the labels.
322
333
323
334
Args:
324
- tokenizer ([`PretrainedTokenizer `] or [`PretrainedFasterTokenizer`]):
335
+ tokenizer ([`PreTrainedTokenizer `] or [`PretrainedFasterTokenizer`]):
325
336
The tokenizer used for encoding the data.
326
337
model ([`PreTrainedModel`]):
327
338
The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
@@ -352,7 +363,7 @@ class DataCollatorForSeq2Seq:
352
363
max_label_length (`int`, *optional*, Pad label to max_label_length. defaults to `None`):
353
364
"""
354
365
355
- tokenizer : PretrainedTokenizerBase
366
+ tokenizer : PreTrainedTokenizerBase
356
367
model : Optional [Any ] = None
357
368
padding : Union [bool , str , PaddingStrategy ] = True
358
369
max_length : Optional [int ] = None
@@ -421,7 +432,7 @@ def __call__(self, features, return_tensors=None):
421
432
422
433
@dataclass
423
434
class DataCollatorForEmbedding :
424
- tokenizer : PretrainedTokenizerBase
435
+ tokenizer : PreTrainedTokenizerBase
425
436
model : Optional [Any ] = None
426
437
padding : Union [bool , str , PaddingStrategy ] = True
427
438
pad_to_multiple_of : Optional [int ] = None
@@ -651,7 +662,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
651
662
[`PreTrainedTokenizerFast`] with the argument `return_special_tokens_mask=True`.
652
663
</Tip>"""
653
664
654
- tokenizer : PretrainedTokenizerBase
665
+ tokenizer : PreTrainedTokenizerBase
655
666
mlm : bool = True
656
667
mlm_probability : float = 0.15
657
668
pad_to_multiple_of : Optional [int ] = None
0 commit comments