|
24 | 24 | extract_commit_hash, |
25 | 25 | is_offline_mode, |
26 | 26 | try_to_load_from_cache, |
| 27 | + PaddingStrategy, |
| 28 | + is_tf_tensor, |
| 29 | + is_torch_tensor, |
| 30 | + to_py_obj, |
27 | 31 | ) |
| 32 | +from transformers.tokenization_utils_base import BatchEncoding, EncodedInput |
| 33 | +from collections.abc import Mapping, Sized |
| 34 | +import numpy as np |
28 | 35 |
|
29 | 36 |
|
30 | 37 | def _get_relative_imports(module_file): |
@@ -465,3 +472,209 @@ def _get_class_from_dynamic_module( |
465 | 472 | class_name, final_module.replace(".py", "").replace("-", "_") |
466 | 473 | ) |
467 | 474 | return get_class_in_module(class_name, final_module.replace("-", "_")) |
| 475 | + |
| 476 | + |
| 477 | +def _pad( |
| 478 | + self, |
| 479 | + encoded_inputs: Union[ |
| 480 | + BatchEncoding, |
| 481 | + List[BatchEncoding], |
| 482 | + Dict[str, EncodedInput], |
| 483 | + Dict[str, List[EncodedInput]], |
| 484 | + List[Dict[str, EncodedInput]], |
| 485 | + ], |
| 486 | + padding=True, |
| 487 | + max_length: Optional[int] = None, |
| 488 | + pad_to_multiple_of: Optional[int] = None, |
| 489 | + padding_side: Optional[bool] = None, |
| 490 | + return_attention_mask: Optional[bool] = None, |
| 491 | + return_tensors=None, |
| 492 | + verbose: bool = True, |
| 493 | +) -> BatchEncoding: |
| 494 | + """ |
| 495 | + Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length |
| 496 | + in the batch. |
| 497 | +
|
| 498 | + Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`, |
| 499 | + `self.pad_token_id` and `self.pad_token_type_id`). |
| 500 | +
|
| 501 | + Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the |
| 502 | + text followed by a call to the `pad` method to get a padded encoding. |
| 503 | +
|
| 504 | + <Tip> |
| 505 | +
|
| 506 | + If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the |
| 507 | + result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of |
| 508 | + PyTorch tensors, you will lose the specific device of your tensors however. |
| 509 | +
|
| 510 | + </Tip> |
| 511 | +
|
| 512 | + Args: |
| 513 | + encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` |
| 514 | + or `List[Dict[str, List[int]]]`): |
| 515 | + Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of |
| 516 | + tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str, |
| 517 | + List[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader |
| 518 | + collate function. |
| 519 | +
|
| 520 | + Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see |
| 521 | + the note above for the return type. |
| 522 | + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): |
| 523 | + Select a strategy to pad the returned sequences (according to the model's padding side and padding |
| 524 | + index) among: |
| 525 | +
|
| 526 | + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single |
| 527 | + sequence if provided). |
| 528 | + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum |
| 529 | + acceptable input length for the model if that argument is not provided. |
| 530 | + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different |
| 531 | + lengths). |
| 532 | + max_length (`int`, *optional*): |
| 533 | + Maximum length of the returned list and optionally padding length (see above). |
| 534 | + pad_to_multiple_of (`int`, *optional*): |
| 535 | + If set will pad the sequence to a multiple of the provided value. |
| 536 | +
|
| 537 | + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability |
| 538 | + `>= 7.5` (Volta). |
| 539 | + padding_side (`str`, *optional*): |
| 540 | + The side on which the model should have padding applied. Should be selected between ['right', 'left']. |
| 541 | + Default value is picked from the class attribute of the same name. |
| 542 | + return_attention_mask (`bool`, *optional*): |
| 543 | + Whether to return the attention mask. If left to the default, will return the attention mask according |
| 544 | + to the specific tokenizer's default, defined by the `return_outputs` attribute. |
| 545 | +
|
| 546 | + [What are attention masks?](../glossary#attention-mask) |
| 547 | + return_tensors (`str` or [`~utils.TensorType`], *optional*): |
| 548 | + If set, will return tensors instead of list of python integers. Acceptable values are: |
| 549 | +
|
| 550 | + - `'tf'`: Return TensorFlow `tf.constant` objects. |
| 551 | + - `'pt'`: Return PyTorch `torch.Tensor` objects. |
| 552 | + - `'np'`: Return Numpy `np.ndarray` objects. |
| 553 | + verbose (`bool`, *optional*, defaults to `True`): |
| 554 | + Whether or not to print more information and warnings. |
| 555 | + """ |
| 556 | + if self.__class__.__name__.endswith("Fast"): |
| 557 | + if not self.deprecation_warnings.get("Asking-to-pad-a-fast-tokenizer", False): |
| 558 | + logger.warning_advice( |
| 559 | + f"You're using a {self.__class__.__name__} tokenizer. Please note that with a fast tokenizer," |
| 560 | + " using the `__call__` method is faster than using a method to encode the text followed by a call" |
| 561 | + " to the `pad` method to get a padded encoding." |
| 562 | + ) |
| 563 | + self.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True |
| 564 | + |
| 565 | + # If we have a list of dicts, let's convert it in a dict of lists |
| 566 | + # We do this to allow using this method as a collate_fn function in PyTorch Dataloader |
| 567 | + if isinstance(encoded_inputs, (list, tuple)) and isinstance( |
| 568 | + encoded_inputs[0], Mapping |
| 569 | + ): |
| 570 | + encoded_inputs = { |
| 571 | + key: [example[key] for example in encoded_inputs] |
| 572 | + for key in encoded_inputs[0].keys() |
| 573 | + } |
| 574 | + |
| 575 | + # The model's main input name, usually `input_ids`, has been passed for padding |
| 576 | + if self.model_input_names[0] not in encoded_inputs: |
| 577 | + raise ValueError( |
| 578 | + "You should supply an encoding or a list of encodings to this method " |
| 579 | + f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}" |
| 580 | + ) |
| 581 | + |
| 582 | + required_input = encoded_inputs[self.model_input_names[0]] |
| 583 | + |
| 584 | + if required_input is None or ( |
| 585 | + isinstance(required_input, Sized) and len(required_input) == 0 |
| 586 | + ): |
| 587 | + if return_attention_mask: |
| 588 | + encoded_inputs["attention_mask"] = [] |
| 589 | + return encoded_inputs |
| 590 | + |
| 591 | + # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects |
| 592 | + # and rebuild them afterwards if no return_tensors is specified |
| 593 | + # Note that we lose the specific device the tensor may be on for PyTorch |
| 594 | + |
| 595 | + first_element = required_input[0] |
| 596 | + if isinstance(first_element, (list, tuple)): |
| 597 | + # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element. |
| 598 | + for item in required_input: |
| 599 | + if len(item) != 0: |
| 600 | + first_element = item[0] |
| 601 | + break |
| 602 | + # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do. |
| 603 | + if not isinstance(first_element, (int, list, tuple)): |
| 604 | + if is_tf_tensor(first_element): |
| 605 | + return_tensors = "tf" if return_tensors is None else return_tensors |
| 606 | + elif is_torch_tensor(first_element): |
| 607 | + return_tensors = "pt" if return_tensors is None else return_tensors |
| 608 | + elif isinstance(first_element, np.ndarray): |
| 609 | + return_tensors = "np" if return_tensors is None else return_tensors |
| 610 | + else: |
| 611 | + raise ValueError( |
| 612 | + f"type of {first_element} unknown: {type(first_element)}. " |
| 613 | + "Should be one of a python, numpy, pytorch or tensorflow object." |
| 614 | + ) |
| 615 | + |
| 616 | + for key, value in encoded_inputs.items(): |
| 617 | + encoded_inputs[key] = to_py_obj(value) |
| 618 | + |
| 619 | + # Convert padding_strategy in PaddingStrategy |
| 620 | + padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies( |
| 621 | + padding=padding, max_length=max_length, verbose=verbose |
| 622 | + ) |
| 623 | + |
| 624 | + required_input = encoded_inputs[self.model_input_names[0]] |
| 625 | + if required_input and not isinstance(required_input[0], (list, tuple)): |
| 626 | + try: |
| 627 | + encoded_inputs = self._pad( |
| 628 | + encoded_inputs, |
| 629 | + max_length=max_length, |
| 630 | + padding_strategy=padding_strategy, |
| 631 | + pad_to_multiple_of=pad_to_multiple_of, |
| 632 | + padding_side=padding_side, |
| 633 | + return_attention_mask=return_attention_mask, |
| 634 | + ) |
| 635 | + except TypeError: |
| 636 | + encoded_inputs = self._pad( |
| 637 | + encoded_inputs, |
| 638 | + max_length=max_length, |
| 639 | + padding_strategy=padding_strategy, |
| 640 | + pad_to_multiple_of=pad_to_multiple_of, |
| 641 | + return_attention_mask=return_attention_mask, |
| 642 | + ) |
| 643 | + return BatchEncoding(encoded_inputs, tensor_type=return_tensors) |
| 644 | + |
| 645 | + batch_size = len(required_input) |
| 646 | + assert all( |
| 647 | + len(v) == batch_size for v in encoded_inputs.values() |
| 648 | + ), "Some items in the output dictionary have a different batch size than others." |
| 649 | + |
| 650 | + if padding_strategy == PaddingStrategy.LONGEST: |
| 651 | + max_length = max(len(inputs) for inputs in required_input) |
| 652 | + padding_strategy = PaddingStrategy.MAX_LENGTH |
| 653 | + |
| 654 | + batch_outputs = {} |
| 655 | + for i in range(batch_size): |
| 656 | + inputs = {k: v[i] for k, v in encoded_inputs.items()} |
| 657 | + try: |
| 658 | + outputs = self._pad( |
| 659 | + inputs, |
| 660 | + max_length=max_length, |
| 661 | + padding_strategy=padding_strategy, |
| 662 | + pad_to_multiple_of=pad_to_multiple_of, |
| 663 | + padding_side=padding_side, |
| 664 | + return_attention_mask=return_attention_mask, |
| 665 | + ) |
| 666 | + except TypeError: |
| 667 | + outputs = self._pad( |
| 668 | + inputs, |
| 669 | + max_length=max_length, |
| 670 | + padding_strategy=padding_strategy, |
| 671 | + pad_to_multiple_of=pad_to_multiple_of, |
| 672 | + return_attention_mask=return_attention_mask, |
| 673 | + ) |
| 674 | + |
| 675 | + for key, value in outputs.items(): |
| 676 | + if key not in batch_outputs: |
| 677 | + batch_outputs[key] = [] |
| 678 | + batch_outputs[key].append(value) |
| 679 | + |
| 680 | + return BatchEncoding(batch_outputs, tensor_type=return_tensors) |
0 commit comments