Skip to content

Commit f7a372a

Browse files
authored
rls2.5: Fix compatibility issues with transformers 4.45.0 (#3320) (#3326)
1 parent 4dbda87 commit f7a372a

File tree

5 files changed

+308
-3
lines changed

5 files changed

+308
-3
lines changed

intel_extension_for_pytorch/llm/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
_get_class_from_dynamic_module,
1818
_get_cached_module_file,
1919
_get_imports,
20+
_pad,
2021
)
2122
import transformers
2223

@@ -32,5 +33,6 @@
3233
transformers.modeling_utils.PreTrainedModel.gradient_checkpointing_enable = (
3334
_gradient_checkpointing_enable
3435
)
36+
transformers.tokenization_utils_base.PreTrainedTokenizerBase.pad = _pad
3537
except ImportError:
3638
pass

intel_extension_for_pytorch/llm/utils.py

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,14 @@
2424
extract_commit_hash,
2525
is_offline_mode,
2626
try_to_load_from_cache,
27+
PaddingStrategy,
28+
is_tf_tensor,
29+
is_torch_tensor,
30+
to_py_obj,
2731
)
32+
from transformers.tokenization_utils_base import BatchEncoding, EncodedInput
33+
from collections.abc import Mapping, Sized
34+
import numpy as np
2835

2936

3037
def _get_relative_imports(module_file):
@@ -465,3 +472,209 @@ def _get_class_from_dynamic_module(
465472
class_name, final_module.replace(".py", "").replace("-", "_")
466473
)
467474
return get_class_in_module(class_name, final_module.replace("-", "_"))
475+
476+
477+
def _pad(
478+
self,
479+
encoded_inputs: Union[
480+
BatchEncoding,
481+
List[BatchEncoding],
482+
Dict[str, EncodedInput],
483+
Dict[str, List[EncodedInput]],
484+
List[Dict[str, EncodedInput]],
485+
],
486+
padding=True,
487+
max_length: Optional[int] = None,
488+
pad_to_multiple_of: Optional[int] = None,
489+
padding_side: Optional[bool] = None,
490+
return_attention_mask: Optional[bool] = None,
491+
return_tensors=None,
492+
verbose: bool = True,
493+
) -> BatchEncoding:
494+
"""
495+
Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
496+
in the batch.
497+
498+
Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
499+
`self.pad_token_id` and `self.pad_token_type_id`).
500+
501+
Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the
502+
text followed by a call to the `pad` method to get a padded encoding.
503+
504+
<Tip>
505+
506+
If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
507+
result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
508+
PyTorch tensors, you will lose the specific device of your tensors however.
509+
510+
</Tip>
511+
512+
Args:
513+
encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]`
514+
or `List[Dict[str, List[int]]]`):
515+
Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of
516+
tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str,
517+
List[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
518+
collate function.
519+
520+
Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see
521+
the note above for the return type.
522+
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
523+
Select a strategy to pad the returned sequences (according to the model's padding side and padding
524+
index) among:
525+
526+
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
527+
sequence if provided).
528+
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
529+
acceptable input length for the model if that argument is not provided.
530+
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
531+
lengths).
532+
max_length (`int`, *optional*):
533+
Maximum length of the returned list and optionally padding length (see above).
534+
pad_to_multiple_of (`int`, *optional*):
535+
If set will pad the sequence to a multiple of the provided value.
536+
537+
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
538+
`>= 7.5` (Volta).
539+
padding_side (`str`, *optional*):
540+
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
541+
Default value is picked from the class attribute of the same name.
542+
return_attention_mask (`bool`, *optional*):
543+
Whether to return the attention mask. If left to the default, will return the attention mask according
544+
to the specific tokenizer's default, defined by the `return_outputs` attribute.
545+
546+
[What are attention masks?](../glossary#attention-mask)
547+
return_tensors (`str` or [`~utils.TensorType`], *optional*):
548+
If set, will return tensors instead of list of python integers. Acceptable values are:
549+
550+
- `'tf'`: Return TensorFlow `tf.constant` objects.
551+
- `'pt'`: Return PyTorch `torch.Tensor` objects.
552+
- `'np'`: Return Numpy `np.ndarray` objects.
553+
verbose (`bool`, *optional*, defaults to `True`):
554+
Whether or not to print more information and warnings.
555+
"""
556+
if self.__class__.__name__.endswith("Fast"):
557+
if not self.deprecation_warnings.get("Asking-to-pad-a-fast-tokenizer", False):
558+
logger.warning_advice(
559+
f"You're using a {self.__class__.__name__} tokenizer. Please note that with a fast tokenizer,"
560+
" using the `__call__` method is faster than using a method to encode the text followed by a call"
561+
" to the `pad` method to get a padded encoding."
562+
)
563+
self.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
564+
565+
# If we have a list of dicts, let's convert it in a dict of lists
566+
# We do this to allow using this method as a collate_fn function in PyTorch Dataloader
567+
if isinstance(encoded_inputs, (list, tuple)) and isinstance(
568+
encoded_inputs[0], Mapping
569+
):
570+
encoded_inputs = {
571+
key: [example[key] for example in encoded_inputs]
572+
for key in encoded_inputs[0].keys()
573+
}
574+
575+
# The model's main input name, usually `input_ids`, has been passed for padding
576+
if self.model_input_names[0] not in encoded_inputs:
577+
raise ValueError(
578+
"You should supply an encoding or a list of encodings to this method "
579+
f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
580+
)
581+
582+
required_input = encoded_inputs[self.model_input_names[0]]
583+
584+
if required_input is None or (
585+
isinstance(required_input, Sized) and len(required_input) == 0
586+
):
587+
if return_attention_mask:
588+
encoded_inputs["attention_mask"] = []
589+
return encoded_inputs
590+
591+
# If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
592+
# and rebuild them afterwards if no return_tensors is specified
593+
# Note that we lose the specific device the tensor may be on for PyTorch
594+
595+
first_element = required_input[0]
596+
if isinstance(first_element, (list, tuple)):
597+
# first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
598+
for item in required_input:
599+
if len(item) != 0:
600+
first_element = item[0]
601+
break
602+
# At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
603+
if not isinstance(first_element, (int, list, tuple)):
604+
if is_tf_tensor(first_element):
605+
return_tensors = "tf" if return_tensors is None else return_tensors
606+
elif is_torch_tensor(first_element):
607+
return_tensors = "pt" if return_tensors is None else return_tensors
608+
elif isinstance(first_element, np.ndarray):
609+
return_tensors = "np" if return_tensors is None else return_tensors
610+
else:
611+
raise ValueError(
612+
f"type of {first_element} unknown: {type(first_element)}. "
613+
"Should be one of a python, numpy, pytorch or tensorflow object."
614+
)
615+
616+
for key, value in encoded_inputs.items():
617+
encoded_inputs[key] = to_py_obj(value)
618+
619+
# Convert padding_strategy in PaddingStrategy
620+
padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
621+
padding=padding, max_length=max_length, verbose=verbose
622+
)
623+
624+
required_input = encoded_inputs[self.model_input_names[0]]
625+
if required_input and not isinstance(required_input[0], (list, tuple)):
626+
try:
627+
encoded_inputs = self._pad(
628+
encoded_inputs,
629+
max_length=max_length,
630+
padding_strategy=padding_strategy,
631+
pad_to_multiple_of=pad_to_multiple_of,
632+
padding_side=padding_side,
633+
return_attention_mask=return_attention_mask,
634+
)
635+
except TypeError:
636+
encoded_inputs = self._pad(
637+
encoded_inputs,
638+
max_length=max_length,
639+
padding_strategy=padding_strategy,
640+
pad_to_multiple_of=pad_to_multiple_of,
641+
return_attention_mask=return_attention_mask,
642+
)
643+
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
644+
645+
batch_size = len(required_input)
646+
assert all(
647+
len(v) == batch_size for v in encoded_inputs.values()
648+
), "Some items in the output dictionary have a different batch size than others."
649+
650+
if padding_strategy == PaddingStrategy.LONGEST:
651+
max_length = max(len(inputs) for inputs in required_input)
652+
padding_strategy = PaddingStrategy.MAX_LENGTH
653+
654+
batch_outputs = {}
655+
for i in range(batch_size):
656+
inputs = {k: v[i] for k, v in encoded_inputs.items()}
657+
try:
658+
outputs = self._pad(
659+
inputs,
660+
max_length=max_length,
661+
padding_strategy=padding_strategy,
662+
pad_to_multiple_of=pad_to_multiple_of,
663+
padding_side=padding_side,
664+
return_attention_mask=return_attention_mask,
665+
)
666+
except TypeError:
667+
outputs = self._pad(
668+
inputs,
669+
max_length=max_length,
670+
padding_strategy=padding_strategy,
671+
pad_to_multiple_of=pad_to_multiple_of,
672+
return_attention_mask=return_attention_mask,
673+
)
674+
675+
for key, value in outputs.items():
676+
if key not in batch_outputs:
677+
batch_outputs[key] = []
678+
batch_outputs[key].append(value)
679+
680+
return BatchEncoding(batch_outputs, tensor_type=return_tensors)

intel_extension_for_pytorch/transformers/generation/utils.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ def whisper_generate(
174174
synced_gpus: bool = False,
175175
return_timestamps: Optional[bool] = None,
176176
task: Optional[str] = None,
177-
language: Optional[str] = None,
177+
language: Optional[Union[str, List[str]]] = None,
178178
is_multilingual: Optional[bool] = None,
179179
prompt_ids: Optional[torch.Tensor] = None,
180180
prompt_condition_type: Optional[str] = None, # first-segment, all-segments
@@ -216,7 +216,7 @@ def whisper_generate(
216216

217217
# 3. Make sure generation config is correctly set
218218
# Make sure the generation config is correctly set depending on whether timestamps are to be returned or not
219-
self._set_return_outputs(
219+
return_dict_in_generate = self._set_return_outputs(
220220
return_dict_in_generate=return_dict_in_generate,
221221
return_token_timestamps=return_token_timestamps,
222222
logprob_threshold=logprob_threshold,
@@ -407,6 +407,8 @@ def whisper_generate(
407407
return_token_timestamps=return_token_timestamps,
408408
do_condition_on_prev_tokens=do_condition_on_prev_tokens,
409409
is_shortform=is_shortform,
410+
batch_size=batch_size,
411+
attention_mask=attention_mask,
410412
kwargs=kwargs,
411413
)
412414

@@ -482,7 +484,7 @@ def whisper_generate(
482484
else:
483485
outputs = sequences
484486

485-
if generation_config.return_dict_in_generate:
487+
if return_dict_in_generate and generation_config.return_dict_in_generate:
486488
dict_outputs = self._stack_split_outputs(
487489
seek_outputs, model_output_type, sequences.device, kwargs
488490
)
@@ -507,6 +509,7 @@ def whisper_generate(
507509
if return_token_timestamps:
508510
dict_outputs["token_timestamps"] = outputs["token_timestamps"]
509511
return dict_outputs
512+
510513
return outputs
511514

512515
return sequences

intel_extension_for_pytorch/transformers/models/reference/models.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5723,6 +5723,78 @@ def prepare_inputs_labels_for_multimodal_llavallama(
57235723
return model_inputs
57245724

57255725

5726+
def prepare_inputs_for_generation_gptneox(
5727+
self,
5728+
input_ids,
5729+
past_key_values=None,
5730+
attention_mask=None,
5731+
inputs_embeds=None,
5732+
**kwargs,
5733+
):
5734+
input_shape = input_ids.shape
5735+
# cut decoder_input_ids if past is used
5736+
if past_key_values is not None:
5737+
past_length = past_key_values[0][0].shape[2]
5738+
5739+
# Some generation methods already pass only the last input ID
5740+
if input_ids.shape[1] > past_length:
5741+
remove_prefix_length = past_length
5742+
else:
5743+
# Default to old behavior: keep only final ID
5744+
remove_prefix_length = input_ids.shape[1] - 1
5745+
5746+
input_ids = input_ids[:, remove_prefix_length:]
5747+
5748+
position_ids = kwargs.get("position_ids", None)
5749+
if attention_mask is not None and position_ids is None:
5750+
# create position_ids on the fly for batch generation
5751+
position_ids = attention_mask.long().cumsum(-1) - 1
5752+
position_ids.masked_fill_(attention_mask == 0, 1)
5753+
if past_key_values:
5754+
position_ids = position_ids[:, -input_ids.shape[1] :]
5755+
5756+
# if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
5757+
if attention_mask is None:
5758+
attention_mask = input_ids.new_ones(input_shape)
5759+
5760+
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
5761+
if inputs_embeds is not None and past_key_values is None:
5762+
model_inputs = {"inputs_embeds": inputs_embeds}
5763+
else:
5764+
model_inputs = {"input_ids": input_ids}
5765+
model_inputs.update(
5766+
{
5767+
"attention_mask": attention_mask,
5768+
"past_key_values": past_key_values,
5769+
"position_ids": position_ids,
5770+
"use_cache": kwargs.get("use_cache"),
5771+
}
5772+
)
5773+
5774+
return model_inputs
5775+
5776+
5777+
def prepare_inputs_for_generation_git(
5778+
self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
5779+
):
5780+
# cut decoder_input_ids if past_key_values is used
5781+
if past_key_values is not None:
5782+
input_ids = input_ids[:, -1:]
5783+
5784+
# if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
5785+
input_shape = input_ids.shape
5786+
if attention_mask is None:
5787+
attention_mask = input_ids.new_ones(input_shape)
5788+
5789+
return {
5790+
"input_ids": input_ids,
5791+
"attention_mask": attention_mask,
5792+
"pixel_values": kwargs.get("pixel_values", None),
5793+
"past_key_values": past_key_values,
5794+
"use_cache": use_cache,
5795+
}
5796+
5797+
57265798
def _postprocess_outputs_whisper(
57275799
self,
57285800
seek_outputs,

0 commit comments

Comments
 (0)