Skip to content

Commit a3ab3a4

Browse files
authored
Fix transformers v4.42.0 compatibility (#793)
* fix transformers v4.42.0 compatibility * fix inc modeling * update setup * add missing argument * fix patching * format * fix num quant op * remove incompatible transformers generation * udpate setup
1 parent 480eea1 commit a3ab3a4

File tree

12 files changed

+92
-78
lines changed

12 files changed

+92
-78
lines changed

.github/workflows/test_openvino.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
fail-fast: false
2222
matrix:
2323
python-version: ["3.8", "3.12"]
24-
transformers-version: ["4.36.0", "4.41.*"]
24+
transformers-version: ["4.36.0", "4.42.*"]
2525
os: [ubuntu-latest]
2626

2727
runs-on: ${{ matrix.os }}

optimum/exporters/openvino/model_patcher.py

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1161,7 +1161,7 @@ def __exit__(self, exc_type, exc_value, traceback):
11611161
block.attention.forward = block.attention._orig_forward
11621162

11631163

1164-
# Adapted from https://github.com/huggingface/transformers/blob/ccdabc5642bf84849af93f591e207dc625c8e1e1/src/transformers/models/phi3/modeling_phi3.py#L426
1164+
# Adapted from https://github.com/huggingface/transformers/blob/ccdabc5642bf84849af93f591e207dc625c8e1e1/src/transformers/models/phi3/modeling_phi3.py#L729
11651165
def _phi3_self_attn_sdpa_forward(
11661166
self,
11671167
hidden_states: torch.Tensor,
@@ -1170,6 +1170,7 @@ def _phi3_self_attn_sdpa_forward(
11701170
past_key_value: Optional[Tuple[torch.Tensor]] = None,
11711171
output_attentions: bool = False,
11721172
use_cache: bool = False,
1173+
cache_position: Optional[torch.LongTensor] = None,
11731174
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
11741175
if output_attentions:
11751176
return self._orig_forward(
@@ -1181,10 +1182,9 @@ def _phi3_self_attn_sdpa_forward(
11811182
use_cache=use_cache,
11821183
)
11831184

1184-
# TO DO: remove llama imports when transformers with phi3 support will be released
1185-
try:
1185+
if is_transformers_version(">=", "4.41.0"):
11861186
from transformers.models.phi3.modeling_phi3 import apply_rotary_pos_emb, repeat_kv
1187-
except ImportError:
1187+
else:
11881188
from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
11891189

11901190
bsz, q_len, _ = hidden_states.size()
@@ -1206,17 +1206,15 @@ def _phi3_self_attn_sdpa_forward(
12061206
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
12071207

12081208
if past_key_value is not None:
1209-
cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
1209+
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
12101210
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
12111211

12121212
key_states = repeat_kv(key_states, self.num_key_value_groups)
12131213
value_states = repeat_kv(value_states, self.num_key_value_groups)
12141214

1215+
causal_mask = attention_mask
12151216
if attention_mask is not None:
1216-
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
1217-
raise ValueError(
1218-
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
1219-
)
1217+
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
12201218

12211219
# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
12221220
# Reference: https://github.com/pytorch/pytorch/issues/112577.
@@ -1229,7 +1227,7 @@ def _phi3_self_attn_sdpa_forward(
12291227
query_states,
12301228
key_states,
12311229
value_states,
1232-
attn_mask=attention_mask,
1230+
attn_mask=causal_mask,
12331231
dropout_p=self.attention_dropout if self.training else 0.0,
12341232
# The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
12351233
is_causal=self.is_causal and attention_mask is None and q_len > 1,
@@ -1561,7 +1559,7 @@ def __exit__(self, exc_type, exc_value, traceback):
15611559
layer.attn._attn = layer.attn._orig_attn
15621560

15631561

1564-
# adapted from https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/dbrx/modeling_dbrx.py#L763
1562+
# Adapted from https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/dbrx/modeling_dbrx.py#L763
15651563
def _dbrx_experts_forward(
15661564
self, x: torch.Tensor, weights: torch.Tensor, top_weights: torch.Tensor, top_experts: torch.LongTensor
15671565
):
@@ -1606,7 +1604,7 @@ def _dbrx_experts_forward(
16061604
return out
16071605

16081606

1609-
# adapted from https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/dbrx/modeling_dbrx.py#L1228
1607+
# Adapted from https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/dbrx/modeling_dbrx.py#L1228
16101608
def _dbrx_update_causal_mask_legacy(
16111609
self, attention_mask: Optional[torch.Tensor], input_tensor: torch.Tensor, cache_position: torch.Tensor
16121610
) -> Optional[torch.Tensor]:
@@ -1803,6 +1801,7 @@ def __exit__(self, exc_type, exc_value, traceback):
18031801
block.ffn.experts.forward = block.ffn.experts._orig_forward
18041802

18051803

1804+
# Adapted from https://github.com/huggingface/transformers/blob/v4.41.0/src/transformers/models/persimmon/modeling_persimmon.py#L264
18061805
def _persimmon_self_attn_sdpa_forward(
18071806
self,
18081807
hidden_states: torch.Tensor,
@@ -1811,6 +1810,7 @@ def _persimmon_self_attn_sdpa_forward(
18111810
past_key_value: Optional["Cache"] = None,
18121811
output_attentions: bool = False,
18131812
use_cache: bool = False,
1813+
cache_position: Optional[torch.LongTensor] = None,
18141814
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
18151815
from transformers.models.persimmon.modeling_persimmon import apply_rotary_pos_emb
18161816

@@ -1865,14 +1865,23 @@ def _persimmon_self_attn_sdpa_forward(
18651865

18661866
if past_key_value is not None:
18671867
# Specific to RoPE models with partial rotation
1868-
cache_kwargs = {"sin": sin, "cos": cos, "partial_rotation_size": self.rotary_emb.dim}
1868+
cache_kwargs = {
1869+
"sin": sin,
1870+
"cos": cos,
1871+
"partial_rotation_size": self.rotary_emb.dim,
1872+
"cache_position": cache_position,
1873+
}
18691874
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
18701875

1876+
causal_mask = attention_mask
1877+
if attention_mask is not None: # no matter the length, we just slice it
1878+
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
1879+
18711880
attn_output = F.scaled_dot_product_attention(
18721881
query_states,
18731882
key_states,
18741883
value_states,
1875-
attention_mask,
1884+
causal_mask,
18761885
scale=1 / math.sqrt(self.head_dim),
18771886
dropout_p=self.attention_dropout.p,
18781887
)

optimum/intel/generation/modeling.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ class BaseModelForCausalLM(OptimizedModel, GenerationMixin):
9090
export_feature = "text-generation"
9191
main_input_name = "input_ids"
9292
base_model_prefix = "torch_script_model"
93+
_supports_cache_class = False
9394

9495
def __init__(
9596
self,

optimum/intel/ipex/modeling_base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ class IPEXModel(OptimizedModel):
127127
base_model_prefix = "ipex_model"
128128
main_input_name = "input_ids"
129129
output_name = "last_hidden_state"
130+
_supports_cache_class = False
130131

131132
def __init__(
132133
self,

optimum/intel/neural_compressor/modeling_base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ class INCModel(OptimizedModel):
7171
auto_model_class = AutoModel
7272
export_feature = "feature-extraction"
7373
base_model_prefix = "inc_model"
74+
_supports_cache_class = False
7475

7576
def __init__(
7677
self,

optimum/intel/neural_compressor/trainer.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -682,6 +682,7 @@ def _inner_training_loop(
682682
def save_model(
683683
self,
684684
output_dir: Optional[str] = None,
685+
_internal_call: bool = False,
685686
save_onnx_model: bool = False,
686687
):
687688
"""
@@ -696,6 +697,7 @@ def save_model(
696697
output_dir=output_dir,
697698
save_onnx_model=save_onnx_model,
698699
)
700+
# TODO: push to hub if self.args.push_to_hub and not _internal_call
699701

700702
def _save(
701703
self,

optimum/intel/openvino/modeling_base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
class OVBaseModel(OptimizedModel):
5151
auto_model_class = None
5252
export_feature = None
53+
_supports_cache_class = False
5354

5455
def __init__(
5556
self,

setup.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@
2828

2929
INSTALL_REQUIRE = [
3030
"torch>=1.11",
31-
"transformers>=4.36.0,<4.42.0",
32-
"optimum~=1.20",
31+
"transformers>=4.36.0,<4.43.0",
32+
"optimum~=1.21",
33+
# "optimum>=1.21.2,<1.22.0",
3334
"datasets>=1.4.0",
3435
"sentencepiece",
3536
"setuptools",

tests/openvino/test_modeling.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -697,7 +697,6 @@ def test_compare_to_transformers(self, model_arch):
697697
ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, **model_kwargs)
698698
self.assertIsInstance(ov_model.config, PretrainedConfig)
699699
self.assertTrue(ov_model.use_cache)
700-
self.assertEqual(ov_model.stateful, ov_model.config.model_type not in not_stateful)
701700
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
702701
tokens = tokenizer("This is a sample output", return_tensors="pt")
703702
tokens.pop("token_type_ids", None)

tests/openvino/test_quantization.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,8 @@
7575

7676
class OVQuantizerTest(unittest.TestCase):
7777
SUPPORTED_ARCHITECTURES_TORCH_MODEL = (
78-
(OVModelForSequenceClassification, "bert", 22, 35),
79-
(OVModelForCausalLM, "gpt2", 41, 3),
78+
(OVModelForSequenceClassification, "bert", 32 if is_transformers_version("<", "4.41.0") else 22, 35),
79+
(OVModelForCausalLM, "gpt2", 41 if is_transformers_version("<", "4.42.0") else 21, 3),
8080
)
8181
SUPPORTED_ARCHITECTURES_OV_MODEL = (
8282
(OVModelForSequenceClassification, "bert", 32, 35),
@@ -90,9 +90,6 @@ def test_automodel_static_quantization(self, model_cls, model_name, expected_fak
9090
dataset_name, dataset_config_name, column_name = _TASK_TO_DATASET[task]
9191
file_name = "openvino_quantized_model.xml"
9292

93-
if model_name == "bert" and is_transformers_version("<", "4.41.0"):
94-
expected_fake_quantize = 32
95-
9693
def preprocess_function(examples, tokenizer):
9794
return tokenizer(examples[column_name], padding="max_length", max_length=128, truncation=True)
9895

0 commit comments

Comments
 (0)