Skip to content

Commit 3a6cec3

Browse files
calptlenglaender
andauthored
Upgrade Transformers to v4.45.x (#751)
Changes: - add sdp attention to Roberta, Albert MBart, XLM-R - re-copy GPT-J --------- Co-authored-by: Leon Engländer <[email protected]>
1 parent bcace97 commit 3a6cec3

File tree

7 files changed

+610
-21
lines changed

7 files changed

+610
-21
lines changed

hf_transformers

Submodule hf_transformers updated 972 files

setup.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,10 @@
2121
# We try to follow their general layout wherever sensible.
2222

2323
_deps = [
24-
"accelerate>=0.21.0",
24+
"accelerate>=0.26.0",
2525
"beautifulsoup4",
2626
"black~=24.4.0",
27+
"dataclasses",
2728
"datasets!=2.5.0",
2829
"dill<0.3.5",
2930
"docutils==0.16.0",
@@ -38,7 +39,7 @@
3839
"protobuf",
3940
"psutil",
4041
"pytest>=7.2.0,<8.0.0",
41-
"pytest-subtests",
42+
"pytest-rich",
4243
"pytest-timeout",
4344
"pytest-xdist",
4445
"markupsafe==2.0.1",
@@ -58,7 +59,7 @@
5859
"sphinx-multiversion==0.2.4",
5960
"timeout-decorator",
6061
"torch",
61-
"transformers~=4.44.0",
62+
"transformers~=4.45.2",
6263
]
6364

6465

@@ -84,7 +85,7 @@ def deps_list(*pkgs):
8485
extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
8586
extras["testing"] = deps_list(
8687
"pytest",
87-
"pytest-subtests",
88+
"pytest-rich",
8889
"pytest-xdist",
8990
"timeout-decorator",
9091
"parameterized",

src/adapters/models/albert/modeling_albert.py

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,18 @@
2020
import torch
2121
from torch import nn
2222

23-
from transformers.models.albert.modeling_albert import AlbertAttention, AlbertLayer
23+
from transformers.models.albert.modeling_albert import AlbertAttention, AlbertLayer, AlbertSdpaAttention
2424
from transformers.pytorch_utils import apply_chunking_to_forward
25+
from transformers.utils import logging
2526

2627
from ...composition import adjust_tensors_for_parallel, match_attn_matrices_for_parallel
2728
from ...utils import prefix_attention_mask
2829
from .mixin_albert import AlbertAttentionAdaptersMixin, AlbertEncoderLayerAdaptersMixin
2930

3031

32+
logger = logging.get_logger(__name__)
33+
34+
3135
class AlbertAttentionWithAdapters(AlbertAttentionAdaptersMixin, AlbertAttention):
3236
def forward(
3337
self,
@@ -101,6 +105,74 @@ def forward(
101105
return (layernormed_context_layer, attention_probs) if output_attentions else (layernormed_context_layer,)
102106

103107

108+
class AlbertSdpaAttentionWithAdapters(AlbertAttentionAdaptersMixin, AlbertSdpaAttention):
109+
def forward(
110+
self,
111+
hidden_states: torch.Tensor,
112+
attention_mask: Optional[torch.FloatTensor] = None,
113+
head_mask: Optional[torch.FloatTensor] = None,
114+
output_attentions: bool = False,
115+
) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
116+
# >>> START AH Changes <<<
117+
attention_mask = prefix_attention_mask(attention_mask, [2, 3]) # type: ignore
118+
# >>> END AH Changes <<<
119+
120+
if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
121+
logger.warning(
122+
"AlbertSdpaAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
123+
"non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to "
124+
"the eager attention implementation, but specifying the eager implementation will be required from "
125+
"Transformers version v5.0.0 onwards. This warning can be removed using the argument "
126+
'`attn_implementation="eager"` when loading the model.'
127+
)
128+
return super().forward(hidden_states, attention_mask, head_mask, output_attentions)
129+
130+
batch_size, seq_len, _ = hidden_states.size()
131+
query_layer = self.transpose_for_scores(self.query(hidden_states))
132+
key_layer = self.transpose_for_scores(self.key(hidden_states))
133+
value_layer = self.transpose_for_scores(self.value(hidden_states))
134+
135+
# >>> START AH Changes <<<
136+
query_layer, key_layer, value_layer = match_attn_matrices_for_parallel(query_layer, key_layer, value_layer)
137+
(attention_mask,) = adjust_tensors_for_parallel(query_layer, attention_mask)
138+
139+
key_layer, value_layer, attention_mask = self.prefix_tuning(
140+
key_layer, value_layer, hidden_states, attention_mask
141+
)
142+
(query_layer,) = adjust_tensors_for_parallel(key_layer, query_layer)
143+
batch_size = query_layer.size(0)
144+
# >>> END AH Changes <<<
145+
146+
# SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
147+
# attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
148+
# Reference: https://github.com/pytorch/pytorch/issues/112577
149+
if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None:
150+
query_layer = query_layer.contiguous()
151+
key_layer = key_layer.contiguous()
152+
value_layer = value_layer.contiguous()
153+
154+
attention_output = torch.nn.functional.scaled_dot_product_attention(
155+
query=query_layer,
156+
key=key_layer,
157+
value=value_layer,
158+
attn_mask=attention_mask,
159+
dropout_p=self.dropout_prob if self.training else 0.0,
160+
is_causal=False,
161+
)
162+
163+
attention_output = attention_output.transpose(1, 2)
164+
attention_output = attention_output.reshape(batch_size, seq_len, self.all_head_size)
165+
166+
projected_context_layer = self.dense(attention_output)
167+
projected_context_layer_dropout = self.output_dropout(projected_context_layer)
168+
169+
layernormed_context_layer = self.attention_adapters(
170+
hidden_states, projected_context_layer_dropout, self.LayerNorm
171+
)
172+
173+
return (layernormed_context_layer,)
174+
175+
104176
class AlbertLayerWithAdapters(AlbertEncoderLayerAdaptersMixin, AlbertLayer):
105177
def forward(
106178
self,

src/adapters/models/gptj/modeling_gptj.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import torch
2020
import torch.utils.checkpoint
2121

22+
from transformers.cache_utils import Cache
2223
from transformers.models.gptj.modeling_gptj import GPTJAttention, GPTJBlock, apply_rotary_pos_emb, get_embed_positions
2324
from transformers.utils.import_utils import is_torch_fx_proxy
2425

@@ -30,12 +31,13 @@ class GPTJAttentionWithAdapters(GPTJAttentionAdaptersMixin, GPTJAttention):
3031
def forward(
3132
self,
3233
hidden_states: torch.FloatTensor,
33-
layer_past: Optional[Tuple[torch.Tensor]] = None,
34+
layer_past: Optional[Cache] = None,
3435
attention_mask: Optional[torch.FloatTensor] = None,
3536
position_ids: Optional[torch.LongTensor] = None,
3637
head_mask: Optional[torch.FloatTensor] = None,
3738
use_cache: Optional[bool] = False,
3839
output_attentions: Optional[bool] = False,
40+
cache_position: Optional[torch.LongTensor] = None,
3941
) -> Union[
4042
Tuple[torch.Tensor, Tuple[torch.Tensor]],
4143
Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]],
@@ -82,15 +84,13 @@ def forward(
8284
query = query.permute(0, 2, 1, 3)
8385

8486
if layer_past is not None:
85-
past_key = layer_past[0]
86-
past_value = layer_past[1]
87-
key = torch.cat((past_key, key), dim=-2)
88-
value = torch.cat((past_value, value), dim=-2)
89-
90-
if use_cache is True:
91-
present = (key, value)
92-
else:
93-
present = None
87+
cache_kwargs = {
88+
"sin": sin,
89+
"cos": cos,
90+
"partial_rotation_size": self.rotary_dim,
91+
"cache_position": cache_position,
92+
}
93+
key, value = layer_past.update(key, value, self.layer_idx, cache_kwargs)
9494

9595
key, value, attention_mask = self.prefix_tuning(key, value, hidden_states, attention_mask)
9696
(query,) = adjust_tensors_for_parallel(key, query)
@@ -102,7 +102,7 @@ def forward(
102102
attn_output = self.out_proj(attn_output)
103103
attn_output = self.resid_dropout(attn_output)
104104

105-
outputs = (attn_output, present)
105+
outputs = (attn_output, layer_past)
106106
if output_attentions:
107107
outputs += (attn_weights,)
108108

@@ -113,24 +113,26 @@ class GPTJBlockWithAdapters(GPTJDecoderBlockAdaptersMixin, GPTJBlock):
113113
def forward(
114114
self,
115115
hidden_states: Optional[torch.FloatTensor],
116-
layer_past: Optional[Tuple[torch.Tensor]] = None,
116+
layer_past: Optional[Cache] = None,
117117
attention_mask: Optional[torch.FloatTensor] = None,
118118
position_ids: Optional[torch.LongTensor] = None,
119119
head_mask: Optional[torch.FloatTensor] = None,
120120
use_cache: Optional[bool] = False,
121121
output_attentions: Optional[bool] = False,
122+
cache_position: Optional[torch.LongTensor] = None,
122123
) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
123124
adjust_tensors_for_parallel_(hidden_states, attention_mask)
124125
residual = hidden_states
125126
hidden_states = self.ln_1(hidden_states)
126127
attn_outputs = self.attn(
127-
hidden_states,
128+
hidden_states=hidden_states,
128129
layer_past=layer_past,
129130
attention_mask=attention_mask,
130131
position_ids=position_ids,
131132
head_mask=head_mask,
132133
use_cache=use_cache,
133134
output_attentions=output_attentions,
135+
cache_position=cache_position,
134136
)
135137
attn_output = attn_outputs[0] # output_attn: a, present, (attentions)
136138
outputs = attn_outputs[1:]

0 commit comments

Comments
 (0)