Upgrade Transformers to v4.46.x (#757)

calpt · web-flow · commit 702381ed5c58 · 2024-12-25T22:52:12.000+01:00
Changes: - re-copy changes in T5, MT5 - fix resize embeddings override - add Distilbert sdpa/ flash - add Mistral QA head conversion - add HF custom pytest markers Known issues: - Electra test failure seems to be present also in HF: https://app.circleci.com/pipelines/github/huggingface/transformers/110747/workflows/60c508ce-1261-46b2-a321-363718877ead/jobs/1473377/tests
diff --git a/conftest.py b/conftest.py
@@ -46,7 +46,11 @@ def pytest_configure(config):
     config.addinivalue_line(
         "markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested"
     )
+    config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested")
     config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
+    config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
+    config.addinivalue_line("markers", "agent_tests: mark the agent tests that are run on their specific schedule")
+    config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")
 
 
 def pytest_addoption(parser):
diff --git a/hf_transformers b/hf_transformers
@@ -1 +1 @@
-Subproject commit 53fad641cfdb5105e2470bcf3ef17ea8e25cc300
+Subproject commit 052e652d6d53c2b26ffde87e039b723949a53493
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,11 @@
 [tool.black]
 line-length = 119
 target-version = ['py38', 'py39', 'py310']
+
+# copied from HF for testing
+[tool.pytest.ini_options]
+markers = [
+    "flash_attn_test: marks tests related to flash attention (deselect with '-m \"not flash_attn_test\"')",
+    "bitsandbytes: select (or deselect with `not`) bitsandbytes integration tests",
+    "generate: marks tests that use the GenerationTesterMixin"
+]
diff --git a/setup.py b/setup.py
@@ -60,7 +60,7 @@
     "timeout-decorator",
     "torch",
     "torchvision",
-    "transformers~=4.45.2",
+    "transformers~=4.46.3",
 ]
 
 
diff --git a/src/adapters/head_utils.py b/src/adapters/head_utils.py
@@ -705,6 +705,14 @@
         },
         "layers": [None, "score"],
     },
+    "MistralForQuestionAnswering": {
+        "config": {
+            "head_type": "question_answering",
+            "layers": 1,
+            "activation_function": None,
+        },
+        "layers": [None, "qa_outputs"],
+    },
     # Electra
     "ElectraForTokenClassification": {
         "config": {
diff --git a/src/adapters/heads/model_mixin.py b/src/adapters/heads/model_mixin.py
@@ -139,10 +139,8 @@ def tie_weights(self):
 
         super().tie_weights()
 
-    def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None):
-        old_embeddings = self.get_input_embeddings()
-        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of)
-        self.set_input_embeddings(new_embeddings)
+    def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None, mean_resizing=True):
+        super()._resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing)
 
         # if word embeddings are not tied, make sure that lm head is resized as well
         if not self.config.tie_word_embeddings:
diff --git a/src/adapters/models/distilbert/modeling_distilbert.py b/src/adapters/models/distilbert/modeling_distilbert.py
@@ -25,13 +25,26 @@
 import torch
 from torch import nn
 
-from transformers.models.distilbert.modeling_distilbert import MultiHeadSelfAttention, TransformerBlock
+from transformers.models.distilbert.modeling_distilbert import (
+    DistilBertFlashAttention2,
+    DistilBertSdpaAttention,
+    MultiHeadSelfAttention,
+    TransformerBlock,
+)
+from transformers.utils import is_flash_attn_2_available, logging
 
 from ...composition import adjust_tensors_for_parallel, adjust_tensors_for_parallel_, match_attn_matrices_for_parallel
 from ...utils import prefix_attention_mask
 from .mixin_distilbert import DistilBertMultiHeadSelfAttentionMixin, DistilBertTransfomerBlockAdaptersMixin
 
 
+if is_flash_attn_2_available():
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+
+
+logger = logging.get_logger(__name__)
+
+
 class MultiHeadSelfAttentionWithAdapters(DistilBertMultiHeadSelfAttentionMixin, MultiHeadSelfAttention):
     def forward(
         self,
@@ -66,18 +79,20 @@ def shape(x: torch.Tensor) -> torch.Tensor:
 
         def unshape(x: torch.Tensor) -> torch.Tensor:
             """group heads"""
-            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
+            return x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.n_heads * dim_per_head)
 
         q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
         k = shape(self.k_lin(key))  # (bs, n_heads, k_length, dim_per_head)
         v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)
 
+        # >>> START AH Changes <<<
         q, k, v = match_attn_matrices_for_parallel(q, k, v)
         (mask,) = adjust_tensors_for_parallel(q, mask)
 
         k, v, mask = self.prefix_tuning(k, v, value, mask, invert_mask=False)
         bs = k.size(0)  # reset for Parallel block
         (q,) = adjust_tensors_for_parallel(k, q)
+        # >>> END AH Changes <<<
 
         mask_reshp = (bs, 1, 1, k.size(2))
 
@@ -105,6 +120,172 @@ def unshape(x: torch.Tensor) -> torch.Tensor:
             return (context,)
 
 
+class DistilBertSdpaAttentionWithAdapters(DistilBertMultiHeadSelfAttentionMixin, DistilBertSdpaAttention):
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, ...]:
+        """
+        Parameters:
+            query: torch.tensor(bs, seq_length, dim)
+            key: torch.tensor(bs, seq_length, dim)
+            value: torch.tensor(bs, seq_length, dim)
+            mask: torch.tensor(bs, seq_length)
+
+        Returns:
+            weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs,
+            seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
+        """
+        if output_attentions or head_mask is not None:
+            logger.warning_once(
+                "DistilBertSdpaAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support"
+                " `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying"
+                " the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be"
+                ' removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                query,
+                key,
+                value,
+                mask,
+                head_mask,
+                output_attentions,
+            )
+
+        batch_size, _, _ = query.size()
+        dim_per_head = self.dim // self.n_heads
+
+        def shape(x: torch.Tensor) -> torch.Tensor:
+            """separate heads"""
+            # keep first dim due to parallel composition
+            return x.view(x.shape[0], -1, self.n_heads, dim_per_head).transpose(1, 2)
+
+        def unshape(x: torch.Tensor) -> torch.Tensor:
+            """group heads"""
+            return x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.n_heads * dim_per_head)
+
+        q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
+        k = shape(self.k_lin(key))  # (bs, n_heads, k_length, dim_per_head)
+        v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)
+
+        # >>> START AH Changes <<<
+        q, k, v = match_attn_matrices_for_parallel(q, k, v)
+        (mask,) = adjust_tensors_for_parallel(q, mask)
+
+        k, v, mask = self.prefix_tuning(k, v, value, mask, invert_mask=False)
+        (q,) = adjust_tensors_for_parallel(k, q)
+        # >>> END AH Changes <<<
+
+        # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
+        # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
+        # Reference: https://github.com/pytorch/pytorch/issues/112577
+        if self.require_contiguous_qkv and q.device.type == "cuda" and mask is not None:
+            q = q.contiguous()
+            k = k.contiguous()
+            v = v.contiguous()
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=mask,
+            dropout_p=self.dropout_prob if self.training else 0.0,
+            is_causal=False,
+        )
+
+        attn_output = unshape(attn_output)
+        attn_output = self.out_lin(attn_output)
+
+        return (attn_output,)
+
+
+class DistilBertFlashAttention2WithAdapters(DistilBertMultiHeadSelfAttentionMixin, DistilBertFlashAttention2):
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, ...]:
+        """
+        Parameters:
+            query: torch.tensor(bs, seq_length, dim)
+            key: torch.tensor(bs, seq_length, dim)
+            value: torch.tensor(bs, seq_length, dim)
+            mask: torch.tensor(bs, seq_length)
+
+        Returns:
+            weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs,
+            seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
+        """
+        batch_size, q_length, dim = query.size()
+
+        dim_per_head = self.dim // self.n_heads
+
+        def reshape(x: torch.Tensor) -> torch.Tensor:
+            """separate heads"""
+            return x.view(x.shape[0], -1, self.n_heads, dim_per_head)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        query_states = reshape(self.q_lin(query))
+        key_states = reshape(self.k_lin(key))
+        value_states = reshape(self.v_lin(value))
+
+        attn_dropout = self.config.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+
+        if query_states.dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_lin.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_weights = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            mask,
+            q_length,
+            dropout=attn_dropout,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_weights_reshaped = attn_weights.reshape(batch_size, q_length, self.n_heads * dim_per_head)
+        attn_output = self.out_lin(attn_weights_reshaped)
+
+        if output_attentions:
+            return (attn_output, attn_weights)
+        else:
+            return (attn_output,)
+
+
 class TransformerBlockWithAdapters(DistilBertTransfomerBlockAdaptersMixin, TransformerBlock):
     def forward(
         self,
@@ -123,7 +304,7 @@ def forward(
             torch.tensor(bs, seq_length, dim) The output of the transformer block contextualization.
         """
         adjust_tensors_for_parallel_(x, attn_mask)
-        attn_mask = prefix_attention_mask(attn_mask, dim=1, prefix_value=1)  # type: ignore
+        attn_mask = prefix_attention_mask(attn_mask, dim=[2, 3], prefix_value=1)  # type: ignore
 
         # Self-Attention
         sa_output = self.attention(
diff --git a/src/adapters/models/mt5/modeling_mt5.py b/src/adapters/models/mt5/modeling_mt5.py
diff --git a/src/adapters/models/t5/modeling_t5.py b/src/adapters/models/t5/modeling_t5.py
diff --git a/tests/test_clip.py b/tests/test_clip.py

Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@`
`60`	`60`	`"timeout-decorator",`
`61`	`61`	`"torch",`
`62`	`62`	`"torchvision",`
`63`		`- "transformers~=4.45.2",`
	`63`	`+ "transformers~=4.46.3",`
`64`	`64`	`]`
`65`	`65`
`66`	`66`