Support past_key_values argument for Electra (#3411)

sijunhe · wj-Mcat · guoshengCS · web-flow · commit e544a04f8ad3 · 2022-10-12T15:31:13.000+08:00
* unit test pass; fix yapf

* change docstring

Co-authored-by: 骑马小猫 &lt;1435130236@qq.com&gt;
Co-authored-by: Guo Sheng &lt;guosheng@baidu.com&gt;
diff --git a/paddlenlp/transformers/electra/modeling.py b/paddlenlp/transformers/electra/modeling.py
@@ -23,8 +23,8 @@
 from paddle.nn.layer.transformer import _convert_attention_mask
 
 from .. import PretrainedModel, register_base_model
-from ..model_outputs import (BaseModelOutput, SequenceClassifierOutput,
-                             TokenClassifierOutput,
+from ..model_outputs import (BaseModelOutputWithPastAndCrossAttentions,
+                             SequenceClassifierOutput, TokenClassifierOutput,
                              QuestionAnsweringModelOutput,
                              MultipleChoiceModelOutput, MaskedLMOutput,
                              tuple_output)
@@ -153,9 +153,12 @@ def forward(self,
                              src_mask=src_mask,
                              output_attentions=output_attentions)
             else:
+                cache_wrapper = cache[i] if isinstance(
+                    cache[i], nn.MultiHeadAttention.Cache
+                ) else nn.MultiHeadAttention.Cache(*cache[i])
                 output, new_cache = mod(output,
                                         src_mask=src_mask,
-                                        cache=cache[i],
+                                        cache=cache_wrapper,
                                         output_attentions=output_attentions)
                 new_caches.append(new_cache)
             if output_attentions:
@@ -174,14 +177,13 @@ def forward(self,
         if not return_dict:
             if output_attentions or output_hidden_states:
                 output = (output, all_attentions, all_hidden_states)
-
             return output if cache is None else (output, new_caches)
 
-        return BaseModelOutput(
+        return BaseModelOutputWithPastAndCrossAttentions(
             last_hidden_state=output,
             hidden_states=all_hidden_states,
             attentions=all_attentions,
-        )
+            past_key_values=new_caches)
 
 
 class ElectraEmbeddings(nn.Layer):
@@ -199,11 +201,17 @@ def __init__(self, vocab_size, embedding_size, hidden_dropout_prob,
         self.layer_norm = nn.LayerNorm(embedding_size, epsilon=layer_norm_eps)
         self.dropout = nn.Dropout(hidden_dropout_prob)
 
-    def forward(self, input_ids, token_type_ids=None, position_ids=None):
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                position_ids=None,
+                past_key_values_length=None):
         if position_ids is None:
             ones = paddle.ones_like(input_ids, dtype="int64")
             seq_length = paddle.cumsum(ones, axis=-1)
             position_ids = seq_length - ones
+            if past_key_values_length is not None:
+                position_ids += past_key_values_length
             position_ids.stop_gradient = True
         position_ids = position_ids.astype("int64")
 
@@ -550,6 +558,8 @@ def forward(self,
                 token_type_ids=None,
                 position_ids=None,
                 attention_mask=None,
+                past_key_values=None,
+                use_cache=None,
                 output_attentions=False,
                 output_hidden_states=False,
                 return_dict=False):
@@ -585,6 +595,17 @@ def forward(self,
                 When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
                 It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
                 Defaults to `None`, which means nothing needed to be prevented attention to.
+            past_key_values (tuple(tuple(Tensor)), optional):
+                Precomputed key and value hidden states of the attention blocks of each layer. This can be used to speedup
+                auto-regressive decoding for generation tasks or to support use cases such as Prefix-Tuning where vectors are prepended
+                to each attention layer. The length of tuple equals to the number of layers, and each tuple having 2 tensors of shape
+                `(batch_size, num_heads, past_key_values_length, embed_size_per_head)`)
+                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
+                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+                `input_ids` of shape `(batch_size, sequence_length)`.
+            use_cache (`bool`, optional):
+                If set to `True`, `past_key_values` key value states are returned.
+                Defaults to `None`.
             output_hidden_states (bool, optional):
                 Whether to return the hidden states of all layers.
                 Defaults to `False`.
@@ -613,26 +634,40 @@ def forward(self,
                 output = model(**inputs)
 
         '''
+        past_key_values_length = None
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
 
         if attention_mask is None:
             attention_mask = paddle.unsqueeze(
                 (input_ids == self.pad_token_id).astype(
                     paddle.get_default_dtype()) * -1e4,
                 axis=[1, 2])
+            if past_key_values is not None:
+                batch_size = past_key_values[0][0].shape[0]
+                past_mask = paddle.zeros(
+                    [batch_size, 1, 1, past_key_values_length],
+                    dtype=attention_mask.dtype)
+                attention_mask = paddle.concat([past_mask, attention_mask],
+                                               axis=-1)
         else:
             if attention_mask.ndim == 2:
                 attention_mask = attention_mask.unsqueeze(axis=[1, 2])
 
-        embedding_output = self.embeddings(input_ids=input_ids,
-                                           position_ids=position_ids,
-                                           token_type_ids=token_type_ids)
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            past_key_values_length=past_key_values_length)
 
         if hasattr(self, "embeddings_project"):
             embedding_output = self.embeddings_project(embedding_output)
 
+        self.encoder._use_cache = use_cache  # To be consistent with HF
         encoder_outputs = self.encoder(
             embedding_output,
             attention_mask,
+            cache=past_key_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict)
diff --git a/tests/transformers/electra/test_modeling.py b/tests/transformers/electra/test_modeling.py
@@ -133,6 +133,60 @@ def create_and_check_electra_model(
             result[0].shape,
             [self.batch_size, self.seq_length, self.hidden_size])
 
+    def create_and_check_electra_model_cache(self, config, input_ids,
+                                             token_type_ids, input_mask,
+                                             sequence_labels, token_labels,
+                                             choice_labels):
+        model = ElectraModel(**config)
+        model.eval()
+
+        input_ids = ids_tensor((self.batch_size, self.seq_length),
+                               self.vocab_size)
+        input_token_types = ids_tensor([self.batch_size, self.seq_length],
+                                       self.type_vocab_size)
+
+        # create tensors for past_key_values of shape [batch_size, num_heads, seq_length, head_size]
+        embed_size_per_head = self.hidden_size // self.num_attention_heads
+        key_tensor = floats_tensor((self.batch_size, self.num_attention_heads,
+                                    self.seq_length, embed_size_per_head))
+        values_tensor = floats_tensor(
+            (self.batch_size, self.num_attention_heads, self.seq_length,
+             embed_size_per_head))
+        past_key_values = ((
+            key_tensor,
+            values_tensor,
+        ), ) * self.num_hidden_layers
+
+        # create fully-visible attention mask for input_ids only and input_ids + past
+        attention_mask = paddle.ones([self.batch_size, self.seq_length])
+        attention_mask_with_past = paddle.ones(
+            [self.batch_size, self.seq_length * 2])
+
+        outputs_with_cache = model(input_ids,
+                                   token_type_ids=input_token_types,
+                                   attention_mask=attention_mask_with_past,
+                                   past_key_values=past_key_values,
+                                   return_dict=self.parent.return_dict)
+        outputs_without_cache = model(input_ids,
+                                      token_type_ids=input_token_types,
+                                      attention_mask=attention_mask,
+                                      return_dict=self.parent.return_dict)
+
+        # last_hidden_state should have the same shape but different values when given past_key_values
+        if self.parent.return_dict:
+            self.parent.assertEqual(
+                outputs_with_cache.last_hidden_state.shape,
+                outputs_without_cache.last_hidden_state.shape)
+            self.parent.assertFalse(
+                paddle.allclose(outputs_with_cache.last_hidden_state,
+                                outputs_without_cache.last_hidden_state))
+        else:
+            outputs_with_cache, _ = outputs_with_cache
+            self.parent.assertEqual(outputs_with_cache.shape,
+                                    outputs_without_cache.shape)
+            self.parent.assertFalse(
+                paddle.allclose(outputs_with_cache, outputs_without_cache))
+
     def create_and_check_electra_for_masked_lm(
         self,
         config,
@@ -356,6 +410,11 @@ def test_electra_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_electra_model(*config_and_inputs)
 
+    def test_electra_model_cache(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_model_cache(
+            *config_and_inputs)
+
     def test_for_masked_lm(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_electra_for_masked_lm(