add attn_mask input for encoder-decoder (#1431) (#1438)

tianxin · web-flow · commit aec31199076d · 2021-12-10T19:20:41.000+08:00
diff --git a/examples/semantic_indexing/faster_predict.py b/examples/semantic_indexing/faster_predict.py
@@ -98,8 +98,8 @@ def get_pooled_embedding(self,
                              input_ids,
                              token_type_ids=None,
                              position_ids=None):
-        src_mask = (input_ids != self.bos_id
-                    ).astype(self.ptm.encoder.layers[0].norm1.bias.dtype)
+        src_mask = input_ids == self.bos_id
+        src_mask = paddle.cast(src_mask, "float32")
         # [bs, 1, 1, max_len]
         src_mask = paddle.unsqueeze(src_mask, axis=[1, 2])
         src_mask.stop_gradient = True
@@ -116,7 +116,6 @@ def get_pooled_embedding(self,
 
         if self.use_fp16:
             embedding_output = paddle.cast(embedding_output, 'float16')
-            src_mask = paddle.cast(src_mask, 'float16')
 
         sequence_output = self.ptm.encoder(embedding_output, src_mask)
 
diff --git a/paddlenlp/ops/faster_transformer/transformer/encoder.py b/paddlenlp/ops/faster_transformer/transformer/encoder.py
@@ -201,19 +201,13 @@ def encoder_forward(self, src, src_mask=None, cache=None):
         src (Tensor):
             The input of Transformer encoder. It is a tensor
             with shape `[batch_size, sequence_length, d_model]`. The data
-            type should be float32 or float64.
+            type should be float32 or float16.
         src_mask (Tensor, optional):
             A tensor used in multi-head attention to prevents attention to
             some unwanted positions, usually the paddings or the subsequent
             positions. It is a tensor with shape `[batch_size, 1, 1, sequence_length]`.
-            When the data type is bool, the unwanted positions have `False`
-            values and the others have `True` values. When the data type is
-            int, the unwanted positions have 0 values and the others have 1
-            values. When the data type is float, the unwanted positions have
-            `-INF` values and the others have 0 values. It can be None when
-            nothing wanted or needed to be prevented attention to. Defaults
-            to None.
-
+            The data type must be float, the unwanted positions have `-INF` values or other non-zeros
+            and the wanted positions must be 0.0.
     Returns:
         output (Tensor|tuple):
             It is a tensor that has the same shape and data type as `src`,
@@ -225,9 +219,17 @@ def encoder_forward(self, src, src_mask=None, cache=None):
             `paddle.nn.MultiHeadAttention.forward` for more details.
     """
 
-    max_seq_len = src.shape[1]
-    # broadcast
-    src_mask = paddle.concat(x=[src_mask] * max_seq_len, axis=2)
+    if src_mask.dtype == paddle.float16:
+        src_mask = paddle.cast(src_mask, "float32")
+
+    src_mask = src_mask == 0.0
+    src_mask = paddle.cast(src_mask, src.dtype)
+
+    # transpose_src_mask: [batch_size, 1, sequence_length, 1]
+    transpose_src_mask = paddle.transpose(src_mask, perm=[0, 1, 3, 2])
+
+    # src_mask: [batch_size, 1, sequence_length, sequence_length]
+    src_mask = src_mask * transpose_src_mask
     output = src
     for i, layer in enumerate(self.layers):
         output = layer(output, src_mask)