No public description

tensorflower-gardener · tensorflower-gardener · commit 854febef7f02 · 2023-12-15T08:39:04.000-08:00
PiperOrigin-RevId: 591264030
diff --git a/official/nlp/configs/encoders.py b/official/nlp/configs/encoders.py
@@ -47,6 +47,7 @@ class BertEncoderConfig(hyperparams.Config):
   output_range: Optional[int] = None
   return_all_encoder_outputs: bool = False
   return_attention_scores: bool = False
+  return_word_embeddings: bool = False
   # Pre/Post-LN Transformer
   norm_first: bool = False
 
@@ -769,5 +770,6 @@ def build_encoder(config: EncoderConfig,
       embedding_layer=embedding_layer,
       return_all_encoder_outputs=encoder_cfg.return_all_encoder_outputs,
       return_attention_scores=encoder_cfg.return_attention_scores,
+      return_word_embeddings=encoder_cfg.return_word_embeddings,
       dict_outputs=True,
       norm_first=encoder_cfg.norm_first)
diff --git a/official/nlp/modeling/networks/bert_encoder.py b/official/nlp/modeling/networks/bert_encoder.py
@@ -79,6 +79,8 @@ class BertEncoderV2(tf_keras.layers.Layer):
       attention scores of all transformer layers. This will be a list of length
       `num_layers`, and each element will be in the shape [batch_size,
       num_attention_heads, seq_dim, seq_dim].
+    return_word_embeddings: If true, also return the input word embedding
+      sequence in the bert inference output.
   """
 
   def __init__(
@@ -101,6 +103,7 @@ def __init__(
       norm_first: bool = False,
       with_dense_inputs: bool = False,
       return_attention_scores: bool = False,
+      return_word_embeddings: bool = False,
       **kwargs):
     # Pops kwargs that are used in V1 implementation.
     if 'dict_outputs' in kwargs:
@@ -208,6 +211,7 @@ def __init__(
         'norm_first': norm_first,
         'with_dense_inputs': with_dense_inputs,
         'return_attention_scores': return_attention_scores,
+        'return_word_embeddings': return_word_embeddings,
     }
     if with_dense_inputs:
       self.inputs = dict(
@@ -278,6 +282,10 @@ def call(self, inputs):
         encoder_outputs=encoder_outputs)
     if self._config['return_attention_scores']:
       output['attention_scores'] = attention_outputs
+
+    if self._config['return_word_embeddings']:
+      output['word_embeddings'] = embeddings
+
     return output
 
   def get_embedding_table(self):
@@ -390,6 +398,8 @@ class BertEncoder(tf_keras.Model):
       attention scores of all transformer layers. This will be a list of length
       `num_layers`, and each element will be in the shape [batch_size,
       num_attention_heads, seq_dim, seq_dim].
+    return_word_embeddings: If true, also return the input word embedding
+      sequence in the bert inference output.
   """
 
   def __init__(
@@ -412,6 +422,7 @@ def __init__(
       dict_outputs=False,
       return_all_encoder_outputs=False,
       return_attention_scores: bool = False,
+      return_word_embeddings: bool = False,
       **kwargs):
     if 'sequence_length' in kwargs:
       kwargs.pop('sequence_length')
@@ -538,6 +549,9 @@ def __init__(
     if return_attention_scores:
       outputs['attention_scores'] = attention_outputs
 
+    if return_word_embeddings:
+      outputs['word_embeddings'] = embeddings
+
     if dict_outputs:
       super().__init__(
           inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
@@ -587,6 +601,7 @@ def __init__(
         'norm_first': norm_first,
         'dict_outputs': dict_outputs,
         'return_attention_scores': return_attention_scores,
+        'return_word_embeddings': return_word_embeddings,
     }
     # pylint: disable=protected-access
     self._setattr_tracking = False
diff --git a/official/nlp/modeling/networks/bert_encoder_test.py b/official/nlp/modeling/networks/bert_encoder_test.py
@@ -138,6 +138,38 @@ def test_dict_outputs_network_creation_return_attention_scores(
     # The default output dtype is float32.
     self.assertAllEqual(tf.float32, all_attention_outputs[-1].dtype)
 
+  @parameterized.named_parameters(
+      ("encoder_v2", bert_encoder.BertEncoderV2),
+      ("encoder_v1", bert_encoder.BertEncoder),
+  )
+  def test_dict_outputs_network_creation_return_word_embeddings(
+      self, encoder_cls):
+    hidden_size = 32
+    sequence_length = 21
+    num_attention_heads = 5
+    num_layers = 3
+    # Create a small BertEncoder for testing.
+    test_network = encoder_cls(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=num_attention_heads,
+        num_layers=num_layers,
+        return_word_embeddings=True,
+        dict_outputs=True)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf_keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf_keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf_keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network(
+        dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
+    word_embeddings = dict_outputs["word_embeddings"]
+
+    expected_data_shape = [None, sequence_length, hidden_size]
+    self.assertAllEqual(expected_data_shape, word_embeddings.shape)
+
+    # The default output dtype is float32.
+    self.assertAllEqual(tf.float32, word_embeddings[-1].dtype)
+
   @parameterized.named_parameters(
       ("encoder_v2", bert_encoder.BertEncoderV2),
       ("encoder_v1", bert_encoder.BertEncoder),