Bert fp16 perf improvements, do the matmul in intermediate later in fp16, and also remove explicit casting to fp32 for layerNorm.

tensorflower-gardener · tensorflower-gardener · commit 3fa65606bb7b · 2019-10-07T14:39:42.000-07:00
PiperOrigin-RevId: 273379063
diff --git a/official/nlp/bert_modeling.py b/official/nlp/bert_modeling.py
@@ -637,12 +637,14 @@ def __init__(self,
                kernel_initializer=None,
                bias_initializer="zeros",
                activation=None,
+               fp32_activation=False,
                **kwargs):
     super(Dense2DProjection, self).__init__(**kwargs)
     self.output_size = output_size
     self.kernel_initializer = kernel_initializer
     self.bias_initializer = bias_initializer
     self.activation = activation
+    self.fp32_activation = fp32_activation
 
   def build(self, input_shape):
     """Implements build() for the layer."""
@@ -685,6 +687,8 @@ def call(self, inputs):
     ret = tf.einsum("abc,cd->abd", inputs, self.kernel)
     ret += self.bias
     if self.activation is not None:
+      if self.dtype == tf.float16 and self.fp32_activation:
+        ret = tf.cast(ret, tf.float32)
       return self.activation(ret)
     return ret
 
@@ -753,7 +757,7 @@ def build(self, unused_input_shapes):
         kernel_initializer=get_initializer(self.initializer_range),
         activation=self.intermediate_activation,
         # Uses float32 so that gelu activation is done in float32.
-        dtype=tf.float32,
+        fp32_activation=True,
         name="intermediate")
     self.output_dense = Dense2DProjection(
         output_size=self.hidden_size,
@@ -788,23 +792,16 @@ def call(self, inputs):
     attention_output = self.attention_dropout(attention_output)
     # Use float32 in keras layer norm and the gelu activation in the
     # intermediate dense layer for numeric stability
-    # TODO(reedwm): These casts are probably unnecessary, as we passed
-    # dtype=tf.float32 to the layer norm constructor, so it will cast its inputs
-    # to float32 automatically. These manual casts additionally do the "+"
-    # operator in float32, but "+" is numerically stable in float16.
-    if self.float_type == tf.float16:
-      input_tensor = tf.cast(input_tensor, tf.float32)
-      attention_output = tf.cast(attention_output, tf.float32)
     attention_output = self.attention_layer_norm(input_tensor +
                                                  attention_output)
+    if self.float_type == tf.float16:
+      attention_output = tf.cast(attention_output, tf.float16)
     intermediate_output = self.intermediate_dense(attention_output)
     if self.float_type == tf.float16:
       intermediate_output = tf.cast(intermediate_output, tf.float16)
     layer_output = self.output_dense(intermediate_output)
     layer_output = self.output_dropout(layer_output)
     # Use float32 in keras layer norm for numeric stability
-    if self.float_type == tf.float16:
-      layer_output = tf.cast(layer_output, tf.float32)
     layer_output = self.output_layer_norm(layer_output + attention_output)
     if self.float_type == tf.float16:
       layer_output = tf.cast(layer_output, tf.float16)