Mesh-TensorFlow:

nshazeer · Copybara-Service · commit 8aa9b39c709e · 2018-12-07T14:49:11.000-08:00
Remove redefined-builtins from ops.py and move them to __init__.py
  Add a few potentially useful operations:
    mtf.sign
    mtf.abs
    mtf.layers.sigmoid_cross_entropy_with_logits

mtf Transformer implementation:
  Remove logit-jittering and replace it with "z_loss", which seems to work better.
  Hard-code the broadcast dimensions for the dropout layers.

PiperOrigin-RevId: 224581601
diff --git a/mesh_tensorflow/__init__.py b/mesh_tensorflow/__init__.py
@@ -26,7 +26,8 @@
 from mesh_tensorflow import simd_mesh_impl
 from mesh_tensorflow import tpu_variables
 from mesh_tensorflow import utils
-from mesh_tensorflow.ops import *  # pylint: disable=wildcard-import
+from mesh_tensorflow.ops_with_redefined_builtins import *  # pylint: disable=wildcard-import
+
 
 # TODO(trandustin): Seal module.
 # from tensorflow.python.util.all_util import remove_undocumented  # pylint: disable=line-too-long
diff --git a/mesh_tensorflow/layers.py b/mesh_tensorflow/layers.py
@@ -19,7 +19,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from mesh_tensorflow import ops as mtf
+from mesh_tensorflow import ops_with_redefined_builtins as mtf
 
 import tensorflow as tf
 
@@ -51,9 +51,6 @@ def dense(x, output_dim, reduced_dims=None, expert_dims=None,
   """
   if variable_dtype is None:
     variable_dtype = mtf.VariableDType(master_dtype, slice_dtype, x.dtype)
-  if variable_dtype.activation_dtype != x.dtype:
-    raise ValueError("variable_dtype.activation_dtype must match x.dtype "
-                     "variable_dtype=%s x=%s" % (variable_dtype, x))
   if expert_dims is None:
     expert_dims = []
   if reduced_dims is None:
@@ -70,6 +67,7 @@ def dense(x, output_dim, reduced_dims=None, expert_dims=None,
         w_shape,
         initializer=tf.random_normal_initializer(stddev=stddev),
         dtype=variable_dtype)
+    w = mtf.cast(w, x.dtype)
     y = mtf.einsum([x, w], output_shape)
     if use_bias:
       b = mtf.get_variable(
@@ -186,13 +184,20 @@ def batch_norm(x, is_training, momentum, epsilon=1e-9,
     return (norm_x * scale) + bias
 
 
-def softmax_cross_entropy_with_logits(logits, targets, vocab_dim):
+def softmax_cross_entropy_with_logits(logits, targets, vocab_dim, z_loss=0.0):
   """Per-example softmax loss.
 
+  if z_loss is nonzero, we add a loss equal to z_loss*log(z)^2, where z is the
+  partition function.  Example value: z_loss=1e-4.  Two uses of z_loss are:
+  - To keep the logits from drifting too far from zero, which can cause
+     unacceptable roundoff errors in bfloat16.
+  - To encourage the logits to be normalized log-probabilities.
+
   Args:
     logits: a mtf.Tensor whose shape contains vocab_dim
     targets: a mtf.Tensor with the same shape as logits
     vocab_dim: a mtf.Dimension
+    z_loss: a float
 
   Returns:
     a mtf.Tensor whose shape is equal to logits.shape - vocab_dim
@@ -206,9 +211,35 @@ def softmax_cross_entropy_with_logits(logits, targets, vocab_dim):
         "logits=%s targets=%s" % (logits.to_string, targets.to_string))
   if vocab_dim not in logits.shape.dims:
     raise ValueError("vocab_dim must be in logits.shape.dims")
-  log_softmax = mtf.log_softmax(logits, vocab_dim)
-  return mtf.negative(
+  log_z = mtf.reduce_logsumexp(logits, vocab_dim)
+  log_softmax = logits - log_z
+  loss = mtf.negative(
       mtf.reduce_sum(log_softmax * targets, reduced_dim=vocab_dim))
+  if z_loss != 0:
+    loss += z_loss * mtf.square(log_z)
+  return loss
+
+
+def sigmoid_cross_entropy_with_logits(logits, targets):
+  """Sigmoid cross-entropy loss.
+
+  Args:
+    logits: a mtf.Tensor
+    targets: a mtf.Tensor with the same shape as logits
+
+  Returns:
+    a mtf.Tensor whose shape is equal to logits.shape
+
+  Raises:
+    ValueError: if the shapes do not match.
+  """
+  if logits.shape != targets.shape:
+    raise ValueError(
+        "logits shape must equal targets shape"
+        "logits=%s targets=%s" % (logits.to_string, targets.to_string))
+  x = logits
+  z = targets
+  return mtf.relu(x) - x * z + mtf.log(1 + mtf.exp(-mtf.abs(x)))
 
 
 def weights_nonzero(targets, dtype=tf.float32):
diff --git a/mesh_tensorflow/ops.py b/mesh_tensorflow/ops.py
@@ -1420,6 +1420,7 @@ def _square_grad(op, dy):
       output_dtype: a dtype
       splittable_dims: a list of Dimensions which are ok to split
       grad_function: an optional python function. Default to using tf.gradients
+        pass in the number 0 to indicate no gradient
       name: an optional string
     """
     super(SlicewiseOperation, self).__init__(inputs, name=name or "slicewise")
@@ -1428,6 +1429,12 @@ def _square_grad(op, dy):
     self._splittable_dims = splittable_dims
     self._grad_function = grad_function
 
+  @property
+  def has_gradient(self):
+    if self._grad_function == 0:
+      return False
+    return super(SlicewiseOperation, self).has_gradient
+
   def gradient(self, grad_ys):
     if self._grad_function is not None:
       return self._grad_function(self, grad_ys[0])
@@ -1547,7 +1554,8 @@ def grad_function(op, dy):
   return cwise(tf.tanh, [x], name=name, grad_function=grad_function)
 
 
-def pow(x, y):  # pylint: disable=redefined-builtin
+def mtf_pow(x, y):
+  """Call externally as mtf.pow()."""
   return exp(log(x) * y)
 
 
@@ -1574,6 +1582,16 @@ def relu(x, name="relu"):
   return cwise(tf.nn.relu, [x], name=name, grad_function=_relu_grad)
 
 
+def sign(x, name="sign"):
+  ret = cwise(tf.sign, [x], name=name, grad_function=0)
+  return ret
+
+
+def mtf_abs(x):
+  """Call externally as mtf.abs()."""
+  return x * sign(x)
+
+
 def cast(x, dtype, name="cast"):
   if dtype == x.dtype:
     return x
@@ -2174,8 +2192,8 @@ def cumsum(x, dim, exclusive=False):
     new_shape = x.shape.rename_dimension(dim.name, new_name)
     comparator = less if exclusive else less_equal
     m = cast(
-        comparator(range(x.mesh, dim, dtype=tf.float32),
-                   range(x.mesh, new_dim, dtype=tf.float32)), x.dtype)
+        comparator(mtf_range(x.mesh, dim, dtype=tf.float32),
+                   mtf_range(x.mesh, new_dim, dtype=tf.float32)), x.dtype)
     ret = einsum([x, m], output_shape=new_shape)
     return reshape(ret, x.shape)
 
@@ -3577,7 +3595,7 @@ def top_1(x, reduced_dim, dtype=tf.int32, name=None):
   with tf.name_scope(name, default_name="top_1"):
     max_val = reduce_max(x, reduced_dim=reduced_dim)
     is_max = to_float(equal(x, max_val))
-    pos = range(x.mesh, reduced_dim, tf.float32)
+    pos = mtf_range(x.mesh, reduced_dim, tf.float32)
     ret = reduce_max(is_max * pos, reduced_dim=reduced_dim)
     ret = cast(ret, dtype)
     return ret, max_val
@@ -3717,9 +3735,11 @@ def divide(x1, x2, output_shape=None, name=None):
     return multiply(x1, reciprocal(x2), output_shape=output_shape)
 
 
-def slice(x, begin, size, slice_dim_name, name=None):  # pylint: disable=redefined-builtin
+def mtf_slice(x, begin, size, slice_dim_name, name=None):
   """Slice operation.
 
+  Call externally as mtf.slice()
+
   Args:
     x: a list of Tensors
     begin: integer, where to begin slicing from along the axis
@@ -3754,7 +3774,7 @@ def one_hot(indices, output_dim, on_value=1.0,
 
   TODO(noam): Is there a good reason we need a special mtf.Operation here?
   We could just use some code like this:
-  cast(equal(indices, range(indices.mesh, output_dim, dtype=indices.dtype)),
+  cast(equal(indices, mtf_range(indices.mesh, output_dim, dtype=indices.dtype)),
        dtype)
 
   Args:
@@ -4067,9 +4087,11 @@ def softmax(x, reduced_dim, extra_logit=None, name=None):
     return exp(log_softmax(x, reduced_dim, extra_logit=extra_logit))
 
 
-def range(mesh, dim, dtype, name=None):  # pylint: disable=redefined-builtin
+def mtf_range(mesh, dim, dtype, name=None):
   """Create a 1d mesh tensor with a range from [0, dim.size).
 
+  Call externally as mtf.range()
+
   Args:
     mesh: a Mesh
     dim: a Dimension
@@ -4563,9 +4585,10 @@ def halo_exchange(x, blocks_dim, block_size_dim, halo_size, wrap=False):
     parts = ([shift(x, i, blocks_dim, wrap)] + parts +
              [shift(x, -i, blocks_dim, wrap)])
   if partial_size > 0:
-    left_margin = slice(x, 0, partial_size, block_size_dim.name)
-    right_margin = slice(x, block_size_dim.size - partial_size, partial_size,
-                         block_size_dim.name)
+    left_margin = mtf_slice(x, 0, partial_size, block_size_dim.name)
+    right_margin = mtf_slice(
+        x, block_size_dim.size - partial_size, partial_size,
+        block_size_dim.name)
     parts = (
         [shift(right_margin, num_complete_blocks + 1, blocks_dim, wrap)]
         + parts +
@@ -4600,8 +4623,9 @@ def left_halo_exchange(x, blocks_dim, block_size_dim, halo_size, wrap=False):
   for i in xrange(1, num_complete_blocks + 1):
     parts = ([shift(x, i, blocks_dim, wrap)] + parts)
   if partial_size > 0:
-    right_margin = slice(x, block_size_dim.size - partial_size, partial_size,
-                         block_size_dim.name)
+    right_margin = mtf_slice(
+        x, block_size_dim.size - partial_size, partial_size,
+        block_size_dim.name)
     parts = ([shift(right_margin, num_complete_blocks + 1, blocks_dim, wrap)]
              + parts)
   return concat(parts, block_size_dim.name)
diff --git a/mesh_tensorflow/ops_with_redefined_builtins.py b/mesh_tensorflow/ops_with_redefined_builtins.py
@@ -0,0 +1,35 @@
+# coding=utf-8
+# Copyright 2018 The Mesh TensorFlow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Mesh TensorFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from mesh_tensorflow.ops import *  # pylint: disable=wildcard-import
+from mesh_tensorflow.ops import mtf_abs as abs  # pylint: disable=redefined-builtin,unused-import
+from mesh_tensorflow.ops import mtf_pow as pow  # pylint: disable=redefined-builtin,unused-import
+from mesh_tensorflow.ops import mtf_range as range  # pylint: disable=redefined-builtin,unused-import
+from mesh_tensorflow.ops import mtf_slice as slice  # pylint: disable=redefined-builtin,unused-import
+
+
+
+# TODO(trandustin): Seal module.
+# from tensorflow.python.util.all_util import remove_undocumented  # pylint: disable=line-too-long
+#
+# _allowed_symbols = None
+#
+# remove_undocumented(__name__, _allowed_symbols)
diff --git a/mesh_tensorflow/optimize.py b/mesh_tensorflow/optimize.py
@@ -20,7 +20,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from mesh_tensorflow import ops as mtf
+from mesh_tensorflow import ops_with_redefined_builtins as mtf
 import tensorflow as tf
 
 
diff --git a/mesh_tensorflow/placement_mesh_impl.py b/mesh_tensorflow/placement_mesh_impl.py
@@ -20,7 +20,7 @@
 
 import functools
 
-from mesh_tensorflow import ops as mtf
+from mesh_tensorflow import ops_with_redefined_builtins as mtf
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 import tensorflow as tf
diff --git a/mesh_tensorflow/simd_mesh_impl.py b/mesh_tensorflow/simd_mesh_impl.py
@@ -19,7 +19,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from mesh_tensorflow import ops as mtf
+from mesh_tensorflow import ops_with_redefined_builtins as mtf
 from mesh_tensorflow import tpu_variables
 from mesh_tensorflow import utils
 from six.moves import xrange  # pylint: disable=redefined-builtin
diff --git a/mesh_tensorflow/transformer/transformer.py b/mesh_tensorflow/transformer/transformer.py
@@ -323,6 +323,7 @@ def __init__(self,
                max_length,
                shared_embedding_and_softmax_weights=False,
                label_smoothing=0.0,
+               z_loss=1e-4,
                name="transformer"):
     self.layer_stack = layer_stack
     self.model_dim = mtf.Dimension("d_model", d_model)
@@ -338,6 +339,7 @@ def __init__(self,
     self.shared_embedding_and_softmax_weights = (
         shared_embedding_and_softmax_weights)
     self.label_smoothing = label_smoothing
+    self.z_loss = z_loss
     self.name = name
 
   def _call_internal(self, context, inputs, targets=None):
@@ -381,27 +383,25 @@ def _call_internal(self, context, inputs, targets=None):
     if self.output_vocab_dim is None:
       return x
     if self.shared_embedding_and_softmax_weights:
-      logits = tf.einsum(
+      logits = mtf.einsum(
           [x * (self.model_dim ** -0.5), embedding_weights],
           reduced_dims=[self.model_dim])
     else:
       logits = mtf.layers.dense(
           x, self.output_vocab_dim, use_bias=False,
           variable_dtype=context.variable_dtype,
           name="logits")
-    if context.train:
-      logits = mtf.layers.multiplicative_jitter(logits, epsilon=1e-2)
     if targets is not None and context.losses is not None:
       off_value = self.label_smoothing / self.output_vocab_dim.size
       on_value = 1.0 - self.label_smoothing + off_value
+      soft_targets = mtf.one_hot(
+          targets, self.output_vocab_dim,
+          dtype=context.activation_dtype,
+          on_value=on_value,
+          off_value=off_value)
       loss = mtf.layers.softmax_cross_entropy_with_logits(
-          logits,
-          mtf.one_hot(
-              targets, self.output_vocab_dim,
-              dtype=context.activation_dtype,
-              on_value=on_value,
-              off_value=off_value),
-          self.output_vocab_dim)
+          logits, soft_targets, self.output_vocab_dim,
+          z_loss=self.z_loss if context.train else 0.0)
       weights = mtf.layers.weights_nonzero(
           targets, dtype=context.activation_dtype)
       loss = mtf.reduce_mean(loss * weights)
@@ -674,6 +674,7 @@ def __init__(self,
                max_length,
                shared_embedding=True,
                label_smoothing=0.0,
+               z_loss=1e-4,
                encoder_name="encoder",
                decoder_name="decoder"):
     self.encoder = Unitransformer(
@@ -692,6 +693,7 @@ def __init__(self,
         autoregressive=True,
         max_length=max_length,
         label_smoothing=label_smoothing,
+        z_loss=z_loss,
         name=decoder_name)
     self.shared_embedding = shared_embedding
 
diff --git a/mesh_tensorflow/transformer/transformer_layers.py b/mesh_tensorflow/transformer/transformer_layers.py