Remove "mtf_" prefix to module and use mtf namespace.

Dustin Tran · Copybara-Service · commit f3075ffad773 · 2018-10-15T16:13:06.000-07:00
PiperOrigin-RevId: 217225608
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg)](CO
 
 Transformer for EN-FR WMT with model splitting |  Transformer for EN-FR WMT with data splitting
 :-------------------------:|:-------------------------:
-![model_splitting](./mtf_transformer_model_splitting.png) | ![data_splitting](./mtf_transformer_data_splitting.png)
+![model_splitting](./transformer_model_splitting.png) | ![data_splitting](./transformer_data_splitting.png)
 
 # Introduction
 
@@ -116,7 +116,7 @@ w2 = mtf.get_variable(mesh, "w2", [hidden_dim, classes_dim])
 # einsum is a generalization of matrix multiplication (see numpy.einsum)
 hidden = mtf.relu(mtf.einsum(images, w1, output_shape=[batch_dim, hidden_dim]))
 logits = mtf.einsum(hidden, w2, output_shape=[batch_dim, classes_dim])
-loss = mtf.reduce_mean(mtf_layers.softmax_cross_entropy_with_logits(
+loss = mtf.reduce_mean(mtf.layers.softmax_cross_entropy_with_logits(
     logits, mtf.one_hot(labels, classes_dim), classes_dim))
 w1_grad, w2_grad = mtf.gradients([loss], [w1, w2])
 update_w1_op = mtf.assign(w1, w1 - w1_grad * 0.001)
@@ -132,7 +132,7 @@ computation.
 devices = ["gpu:0", "gpu:1", "gpu:2", "gpu:3"]
 mesh_shape = [("all_processors", 4)]
 layout_rules = [("batch", "all_processors")]
-mesh_impl = placement_mesh_impl.PlacementMeshImpl(
+mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
     mesh_shape, layout_rules, devices)
 lowering = mtf.Lowering(graph, {mesh:mesh_impl})
 tf_update_ops = [lowering.lowered_operation(update_w1_op),
@@ -371,7 +371,7 @@ TPU_NAME=ylc-mtf-donut
 # 2 ways data-parallelism and 4 ways model-parallelism.
 # In this configuration, we split the batch dimension into 2 cores and the
 # hidden dimension into 4 cores.
-python examples/mtf_toy_model_tpu.py \
+python examples/toy_model_tpu.py \
   --tpu=$TPU \
   --model_dir=$MODEL_DIR \
   --io_size=8 \
@@ -381,7 +381,7 @@ python examples/mtf_toy_model_tpu.py \
 
 # 8 ways model-parallelism.
 # In this configuration, We split the hidden dimension into 8 cores.
-python examples/mtf_toy_model_tpu.py \
+python examples/toy_model_tpu.py \
   --tpu=$TPU \
   --model_dir=$MODEL_DIR \
   --io_size=8 \
diff --git a/examples/mnist.py b/examples/mnist.py
@@ -23,11 +23,6 @@
 from __future__ import print_function
 
 import mesh_tensorflow as mtf
-
-from mesh_tensorflow import mtf_layers
-from mesh_tensorflow import mtf_optimize
-from mesh_tensorflow import placement_mesh_impl
-
 import mnist_dataset as dataset  # local file import
 import tensorflow as tf
 
@@ -104,20 +99,20 @@ def mnist_model(image, labels, mesh):
   hidden_dim1 = mtf.Dimension("hidden1", FLAGS.hidden_size)
   hidden_dim2 = mtf.Dimension("hidden2", FLAGS.hidden_size)
 
-  h1 = mtf_layers.dense(
+  h1 = mtf.layers.dense(
       x, hidden_dim1,
       reduced_dims=x.shape.dims[-4:],
       activation=mtf.relu, name="hidden1")
-  h2 = mtf_layers.dense(
+  h2 = mtf.layers.dense(
       h1, hidden_dim2,
       activation=mtf.relu, name="hidden2")
-  logits = mtf_layers.dense(h2, classes_dim, name="logits")
+  logits = mtf.layers.dense(h2, classes_dim, name="logits")
   if labels is None:
     loss = None
   else:
     labels = mtf.import_tf_tensor(
         mesh, tf.reshape(labels, [FLAGS.batch_size]), mtf.Shape([batch_dim]))
-    loss = mtf_layers.softmax_cross_entropy_with_logits(
+    loss = mtf.layers.softmax_cross_entropy_with_logits(
         logits, mtf.one_hot(labels, classes_dim), classes_dim)
     loss = mtf.reduce_mean(loss)
   return logits, loss
@@ -135,13 +130,13 @@ def model_fn(features, labels, mode, params):
   layout_rules = mtf.convert_to_layout_rules(FLAGS.layout)
   mesh_size = mesh_shape.size
   mesh_devices = [""] * mesh_size
-  mesh_impl = placement_mesh_impl.PlacementMeshImpl(
+  mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
       mesh_shape, layout_rules, mesh_devices)
 
   if mode == tf.estimator.ModeKeys.TRAIN:
     var_grads = mtf.gradients(
         [loss], [v.outputs[0] for v in graph.trainable_variables])
-    optimizer = mtf_optimize.AdafactorOptimizer()
+    optimizer = mtf.optimize.AdafactorOptimizer()
     update_ops = []
     for grad, var in zip(var_grads, graph.trainable_variables):
       update_ops.extend(optimizer.apply_grad(grad, var))
diff --git a/examples/toy_model_tpu.py b/examples/toy_model_tpu.py
@@ -20,12 +20,6 @@
 from __future__ import print_function
 
 import mesh_tensorflow as mtf
-
-from mesh_tensorflow import mtf_layers
-from mesh_tensorflow import mtf_optimize
-from mesh_tensorflow import mtf_utils
-from mesh_tensorflow.simd_mesh_impl import SimdMeshImpl
-
 import numpy
 import tensorflow as tf
 
@@ -107,8 +101,8 @@ def toy_model(features, mesh):
   io_dim = mtf.Dimension('io', FLAGS.io_size)
 
   x = mtf.import_tf_tensor(mesh, features, mtf.Shape([batch_dim, io_dim]))
-  h = mtf_layers.dense(x, hidden_dim, name='layer1', use_bias=False)
-  y = mtf_layers.dense(h, io_dim, name='layer2', use_bias=False)
+  h = mtf.layers.dense(x, hidden_dim, name='layer1', use_bias=False)
+  y = mtf.layers.dense(h, io_dim, name='layer2', use_bias=False)
 
   loss = mtf.reduce_sum(mtf.square(y - x))
   return y, loss
@@ -122,17 +116,17 @@ def model_fn(features, labels, mode, params):
   mesh = mtf.Mesh(graph, 'my_mesh')
   mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape)
   mesh_devices = [''] * mesh_shape.size
-  mesh_impl = SimdMeshImpl(
+  mesh_impl = mtf.simd_mesh_impl.SimdMeshImpl(
       mesh_shape, mtf.convert_to_layout_rules(FLAGS.layout),
       mesh_devices, params['context'].device_assignment)
-  with mtf_utils.outside_all_rewrites():
+  with mtf.utils.outside_all_rewrites():
     logits, loss = toy_model(features, mesh)
 
   # TRAIN mode
   if mode == tf.estimator.ModeKeys.TRAIN:
     var_grads = mtf.gradients([loss],
                               [v.outputs[0] for v in graph.trainable_variables])
-    optimizer = mtf_optimize.AdafactorOptimizer()
+    optimizer = mtf.optimize.AdafactorOptimizer()
     update_ops = []
     for grad, var in zip(var_grads, graph.trainable_variables):
       update_ops.extend(optimizer.apply_grad(grad, var))
@@ -152,7 +146,7 @@ def model_fn(features, labels, mode, params):
   else:
     tf_logits = lowering.export_to_tf_tensor(fully_replicated_logits)
 
-  with mtf_utils.outside_all_rewrites():
+  with mtf.utils.outside_all_rewrites():
     # Copy master variables to slices. Must be called first.
     restore_hook = mtf.MtfRestoreHook(lowering)
     if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/mesh_tensorflow/__init__.py b/mesh_tensorflow/__init__.py
@@ -19,13 +19,13 @@
 from __future__ import division
 from __future__ import print_function
 
-from mesh_tensorflow import mtf_beam_search
-from mesh_tensorflow import mtf_layers
-from mesh_tensorflow import mtf_optimize
-from mesh_tensorflow import mtf_utils
+from mesh_tensorflow import beam_search
+from mesh_tensorflow import layers
+from mesh_tensorflow import optimize
 from mesh_tensorflow import placement_mesh_impl
 from mesh_tensorflow import simd_mesh_impl
 from mesh_tensorflow import tpu_variables
+from mesh_tensorflow import utils
 from mesh_tensorflow.ops import *  # pylint: disable=wildcard-import
 
 # TODO(trandustin): Seal module.
diff --git a/mesh_tensorflow/beam_search.py b/mesh_tensorflow/beam_search.py
diff --git a/mesh_tensorflow/layers.py b/mesh_tensorflow/layers.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Layers for mesh tensorflow."""
+"""Layers implemented in Mesh TensorFlow."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/mesh_tensorflow/layers_test.py b/mesh_tensorflow/layers_test.py
@@ -23,14 +23,12 @@
 
 import mesh_tensorflow as mtf
 
-from mesh_tensorflow import mtf_layers
-from mesh_tensorflow import placement_mesh_impl
 from tensor2tensor.layers import common_layers
 
 import tensorflow as tf
 
 
-class MtfLayersTest(parameterized.TestCase, tf.test.TestCase):
+class LayersTest(parameterized.TestCase, tf.test.TestCase):
 
   @parameterized.parameters(
       (4, True),
@@ -49,12 +47,12 @@ def testDense(self, units, use_bias):
 
     mtf_inputs = mtf.import_tf_tensor(
         mesh, inputs, shape=mtf.Shape([batch_dim, channels_dim]))
-    mtf_outputs = mtf_layers.dense(mtf_inputs,
+    mtf_outputs = mtf.layers.dense(mtf_inputs,
                                    output_dim=depth_dim,
                                    reduced_dims=[channels_dim],
                                    activation=mtf.relu,
                                    use_bias=use_bias)
-    mesh_impl = placement_mesh_impl.PlacementMeshImpl(
+    mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
         shape=[], layout={}, devices=[""])
     lowering = mtf.Lowering(graph, {mesh: mesh_impl})
     actual_outputs = lowering.export_to_tf_tensor(mtf_outputs)
@@ -83,9 +81,9 @@ def testLayerNorm(self):
 
     mtf_inputs = mtf.import_tf_tensor(
         mesh, inputs, shape=mtf.Shape([batch_dim, channels_dim]))
-    mtf_outputs = mtf_layers.layer_norm(mtf_inputs,
+    mtf_outputs = mtf.layers.layer_norm(mtf_inputs,
                                         dim=channels_dim)
-    mesh_impl = placement_mesh_impl.PlacementMeshImpl(
+    mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
         shape=[], layout={}, devices=[""])
     lowering = mtf.Lowering(graph, {mesh: mesh_impl})
     actual_outputs = lowering.export_to_tf_tensor(mtf_outputs)
@@ -110,8 +108,8 @@ def testWeightsNonzero(self):
 
     mtf_inputs = mtf.import_tf_tensor(
         mesh, inputs, shape=mtf.Shape([batch_dim, channels_dim]))
-    mtf_outputs = mtf_layers.weights_nonzero(mtf_inputs)
-    mesh_impl = placement_mesh_impl.PlacementMeshImpl(
+    mtf_outputs = mtf.layers.weights_nonzero(mtf_inputs)
+    mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
         shape=[], layout={}, devices=[""])
     lowering = mtf.Lowering(graph, {mesh: mesh_impl})
     actual_outputs = lowering.export_to_tf_tensor(mtf_outputs)
@@ -138,9 +136,9 @@ def testDenseReluDense(self):
 
     mtf_inputs = mtf.import_tf_tensor(
         mesh, inputs, shape=mtf.Shape([batch_dim, channels_dim]))
-    mtf_outputs = mtf_layers.dense_relu_dense(mtf_inputs,
+    mtf_outputs = mtf.layers.dense_relu_dense(mtf_inputs,
                                               hidden_channels=hidden_dim)
-    mesh_impl = placement_mesh_impl.PlacementMeshImpl(
+    mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
         shape=[], layout={}, devices=[""])
     lowering = mtf.Lowering(graph, {mesh: mesh_impl})
     actual_outputs = lowering.export_to_tf_tensor(mtf_outputs)
@@ -179,13 +177,13 @@ def testMaskedLocalAttention1D(self, batch, length, io_channels, kv_channels,
     mtf_memory = mtf.import_tf_tensor(
         mesh, memory,
         shape=mtf.Shape([batch_dim, length_m_dim, io_channels_dim]))
-    mtf_outputs = mtf_layers.masked_local_attention_1d(
+    mtf_outputs = mtf.layers.masked_local_attention_1d(
         mtf_query,
         mtf_memory,
         kv_channels=kv_channels_dim,
         heads=heads_dim,
         block_length=block_length)
-    mesh_impl = placement_mesh_impl.PlacementMeshImpl(
+    mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
         shape=[], layout={}, devices=[""])
     lowering = mtf.Lowering(graph, {mesh: mesh_impl})
     actual_outputs = lowering.export_to_tf_tensor(mtf_outputs)
@@ -228,12 +226,12 @@ def testDotProductAttention(
         mesh, value,
         shape=mtf.Shape(
             [batch_dim, heads_dim, length_kv_dim, depth_v_dim]))
-    mtf_outputs = mtf_layers.dot_product_attention(
+    mtf_outputs = mtf.layers.dot_product_attention(
         mtf_query,
         mtf_key,
         mtf_value,
         mask=None)
-    mesh_impl = placement_mesh_impl.PlacementMeshImpl(
+    mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
         shape=[], layout={}, devices=[""])
     lowering = mtf.Lowering(graph, {mesh: mesh_impl})
     actual_outputs = lowering.export_to_tf_tensor(mtf_outputs)
@@ -267,13 +265,13 @@ def testMultiheadAttention(self, kv_channels, heads):
     mtf_query = mtf.import_tf_tensor(
         mesh, query,
         shape=mtf.Shape([batch_dim, length_dim, channels_dim]))
-    mtf_outputs = mtf_layers.multihead_attention(
+    mtf_outputs = mtf.layers.multihead_attention(
         mtf_query,
         memory_antecedent=None,
         mask=None,
         kv_channels=kv_channels_dim,
         heads=heads_dim)
-    mesh_impl = placement_mesh_impl.PlacementMeshImpl(
+    mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
         shape=[], layout={}, devices=[""])
     lowering = mtf.Lowering(graph, {mesh: mesh_impl})
     actual_outputs = lowering.export_to_tf_tensor(mtf_outputs)
diff --git a/mesh_tensorflow/ops.py b/mesh_tensorflow/ops.py
@@ -23,7 +23,7 @@
 from operator import mul
 import re
 
-from mesh_tensorflow import mtf_utils
+from mesh_tensorflow import utils
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
@@ -2526,7 +2526,7 @@ def __init__(self, mesh, name, shape, dtype, initializer,
                trainable, **kwargs):
     super(Variable, self).__init__([], mesh, name="name_will_be_set_later")
     self._trainable = trainable
-    with tf.device(mesh.variable_placer_fn), mtf_utils.outside_all_rewrites():
+    with tf.device(mesh.variable_placer_fn), utils.outside_all_rewrites():
       self.master = tf.get_variable(
           name, shape.to_integer_list, dtype=dtype, initializer=initializer,
           **kwargs)
@@ -2538,7 +2538,7 @@ def __init__(self, mesh, name, shape, dtype, initializer,
 
   def lower(self, lowering):
     mesh_impl = lowering.mesh_impl(self)
-    with mtf_utils.outside_all_rewrites():
+    with utils.outside_all_rewrites():
       sv = mesh_impl.LaidOutVariable(self, mesh_impl)
     lowering.variables[self] = sv
     lowering.set_tensor_lowering(self.outputs[0], sv.laid_out_tensor)
diff --git a/mesh_tensorflow/ops_test.py b/mesh_tensorflow/ops_test.py
@@ -22,8 +22,6 @@
 from absl.testing import parameterized
 
 import mesh_tensorflow as mtf
-from mesh_tensorflow import placement_mesh_impl
-
 import tensorflow as tf
 
 
@@ -126,7 +124,7 @@ def testLowering(self):
     mtf_inputs = mtf.import_tf_tensor(mesh,
                                       tf_tensor=inputs,
                                       shape=mtf.Shape([]))
-    mesh_impl = placement_mesh_impl.PlacementMeshImpl(
+    mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
         shape=[], layout={}, devices=[""])
     lowering = mtf.Lowering(graph, {mesh: mesh_impl})
 
diff --git a/mesh_tensorflow/optimize.py b/mesh_tensorflow/optimize.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Mesh-Tensorflow Optimizers."""
+"""Mesh Tensorflow Optimizers."""
 
 
 from __future__ import absolute_import
diff --git a/mesh_tensorflow/simd_mesh_impl.py b/mesh_tensorflow/simd_mesh_impl.py
@@ -19,9 +19,9 @@
 from __future__ import division
 from __future__ import print_function
 
-from mesh_tensorflow import mtf_utils
 from mesh_tensorflow import ops as mtf
 from mesh_tensorflow import tpu_variables
+from mesh_tensorflow import utils
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 import tensorflow as tf
@@ -44,7 +44,7 @@ def __init__(self, shape, layout, devices, device_assignment):
   def pnum_tensor(self):
     if self._pnum_tensor is not None:
       return self._pnum_tensor
-    with mtf_utils.outside_all_rewrites():
+    with utils.outside_all_rewrites():
       tf.logging.info("Create pnum_tensor")
       self._pnum_tensor = tpu_ops.tpu_replicated_input(
           list(range(self.size)), name="pnum_constants")
@@ -116,7 +116,7 @@ def __init__(self, variable, mesh_impl):
                   initializer=tf.zeros_initializer()))
       self._laid_out_tensor = mesh_impl.LaidOutTensor(
           [tpu_variables.ReplicatedVariable(base_name, slices)])
-      with tf.device(variable.master.device), mtf_utils.outside_all_rewrites():
+      with tf.device(variable.master.device), utils.outside_all_rewrites():
         self._copy_master_to_slices = self.assign_to_slices(
             mesh_impl.make_slices(variable.master, shape),
             assign_to_tensor_list=slices)
diff --git a/mesh_tensorflow/transformer_data_splitting.png b/mesh_tensorflow/transformer_data_splitting.png
diff --git a/mesh_tensorflow/transformer_model_splitting.png b/mesh_tensorflow/transformer_model_splitting.png
diff --git a/mesh_tensorflow/utils.py b/mesh_tensorflow/utils.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Common utilities for mesh tensorflow."""
+"""Common utilities for Mesh TensorFlow."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/mesh_tensorflow/utils_test.py b/mesh_tensorflow/utils_test.py