Option to combine variables in mesh-tensorflow to improve graph construction time for models with many variables on many cores. A more transparent solution would still be preferable. To use this feature, call Graph.rewrite_stack_variables() after the forwards pass. The graph gets rewritten to have fewer variables.

nshazeer · Copybara-Service · commit 3d8ce2b8a4a1 · 2018-11-27T11:46:00.000-08:00
PiperOrigin-RevId: 223029564
diff --git a/mesh_tensorflow/ops.py b/mesh_tensorflow/ops.py
@@ -347,7 +347,6 @@ class Graph(object):
 
   def __init__(self):
     self._operations = []
-    self._tensors = []
     self._trainable_variables = []
     self._all_variables = []
     # Maps a name used in the graph to the next id to use for that name.
@@ -360,10 +359,6 @@ def __repr__(self):
   def operations(self):
     return self._operations
 
-  @property
-  def tensors(self):
-    return self._tensors
-
   @property
   def trainable_variables(self):
     return self._trainable_variables
@@ -407,6 +402,91 @@ def unique_name(self, name, mark_as_used=True):
 
     return name
 
+  def rewrite_stack_variables(self, max_combined_size=2 ** 30):
+    """Rewrite the current graph to combine variables.
+
+    This helps speed up graph construction times in the case of large meshes
+    and large numbers of variables.
+
+    This function should be called after the forward pass, (before any variable
+    assignemnts). Some similar variables are stacked to form larger variables.
+
+    Variables created prior to this call are checkpointed as separate variables,
+    even though they are combined internally.   So the checkpoints are
+    compatible for inference purposes with/without this call.  However, the
+    optimizer accumulators, which are created after this call are checkpointed
+    as combined variables.
+
+    When we find a set of variables with the same shape/dtype/etc, we replace
+    them with one StackedVariable and an "unstack" operation.  The
+    StackedVariable has multiple master variables (so as to maintain
+    checkpiont compatibility), but only one slice variable per device.  We
+    point the inputs of later operations to the outputs of the
+    "unstack" operations, instead of the outputs of the defunct single
+    variables.
+
+    TODO(noam, ylc): Rewrite assignments as well, so that this can be applied at
+    the end of graph construction and be fully checkpoint-compatible.
+    Alternatively, find another solution for speeding up graph construction.
+
+    Args:
+      max_combined_size: an integer - maximum size for combined variables.
+    """
+    all_variables = self._all_variables
+    operations = self._operations
+    self._operations = []
+    self._all_variables = []
+    self._trainable_variables = []
+    def var_key(v):
+      return str([v.shape,
+                  v.master_dtype,
+                  v.slice_dtype,
+                  v.activation_dtype,
+                  v.trainable])
+    key_to_vars = collections.defaultdict(list)
+    for v in all_variables:
+      key_to_vars[var_key(v)].append(v)
+    deleted_vars = set()
+    # We need to point the inputs of other operations at the outputs of unstack
+    # instead of the outputs of the deleted Variables.  We construct this
+    # mapping from old input tensors to new input tensors.
+    tensor_mapping = {}
+    for op in operations:
+      if isinstance(op, Assign):
+        raise ValueError("stack_variables() should be called before any "
+                         "variable assignment.")
+      if isinstance(op, StackedVariable):
+        raise ValueError("stack_variables() should not be called twice.")
+      if isinstance(op, Variable):
+        if op in deleted_vars:
+          continue
+        similar_vars = key_to_vars[var_key(op)]
+        num_to_stack = max(1, min(
+            len(similar_vars),
+            max_combined_size // op.shape.size))
+        to_stack = similar_vars[:num_to_stack]
+        key_to_vars[var_key(op)] = similar_vars[num_to_stack:]
+        if num_to_stack > 1:
+          stacked_var = StackedVariable(to_stack)
+          stack_dim = stacked_var.shape.dims[0]
+          deleted_vars.update(to_stack)
+          unstacked = unstack(stacked_var.outputs[0], stack_dim)
+          for v, t in zip(to_stack, unstacked):
+            tensor_mapping[v.outputs[0]] = t
+        else:
+          self._operations.append(op)
+          self._all_variables.append(op)
+          if op.trainable:
+            self.trainable_variables.append(op)
+      else:
+        self._operations.append(op)
+        # Point inputs of other operations to the outputs of unstack.
+        # pylint: disable=protected-access
+        for i in xrange(len(op._inputs)):
+          if op._inputs[i] in tensor_mapping:
+            op._inputs[i] = tensor_mapping[op._inputs[i]]
+        # pylint: enable=protected-access
+
 
 class Lowering(object):
   """Lowering of a Graph from Mesh-TensorFlow to TensorFlow.
@@ -1087,7 +1167,6 @@ def __init__(self, operation, shape, dtype, name=None, index=0):
     if name is None:
       name = self.operation.name + ":" + str(index)
     self._name = name
-    self._mesh.graph.tensors.append(self)
 
   @property
   def shape(self):
@@ -2204,7 +2283,7 @@ def conv2d(conv_input, conv_filter, strides, padding, name=None):
 
 
 class Conv2dBackpropInputOperation(Operation):
-  """like tf.nn.conv2d_backprop_input"""
+  """like tf.nn.conv2d_backprop_input."""
 
   def __init__(self, input_shape, conv_filter, dy, strides, padding, name=None):
     super(Conv2dBackpropInputOperation, self).__init__(
@@ -2618,11 +2697,12 @@ def __init__(self, mesh, name, shape, master_dtype, slice_dtype,
     self._slice_dtype = slice_dtype
     self._activation_dtype = activation_dtype
     self._trainable = trainable
-    with tf.device(mesh.variable_placer_fn), utils.outside_all_rewrites():
-      self.master = tf.get_variable(
-          name, shape.to_integer_list, dtype=master_dtype,
-          initializer=initializer, **kwargs)
-    self._name = self.master.name[:self.master.name.find(":")]
+    if not isinstance(self, StackedVariable):
+      with tf.device(mesh.variable_placer_fn), utils.outside_all_rewrites():
+        self._master = tf.get_variable(
+            name, shape.to_integer_list, dtype=master_dtype,
+            initializer=initializer, **kwargs)
+      self._name = self._master.name[:self._master.name.find(":")]
     self._outputs = [Tensor(self, shape, activation_dtype)]
     self.graph.all_variables.append(self)
     if trainable:
@@ -2667,6 +2747,61 @@ def slice_dtype(self):
   def activation_dtype(self):
     return self._activation_dtype
 
+  @property
+  def trainable(self):
+    return self._trainable
+
+  @property
+  def master_device(self):
+    return self._master.device
+
+  def get_master(self):
+    return self._master
+
+  def assign_to_master(self, val):
+    return tf.assign(self._master, val)
+
+
+class StackedVariable(Variable):
+  """A Variable which combines many variables into one.
+
+  This is a performance optimization to reduce the time associated with large
+  numbers of slice variables.  See Graph.rewrite_stack_variables() for usage.
+  """
+
+  def __init__(self, vs):
+    """Create a StackedVariable.
+
+    Args:
+      vs: a list of Variables
+    """
+    shape = Shape([Dimension("stacked", len(vs))] + vs[0].shape.dims)
+    name = "stacked/" + vs[0].name
+    # TODO(noam): verify that vs are the same shape, etc.
+    super(StackedVariable, self).__init__(
+        vs[0].mesh, name, shape, vs[0].master_dtype, vs[0].slice_dtype,
+        vs[0].activation_dtype, None, vs[0].trainable)
+    self._name = name
+    self._masters = [v.get_master() for v in vs]
+    self._original_names = [v.name for v in vs]
+
+  @property
+  def original_names(self):
+    return self._original_names
+
+  @property
+  def master_device(self):
+    return self._masters[0].device
+
+  def get_master(self):
+    with tf.device(self.master_device):
+      return tf.stack(self._masters)
+
+  def assign_to_master(self, val):
+    return tf.group([
+        tf.assign(var_slice, val_slice) for var_slice, val_slice
+        in zip(self._masters, tf.unstack(val))])
+
 
 class ReadVariable(Operation):
   """Read a variable."""
@@ -4014,6 +4149,9 @@ def log_variable_sizes(var_list, tag, verbose=True):
       tf.logging.info("Weight    %s\tshape    %s\tsize    %d",
                       v.name.ljust(80),
                       str(v.shape).ljust(30), v_size)
+      if isinstance(v, StackedVariable):
+        for n in v.original_names:
+          tf.logging.info("    " + n)
     total_size += v_size
   tf.logging.info("%s Total size: %d", tag, total_size)
 
diff --git a/mesh_tensorflow/ops_test.py b/mesh_tensorflow/ops_test.py
@@ -93,26 +93,22 @@ def testTensorLayout(self):
 
   def testGraph(self):
     graph = mtf.Graph()
-    self.assertLen(graph.operations, 0)
-    self.assertLen(graph.tensors, 0)
-    self.assertLen(graph.trainable_variables, 0)
-    self.assertLen(graph.all_variables, 0)
+    self.assertEmpty(graph.operations)
+    self.assertEmpty(graph.trainable_variables)
+    self.assertEmpty(graph.all_variables)
     mesh = mtf.Mesh(graph, "mesh_test")
     _ = mtf.import_tf_tensor(mesh,
                              tf_tensor=tf.constant(0.),
                              shape=mtf.Shape([]))
     self.assertLen(graph.operations, 1)
-    self.assertLen(graph.tensors, 1)
-    self.assertLen(graph.trainable_variables, 0)
-    self.assertLen(graph.all_variables, 0)
+    self.assertEmpty(graph.trainable_variables)
+    self.assertEmpty(graph.all_variables)
     _ = mtf.get_variable(mesh, "variable_0", mtf.Shape([]), trainable=True)
     self.assertLen(graph.operations, 2)
-    self.assertLen(graph.tensors, 2)
     self.assertLen(graph.trainable_variables, 1)
     self.assertLen(graph.all_variables, 1)
     _ = mtf.get_variable(mesh, "variable_1", mtf.Shape([]), trainable=False)
     self.assertLen(graph.operations, 3)
-    self.assertLen(graph.tensors, 3)
     self.assertLen(graph.trainable_variables, 1)
     self.assertLen(graph.all_variables, 2)
 
@@ -172,7 +168,7 @@ def testMeshImpl(self):
                                     ("heads", "model")])
     mesh_impl = mtf.MeshImpl(shape=shape, layout_rules=layout_rules)
     self.assertEqual(mesh_impl.shape, shape)
-    self.assertEqual(mesh_impl.ndims, len(shape))
+    self.assertLen(shape, mesh_impl.ndims)
     self.assertEqual(mesh_impl.layout_rules, layout_rules)
     self.assertEqual(mesh_impl.size, shape.size)
     self.assertTrue(mesh_impl.supports_control_dependencies)
diff --git a/mesh_tensorflow/placement_mesh_impl.py b/mesh_tensorflow/placement_mesh_impl.py
@@ -79,7 +79,7 @@ def __init__(self, variable, mesh_impl):
       if self.slice_is_master:
         tf.logging.info(
             "Single slice is indentical to master - avoid creating extra vars.")
-        slices = [variable.master]
+        slices = [variable.get_master()]
         self._laid_out_tensor = mesh_impl.LaidOutTensor(slices)
         self._copy_slices_to_master = tf.group([])
         self._copy_master_to_slices = tf.group([])
@@ -96,9 +96,9 @@ def __init__(self, variable, mesh_impl):
                 tf.cast(slices[-1], variable.master_dtype))
         self._laid_out_tensor = mesh_impl.LaidOutTensor(slices)
         self._copy_master_to_slices = self.assign_to_slices(
-            mtf.assign_slice, mesh_impl.make_slices(variable.master, shape))
-        self._copy_slices_to_master = tf.assign(
-            variable.master,
+            mtf.assign_slice, mesh_impl.make_slices(
+                variable.get_master(), shape))
+        self._copy_slices_to_master = variable.assign_to_master(
             mesh_impl.combine_slices(slices_with_master_dtype, shape))
 
     @property
@@ -108,7 +108,9 @@ def slice_is_master(self):
         return False
       if self._variable.master_dtype != self._variable.slice_dtype:
         return False
-      master_device = self._variable.master.device
+      if isinstance(self._variable, mtf.StackedVariable):
+        return False
+      master_device = self._variable.master_device
       slice_device = self._mesh_impl.devices[0]
       return slice_device == master_device or not slice_device
 
diff --git a/mesh_tensorflow/simd_mesh_impl.py b/mesh_tensorflow/simd_mesh_impl.py
@@ -101,7 +101,7 @@ def __init__(self, variable, mesh_impl):
       base_name = variable.name
       slices = []
       slices_with_master_dtype = []
-      with tf.device(variable.master.device), utils.outside_all_rewrites():
+      with tf.device(variable.master_device), utils.outside_all_rewrites():
         zero_tensor = tf.zeros(slice_shape)
 
       # pylint: disable=protected-access
@@ -138,15 +138,14 @@ def __init__(self, variable, mesh_impl):
 
       self._laid_out_tensor = mesh_impl.LaidOutTensor(
           [tpu_variables.ReplicatedVariable(base_name, slices)])
-      with tf.device(variable.master.device), utils.outside_all_rewrites():
+      with tf.device(variable.master_device), utils.outside_all_rewrites():
         self._copy_master_to_slices = self._generate_copy_master_to_slices_op(
-            variable.master, shape, slices, slice_shape)
+            variable.get_master(), shape, slices, slice_shape)
         slices_with_master_dtype = [
             tf.cast(s, variable.master_dtype) for s in slices]
-        self._copy_slices_to_master = tf.assign(
-            variable.master,
+        self._copy_slices_to_master = variable.assign_to_master(
             mesh_impl.combine_slices(slices_with_master_dtype, shape,
-                                     device=variable.master.device))
+                                     device=variable.master_device))
 
     def _generate_copy_master_to_slices_op(self, master_variable, master_shape,
                                            slices, slice_shape):