Fix pruning with distribution strategy.

liyunlu0618 · alanchiao · commit a0bf7a4a7df0 · 2019-05-02T12:44:24.000-07:00
PiperOrigin-RevId: 246366931
diff --git a/tensorflow_model_optimization/python/core/sparsity/keras/BUILD b/tensorflow_model_optimization/python/core/sparsity/keras/BUILD
@@ -148,11 +148,10 @@ py_library(
         # python:array_ops tensorflow dep2,
         # python:control_flow_ops tensorflow dep2,
         # python:dtypes tensorflow dep2,
-        # python:framework tensorflow dep2,
         # python:framework_ops tensorflow dep2,
         # python:math_ops tensorflow dep2,
         # python:nn_ops tensorflow dep2,
-        # python:platform tensorflow dep2,
+        # python:state_ops tensorflow dep2,
         # python:summary tensorflow dep2,
         # python:variables tensorflow dep2,
     ],
diff --git a/tensorflow_model_optimization/python/core/sparsity/keras/prune_distributed_test.py b/tensorflow_model_optimization/python/core/sparsity/keras/prune_distributed_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Distributed pruning test."""
 
+import tempfile
 from absl.testing import parameterized
 import numpy as np
 
@@ -71,6 +72,14 @@ def testPrunesSimpleDenseModel(self, distribution):
     model.predict(np.random.rand(20, 10))
     test_utils.assert_model_sparsity(self, 0.5, model)
 
+    _, keras_file = tempfile.mkstemp('.h5')
+    keras.models.save_model(model, keras_file)
+
+    with prune.prune_scope():
+      loaded_model = keras.models.load_model(keras_file)
+
+    test_utils.assert_model_sparsity(self, 0.5, loaded_model)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow_model_optimization/python/core/sparsity/keras/pruning_impl.py b/tensorflow_model_optimization/python/core/sparsity/keras/pruning_impl.py
@@ -26,12 +26,12 @@
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import variables
 from tensorflow.python.summary import summary as summary_ops_v1
 from tensorflow_model_optimization.python.core.sparsity.keras import pruning_utils
 
-
 class Pruning(object):
   """Implementation of magnitude-based weight pruning."""
 
@@ -55,15 +55,9 @@ def __init__(self, training_step_fn, pruning_vars, pruning_schedule,
     self._block_pooling_type = block_pooling_type
     self._validate_block()
 
-    # List of tensorflow assignments ops for new masks and thresholds
-    self._assign_ops = []
-
     # Training step
     self._step_fn = training_step_fn
 
-    # List of tensorflow assignment ops for the weights
-    self._weight_assign_ops = []
-
     self._validate_block()
 
   def _validate_block(self):
@@ -73,9 +67,6 @@ def _validate_block(self):
           raise ValueError('Block Sparsity can only be used for layers which '
                            'have 2-dimensional weights.')
 
-  def get_weight_sparsity(self):
-    return [math_ops.reduce_mean(weight) for weight, _, _ in self._pruning_vars]
-
   def _update_mask(self, weights):
     """Updates the mask for a given weight tensor.
 
@@ -161,69 +152,99 @@ def _maybe_update_block_mask(self, weights):
     return new_threshold, array_ops.reshape(sliced_mask,
                                             array_ops.shape(weights))
 
-  def _get_assign_ops(self):
-    """Gather the assign ops for assigning updated masks and threshold."""
-    # Make sure the assignment ops have not already been added to the list
-    if self._assign_ops:
-      raise ValueError(
-          'Assign op list not empty. _get_assign_ops() called twice?')
-
-    for weight, mask, threshold in self._pruning_vars:
-      is_partitioned = isinstance(weight, variables.PartitionedVariable)
-      weight_as_tensor = weight
-      if is_partitioned:
-        weight_as_tensor = weight.as_tensor()
-
-      new_threshold, new_mask = self._maybe_update_block_mask(weight_as_tensor)
-      self._assign_ops.append(
-          pruning_utils.variable_assign(threshold, new_threshold))
-
-      self._assign_ops.append(
-          pruning_utils.partitioned_variable_assign(mask, new_mask)
-          if is_partitioned else pruning_utils.variable_assign(mask, new_mask))
-
   def _get_weight_assign_ops(self):
     """Gather the assign ops for assigning weights<=weights*mask."""
-    if self._weight_assign_ops:
-      raise ValueError(
-          'Assign op list not empty. _get_weight_assign_ops() called twice?')
-
-    for weight, mask, _ in self._pruning_vars:
-      is_partitioned = isinstance(weight, variables.PartitionedVariable)
-      masked_weight = math_ops.multiply(weight, mask)
-      self._weight_assign_ops.append(
-          pruning_utils.partitioned_variable_assign(weight, masked_weight)
-          if is_partitioned else pruning_utils
-          .variable_assign(weight, masked_weight))
-
-  def weight_mask_op(self):
-    if tf.executing_eagerly() or not self._weight_assign_ops:
-      self._weight_assign_ops = []
-      self._get_weight_assign_ops()
-
-    with ops.control_dependencies(self._weight_assign_ops):
-      return control_flow_ops.no_op('mask_weights')
 
-  def mask_update_op(self):
-    self._assign_ops = []
-    self._get_assign_ops()
+    def update_fn(distribution, values_and_vars):
+      # TODO(yunluli): Need this ReduceOp because the weight is created by the
+      # layer wrapped, so we don't have control of its aggregation policy. May
+      # be able to optimize this when distribution strategy supports easier
+      # update to mirrored variables in replica context.
+      reduced_values = distribution.extended.batch_reduce_to(
+          tf.distribute.ReduceOp.MEAN, values_and_vars)
+      var_list = [v for _, v in values_and_vars]
+      values_and_vars = zip(reduced_values, var_list)
+
+      def update_var(variable, reduced_value):
+        return state_ops.assign(variable, reduced_value)
+
+      update_ops = []
+      for value, var in values_and_vars:
+        update_ops.append(
+            distribution.extended.update(var, update_var, args=(value,)))
+
+      return control_flow_ops.group(update_ops)
+
+    assign_ops = []
+
+    if tf.distribute.get_replica_context():
+      values_and_vars = []
+      for weight, mask, _ in self._pruning_vars:
+        masked_weight = math_ops.multiply(weight, mask)
+        values_and_vars.append((masked_weight, weight))
+      assign_ops.append(tf.distribute.get_replica_context().merge_call(
+          update_fn, args=(values_and_vars,)))
+    else:
+      for weight, mask, _ in self._pruning_vars:
+        masked_weight = math_ops.multiply(weight, mask)
+        assign_ops.append(state_ops.assign(weight, masked_weight))
+
+    return assign_ops
 
-    with ops.control_dependencies(self._assign_ops):
-      return control_flow_ops.no_op('mask_update')
+  def weight_mask_op(self):
+    return control_flow_ops.group(self._get_weight_assign_ops())
 
   def conditional_mask_update(self):
     """Returns an op to updates masks as per the pruning schedule."""
 
     def maybe_update_masks():
       return self._pruning_schedule(self._step_fn())[0]
 
-    def mask_update_op():
-      return self.mask_update_op()
-
-    def no_op():
+    def no_update():
       return control_flow_ops.no_op()
 
-    return control_flow_ops.cond(maybe_update_masks(), mask_update_op, no_op)
+    def mask_update():
+      """Updates mask without distribution strategy."""
+
+      def update():
+        assign_ops = []
+
+        for weight, mask, threshold in self._pruning_vars:
+          new_threshold, new_mask = self._maybe_update_block_mask(weight)
+          assign_ops.append(state_ops.assign(threshold, new_threshold))
+          assign_ops.append(state_ops.assign(mask, new_mask))
+
+        return control_flow_ops.group(assign_ops)
+
+      return control_flow_ops.cond(maybe_update_masks(), update, no_update)
+
+    def mask_update_distributed(distribution):
+      """Updates mask with distribution strategy."""
+
+      def update(var, value):
+        return state_ops.assign(var, value)
+
+      def update_distributed():
+        """Gather distributed update ops."""
+        assign_ops = []
+
+        for weight, mask, threshold in self._pruning_vars:
+          new_threshold, new_mask = self._maybe_update_block_mask(weight)
+          assign_ops.append(
+              distribution.extended.update(mask, update, (new_mask,)))
+          assign_ops.append(
+              distribution.extended.update(threshold, update, (new_threshold,)))
+
+        return control_flow_ops.group(assign_ops)
+
+      return control_flow_ops.cond(maybe_update_masks(), update_distributed,
+                                   no_update)
+
+    if tf.distribute.get_replica_context():
+      return tf.distribute.get_replica_context().merge_call(
+          mask_update_distributed)
+    else:
+      return mask_update()
 
   def add_pruning_summaries(self):
     """Adds summaries of weight sparsities and thresholds."""
diff --git a/tensorflow_model_optimization/python/core/sparsity/keras/pruning_impl_test.py b/tensorflow_model_optimization/python/core/sparsity/keras/pruning_impl_test.py
@@ -67,9 +67,9 @@ def testUpdateSingleMask(self):
     self.assertAllEqual(np.count_nonzero(mask_before_pruning), 100)
 
     if context.executing_eagerly():
-      p.mask_update_op()
+      p.conditional_mask_update()
     else:
-      K.get_session().run(p.mask_update_op())
+      K.get_session().run(p.conditional_mask_update())
 
     mask_after_pruning = K.get_value(mask)
     self.assertAllEqual(np.count_nonzero(mask_after_pruning), 50)
@@ -143,31 +143,6 @@ def testBlockMaskingWithHigherDimensionsRaisesError(self):
     with self.assertRaises(ValueError):
       self._blockMasking(block_size, block_pooling_type, weight, expected_mask)
 
-  def testPartitionedVariableMasking(self):
-    partitioner = partitioned_variables.variable_axis_size_partitioner(40)
-    with self.cached_session():
-      with variable_scope.variable_scope("", partitioner=partitioner):
-        weight = variable_scope.get_variable(
-            "weights", initializer=math_ops.linspace(1.0, 100.0, 100))
-        mask = pruning_utils.mask_variable(weight)
-        threshold = pruning_utils.threshold_variable(weight)
-
-      p = pruning_impl.Pruning(
-          pruning_vars=[(weight, mask, threshold)],
-          training_step_fn=self.training_step_fn,
-          pruning_schedule=self.constant_sparsity,
-          block_size=self.block_size,
-          block_pooling_type=self.block_pooling_type)
-
-      if context.executing_eagerly():
-        p.mask_update_op()
-      else:
-        variables.global_variables_initializer().run()
-        K.get_session().run(p.mask_update_op())
-
-      mask_after_pruning = K.get_value(mask.as_tensor())
-      self.assertAllEqual(np.count_nonzero(mask_after_pruning), 50)
-
   def testConditionalMaskUpdate(self):
     weight = K.variable(np.linspace(1.0, 100.0, 100), name="weights")
     mask = K.ones(weight.get_shape())
diff --git a/tensorflow_model_optimization/python/core/sparsity/keras/pruning_utils.py b/tensorflow_model_optimization/python/core/sparsity/keras/pruning_utils.py
@@ -34,52 +34,6 @@
 from tensorflow.python.ops import variable_scope
 
 
-def mask_variable(var, scope=''):
-  """Create a mask for the weights.
-
-  This function adds a variable 'mask' to the graph.
-
-  Args:
-    var: the weight variable that needs to be masked
-    scope: The variable scope of the variable var
-
-  Returns:
-    the mask variable of the same size and shape as var, initialized to all 1s.
-  """
-  with variable_scope.variable_scope(scope):
-    # TODO(suyoggupta): Remove variable_scope dependency
-    mask = variable_scope.get_variable(
-        'mask',
-        var.get_shape(),
-        initializer=init_ops.ones_initializer(),
-        trainable=False,
-        dtype=var.dtype)
-  return mask
-
-
-def threshold_variable(var, scope=''):
-  """Create a scalar threshold for the weights.
-
-  This function adds a variable
-  'threshold' to the graph.
-
-  Args:
-    var: The weight variable that needs to be masked
-    scope: The variable scope of the variable var
-
-  Returns:
-    A scalar threshold variable initialized to 0.
-  """
-  with variable_scope.variable_scope(scope):
-    # TODO(suyoggupta): Remove variable_scope dependency
-    threshold = variable_scope.get_variable(
-        'threshold', [],
-        initializer=init_ops.zeros_initializer(),
-        trainable=False,
-        dtype=var.dtype)
-    return threshold
-
-
 def kronecker_product(mat1, mat2):
   """Computes the Kronecker product of two matrices mat1 and mat2.
 
@@ -97,7 +51,6 @@ def kronecker_product(mat1, mat2):
   mat2_rsh = array_ops.reshape(mat2, [1, m2, 1, n2])
   return array_ops.reshape(mat1_rsh * mat2_rsh, [m1 * m2, n1 * n2])
 
-
 def expand_tensor(tensor, block_size):
   """Expands a 2D tensor by replicating the tensor values.
 
@@ -213,50 +166,3 @@ def factorized_pool(input_tensor,
 
   return array_ops.squeeze(
       array_ops.transpose(width_pooling, perm=[0, 1, 3, 2]))
-
-
-def determine_partitioned_axis(partitioned_variable):
-  partitioned_axis = 0
-  concatenated_variable_shape = partitioned_variable.get_shape()
-  for partition in partitioned_variable:
-    partition_shape = partition.get_shape()
-    maybe_partitioned_axis = np.less(partition_shape,
-                                     concatenated_variable_shape)
-    # Sanity check: make sure number of partitioned axis == 1
-    if np.count_nonzero(maybe_partitioned_axis) != 1:
-      raise ValueError('Number of partitioned axes %s not equal to 1' %
-                       np.count_nonzero(maybe_partitioned_axis))
-    partitioned_axis = np.where(maybe_partitioned_axis)[0][0]
-  return partitioned_axis
-
-
-def variable_assign(var, new_value):
-  return state_ops.assign(var, new_value)
-
-
-def partitioned_variable_assign(partitioned_var, new_value):
-  """Assign op for partitioned variables.
-
-  Args:
-    partitioned_var: A partitioned tensorflow variable
-    new_value: Value to be assigned to the variable var
-
-  Returns:
-    A tensorflow op that groups the assign ops for each of the variable slices
-  """
-  # Determine which axis was used to partition the variable. Currently
-  # tensorflow allows partitioning variable only along 1 axis.
-  axis = 0 if len(partitioned_var) == 1 else determine_partitioned_axis(
-      partitioned_var)
-
-  partition_sizes = np.array(
-      [partition.get_shape()[axis] for partition in partitioned_var])
-  new_partitioned_values = array_ops.split(
-      new_value,
-      ops.convert_to_tensor(partition_sizes, dtype=dtypes.int32),
-      axis=axis)
-  op_list = []
-  for partition in partitioned_var:
-    op_list.append(
-        variable_assign(partition, new_partitioned_values[len(op_list)]))
-  return control_flow_ops.group(*op_list)
diff --git a/tensorflow_model_optimization/python/core/sparsity/keras/pruning_wrapper.py b/tensorflow_model_optimization/python/core/sparsity/keras/pruning_wrapper.py
@@ -231,9 +231,9 @@ def no_op():
       return control_flow_ops.no_op('no_update')
 
     update_op = tf_utils.smart_cond(training, add_update, no_op)
-    self.layer.add_update(update_op)
+    self.add_update(update_op)
     # Always execute the op that performs weights = weights * mask
-    self.layer.add_update(self.pruning_obj.weight_mask_op())
+    self.add_update(self.pruning_obj.weight_mask_op())
 
     return self.layer.call(inputs)