Use pbroadcast while executing sharded JDs to ensure proper gradients for

sharadmv · tensorflower-gardener · commit 6c29df8594f0 · 2021-06-03T18:08:27.000-07:00
both sample and log_prob.

PiperOrigin-RevId: 377414022
diff --git a/tensorflow_probability/python/distributions/joint_distribution.py b/tensorflow_probability/python/distributions/joint_distribution.py
@@ -753,6 +753,7 @@ def _execute_model(self,
                      sample_shape=(),
                      seed=None,
                      value=None,
+                     stop_index=None,
                      sample_and_trace_fn=trace_distributions_and_values):
     """Executes `model`, creating both samples and distributions."""
     values_out = []
@@ -833,6 +834,8 @@ def _execute_model(self,
           values_out.append(traced_values)
 
         index += 1
+        if stop_index is not None and index == stop_index:
+          break
         d = gen.send(next_value)
     except StopIteration:
       pass
@@ -1205,7 +1208,6 @@ def _jd_log_prob_ratio(p, x, q, y, name=None):
     ps, _ = p.sample_distributions(value=x, seed=samplers.zeros_seed())
     qs, _ = q.sample_distributions(value=y, seed=samplers.zeros_seed())
     tf.nest.assert_same_structure(ps, qs)
-    parts = []
-    for p_, x_, q_, y_ in zip(ps, x, qs, y):
-      parts.append(log_prob_ratio.log_prob_ratio(p_, x_, q_, y_))
-    return tf.add_n(parts)
+    log_prob_ratio_parts = nest.map_structure_up_to(
+        ps, log_prob_ratio.log_prob_ratio, ps, x, qs, y)
+    return tf.add_n(tf.nest.flatten(log_prob_ratio_parts))
diff --git a/tensorflow_probability/python/experimental/distribute/joint_distribution.py b/tensorflow_probability/python/experimental/distribute/joint_distribution.py
@@ -20,43 +20,86 @@
 
 import tensorflow.compat.v2 as tf
 from tensorflow_probability.python import distributions as distribution_lib
+from tensorflow_probability.python.distributions import joint_distribution as jd_lib
 from tensorflow_probability.python.distributions import log_prob_ratio as lp_ratio
 from tensorflow_probability.python.internal import distribute_lib
-from tensorflow_probability.python.internal import samplers
 
-from tensorflow.python.util import nest  # pylint: disable=g-direct-tensorflow-import
+
+def pbroadcast_value(value, value_axis_names, output_axis_names):
+  value_axis_names = distribute_lib.canonicalize_axis_name(value_axis_names)
+  pbroadcast_axes = [
+      axis_name for axis_name in output_axis_names
+      if axis_name not in value_axis_names
+  ]
+  return distribute_lib.pbroadcast(value, pbroadcast_axes)
+
+
+def _maybe_substitute_or_add_value_in_tuple(value_tuple, index, value):
+  if index > len(value_tuple):
+    raise ValueError('Cannot add value to tuple without available slot.')
+  if index == len(value_tuple):
+    return value_tuple + (value,)
+  curr_value = value_tuple[index]
+  if curr_value is not None:
+    return value_tuple
+  return value_tuple[:index] + (value,) + value_tuple[index + 1:]
 
 
 class JointDistributionDistributedMixin(object):
   """A JDMixin that shards the log_prob calculation."""
 
-  def _map_measure_over_dists(self, attr, value):
-    """Override the default implementation to shard its log_prob calculation."""
-    if any(x is None for x in tf.nest.flatten(value)):
-      raise ValueError('No `value` part can be `None`; saw: {}.'.format(value))
-    if (attr in ('log_prob', 'unnormalized_log_prob')) and any(
-        self.experimental_shard_axis_names):
-
-      def inner_log_prob_parts(value):
-        ds, xs = self._call_flat_sample_distributions(
-            value=value, seed=samplers.zeros_seed())
-        # We need to flatten and unflatten here to ensure the output structure
-        # matches `flat_sharded_distributions`.
-        return self._model_unflatten(
-            [getattr(d, attr)(x) for d, x in zip(ds, xs)])
-
-      axis_names = self.experimental_shard_axis_names
-      # Individual distributions will apply psum in their `log_prob` methods
-      # so we need to pbroadcast `value` according to `axis_names` to provide
-      # correct gradients. We are safe to add pbroadcasts to functions with
-      # psums already in them.
-      log_prob_parts = distribute_lib.make_pbroadcast_function(
-          inner_log_prob_parts, (axis_names,), axis_names,
-          out_dtype=value)(value)
-      return iter(tf.nest.flatten(log_prob_parts))
-    ds, xs = self._call_flat_sample_distributions(
-        value=value, seed=samplers.zeros_seed())
-    return (getattr(d, attr)(x) for d, x in zip(ds, xs))
+  def _call_execute_model(
+      self,
+      sample_shape=(),
+      seed=None,
+      value=None,
+      sample_and_trace_fn=jd_lib.trace_distributions_and_values):
+    return self._distribute_execute_model(
+        sample_shape=sample_shape,
+        seed=seed,
+        value=value if value is None else self._model_flatten(value),
+        sample_and_trace_fn=sample_and_trace_fn)
+
+  def _distribute_execute_model(
+      self,
+      sample_shape=(),
+      seed=None,
+      value=None,
+      sample_and_trace_fn=jd_lib.trace_distributions_and_values):
+    """Executes a model, adding `pbroadcasts` to ensure correct gradients."""
+    shard_axis_names = self._model_flatten(self.experimental_shard_axis_names)
+    final_values_out = []
+    if value is None:
+      value = ()
+
+    def sample_and_trace_value_fn(dist,
+                                  sample_shape,
+                                  seed,
+                                  value=None):
+      value, traced = sample_and_trace_fn(
+          dist=dist, sample_shape=sample_shape, seed=seed, value=value)
+      # We trace `next_value` here so we can pass it back in as part of `value`
+      # in the next iteration of the coroutine.
+      return value, (value, traced)
+
+    for output_index, output_axes in enumerate(shard_axis_names):
+      # We pbroadcast all values according to the difference between the current
+      # `output_axes` and their own active axes.
+      previous_shard_axes = shard_axis_names[:len(value)]
+      pbroadcasted_value = tuple(
+          pbroadcast_value(v, v_axis_names, output_axes)
+          for v, v_axis_names in zip(value, previous_shard_axes)
+      )
+      pbroadcasted_values, traced_values = zip(*super()._execute_model(
+          sample_shape=sample_shape,
+          seed=seed,
+          value=pbroadcasted_value + (None,),
+          stop_index=output_index + 1,
+          sample_and_trace_fn=sample_and_trace_value_fn))
+      value = _maybe_substitute_or_add_value_in_tuple(
+          value, output_index, pbroadcasted_values[output_index])
+      final_values_out.append(traced_values[output_index])
+    return final_values_out
 
 
 class JointDistributionSequential(JointDistributionDistributedMixin,
@@ -91,19 +134,4 @@ def _dist_jd_log_prob_ratio(p, x, q, y, name=None):
     if p_axis_names != q_axis_names:
       raise ValueError('p and q must use the same sharding. '
                        f'Saw: p: {p}, {p_axis_names}, q: {q}, {q_axis_names}')
-
-    def log_prob_ratio_parts_fn(x, y):
-      p_dists = p.sample_distributions(value=x, seed=samplers.zeros_seed())[0]
-      q_dists = q.sample_distributions(value=y, seed=samplers.zeros_seed())[0]
-      return nest.map_structure_up_to(
-          p_dists,
-          lp_ratio.log_prob_ratio,
-          p_dists, x, q_dists, y)
-
-    return tf.add_n(
-        tf.nest.flatten(
-            distribute_lib.make_pbroadcast_function(
-                log_prob_ratio_parts_fn,
-                in_axes=(p_axis_names, p_axis_names),
-                out_axes=p_axis_names,
-                out_dtype=x)(x, y)))
+    return jd_lib._jd_log_prob_ratio(p, x, q, y, name=name)  # pylint: disable=protected-access
diff --git a/tensorflow_probability/python/experimental/distribute/joint_distribution_test.py b/tensorflow_probability/python/experimental/distribute/joint_distribution_test.py
@@ -30,6 +30,8 @@
 tfb = tfp.bijectors
 tfd = tfp.distributions
 
+Root = tfd.JointDistributionCoroutine.Root
+
 
 def true_log_prob_fn(w, x, data):
   return (tfd.Normal(0., 1.).log_prob(w) +
@@ -65,7 +67,7 @@ def make_jd_named(axis_name):
 def make_jd_coroutine(axis_name):
 
   def model_coroutine():
-    w = yield tfd.JointDistributionCoroutine.Root(tfd.Normal(0., 1.))
+    w = yield Root(tfd.Normal(0., 1.))
     x = yield sharded.Sharded(
         tfd.Sample(tfd.Normal(w, 1.), 1), shard_axis_name=axis_name)
     yield sharded.Sharded(
@@ -219,6 +221,86 @@ def _lpr(x, y):
     self.assertAllClose(
         true_lp_diff_grad, dist_lp_diff_grad)
 
+  def test_jd_has_correct_sample_path_gradients(self):
+
+    def log_prob_fn(x_loc):
+      @tfd.JointDistributionCoroutine
+      def surrogate():
+        x = yield Root(tfd.Normal(x_loc, 1.))
+        y = yield tfd.Normal(x, 1.)
+        yield tfd.Sample(tfd.Normal(x + y, 1.), test_lib.NUM_DEVICES)
+
+      @tfd.JointDistributionCoroutine
+      def model():
+        yield Root(tfd.Normal(1., 1.))
+        yield Root(tfd.Normal(1., 1.))
+        yield tfd.Sample(tfd.Normal(1., 1.), test_lib.NUM_DEVICES)
+      return tf.reduce_mean(
+          model.log_prob(surrogate.sample(sample_shape=1e6, seed=self.key)))
+
+    true_log_prob, true_log_prob_grad = tfp.math.value_and_gradient(
+        log_prob_fn, 0.)
+
+    def run(seed):
+      def sharded_log_prob_fn(x_loc):
+        @jd.JointDistributionCoroutine
+        def surrogate():
+          x = yield Root(tfd.Normal(x_loc, 1.))
+          y = yield tfd.Normal(x, 1.)
+          yield sharded.Sharded(tfd.Normal(x + y, 1.), self.axis_name)
+
+        @jd.JointDistributionCoroutine
+        def model():
+          yield Root(tfd.Normal(1., 1.))
+          yield Root(tfd.Normal(1., 1.))
+          yield sharded.Sharded(tfd.Normal(1., 1.), self.axis_name)
+        return tf.reduce_mean(
+            model.log_prob(surrogate.sample(sample_shape=1e6, seed=seed)))
+      sharded_log_prob, sharded_log_prob_grad = tfp.math.value_and_gradient(
+          sharded_log_prob_fn, 0.)
+      return sharded_log_prob, sharded_log_prob_grad
+
+    sharded_log_prob, sharded_log_prob_grad = self.per_replica_to_tensor(
+        self.strategy_run(
+            run, (self.key,), in_axes=None))
+    for i in range(test_lib.NUM_DEVICES):
+      self.assertAllClose(sharded_log_prob[i], true_log_prob, atol=1e-2)
+      self.assertAllClose(sharded_log_prob_grad[i], true_log_prob_grad,
+                          atol=1e-2)
+
+  def test_jd_has_correct_sample_path_gradients_with_partial_values(self):
+
+    def run(seed):
+      @jd.JointDistributionCoroutine
+      def model():
+        yield Root(tfd.Normal(0., 1., name='x'))
+        yield tfd.Normal(0., 1., name='y')
+        yield sharded.Sharded(tfd.Normal(1., 1.), self.axis_name, name='z')
+
+      sample = model.sample(seed=seed)
+
+      def lp_fn1(x, y, z):
+        return model.log_prob((x, y, z))
+
+      def lp_fn2(x, z):
+        return model.log_prob(model.sample(value=(x, None, z), seed=seed))
+
+      lp_and_grad1 = tfp.math.value_and_gradient(
+          lp_fn1, [*sample])
+      (lp2, grad2) = tfp.math.value_and_gradient(
+          lp_fn2, [sample.x, sample.z])
+      return lp_and_grad1, (lp2, grad2)
+
+    (lp1, grad1), (lp2, grad2) = self.per_replica_to_tensor(
+        self.strategy_run(
+            run, (self.key,), in_axes=None))
+    grad2 = [grad2[0], None, grad2[1]]
+    for i in range(test_lib.NUM_DEVICES):
+      for j in range(3):
+        self.assertAllClose(lp1[i], lp2[i])
+        if grad2[j] is not None:
+          self.assertAllClose(grad1[j][i], grad2[j][i])
+
   def test_default_event_space_bijector_non_interacting(self):
 
     root = jd.JointDistributionCoroutine.Root