Make interacting default event space bijectors work for sharded JDs in simple cases.

SiegeLordEx · tensorflower-gardener · commit 7d28f9b479db · 2021-06-15T12:18:41.000-07:00
The cases where it should work where all RVs are tensor-valued. This excludes
nested JDs as well as JDs with non-JD multipart components (which we can't express anyway).

A nontrivial change here is that _sanitize_value was moved into
sample_and_trace functions because using _execute_model inside the bijector
like I did opens up the possibility of elements of the `value` arg to
`_execute_model` to be of a different structure than the actual distribution.
This feels like a good thing anyway, if we interpretat the `_execute_model` as
a primitive effect system.

Ideally I'd make the base JD's use the new _conditioned_bijectors function, but doing so interacted poorly with autobatched JDs for unexplored reasons.

PiperOrigin-RevId: 379403363
diff --git a/tensorflow_probability/python/distributions/joint_distribution.py b/tensorflow_probability/python/distributions/joint_distribution.py
@@ -79,6 +79,7 @@ class ValueWithTrace(collections.namedtuple(
 
 def trace_distributions_and_values(dist, sample_shape, seed, value=None):
   """Draws a sample, and traces both the distribution and sampled value."""
+  value = _sanitize_value(dist, value)
   if value is None:
     value = dist.sample(sample_shape, seed=seed)
   elif tf.nest.is_nested(dist.dtype) and any(
@@ -103,6 +104,7 @@ def trace_values_only(dist, sample_shape, seed, value=None):
 
 def trace_values_and_log_probs(dist, sample_shape, seed, value=None):
   """Draws a sample, and traces both the sampled value and its log density."""
+  value = _sanitize_value(dist, value)
   if value is None:
     value, lp = dist.experimental_sample_and_log_prob(sample_shape, seed=seed)
   elif tf.nest.is_nested(dist.dtype) and any(
@@ -786,7 +788,7 @@ def _execute_model(self,
         value_at_index = None
         if (value is not None and len(value) > index and
             value[index] is not None):
-          value_at_index = _sanitize_value(actual_distribution, value[index])
+          value_at_index = value[index]
         try:
           next_value, traced_values = sample_and_trace_fn(
               actual_distribution,
diff --git a/tensorflow_probability/python/experimental/distribute/BUILD b/tensorflow_probability/python/experimental/distribute/BUILD
@@ -77,9 +77,11 @@ multi_substrate_py_library(
     deps = [
         ":sharded",
         # tensorflow dep,
+        "//tensorflow_probability/python/bijectors:identity",
         "//tensorflow_probability/python/distributions",
         "//tensorflow_probability/python/distributions:log_prob_ratio",
         "//tensorflow_probability/python/internal:distribute_lib",
+        "//tensorflow_probability/python/internal:samplers",
     ],
 )
 
@@ -112,7 +114,6 @@ multi_substrate_py_test(
         # absl/testing:parameterized dep,
         # tensorflow dep,
         "//tensorflow_probability",
-        "//tensorflow_probability/python/internal:distribute_lib",
         "//tensorflow_probability/python/internal:distribute_test_lib",
         "//tensorflow_probability/python/internal:samplers",
         "//tensorflow_probability/python/internal:test_util",
diff --git a/tensorflow_probability/python/experimental/distribute/joint_distribution.py b/tensorflow_probability/python/experimental/distribute/joint_distribution.py
@@ -20,9 +20,11 @@
 
 import tensorflow.compat.v2 as tf
 from tensorflow_probability.python import distributions as distribution_lib
+from tensorflow_probability.python.bijectors import identity as identity_bijector
 from tensorflow_probability.python.distributions import joint_distribution as jd_lib
 from tensorflow_probability.python.distributions import log_prob_ratio as lp_ratio
 from tensorflow_probability.python.internal import distribute_lib
+from tensorflow_probability.python.internal import samplers
 
 
 def pbroadcast_value(value, value_axis_names, output_axis_names):
@@ -101,6 +103,11 @@ def sample_and_trace_value_fn(dist,
       final_values_out.append(traced_values[output_index])
     return final_values_out
 
+  def _default_event_space_bijector(self, *args, **kwargs):
+    if args or kwargs:
+      return _DefaultJointBijector(self.experimental_pin(*args, **kwargs))
+    return _DefaultJointBijector(self)
+
 
 class JointDistributionSequential(JointDistributionDistributedMixin,
                                   distribution_lib.JointDistributionSequential):
@@ -135,3 +142,26 @@ def _dist_jd_log_prob_ratio(p, x, q, y, name=None):
       raise ValueError('p and q must use the same sharding. '
                        f'Saw: p: {p}, {p_axis_names}, q: {q}, {q_axis_names}')
     return jd_lib._jd_log_prob_ratio(p, x, q, y, name=name)  # pylint: disable=protected-access
+
+
+class _DefaultJointBijector(jd_lib._DefaultJointBijector):  # pylint: disable=protected-access
+  """Sharding-compatible event space bijector for JDs."""
+
+  def _conditioned_bijectors(self, samples, constrained=False):
+    if samples is None:
+      return self.bijectors
+
+    def sample_and_trace_fn(dist, value, **_):
+      bij = self._bijector_fn(dist)
+      if bij is None:
+        bij = identity_bijector.Identity()
+
+      # If the RV is not yet constrained, transform it.
+      value = value if constrained else bij.forward(value)
+      return jd_lib.ValueWithTrace(value=value, traced=bij)
+
+    return self._jd._call_execute_model(  # pylint: disable=protected-access
+        sample_shape=(),
+        value=samples,
+        seed=samplers.zeros_seed(),
+        sample_and_trace_fn=sample_and_trace_fn)
diff --git a/tensorflow_probability/python/experimental/distribute/joint_distribution_test.py b/tensorflow_probability/python/experimental/distribute/joint_distribution_test.py
@@ -23,7 +23,6 @@
 import tensorflow_probability as tfp
 from tensorflow_probability.python.experimental.distribute import joint_distribution as jd
 from tensorflow_probability.python.experimental.distribute import sharded
-from tensorflow_probability.python.internal import distribute_lib
 from tensorflow_probability.python.internal import distribute_test_lib as test_lib
 from tensorflow_probability.python.internal import test_util
 
@@ -381,19 +380,6 @@ def sharded_model():
           shard_axis_name=self.axis_name,
           name='z')
 
-    @tfd.JointDistributionCoroutine
-    def manual_sharded_model():
-      # This one has manual pbroadcasts; the goal is to get sharded_model above
-      # to do this automatically.
-      x = yield root(tfd.LogNormal(0., 1., name='x'))
-      x = distribute_lib.pbroadcast(x, axis_name=self.axis_name)
-      yield sharded.Sharded(
-          tfd.Uniform(0., x), shard_axis_name=self.axis_name, name='y')
-      yield sharded.Sharded(
-          tfb.Scale(x)(tfd.Normal(0., 1.)),
-          shard_axis_name=self.axis_name,
-          name='z')
-
     sample = model.sample(seed=self.key)
     unconstrained_sample = (
         model.experimental_default_event_space_bijector().inverse(sample))
@@ -416,12 +402,6 @@ def run(unconstrained_sample):
           lambda unconstrained_sample: unconstrained_lp(  # pylint: disable=g-long-lambda
               sharded_model, unconstrained_sample), (unconstrained_sample,))
 
-    def manual_run(unconstrained_sample):
-      return tfp.math.value_and_gradient(
-          lambda unconstrained_sample: unconstrained_lp(  # pylint: disable=g-long-lambda
-              manual_sharded_model, unconstrained_sample),
-          (unconstrained_sample,))
-
     sharded_unconstrained_sample = unconstrained_sample._replace(
         y=self.shard_values(unconstrained_sample.y),
         z=self.shard_values(unconstrained_sample.z))
@@ -433,23 +413,8 @@ def manual_run(unconstrained_sample):
     lp = lp[0]
     g = g._replace(x=g.x[0])
 
-    manual_lp, (manual_g,) = self.per_replica_to_tensor(
-        self.strategy_run(
-            manual_run, (sharded_unconstrained_sample,),
-            in_axes=(model.dtype._replace(x=None, y=0, z=0),)))
-    manual_lp = manual_lp[0]
-    manual_g = manual_g._replace(x=manual_g.x[0])
-
     self.assertAllClose(true_lp, lp)
-    # TODO(b/175084455): This will fail because there are sharded <->
-    # non-sharded edges in the gradient graph not accounted for. The edges arise
-    # because the sharded bijectors' parameterizations depend non-sharded
-    # parameters.
-    with self.assertRaises(AssertionError):
-      self.assertAllCloseNested(true_g, g)
-
-    self.assertAllClose(true_lp, manual_lp)
-    self.assertAllCloseNested(true_g, manual_g)
+    self.assertAllCloseNested(true_g, g)
 
 if __name__ == '__main__':
   tf.test.main()