Extract static JD attributes via tracing.

davmre · tensorflower-gardener · commit d3398a71bdbe · 2021-06-22T17:44:00.000-07:00
The attributes are cached, so that a model is only traced once. This is intended to limit the need to actually run the model in `self._get_single_sample_distributions()`.

This change also includes tweaks to enable numpy-mode tests for joint distributions.

PiperOrigin-RevId: 380928836
diff --git a/tensorflow_probability/python/distributions/BUILD b/tensorflow_probability/python/distributions/BUILD
@@ -3100,7 +3100,6 @@ multi_substrate_py_test(
     name = "joint_distribution_sequential_test",
     size = "medium",
     srcs = ["joint_distribution_sequential_test.py"],
-    numpy_tags = ["notap"],
     shard_count = 2,
     deps = [
         # absl/testing:parameterized dep,
diff --git a/tensorflow_probability/python/distributions/joint_distribution.py b/tensorflow_probability/python/distributions/joint_distribution.py
@@ -33,6 +33,8 @@
 from tensorflow_probability.python.distributions import distribution as distribution_lib
 from tensorflow_probability.python.distributions import log_prob_ratio
 from tensorflow_probability.python.internal import assert_util
+from tensorflow_probability.python.internal import auto_composite_tensor
+from tensorflow_probability.python.internal import callable_util
 from tensorflow_probability.python.internal import distribution_util
 from tensorflow_probability.python.internal import docstring_util
 from tensorflow_probability.python.internal import nest_util
@@ -53,6 +55,38 @@
 JAX_MODE = False
 
 
+@auto_composite_tensor.auto_composite_tensor
+class StaticDistributionAttributes(auto_composite_tensor.AutoCompositeTensor):
+  """Container to smuggle static attributes out of a tf.function trace."""
+
+  def __init__(self,
+               batch_shape,
+               dtype,
+               event_shape,
+               experimental_shard_axis_names,
+               name,
+               reparameterization_type):
+    self.batch_shape = batch_shape
+    self.dtype = dtype
+    self.event_shape = event_shape
+    self.experimental_shard_axis_names = experimental_shard_axis_names
+    self.name = name
+    self.reparameterization_type = reparameterization_type
+
+  def __iter__(self):
+    """Yields parameters in order matching __init__ signature."""
+    return iter((self.batch_shape, self.dtype, self.event_shape,
+                 self.experimental_shard_axis_names, self.name,
+                 self.reparameterization_type))
+
+if JAX_MODE:
+  from jax import tree_util  # pylint: disable=g-import-not-at-top
+  tree_util.register_pytree_node(
+      StaticDistributionAttributes,
+      flatten_func=lambda sda: ([], list(sda)),
+      unflatten_func=lambda attrs, _: StaticDistributionAttributes(*attrs))
+
+
 class ValueWithTrace(collections.namedtuple(
     'ValueWithTrace',
     ['value', 'traced'])):
@@ -119,6 +153,22 @@ def trace_values_and_log_probs(dist, sample_shape, seed, value=None):
   return ValueWithTrace(value=value, traced=(value, lp))
 
 
+def trace_static_attributes(dist, sample_shape, seed, value):
+  """Extracts the current distribution's static attributes as Tensor specs."""
+  del sample_shape
+  if value is None:
+    value = dist.sample(seed=seed)
+  return ValueWithTrace(
+      value=value,
+      traced=StaticDistributionAttributes(
+          batch_shape=dist.batch_shape,
+          dtype=dist.dtype,
+          experimental_shard_axis_names=dist.experimental_shard_axis_names,
+          event_shape=dist.event_shape,
+          name=get_explicit_name_for_component(dist),
+          reparameterization_type=dist.reparameterization_type))
+
+
 CALLING_CONVENTION_DESCRIPTION = """
 The measure methods of `JointDistribution` (`log_prob`, `prob`, etc.)
 can be called either by passing a single structure of tensors or by using
@@ -269,6 +319,17 @@ def _get_single_sample_distributions(self, candidate_dists=None):
       self._single_sample_distributions[graph_id] = ds
     return ds
 
+  def _get_static_distribution_attributes(self, seed=None):
+    if not hasattr(self, '_cached_static_attributes'):
+      flat_list_of_static_attributes = callable_util.get_output_spec(
+          lambda: self._execute_model(  # pylint: disable=g-long-lambda
+              sample_and_trace_fn=trace_static_attributes,
+              seed=seed if seed is not None else samplers.zeros_seed()))
+      self._cached_static_attributes = StaticDistributionAttributes(
+          *zip(*flat_list_of_static_attributes))
+
+    return self._cached_static_attributes
+
   # Override `tf.Module`'s `_flatten` method to ensure that distributions are
   # instantiated, so that accessing `.variables` or `.trainable_variables` gives
   # consistent results.
@@ -287,8 +348,8 @@ def _model_flatten(self, xs):
   @property
   def dtype(self):
     """The `DType` of `Tensor`s handled by this `Distribution`."""
-    return self._model_unflatten([
-        d.dtype for d in self._get_single_sample_distributions()])
+    return self._model_unflatten(
+        self._get_static_distribution_attributes().dtype)
 
   @property
   def reparameterization_type(self):
@@ -301,37 +362,31 @@ def reparameterization_type(self):
       reparameterization_type: `ReparameterizationType` of each distribution in
         `model`.
     """
-    return self._model_unflatten([
-        d.reparameterization_type
-        for d in self._get_single_sample_distributions()])
+    return self._model_unflatten(
+        self._get_static_distribution_attributes().reparameterization_type)
 
   @property
   def experimental_shard_axis_names(self):
     """Indicates whether part distributions have active shard axis names."""
-    return self._model_unflatten([
-        d.experimental_shard_axis_names
-        for d in self._get_single_sample_distributions()])
+    return self._model_unflatten(
+        self._get_static_distribution_attributes().
+        experimental_shard_axis_names)
 
   @property
   def use_vectorized_map(self):
     return False
 
   def _batch_shape(self):
-    return self._model_unflatten([
-        d.batch_shape for d in self._get_single_sample_distributions()])
+    return self._model_unflatten(
+        self._get_static_distribution_attributes().batch_shape)
 
   def _batch_shape_tensor(self):
     return self._model_unflatten(
         self._map_attr_over_dists('batch_shape_tensor'))
 
   def _event_shape(self):
-    if not hasattr(self, '_cached_event_shape'):
-      self._cached_event_shape = [
-          d.event_shape
-          for d in self._get_single_sample_distributions()]
-    # Unflattening *after* retrieving from cache prevents tf.Module from
-    # wrapping the returned value.
-    return self._model_unflatten(self._cached_event_shape)
+    return self._model_unflatten(
+        self._get_static_distribution_attributes().event_shape)
 
   def _event_shape_tensor(self):
     return self._model_unflatten(
@@ -363,6 +418,11 @@ def sample_distributions(self, sample_shape=(), seed=None, value=None,
       samples: a `tuple` of `Tensor`s with prepended dimensions `sample_shape`
         for each of `distribution_fn`.
     """
+    # Use the user-provided seed to trace static distribution attributes, if
+    # they're not already cached. This ensures we don't try to pass a stateless
+    # seed to a stateful sampler, or vice versa.
+    self._get_static_distribution_attributes(seed=seed)
+
     with self._name_and_control_scope(name):
       value = self._resolve_value(value=value,
                                   allow_partially_specified=True,
@@ -516,6 +576,11 @@ def _unnormalized_log_prob(self, value):
                 'corresponding distribution. Default value: `None` '
                 '(i.e., draw a sample from each distribution).')})
   def _sample_n(self, sample_shape, seed, value=None):
+    # Use the user-provided seed to trace static distribution attributes, if
+    # they're not already cached. This ensures we don't try to pass a stateless
+    # seed to a stateful sampler, or vice versa.
+    self._get_static_distribution_attributes(seed=seed)
+
     might_have_batch_dims = (
         distribution_util.shape_may_be_nontrivial(sample_shape)
         or value is not None)
@@ -539,6 +604,11 @@ def _sample_n(self, sample_shape, seed, value=None):
 
   # TODO(b/189122177): Implement _sample_and_log_prob for distributed JDs.
   def _sample_and_log_prob(self, sample_shape, seed, value=None, **kwargs):
+    # Use the user-provided seed to trace static distribution attributes, if
+    # they're not already cached. This ensures we don't try to pass a stateless
+    # seed to a stateful sampler, or vice versa.
+    self._get_static_distribution_attributes(seed=seed)
+
     xs, lps = zip(
         *self._call_execute_model(
             sample_shape,
@@ -673,8 +743,8 @@ def _flat_resolve_names(self, dummy_name='var'):
     """Resolves a name for each random variable in the model."""
     names = []
     names_used = set()
-    for dummy_idx, d in enumerate(self._get_single_sample_distributions()):
-      name = get_explicit_name_for_component(d)
+    for dummy_idx, name in enumerate(
+        self._get_static_distribution_attributes().name):
       if name is None:
         name = '{}{}'.format(dummy_name, dummy_idx)
       if name in names_used:
diff --git a/tensorflow_probability/python/distributions/joint_distribution_auto_batched_test.py b/tensorflow_probability/python/distributions/joint_distribution_auto_batched_test.py
@@ -523,19 +523,13 @@ def dist():
                             [value_partial_batch_dim, num_rows, num_columns])
 
   def test_unit_sample_shape_avoids_vectorization(self):
-    if not tf.executing_eagerly():
-      self.skipTest('Test relies on eager execution.')
-
+    xs = []  # Collect (possibly symbolic) Tensors sampled inside the model.
     @tfd.JointDistributionCoroutineAutoBatched
     def dist():
-      # Because `pfor` operates by tracing its loop body, to ensure we're
-      # not inside of a `pfor` loop body it's sufficient to check that we're
-      # not inside of a tf.function.
-      if not tf.executing_eagerly():
-        raise ValueError('Model is running inside tf.function. This may '
-                         'indicate that auto-vectorization is being '
-                         'triggered unnecessarily.')
-      yield tfd.Normal(0., 1., name='x')
+      x = yield tfd.Normal(0., 1., name='x')
+      xs.append(x)
+
+    # Try sampling with a variety of unit sample shapes.
     self.assertEqual(
         [1],
         dist.sample(
@@ -549,6 +543,10 @@ def dist():
         dist.sample([1, 1],
                     seed=test_util.test_seed(sampler_type='seedless')).x.shape)
 
+    # Check that the model only ever saw the trivial sample shape.
+    for x in xs:
+      self.assertEqual(x.shape, [])
+
   def test_unit_sample_shape(self):
     @tfd.JointDistributionCoroutineAutoBatched
     def dist():
diff --git a/tensorflow_probability/python/distributions/joint_distribution_coroutine_test.py b/tensorflow_probability/python/distributions/joint_distribution_coroutine_test.py
@@ -1395,6 +1395,37 @@ def desired_unnorm_lp(cprior, c1, c0):
         tfp.math.value_and_gradient(lp_fn, (cprior, c1, c0))[1],
         tfp.math.value_and_gradient(ulp_fn, (cprior, c1, c0))[1])
 
+  @test_util.numpy_disable_test_missing_functionality('symbolic tracing')
+  @test_util.jax_disable_test_missing_functionality(
+      'https://github.com/google/jax/issues/7011')
+  def test_symbolic_trace_dtype(self):
+    # A model that will definitely OOM. (1 billion squared floats).
+    @tfd.JointDistributionCoroutine
+    def model():
+      x = yield Root(tfd.MultivariateNormalDiag(
+          tf.zeros(int(1e9)), tf.ones(int(1e9)), name='x'))
+      loc = tf.einsum('i,j->ij', x, x)
+      yield tfd.Independent(
+          tfd.MultivariateNormalDiag(loc, tf.ones(int(1e9))),
+          reinterpreted_batch_ndims=1,
+          name='y')
+    self.assertEqual((tf.float32, tf.float32), model.dtype)
+
+  @test_util.numpy_disable_test_missing_functionality('symbolic tracing')
+  def test_symbolic_trace_is_cached(self):
+    model_executions = []
+
+    @tfd.JointDistributionCoroutine
+    def model():
+      x = yield Root(tfd.Normal(0., 1., name='x'))
+      y = yield tfd.Normal(x, 1., name='y')
+      model_executions.append(y)
+
+    self.assertAllEqual(((), ()), model.event_shape)
+    self.assertAllEqual(((), ()), model.batch_shape)
+    self.assertAllEqual((tf.float32, tf.float32), model.dtype)
+    self.assertAllEqual(('x', 'y'), model._flat_resolve_names())
+    self.assertLen(model_executions, 1)
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow_probability/python/distributions/joint_distribution_sample_path_mixin.py b/tensorflow_probability/python/distributions/joint_distribution_sample_path_mixin.py
@@ -78,8 +78,9 @@ def batch_ndims(self):
     return self._batch_ndims
 
   def _batch_shape_parts(self):
-    return [d.batch_shape[:self.batch_ndims]
-            for d in self._get_single_sample_distributions()]
+    return [batch_shape[:self.batch_ndims]
+            for batch_shape in self._get_static_distribution_attributes().
+            batch_shape]
 
   def _batch_shape(self):
     # Caching will not leak graph Tensors since this is a static attribute.
diff --git a/tensorflow_probability/python/distributions/joint_distribution_sequential_test.py b/tensorflow_probability/python/distributions/joint_distribution_sequential_test.py
@@ -29,6 +29,7 @@
 import tensorflow_probability as tfp
 
 from tensorflow_probability.python.distributions import joint_distribution_sequential
+from tensorflow_probability.python.internal import prefer_static as ps
 from tensorflow_probability.python.internal import test_util
 
 from tensorflow.python.util import tf_inspect  # pylint: disable=g-direct-tensorflow-import
@@ -501,6 +502,7 @@ def test_matrix_factorization(self):
     self.assertEqual(lp.shape, [7, 9])
 
   @test_util.jax_disable_variable_test
+  @test_util.numpy_disable_variable_test
   def test_latent_dirichlet_allocation(self):
     """Tests Latent Dirichlet Allocation joint model.
 
@@ -587,8 +589,7 @@ def test_poisson_switchover_graphical_model(self):
         indices=tf.cast(
             tau[..., tf.newaxis] < tf.linspace(0., 1., n),
             dtype=tf.int32),
-        # TODO(b/139204153): Remove static value hack after bug closed.
-        batch_dims=int(tf.get_static_value(tf.rank(tau))))
+        batch_dims=ps.rank(tau))
 
     alpha = tf.math.reciprocal(tf.reduce_mean(count_data))
 
diff --git a/tensorflow_probability/python/internal/backend/numpy/numpy_array.py b/tensorflow_probability/python/internal/backend/numpy/numpy_array.py
@@ -197,11 +197,9 @@ def _linspace(start, stop, num, name=None, axis=0):  # pylint: disable=unused-ar
   if np.issubdtype(start.dtype, np.integer):
     start = start.astype(np.float64)
   stop = ops.convert_to_tensor(stop, dtype=start.dtype)
-  num = ops.convert_to_tensor(num, dtype_hint=np.int32)
-  if not np.issubdtype(num.dtype, np.integer):
+  if not np.issubdtype(np.array(num).dtype, np.integer):
     raise TypeError('`num` must be an integer but got {}'.format(num.dtype))
-  num = num.astype(np.int32)
-  return np.linspace(start, stop, num, axis=axis).astype(start.dtype)
+  return np.linspace(start, stop, int(num), axis=axis).astype(start.dtype)
 
 
 def _one_hot(  # pylint: disable=unused-argument
diff --git a/tensorflow_probability/python/internal/backend/numpy/random_generators.py b/tensorflow_probability/python/internal/backend/numpy/random_generators.py
@@ -19,7 +19,6 @@
 from __future__ import print_function
 
 import numpy as np
-import numpy as onp  # Avoids JAX rewrite.  # pylint: disable=reimported
 
 from tensorflow_probability.python.internal.backend.numpy import _utils as utils
 from tensorflow_probability.python.internal.backend.numpy import ops
@@ -64,14 +63,10 @@ def _ensure_shape_tuple(t):
 
 
 def _bcast_shape(base_shape, args):
-  base_shape = _ensure_shape_tuple(base_shape)
-  if not args:
-    return base_shape
-  bc_arr = onp.zeros(base_shape + (0,))
+  bcast_shape = _ensure_shape_tuple(base_shape)
   for arg in args:
-    if arg is not None:
-      bc_arr = bc_arr + onp.zeros(np.asarray(arg).shape + (0,))
-  return bc_arr.shape[:-1]
+    bcast_shape = ops.broadcast_shape(bcast_shape, np.asarray(arg).shape)
+  return bcast_shape
 
 
 def _binomial(shape, seed, counts, probs, output_dtype=np.int32, name=None):  # pylint: disable=unused-argument
diff --git a/tensorflow_probability/python/internal/backend/numpy/tensor_spec.py b/tensorflow_probability/python/internal/backend/numpy/tensor_spec.py
@@ -21,6 +21,9 @@
 
 class TensorSpec(object):
 
-  def __init__(self, *args, **kwargs):
-    del args, kwargs
-    self.dtype = None
+  def __init__(self, shape, dtype):
+    self.shape = shape
+    self.dtype = dtype
+
+  def __repr__(self):
+    return f'TensorSpec(shape={self.shape}, dtype={self.dtype})'
diff --git a/tensorflow_probability/python/internal/callable_util.py b/tensorflow_probability/python/internal/callable_util.py
diff --git a/tensorflow_probability/python/internal/callable_util_test.py b/tensorflow_probability/python/internal/callable_util_test.py