Use batch shape annotations to ensure that autodiff computes correct bijector LDJs.

davmre · tensorflower-gardener · commit 0df33acf6fb9 · 2021-05-25T12:39:36.000-07:00
PiperOrigin-RevId: 375770777
diff --git a/tensorflow_probability/python/bijectors/bijector.py b/tensorflow_probability/python/bijectors/bijector.py
@@ -1415,7 +1415,23 @@ def _call_inverse_log_det_jacobian(self, y, event_ndims, name, **kwargs):
           x = self.inverse(y, **kwargs)  # Fall back to computing `-fldj(x)`
           ildj = attrs['ildj'] = -self._forward_log_det_jacobian(x, **kwargs)
         elif self._is_scalar:
-          ildj = _autodiff_log_det_jacobian(self._inverse, y)
+          try:
+            scalar_batch_shape = self.experimental_batch_shape_tensor(
+                y_event_ndims=0)
+          except NotImplementedError:
+            raise NotImplementedError(
+                'Cannot derive `inverse_log_det_jacobian` using automatic '
+                'differentiation because its shape could not be determined. '
+                'Please implement at least one of:\n'
+                '`{bijector_type}._parameter_properties`\n'
+                '`{bijector_type}._batch_shape_tensor`\n'
+                '`{bijector_type}._forward_log_det_jacobian`\n '
+                '`{bijector_type}._inverse_log_det_jacobian`.'.format(
+                    bijector_type=type(self).__name__))
+          ildj = _autodiff_log_det_jacobian(
+              self.inverse,
+              tf.broadcast_to(y, ps.broadcast_shape(ps.shape(y),
+                                                    scalar_batch_shape)))
         else:
           raise NotImplementedError(
               'Neither _forward_log_det_jacobian nor _inverse_log_det_jacobian '
@@ -1524,7 +1540,23 @@ def _call_forward_log_det_jacobian(self, x, event_ndims, name, **kwargs):
           y = self.forward(x, **kwargs)  # Fall back to computing `ildj(y)`
           ildj = attrs['ildj'] = self._inverse_log_det_jacobian(y, **kwargs)
         elif self._is_scalar:
-          ildj = -_autodiff_log_det_jacobian(self._forward, x)
+          try:
+            scalar_batch_shape = self.experimental_batch_shape_tensor(
+                x_event_ndims=0)
+          except NotImplementedError:
+            raise NotImplementedError(
+                'Cannot derive `forward_log_det_jacobian` using automatic '
+                'differentiation because its shape could not be determined. '
+                'Please implement at least one of:\n'
+                '`{bijector_type}._parameter_properties`\n'
+                '`{bijector_type}._batch_shape_tensor`\n'
+                '`{bijector_type}._forward_log_det_jacobian`\n '
+                '`{bijector_type}._inverse_log_det_jacobian`.'.format(
+                    bijector_type=type(self).__name__))
+          ildj = -_autodiff_log_det_jacobian(
+              self.forward,
+              tf.broadcast_to(x, ps.broadcast_shape(ps.shape(x),
+                                                    scalar_batch_shape)))
         else:
           raise NotImplementedError(
               'Neither _forward_log_det_jacobian nor _inverse_log_det_jacobian '
@@ -2111,6 +2143,8 @@ def ldj_reduction_shape(shape_structure,
 
 def _autodiff_log_det_jacobian(fn, x):
   """Automatically compute the log det jacobian of a scalar function."""
+  # Note: x must be fully broadcast (`shape(x) == shape(fn(x))`); otherwise
+  # the gradients will be (incorrectly) summed.
   _, grads = gradient.value_and_gradient(fn, x)
   if grads is None:
     raise ValueError('Cannot compute log det jacobian; function {} has `None` '
diff --git a/tensorflow_probability/python/bijectors/bijector_test.py b/tensorflow_probability/python/bijectors/bijector_test.py
@@ -29,6 +29,7 @@
 from tensorflow_probability.python import bijectors as tfb
 from tensorflow_probability.python.bijectors import bijector as bijector_lib
 from tensorflow_probability.python.internal import cache_util
+from tensorflow_probability.python.internal import parameter_properties
 from tensorflow_probability.python.internal import prefer_static as ps
 from tensorflow_probability.python.internal import tensor_util
 from tensorflow_probability.python.internal import test_util
@@ -84,12 +85,12 @@ def __init__(self):
 
     with self.assertRaisesRegexp(
         NotImplementedError,
-        'inverse not implemented'):
+        'Cannot derive `inverse_log_det_jacobian`'):
       bij.inverse_log_det_jacobian(0, event_ndims=0)
 
     with self.assertRaisesRegexp(
         NotImplementedError,
-        'forward not implemented'):
+        'Cannot derive `forward_log_det_jacobian`'):
       bij.forward_log_det_jacobian(0, event_ndims=0)
 
   @test_util.disable_test_for_backend(
@@ -128,8 +129,11 @@ def _forward(self, x):
         error_clazz, 'Tensor conversion requested dtype'):
       b64.forward(x32)
 
+  @parameterized.named_parameters(
+      ('no_batch_shape', 1.4),
+      ('with_batch_shape', [[[2., 3.], [5., 7.]]]))
   @test_util.numpy_disable_gradient_test
-  def testAutodiffLogDetJacobian(self):
+  def testAutodiffLogDetJacobian(self, bijector_scale):
 
     class NoJacobianBijector(tfb.Bijector):
       """Bijector with no log det jacobian methods."""
@@ -148,7 +152,12 @@ def _forward(self, x):
       def _inverse(self, y):
         return tf.math.log(y) / self._scale
 
-    b = NoJacobianBijector(scale=1.4)
+      @classmethod
+      def _parameter_properties(cls, dtype, num_classes=None):
+        return dict(
+            scale=parameter_properties.ParameterProperties(event_ndims=0))
+
+    b = NoJacobianBijector(scale=bijector_scale)
     x = tf.convert_to_tensor([2., -3.])
     [
         fldj,
diff --git a/tensorflow_probability/python/experimental/bijectors/distribution_bijectors_test.py b/tensorflow_probability/python/experimental/bijectors/distribution_bijectors_test.py
@@ -97,8 +97,6 @@ def test_all_distributions_either_work_or_raise_error(self, dist_name, data):
 
     dist = data.draw(dhps.base_distributions(
         dist_name=dist_name,
-        # TODO(b/175354524) fix autodiff for batch LDJs and enable batch tests.
-        batch_shape=[],
         enable_vars=False,
         param_strategy_fn=_constrained_zeros_fn))
     try:
diff --git a/tensorflow_probability/python/experimental/bijectors/scalar_function_with_inferred_inverse.py b/tensorflow_probability/python/experimental/bijectors/scalar_function_with_inferred_inverse.py
@@ -19,8 +19,10 @@
 
 from tensorflow_probability.python import math as tfp_math
 from tensorflow_probability.python.bijectors import bijector
+from tensorflow_probability.python.internal import callable_util
 from tensorflow_probability.python.internal import custom_gradient as tfp_custom_gradient
 from tensorflow_probability.python.internal import prefer_static as ps
+from tensorflow_probability.python.internal import tensorshape_util
 
 __all__ = ['ScalarFunctionWithInferredInverse']
 
@@ -35,6 +37,7 @@ def __init__(self,
                max_iterations=50,
                require_convergence=True,
                additional_scalar_parameters_requiring_gradients=(),
+               dtype=None,
                validate_args=False,
                name='scalar_function_with_inferred_inverse'):
     """Initialize the ScalarFunctionWithInferredInverse bijector.
@@ -72,6 +75,9 @@ def __init__(self,
         anything in the closure of `fn`) will not, in general, receive
         gradients.
         Default value: `()`.
+      dtype: `tf.dtype` supported by this `Bijector`. `None` means dtype is not
+        enforced.
+        Default value: `None`.
       validate_args: Python `bool` indicating whether arguments should be
         checked for correctness.
       name: Python `str` name given to ops managed by this object.
@@ -91,14 +97,14 @@ def __init__(self,
       # VJPs and JVPs can be computed efficiently using actual matrix ops.
       self._additional_scalar_parameters_requiring_gradients = (
           additional_scalar_parameters_requiring_gradients)
-      self._cached_fn_batch_shape = None
 
       self._bound_fn = (
           lambda x: fn(x, *additional_scalar_parameters_requiring_gradients))
       self._inverse = self._wrap_inverse_with_implicit_gradient()
 
       super(ScalarFunctionWithInferredInverse, self).__init__(
           parameters=parameters,
+          dtype=dtype,
           forward_min_event_ndims=0,
           inverse_min_event_ndims=0,
           validate_args=validate_args,
@@ -129,15 +135,25 @@ def bound_fn(self):
     """Forward `fn` with any extra args bound, so that `y = bound_fn(x)`."""
     return self._bound_fn
 
-  def _fn_batch_shape(self):
-    if self._cached_fn_batch_shape is None:
-      # Evaluating at a scalar value (0.) exposes the function's batch shape.
-      # For example, evaluating
-      # `fn = lambda x: x * constant([1., 2., 3.])`
-      # returns a result of shape `[3]`.
-      self._cached_fn_batch_shape = ps.shape(
-          self.bound_fn(self.domain_constraint_fn(0.)))  # pylint: disable=not-callable
-    return self._cached_fn_batch_shape
+  def _batch_shape(self, x_event_ndims):
+    try:
+      # Trace the function to extract its batch shape without executing it.
+      fn_shape = callable_util.get_output_spec(
+          lambda x: self.bound_fn(self.domain_constraint_fn(x)),  # pylint: disable=not-callable
+          tf.TensorSpec([], dtype=self.dtype if self.dtype else tf.float32)
+          ).shape
+    except TypeError:  # `dtype` wasn't specified.
+      return tf.TensorShape(None)
+
+    fn_rank = tensorshape_util.rank(fn_shape)
+    if fn_rank is not None:
+      return fn_shape[:fn_rank - x_event_ndims]
+    return fn_shape
+
+  def _batch_shape_tensor(self, x_event_ndims):
+    fn_shape = ps.shape(
+        self.bound_fn(self.domain_constraint_fn(0.)))  # pylint: disable=not-callable
+    return fn_shape[:ps.rank_from_shape(fn_shape) - x_event_ndims]
 
   def _forward(self, x):
     return self.bound_fn(x)
@@ -220,8 +236,8 @@ def _arg_broadcasting_wrapped_inverse(y):
         # TODO(davmre): Do gradient reductions directly in the VJP using
         # `tf.raw_ops.BroadcastGradientArgs` so we can remove this wrapper
         # and avoid spurious broadcasting.
-        full_batch_shape = ps.broadcast_shape(self._fn_batch_shape(),
-                                              ps.shape(y))
+        full_batch_shape = ps.broadcast_shape(
+            self.experimental_batch_shape_tensor(), ps.shape(y))
         args = [tf.broadcast_to(arg, full_batch_shape) for arg in args]
       return _inverse_with_gradient(y, *args)