FunMC: Add HMC with state gradients.

SiegeLordEx · tensorflower-gardener · commit 7911fc63e722 · 2021-06-03T12:05:10.000-07:00
This is a composable component of implementing gradient-based trajectory adaptation, such as ChEES-HMC.

PiperOrigin-RevId: 377341695
diff --git a/spinoffs/fun_mc/fun_mc/dynamic/backend_jax/backend.py b/spinoffs/fun_mc/fun_mc/dynamic/backend_jax/backend.py
@@ -17,11 +17,13 @@
 from fun_mc.dynamic.backend_jax import tf_on_jax
 from fun_mc.dynamic.backend_jax import util
 from tensorflow_probability.substrates import jax as tfp
+from tensorflow_probability.substrates.jax.internal import distribute_lib
 from tensorflow_probability.substrates.jax.internal import prefer_static
 
 tf = tf_on_jax.tf
 
 __all__ = [
+    'distribute_lib',
     'prefer_static',
     'tf',
     'tfp',
diff --git a/spinoffs/fun_mc/fun_mc/dynamic/backend_tensorflow/backend.py b/spinoffs/fun_mc/fun_mc/dynamic/backend_tensorflow/backend.py
@@ -16,10 +16,12 @@
 
 import tensorflow.compat.v2 as tf
 import tensorflow_probability as tfp
+from tensorflow_probability.python.internal import distribute_lib
 from tensorflow_probability.python.internal import prefer_static
 from fun_mc.dynamic.backend_tensorflow import util
 
 __all__ = [
+    'distribute_lib',
     'prefer_static',
     'tf',
     'tfp',
diff --git a/spinoffs/fun_mc/fun_mc/fun_mc_lib.py b/spinoffs/fun_mc/fun_mc/fun_mc_lib.py
@@ -136,6 +136,7 @@
                     Mapping[Any, BooleanTensor]]
 FloatNest = Union[FloatTensor, Sequence[FloatTensor], Mapping[Any, FloatTensor]]
 IntNest = Union[IntTensor, Sequence[IntTensor], Mapping[Any, IntTensor]]
+StringNest = Union[Text, Sequence[Text], Mapping[Any, Text]]
 DTypeNest = Union['tf.DType', Sequence['tf.DType'], Mapping[Any, 'tf.DType']]
 State = TensorNest  # pylint: disable=invalid-name
 TransitionOperator = Callable[..., Tuple[State, TensorNest]]
@@ -1381,6 +1382,7 @@ class IntegratorExtras(NamedTuple):
   final_kinetic_energy: 'FloatTensor'
   final_kinetic_energy_extra: Any
   integrator_trace: Any
+  momentum_grads: 'State'
 
 
 @util.named_call
@@ -1496,7 +1498,8 @@ def integrator_trace_wrapper_fn(args, _):
       final_energy=final_energy,
       final_kinetic_energy=integrator_step_extra.kinetic_energy,
       final_kinetic_energy_extra=integrator_step_extra.kinetic_energy_extra,
-      integrator_trace=integrator_trace)
+      integrator_trace=integrator_trace,
+      momentum_grads=integrator_step_extra.momentum_grads)
 
   return state, extra
 
diff --git a/spinoffs/fun_mc/fun_mc/prefab.py b/spinoffs/fun_mc/fun_mc/prefab.py
@@ -37,6 +37,8 @@
     'adaptive_hamiltonian_monte_carlo_init',
     'adaptive_hamiltonian_monte_carlo_step',
     'AdaptiveHamiltonianMonteCarloState',
+    'hamiltonian_monte_carlo_with_state_grads_step',
+    'HamiltonianMonteCarloWithStateGradsExtra',
     'interactive_trace',
     'step_size_adaptation_init',
     'step_size_adaptation_step',
@@ -457,8 +459,8 @@ def interactive_trace(
     iteration_axis: Integer. Indicates the axis of the trace outputs that should
       be flattened with the first axis. This is most useful when `fn` is
       `trace`. E.g. if the trace has shape `[num_steps, 2, 5]` and
-      `iteration_axis=2`, the trace outputs will be reshaped/transposed to
-      `[2, 5 * num_steps]`. A value of 0 disables this operation.
+      `iteration_axis=2`, the trace outputs will be reshaped/transposed to `[2,
+      5 * num_steps]`. A value of 0 disables this operation.
     block_until_ready: Whether to wait for the computation to finish between
       steps. This results in smoother progress bars under, e.g., JAX.
     progress_bar_fn: A callable that will be called with an iterable with length
@@ -504,13 +506,15 @@ def fn_with_progress(state):
   )
 
   if iteration_axis != 0:
+
     def fix_part(x):
       x = util.move_axis(x, 0, iteration_axis - 1)
       x = tf.reshape(
           x,
           tuple(x.shape[:iteration_axis - 1]) + (-1,) +
           tuple(x.shape[iteration_axis + 1:]))
       return x
+
     trace = util.map_tree(fix_part, trace)
   return state, trace
 
@@ -649,3 +653,120 @@ def step_size_adaptation_step(
       opt_state=opt_state, rms_state=rms_state, step=state.step + 1)
   extra = StepSizeAdaptationExtra(opt_extra=opt_extra, accept_prob=accept_prob)
   return state, extra
+
+
+class HamiltonianMonteCarloWithStateGradsExtra(NamedTuple):
+  """Extra outputs for 'hamiltonian_monte_carlo_with_state_grads_step'."""
+  hmc_extra: 'fun_mc.HamiltonianMonteCarloExtra'
+  num_integrator_steps: 'fun_mc.IntTensor'
+  proposed_state: 'fun_mc.State'
+
+
+def hamiltonian_monte_carlo_with_state_grads_step(
+    hmc_state: 'fun_mc.HamiltonianMonteCarloState',
+    trajectory_length: 'fun_mc.FloatTensor',
+    scalar_step_size: 'fun_mc.FloatTensor',
+    step_size_scale: 'fun_mc.FloatNest' = 1.,
+    shard_axis_names: 'fun_mc.StringNest' = (),
+    **hmc_kwargs
+) -> ('Tuple[fun_mc.HamiltonianMonteCarloState, '
+      'HamiltonianMonteCarloWithStateGradsExtra]'):
+  """Hamiltonian Monte Carlo (HMC) step with gradients for proposed state.
+
+  This acts as a `fun_mc.hamiltonian_monte_carlo_step`, where the
+  `num_integrator_steps` is defined as `ceil(trajectory_length /
+  scalar_step_size)` and `step_size` is defined as `scalar_step_size *
+  step_size_scale`. The main feature of this function is that it propagates the
+  gradients from `hmc_with_state_grads_extra.proposed_state` to
+  `trajectory_length` (these are the only gradients propagated at the moment).
+  This feature can be used to do gradient-based optimization of
+  `trajectory_length` based on criteria that depend on the `proposed_state`
+  (e.g. [1]).
+
+  This function supports SPMD via sharded states in the same sense as TensorFlow
+  Probability's `tfp.experimental.distribute.Sharded`. Certain state tensors can
+  be annotated as having different values on different devices, with
+  cross-device reductions being inserted accordingly.
+
+  Args:
+    hmc_state: `fun_mc.HamiltonianMonteCarloState`.
+    trajectory_length: Trajectory length used by HMC.
+    scalar_step_size: Scalar step size (used to compute the number of leapfrog
+      steps).
+    step_size_scale: Step size scale, structure broadcastable to the
+      `hmc_state.state`.
+    shard_axis_names: Shard axes names, used for SPMD.
+    **hmc_kwargs: Passed to `fun_mc.hamiltonian_monte_carlo_step`.
+
+  Returns:
+    hmc_state: `fun_mc.HamiltonianMonteCarloState`.
+    hmc_with_grads_extra: Extra outputs.
+
+  #### References
+
+  [1]: Hoffman, M., Radul, A., & Sountsov, P. (2021). An Adaptive MCMC Scheme
+       for Setting Trajectory Lengths in Hamiltonian Monte Carlo.
+       http://proceedings.mlr.press/v130/hoffman21a.html
+  """
+
+  @tf.custom_gradient
+  def hmc(trajectory_length):
+    trajectory_length = tf.convert_to_tensor(trajectory_length)
+    num_integrator_steps = tf.cast(
+        tf.math.ceil(trajectory_length / scalar_step_size), tf.int32)
+    # In case something goes negative.
+    num_integrator_steps = tf.maximum(1, num_integrator_steps)
+    new_hmc_state, hmc_extra = fun_mc.hamiltonian_monte_carlo_step(
+        hmc_state,
+        num_integrator_steps=num_integrator_steps,
+        step_size=util.map_tree(lambda s: s * scalar_step_size,
+                                step_size_scale),
+        **hmc_kwargs)
+    hmc_with_grads_extra = HamiltonianMonteCarloWithStateGradsExtra(
+        proposed_state=hmc_extra.proposed_hmc_state.state,
+        hmc_extra=hmc_extra,
+        num_integrator_steps=num_integrator_steps)
+    res = (new_hmc_state, hmc_with_grads_extra)
+
+    def grad(*grads):
+      grads = util.unflatten_tree(res, util.flatten_tree(grads))
+
+      step_size_scale_bc = fun_mc.maybe_broadcast_structure(
+          step_size_scale, hmc_extra.integrator_extra.momentum_grads)
+
+      # We wish to compute `grads^T @
+      # jacobian(proposed_state(trajectory_length))`.
+      #
+      # The Jacobian is known from from Hamilton's equations:
+      #
+      # dx / dt = dK(v) / dv
+      #
+      # where `x` is the state, `v` is the momentum and `K` is the kinetic
+      # energy. Since `step_size_scale` rescales momentum, we the right hand
+      # side of that expression is `momentum_grads * step_size_scale` by the
+      # chain rule. Since the Jacobian in question has 1 row, the
+      # vector-Jacobian product is simply the dot product.
+      state_grads = util.map_tree(lambda s, m, g: s * m * g, step_size_scale_bc,
+                                  hmc_extra.integrator_extra.momentum_grads,
+                                  grads[1].proposed_state)
+
+      def do_sum(x, shard_axis_names):
+        res = tf.reduce_sum(
+            x, list(range(len(trajectory_length.shape), len(x.shape))))
+        if shard_axis_names:
+          res = backend.distribute_lib.psum(res, shard_axis_names)
+        return res
+
+      if shard_axis_names:
+        shard_axis_names_bc = shard_axis_names
+      else:
+        shard_axis_names_bc = util.map_tree(lambda _: [], state_grads)
+
+      return sum(
+          util.flatten_tree(
+              util.map_tree_up_to(state_grads, do_sum, state_grads,
+                                  shard_axis_names_bc)))
+
+    return res, grad
+
+  return hmc(trajectory_length)
diff --git a/spinoffs/fun_mc/fun_mc/prefab_test.py b/spinoffs/fun_mc/fun_mc/prefab_test.py
@@ -14,8 +14,12 @@
 # ============================================================================
 """Tests for prefabs."""
 
+import functools
+import os
+
 # Dependency imports
 
+import jax
 from jax.config import config as jax_config
 import numpy as np
 import tensorflow.compat.v2 as real_tf
@@ -30,13 +34,17 @@
 tf = backend.tf
 tfp = backend.tfp
 util = backend.util
+tfd = tfp.distributions
 
 real_tf.enable_v2_behavior()
 jax_config.update('jax_enable_x64', True)
 
-
 BACKEND = None  # Rewritten by backends/rewrite.py.
 
+if BACKEND == 'backend_jax':
+  os.environ['XLA_FLAGS'] = (f'{os.environ.get("XLA_FLAGS", "")} '
+                             '--xla_force_host_platform_device_count=4')
+
 
 def _test_seed():
   return tfp_test_util.test_seed() % (2**32 - 1)
@@ -169,6 +177,7 @@ def kernel(ssa_state, seed):
     self.assertAllClose(rms_step_size[100], rms_step_size[150])
 
   def testInteractiveIterationAxis1(self):
+
     def kernel(x):
       return x + 1, x
 
@@ -184,6 +193,7 @@ def kernel(x):
     self.assertAllClose(99., trace[-1])
 
   def testInteractiveIterationAxis2(self):
+
     def kernel(x):
       return x + 1, x
 
@@ -193,16 +203,94 @@ def inner(x):
       return state, trace
 
     state, trace = prefab.interactive_trace(
-        tf.zeros(2),
-        inner,
-        20,
-        iteration_axis=2,
-        progress_bar_fn=None)
+        tf.zeros(2), inner, 20, iteration_axis=2, progress_bar_fn=None)
 
     self.assertAllClose([100., 100.], state)
     self.assertEqual([2, 100], list(trace.shape))
     self.assertAllClose([99., 99.], trace[:, -1])
 
+  def testHMCWithStateGrads(self):
+    trajectory_length = 1.
+    epsilon = 1e-3
+
+    root = tfp.experimental.distribute.JointDistributionCoroutine.Root
+
+    seed = self._make_seed(_test_seed())
+
+    def hmc_step(trajectory_length, axis_name=()):
+
+      @tfp.experimental.distribute.JointDistributionCoroutine
+      def model():
+        z = yield root(tfd.Normal(0., 1))
+        yield tfp.experimental.distribute.Sharded(
+            tfd.Sample(tfd.Normal(z, 1.), 8), axis_name)
+
+      @tfp.experimental.distribute.JointDistributionCoroutine
+      def momentum_dist():
+        yield root(tfd.Normal(0., 2))
+        yield root(
+            tfp.experimental.distribute.Sharded(
+                tfd.Sample(tfd.Normal(0., 3.), 8), axis_name))
+
+      def target_log_prob_fn(x):
+        return model.log_prob(x), ()
+
+      def kinetic_energy_fn(m):
+        return -momentum_dist.log_prob(m), ()
+
+      def momentum_sample_fn(seed):
+        return momentum_dist.sample(2, seed=seed)
+
+      state = model.sample(2, seed=seed)
+      hmc_state = fun_mc.hamiltonian_monte_carlo_init(state, target_log_prob_fn)
+      hmc_state, hmc_extra = (
+          prefab.hamiltonian_monte_carlo_with_state_grads_step(
+              hmc_state,
+              trajectory_length=trajectory_length,
+              scalar_step_size=epsilon,
+              step_size_scale=util.map_tree(lambda x: 1. + tf.abs(x), state),
+              target_log_prob_fn=target_log_prob_fn,
+              seed=seed,
+              kinetic_energy_fn=kinetic_energy_fn,
+              momentum_sample_fn=momentum_sample_fn,
+              shard_axis_names=model.experimental_shard_axis_names))
+
+      def sum_state(x, axis_name):
+        res = tf.reduce_sum(x**2)
+        if axis_name:
+          res = backend.distribute_lib.psum(res, axis_name)
+        return res
+
+      sum_sq = util.map_tree_up_to(hmc_extra.proposed_state, sum_state,
+                                   hmc_extra.proposed_state,
+                                   model.experimental_shard_axis_names)
+      sum_sq = sum(util.flatten_tree(sum_sq))
+      return sum_sq, ()
+
+    def finite_diff_grad(f, epsilon, x):
+      return (fun_mc.call_potential_fn(f, util.map_tree(
+          lambda x: x + epsilon, x))[0] - fun_mc.call_potential_fn(
+              f, util.map_tree(lambda x: x - epsilon, x))[0]) / (2 * epsilon)
+
+    f = tf.function(hmc_step)
+    auto_diff = util.value_and_grad(f, trajectory_length)[2]
+    finite_diff = finite_diff_grad(f, epsilon, trajectory_length)
+
+    self.assertAllClose(auto_diff, finite_diff, rtol=0.01)
+
+    if BACKEND == 'backend_jax':
+
+      @functools.partial(jax.pmap, axis_name='i')
+      def run(_):
+        f = tf.function(lambda trajectory_length: hmc_step(  # pylint: disable=g-long-lambda
+            trajectory_length, axis_name='i'))
+        auto_diff = util.value_and_grad(f, trajectory_length)[2]
+        finite_diff = finite_diff_grad(f, epsilon, trajectory_length)
+        return auto_diff, finite_diff
+
+      auto_diff, finite_diff = run(tf.ones(4))
+      self.assertAllClose(auto_diff, finite_diff, rtol=0.01)
+
 
 @test_util.multi_backend_test(globals(), 'prefab_test')
 class PrefabTest32(PrefabTest):