Add batch axis names for gradient-based trajectory length adaptation

sharadmv · tensorflower-gardener · commit c1c8f7445fe3 · 2021-06-08T14:09:59.000-07:00
PiperOrigin-RevId: 378240709
diff --git a/tensorflow_probability/python/experimental/mcmc/gradient_based_trajectory_length_adaptation.py b/tensorflow_probability/python/experimental/mcmc/gradient_based_trajectory_length_adaptation.py
@@ -15,6 +15,7 @@
 """Gradient-based trajectory length adaptation kernel."""
 
 import collections
+import functools
 
 import tensorflow.compat.v2 as tf
 
@@ -89,6 +90,16 @@ def _map_structure_up_to_with_axes(structure, fn, *args,
                                   experimental_shard_axis_names)
 
 
+def _reduce_with_axes(index_op, name_op, x, axis_idx=None, axis_names=None):
+  return name_op(index_op(x, axis_idx), axis_names)
+
+
+_reduce_sum_with_axes = functools.partial(_reduce_with_axes, tf.reduce_sum,
+                                          distribute_lib.psum)
+_reduce_mean_with_axes = functools.partial(_reduce_with_axes, tf.reduce_mean,
+                                           distribute_lib.pmean)
+
+
 def hmc_like_num_leapfrog_steps_getter_fn(kernel_results):
   """Getter for `num_leapfrog_steps` so it can be inspected."""
   return unnest.get_innermost(kernel_results, 'num_leapfrog_steps')
@@ -132,7 +143,8 @@ def chees_criterion(previous_state,
                     proposed_state,
                     accept_prob,
                     validate_args=False,
-                    experimental_shard_axis_names=None):
+                    experimental_shard_axis_names=None,
+                    experimental_chain_axis_names=None):
   """The ChEES criterion from [1].
 
   ChEES stands for Change in the Estimator of the Expected Square.
@@ -166,6 +178,8 @@ def chees_criterion(previous_state,
     validate_args: Whether to perform non-static argument validation.
     experimental_shard_axis_names: A structure of string names indicating how
       members of the state are sharded.
+    experimental_chain_axis_names: A string or list of string names indicating
+      how batches of chains are sharded.
 
   Returns:
     chees: The value of the ChEES criterion.
@@ -182,7 +196,13 @@ def chees_criterion(previous_state,
   """
   batch_ndims = ps.rank(accept_prob)
   batch_axes = ps.range(batch_ndims, dtype=tf.int32)
-  num_chains = ps.size(accept_prob)
+  experimental_chain_axis_names = distribute_lib.canonicalize_axis_name(
+      experimental_chain_axis_names)
+  # Number of total chains is local batch size * distributed axis size
+  local_axis_size = ps.maximum(ps.size(accept_prob), 1)
+  distributed_axis_size = int(ps.reduce_prod([
+      distribute_lib.get_axis_size(a) for a in experimental_chain_axis_names]))
+  num_chains = local_axis_size * distributed_axis_size
   num_chains_ = tf.get_static_value(num_chains)
   if num_chains_ is not None:
     if num_chains_ < 2:
@@ -199,7 +219,9 @@ def chees_criterion(previous_state,
   def _center_previous_state(x):
     # The empirical mean here is a stand-in for the true mean, so we drop the
     # gradient that flows through this term.
-    return x - tf.stop_gradient(tf.reduce_mean(x, axis=batch_axes))
+    x_mean = _reduce_mean_with_axes(
+        x, batch_axes, experimental_chain_axis_names)
+    return x - tf.stop_gradient(x_mean)
 
   def _center_proposed_state(x):
     # The empirical mean here is a stand-in for the true mean, so we drop the
@@ -216,8 +238,10 @@ def _center_proposed_state(x):
     # If all accept_prob's are zero, the x_center will have a nonsense value,
     # but we'll discard the resultant gradients later on, so it's fine.
     x_center = (
-        tf.reduce_sum(expanded_accept_prob * x_safe, axis=batch_axes) /
-        (tf.reduce_sum(expanded_accept_prob, axis=batch_axes) + 1e-20))
+        _reduce_sum_with_axes(expanded_accept_prob * x_safe, batch_axes,
+                              experimental_chain_axis_names) /
+        (_reduce_sum_with_axes(expanded_accept_prob, batch_axes,
+                               experimental_chain_axis_names) + 1e-20))
 
     return x - tf.stop_gradient(x_center)
 
@@ -358,6 +382,7 @@ def __init__(
       proposed_state_getter_fn=hmc_like_proposed_state_getter_fn,
       validate_args=False,
       experimental_shard_axis_names=None,
+      experimental_chain_axis_names=None,
       name=None):
     """Creates the trajectory length adaptation kernel.
 
@@ -414,6 +439,8 @@ def __init__(
         outputs.
       experimental_shard_axis_names: A structure of string names indicating how
         members of the state are sharded.
+      experimental_chain_axis_names: A string or list of string names indicating
+        how batches of chains are sharded.
       name: Python `str` name prefixed to Ops created by this class. Default:
         'simple_step_size_adaptation'.
 
@@ -452,6 +479,7 @@ class docstring).
         proposed_state_getter_fn=hmc_like_proposed_state_getter_fn,
         validate_args=validate_args,
         experimental_shard_axis_names=experimental_shard_axis_names,
+        experimental_chain_axis_names=experimental_chain_axis_names,
         name=name,
     )
 
@@ -468,12 +496,15 @@ def num_adaptation_steps(self):
     return self._parameters['num_adaptation_steps']
 
   def criterion_fn(self, previous_state, proposed_state, accept_prob):
-    if self.experimental_shard_axis_names is None:
-      return self._parameters['criterion_fn'](previous_state, proposed_state,
-                                              accept_prob)
-    return self._parameters['criterion_fn'](
-        previous_state, proposed_state, accept_prob,
-        experimental_shard_axis_names=self.experimental_shard_axis_names)
+    kwargs = {}
+    if self.experimental_chain_axis_names is not None:
+      kwargs['experimental_chain_axis_names'] = (
+          self.experimental_chain_axis_names)
+    if self.experimental_shard_axis_names is not None:
+      kwargs['experimental_shard_axis_names'] = (
+          self.experimental_shard_axis_names)
+    return self._parameters['criterion_fn'](previous_state, proposed_state,
+                                            accept_prob, **kwargs)
 
   @property
   def max_leapfrog_steps(self):
@@ -567,7 +598,8 @@ def one_step(self, current_state, previous_kernel_results, seed=None):
           step_size=step_size,
           criterion_fn=self.criterion_fn,
           max_leapfrog_steps=self.max_leapfrog_steps,
-          experimental_shard_axis_names=self.experimental_shard_axis_names)
+          experimental_shard_axis_names=self.experimental_shard_axis_names,
+          experimental_chain_axis_names=self.experimental_chain_axis_names)
 
       # Undo the effect of adaptation if we're not in the burnin phase. We keep
       # the criterion, however, as that's a diagnostic. We also keep the
@@ -623,9 +655,16 @@ def is_calibrated(self):
   def experimental_shard_axis_names(self):
     return self._parameters['experimental_shard_axis_names']
 
+  @property
+  def experimental_chain_axis_names(self):
+    return self._parameters['experimental_chain_axis_names']
+
   def experimental_with_shard_axes(self, shard_axis_names):
     return self.copy(experimental_shard_axis_names=shard_axis_names)
 
+  def experimental_with_chain_axes(self, chain_axis_names):
+    return self.copy(experimental_chain_axis_names=chain_axis_names)
+
 
 def _forbid_inner_transformed_kernel(inner_kernel):
   """Forbids inner kernel from containing `TransformedTransitionKernel`."""
@@ -669,7 +708,8 @@ def _update_trajectory_grad(previous_kernel_results, previous_state,
                             proposed_state, proposed_velocity,
                             trajectory_jitter, accept_prob, step_size,
                             criterion_fn, max_leapfrog_steps,
-                            experimental_shard_axis_names=None):
+                            experimental_shard_axis_names=None,
+                            experimental_chain_axis_names=None):
   """Updates the trajectory length."""
   # Compute criterion grads.
   def leapfrog_action(dt):
@@ -693,12 +733,16 @@ def adjust_state(x, v, shard_axes=None):
   trajectory_grad *= trajectory_jitter
 
   # Weight by acceptance probability.
+  experimental_chain_axis_names = distribute_lib.canonicalize_axis_name(
+      experimental_chain_axis_names)
   trajectory_grad = tf.where(accept_prob > 1e-4, trajectory_grad, 0.)
   trajectory_grad = tf.where(
       tf.math.is_finite(trajectory_grad), trajectory_grad, 0.)
   trajectory_grad = (
-      tf.reduce_sum(trajectory_grad * accept_prob) /
-      tf.reduce_sum(accept_prob + 1e-20))
+      _reduce_sum_with_axes(trajectory_grad * accept_prob,
+                            None, experimental_chain_axis_names) /
+      _reduce_sum_with_axes(accept_prob + 1e-20, None,
+                            experimental_chain_axis_names))
 
   # Compute Adam/RMSProp step size.
   dtype = previous_kernel_results.adaptation_rate.dtype
diff --git a/tensorflow_probability/python/experimental/mcmc/gradient_based_trajectory_length_adaptation_test.py b/tensorflow_probability/python/experimental/mcmc/gradient_based_trajectory_length_adaptation_test.py
@@ -396,6 +396,45 @@ def run(seed):
       self.assertAllClose(avg_sq_grad[0], avg_sq_grad[i])
       self.assertAllClose(avg_max_tl[0], avg_max_tl[i])
 
+  def test_gbtla_kernel_can_shard_chains_across_devices(self):
+
+    def target_log_prob(a, b):
+      return (
+          tfd.Normal(0., 1.).log_prob(a)
+          + tfd.Sample(tfd.Normal(a, 1.), 4).log_prob(b))
+
+    kernel = tfp.mcmc.HamiltonianMonteCarlo(target_log_prob,
+                                            step_size=1e-2,
+                                            num_leapfrog_steps=2)
+    kernel = tfp.experimental.mcmc.GradientBasedTrajectoryLengthAdaptation(
+        kernel, 10)
+    sharded_kernel = kernel.experimental_with_chain_axes(self.axis_name)
+
+    def run(seed):
+      init_seed, sample_seed = samplers.split_seed(seed)
+      state_seeds = samplers.split_seed(init_seed)
+      state = [
+          samplers.normal(seed=state_seeds[0], shape=[]),
+          samplers.normal(seed=state_seeds[1], shape=[4])
+      ]
+      kr = sharded_kernel.bootstrap_results(state)
+      _, kr = sharded_kernel.one_step(state, kr, seed=sample_seed)
+      return (
+          kr.averaged_sq_grad,
+          kr.averaged_max_trajectory_length
+      )
+
+    seeds = self.shard_values(tf.stack(tfp.random.split_seed(
+        samplers.zeros_seed(), distribute_test_lib.NUM_DEVICES)), 0)
+
+    avg_sq_grad, avg_max_tl = self.evaluate(
+        self.per_replica_to_tensor(self.strategy_run(
+            run, args=(seeds,), axis_name=self.axis_name), 0))
+
+    for i in range(distribute_test_lib.NUM_DEVICES):
+      self.assertAllClose(avg_sq_grad[0], avg_sq_grad[i])
+      self.assertAllClose(avg_max_tl[0], avg_max_tl[i])
+
 
 del _GradientBasedTrajectoryLengthAdaptationTest