Enable batch support for windowed_mean|variance

nicolaspi · nicolaspi · commit 26752108c679 · 2022-09-26T10:58:37.000Z
diff --git a/tensorflow_probability/python/stats/sample_stats.py b/tensorflow_probability/python/stats/sample_stats.py
@@ -14,10 +14,18 @@
 # ============================================================================
 """Functions for computing statistics of samples."""
 
+JAX_MODE = False
+NUMPY_MODE = False
+
 # Dependency imports
 import numpy as np
 import tensorflow.compat.v2 as tf
 
+if JAX_MODE or NUMPY_MODE:
+  tnp = np
+else:
+  import tensorflow.experimental.numpy as tnp
+
 from tensorflow_probability.python.internal import assert_util
 from tensorflow_probability.python.internal import distribution_util
 from tensorflow_probability.python.internal import dtype_util
@@ -712,7 +720,7 @@ def windowed_variance(
 
   Computes variances among data in the Tensor `x` along the given windows:
 
-    result[i] = variance(x[low_indices[i]:high_indices[i]+1])
+    result[i] = variance(x[low_indices[i]:high_indices[i]])
 
   accurately and efficiently.  To wit, if K is the size of
   `low_indices` and `high_indices`, and `N` is the size of `x` along
@@ -727,10 +735,9 @@ def windowed_variance(
   last half of an MCMC chain.
 
   Suppose `x` has shape `Bx + [N] + E`, where the `Bx` component has
-  rank `axis`, and `low_indices` and `high_indices` broadcast to shape
-  `[M]`.  Then each element of `low_indices` and `high_indices`
-  must be between 0 and N+1, and the shape of the output will be
-  `Bx + [M] + E`.  Batch shape in the indices is not currently supported.
+  rank `axis`, and `low_indices` and `high_indices` broadcast to `x`.
+  Then each element of `low_indices` and `high_indices` must be
+  between 0 and N+1, and the shape of the output will be `Bx + [M] + E`.
 
   The default windows are
   `[0, 1), [1, 2), [1, 3), [2, 4), [2, 5), ...`
@@ -769,7 +776,7 @@ def windowed_variance(
   """
   with tf.name_scope(name or 'windowed_variance'):
     x = tf.convert_to_tensor(x)
-    low_indices, high_indices, low_counts, high_counts = _prepare_window_args(
+    x, indices, axis = _prepare_window_args(
         x, low_indices, high_indices, axis)
 
     # We have a problem with indexing: the standard convention demands
@@ -786,15 +793,11 @@ def windowed_variance(
     def index_for_cumulative(indices):
       return tf.maximum(indices - 1, 0)
     cum_sums = tf.cumsum(x, axis=axis)
-    low_sums = tf.gather(
-        cum_sums, index_for_cumulative(low_indices), axis=axis)
-    high_sums = tf.gather(
-        cum_sums, index_for_cumulative(high_indices), axis=axis)
+    sums = tnp.take_along_axis(
+        cum_sums, index_for_cumulative(indices), axis=axis)
     cum_variances = cumulative_variance(x, sample_axis=axis)
-    low_variances = tf.gather(
-        cum_variances, index_for_cumulative(low_indices), axis=axis)
-    high_variances = tf.gather(
-        cum_variances, index_for_cumulative(high_indices), axis=axis)
+    variances = tnp.take_along_axis(
+        cum_variances, index_for_cumulative(indices), axis=axis)
 
     # This formula is the binary accurate variance merge from [1],
     # adapted to subtract and batched across the indexed counts, sums,
@@ -812,15 +815,18 @@ def index_for_cumulative(indices):
     # This formula can also be read as implementing the above variance
     # computation by "unioning" A u B with a notional "negative B"
     # multiset.
-    counts = high_counts - low_counts  # |A|
-    discrepancies = (
-        _safe_average(high_sums, high_counts) -
-        _safe_average(low_sums, low_counts))**2  # (mean(A u B) - mean(B))**2
-    adjustments = high_counts * (-low_counts) / counts  # |A u B| * -|B| / |A|
-    residuals = (high_variances * high_counts -
-                 low_variances * low_counts +
+    bounds = ps.cast(indices, sums.dtype)
+    counts = bounds[1] - bounds[0]  # |A|
+    sum_averages = tf.math.divide_no_nan(sums, bounds)
+    # (mean(A u B) - mean(B))**2
+    discrepancies = tf.square(sum_averages[1] - sum_averages[0])
+    # |A u B| * -|B| / |A|
+    adjustments = tf.math.divide_no_nan(bounds[1] * (-bounds[0]), counts)
+    variances_scaled = variances * bounds
+    residuals = (variances_scaled[1] -
+                 variances_scaled[0] +
                  adjustments * discrepancies)
-    return _safe_average(residuals, counts)
+    return tf.math.divide_no_nan(residuals, counts)
 
 
 def windowed_mean(
@@ -829,7 +835,7 @@ def windowed_mean(
 
   Computes means among data in the Tensor `x` along the given windows:
 
-    result[i] = mean(x[low_indices[i]:high_indices[i]+1])
+    result[i] = mean(x[low_indices[i]:high_indices[i]])
 
   efficiently.  To wit, if K is the size of `low_indices` and
   `high_indices`, and `N` is the size of `x` along the given `axis`,
@@ -842,10 +848,9 @@ def windowed_mean(
   last half of an MCMC chain.
 
   Suppose `x` has shape `Bx + [N] + E`, where the `Bx` component has
-  rank `axis`, and `low_indices` and `high_indices` broadcast to shape
-  `[M]`.  Then each element of `low_indices` and `high_indices`
-  must be between 0 and N+1, and the shape of the output will be
-  `Bx + [M] + E`.  Batch shape in the indices is not currently supported.
+  rank `axis`, and `low_indices` and `high_indices` broadcast to `x`.
+  Then each element of `low_indices` and `high_indices` must be
+  between 0 and N+1, and the shape of the output will be `Bx + [M] + E`.
 
   The default windows are
   `[0, 1), [1, 2), [1, 3), [2, 4), [2, 5), ...`
@@ -878,18 +883,17 @@ def windowed_mean(
   """
   with tf.name_scope(name or 'windowed_mean'):
     x = tf.convert_to_tensor(x)
-    low_indices, high_indices, low_counts, high_counts = _prepare_window_args(
-        x, low_indices, high_indices, axis)
+    x, indices, axis = _prepare_window_args(x, low_indices, high_indices, axis)
 
     raw_cumsum = tf.cumsum(x, axis=axis)
-    cum_sums = tf.concat(
-        [tf.zeros_like(tf.gather(raw_cumsum, [0], axis=axis)), raw_cumsum],
-        axis=axis)
-    low_sums = tf.gather(cum_sums, low_indices, axis=axis)
-    high_sums = tf.gather(cum_sums, high_indices, axis=axis)
-
-    counts = high_counts - low_counts
-    return _safe_average(high_sums - low_sums, counts)
+    rank = ps.rank(x)
+    paddings = ps.reshape(ps.one_hot(2*axis, depth=2*rank, dtype=tf.int32),
+                          (rank, 2))
+    cum_sums = ps.pad(raw_cumsum, paddings)
+    sums = tnp.take_along_axis(cum_sums, indices,
+      axis=axis)
+    counts = ps.cast(indices[1] - indices[0], dtype=sums.dtype)
+    return tf.math.divide_no_nan(sums[1] - sums[0], counts)
 
 
 def _prepare_window_args(x, low_indices=None, high_indices=None, axis=0):
@@ -905,24 +909,20 @@ def _prepare_window_args(x, low_indices=None, high_indices=None, axis=0):
   # Broadcast indices together.
   high_indices = high_indices + tf.zeros_like(low_indices)
   low_indices = low_indices + tf.zeros_like(high_indices)
-
-  # TODO(axch): Support batch low and high indices.  That would
-  # complicate this shape munging (though tf.gather should work
-  # fine).
-
-  # We want to place `low_counts` and `high_counts` at the `axis`
-  # position, so we reshape them to shape `[1, 1, ..., 1, N, 1, ...,
-  # 1]`, where the `N` is at `axis`.  The `counts_shp`, below,
-  # is this shape.
-  size = ps.size(high_indices)
-  counts_shp = ps.one_hot(
-      axis, depth=ps.rank(x), on_value=size, off_value=1)
-
-  low_counts = tf.reshape(tf.cast(low_indices, dtype=x.dtype),
-                          shape=counts_shp)
-  high_counts = tf.reshape(tf.cast(high_indices, dtype=x.dtype),
-                           shape=counts_shp)
-  return low_indices, high_indices, low_counts, high_counts
+  indices = ps.stack([low_indices, high_indices], axis=0)
+  x = tf.expand_dims(x, axis=0)
+  axis += 1
+
+  if ps.rank(indices) != ps.rank(x) and ps.rank(indices) == 2:
+    # legacy usage, kept for backward compatibility
+    size = ps.size(indices) // 2
+    bc_shape = ps.one_hot(axis, depth=ps.rank(x), on_value=size,
+      off_value=1)
+    bc_shape = ps.concat([[2], bc_shape[1:]], axis=0)
+    indices = ps.reshape(indices, bc_shape)
+  # `take_along_axis` requires the type to be int32
+  indices = ps.cast(indices, dtype=tf.int32)
+  return x, indices, axis
 
 
 def _safe_average(totals, counts):
diff --git a/tensorflow_probability/python/stats/sample_stats_test.py b/tensorflow_probability/python/stats/sample_stats_test.py
@@ -15,7 +15,7 @@
 """Tests for Sample Stats Ops."""
 
 # Dependency imports
-
+import functools
 import numpy as np
 import tensorflow.compat.v1 as tf1
 import tensorflow.compat.v2 as tf
@@ -679,6 +679,85 @@ def test_windowed_mean_corner_cases(self):
                         self.evaluate(sample_stats.windowed_mean(y)))
 
 
+@test_util.test_all_tf_execution_regimes
+class WindowedStatsTest(test_util.TestCase):
+  def apply_slice_along_axis(self, func, arr, low, high, axis):
+    """Applies `func` over slices of `arr` along `axis`. Slices intervals are
+    specified through `low` and `high`. Support broadcasting.
+    """
+    np.testing.assert_equal(low.shape, high.shape)
+    ni, _, nk = arr.shape[:axis], arr.shape[axis], arr.shape[axis + 1:]
+    si, j, sk = low.shape[:axis], low.shape[axis], low.shape[axis + 1:]
+    mk = max(nk, sk)
+    mi = max(ni, si)
+    out = np.empty(mi + (j,) + mk)
+    for ki in np.ndindex(ni):
+      for kk in np.ndindex(mk):
+        ak = tuple(np.mod(kk, nk))
+        ik = tuple(np.mod(kk, sk))
+        ai = tuple(np.mod(ki, ni))
+        ii = tuple(np.mod(ki, si))
+        a_1d = arr[ai + np.s_[:, ] + ak]
+        out_1d = out[ki + np.s_[:, ] + kk]
+        low_1d = low[ii + np.s_[:, ] + ik]
+        high_1d = high[ii + np.s_[:, ] + ik]
+
+        for r in range(j):
+          out_1d[r] = func(a_1d[low_1d[r]:high_1d[r]])
+    return out
+  def check_gaussian_windowed(self, shape, indice_shape, axis,
+                              window_func, np_func):
+    stat_shape = np.array(shape).astype(np.int32)
+    stat_shape[axis] = 1
+    loc = np.arange(np.prod(stat_shape)).reshape(stat_shape)
+    scale = 0.1 * np.arange(np.prod(stat_shape)).reshape(stat_shape)
+    rng = test_util.test_np_rng()
+    x = rng.normal(loc=loc, scale=scale, size=shape)
+    indice_shape = [2] + list(indice_shape)
+    indices = rng.randint(shape[axis] + 1, size=indice_shape)
+    indices = np.sort(indices, axis=0)
+    low_indices, high_indices = indices[0], indices[1]
+    a = window_func(x, low_indices=low_indices,
+                    high_indices=high_indices, axis=axis)
+    b = self.apply_slice_along_axis(np_func, x, low_indices, high_indices,
+                               axis=axis)
+    b[np.isnan(b)] = 0  # We treat stats computed on empty sets as zeros
+    self.assertAllClose(a, b)
+
+  def check_windowed(self, func, numpy_func):
+    check_fn = functools.partial(self.check_gaussian_windowed,
+                                 window_func=func, np_func=numpy_func)
+    check_fn((64, 4, 8), (128, 1, 1), axis=0)
+    check_fn((64, 4, 8), (32, 1, 1), axis=0)
+    check_fn((64, 4, 8), (32, 4, 1), axis=0)
+    check_fn((64, 4, 8), (32, 4, 8), axis=0)
+    check_fn((64, 4, 8), (64, 64, 1), axis=1)
+    check_fn((64, 4, 8), (1, 64, 1), axis=1)
+    check_fn((64, 4, 8), (64, 2, 8), axis=1)
+    check_fn((64, 4, 8), (64, 4, 64), axis=2)
+    check_fn((64, 4, 8), (1, 1, 64), axis=2)
+    check_fn((64, 4, 8), (64, 4, 4), axis=2)
+    check_fn((64, 4, 8), (1, 1, 4), axis=2)
+
+    with self.assertRaises(Exception):
+      # Non broadcastable shapes
+      check_fn((64, 4, 8), (4, 1, 4), axis=2)
+
+  def test_windowed_mean(self):
+    self.check_windowed(func=tfp.stats.windowed_mean, numpy_func=np.mean)
+
+  def test_windowed_mean_graph(self):
+    func = tf.function(tfp.stats.windowed_mean)
+    self.check_windowed(func=func, numpy_func=np.mean)
+
+  def test_windowed_variance(self):
+    self.check_windowed(func=tfp.stats.windowed_variance, numpy_func=np.var)
+
+  def test_windowed_variance_graph(self):
+    func = tf.function(tfp.stats.windowed_variance)
+    self.check_windowed(func=func, numpy_func=np.var)
+
+
 @test_util.test_all_tf_execution_regimes
 class LogAverageProbsTest(test_util.TestCase):