Implement optional Gauss-Hermite quadrature for computing statistics of the LogitNormal distribution.

axch · tensorflower-gardener · commit 9d88c41f6540 · 2021-07-08T11:56:06.000-07:00
PiperOrigin-RevId: 383682512
diff --git a/tensorflow_probability/python/distributions/logitnormal.py b/tensorflow_probability/python/distributions/logitnormal.py
@@ -18,6 +18,8 @@
 from __future__ import division
 from __future__ import print_function
 
+import numpy as onp
+
 import tensorflow.compat.v2 as tf
 
 from tensorflow_probability.python import math as tfp_math
@@ -47,6 +49,8 @@ def __init__(self,
                loc,
                scale,
                num_probit_terms_approx=2,
+               gauss_hermite_scale_limit=None,
+               gauss_hermite_degree=20,
                validate_args=False,
                allow_nan_stats=True,
                name='LogitNormal'):
@@ -71,6 +75,18 @@ def __init__(self,
         (inclusive). Using `num_probit_terms_approx=2` should result in
         `mean_approx` error not exceeding `10**-4`.
         Default value: `2`.
+      gauss_hermite_scale_limit: Floating-point `Tensor` or `None`.
+        The (batch-wise) maximum scale at which to compute statistics
+        with Gauss-Hermite quadrature instead of the Monahan-Stefanski
+        approximation [1].  Default: `None`, which recovers the legacy
+        behavior of using Monahan-Stefanski everywhere and does not
+        add TF ops for Gauss-Hermite.  The best value depends on the
+        working precision and the number of terms in the Gauss-Hermite
+        or Monahan-Stefanski approximations being switched between,
+        as well as the expected range of `loc` parameters; but `1` is
+        not unreasonable.
+      gauss_hermite_degree: Python integer giving the number of
+        sample points to use for Gauss-Hermite quadrature.
       validate_args: Python `bool`, default `False`. Whether to validate input
         with asserts. If `validate_args` is `False`, and the inputs are
         invalid, correct behavior is not guaranteed.
@@ -90,6 +106,7 @@ def __init__(self,
          Communications in Statistics-Simulation and Computation 9.4 (1980):
          389-419.
          https://www.tandfonline.com/doi/abs/10.1080/03610918008812164
+
     """
     parameters = dict(locals())
     num_probit_terms_approx = int(num_probit_terms_approx)
@@ -98,6 +115,8 @@ def __init__(self,
           'Argument `num_probit_terms_approx` must be an integer between '
           '`1` and `8` (inclusive).')
     self._num_probit_terms_approx = num_probit_terms_approx
+    self._gauss_hermite_scale_limit = gauss_hermite_scale_limit
+    self._gauss_hermite_degree = gauss_hermite_degree
     with tf.name_scope(name) as name:
       super(LogitNormal, self).__init__(
           distribution=normal_lib.Normal(loc=loc, scale=scale),
@@ -131,6 +150,16 @@ def num_probit_terms_approx(self):
     """Number of `Normal(0, 1).cdf` terms using in `mean_*_approx` functions."""
     return self._num_probit_terms_approx
 
+  @property
+  def gauss_hermite_scale_limit(self):
+    """Largest scale using Gauss-Hermite quadrature in `*_approx` functions."""
+    return self._gauss_hermite_scale_limit
+
+  @property
+  def gauss_hermite_degree(self):
+    """Number of points for Gauss-Hermite quadrature in `*_approx` functions."""
+    return self._gauss_hermite_degree
+
   experimental_is_sharded = False
 
   def mean_log_prob_approx(self, y=None, name='mean_log_prob_approx'):
@@ -199,10 +228,19 @@ def mean_approx(self, name='mean_approx'):
          https://www.tandfonline.com/doi/abs/10.1080/03610918008812164
     """
     with self._name_and_control_scope(name):
-      return approx_expected_sigmoid(
-          self.loc, self.scale,
+      loc = tf.convert_to_tensor(self.loc)
+      scale = tf.convert_to_tensor(self.scale)
+      monahan_stefanski_answer = approx_expected_sigmoid(
+          loc, scale,
           MONAHAN_MIX_PROB[self.num_probit_terms_approx],
           MONAHAN_INVERSE_SCALE[self.num_probit_terms_approx])
+      if self.gauss_hermite_scale_limit is None:
+        return monahan_stefanski_answer
+      else:
+        gauss_hermite_answer = logit_normal_mean_gh(
+            loc, scale, self.gauss_hermite_degree)
+        return tf.where(scale < self.gauss_hermite_scale_limit,
+                        gauss_hermite_answer, monahan_stefanski_answer)
 
   def variance_approx(self, name='variance_approx'):
     """Approximate the variance of a LogitNormal.
@@ -233,10 +271,19 @@ def variance_approx(self, name='variance_approx'):
          https://www.tandfonline.com/doi/abs/10.1080/03610918008812164
     """
     with self._name_and_control_scope(name):
-      return approx_variance_sigmoid(
-          self.loc, self.scale,
+      loc = tf.convert_to_tensor(self.loc)
+      scale = tf.convert_to_tensor(self.scale)
+      monahan_stefanski_answer = approx_variance_sigmoid(
+          loc, scale,
           MONAHAN_MIX_PROB[self.num_probit_terms_approx],
           MONAHAN_INVERSE_SCALE[self.num_probit_terms_approx])
+      if self.gauss_hermite_scale_limit is None:
+        return monahan_stefanski_answer
+      else:
+        gauss_hermite_answer = logit_normal_variance_gh(
+            loc, scale, self.gauss_hermite_degree)
+        return tf.where(scale < self.gauss_hermite_scale_limit,
+                        gauss_hermite_answer, monahan_stefanski_answer)
 
   def stddev_approx(self, name='stddev_approx'):
     """Approximate the stdandard deviation of a LogitNormal.
@@ -479,3 +526,37 @@ def approx_variance_sigmoid(
         alpha[tf.newaxis, :] * alpha[:, tf.newaxis] * (b + bt),
         axis=[-2, -1])
     return mom2 - approx_expected_sigmoid(m, s, alpha, c)**2.
+
+
+# The above approximations fail for small scales.  We compute
+# statistics for small scales with Gauss-Hermite quadrature.
+
+
+def logit_normal_mean_gh(loc, scale, deg):
+  """Approximates `E_{N(m,s)}[sigmoid(X)]` by Gauss-Hermite quadrature."""
+  # We want to integrate
+  # A = \int_-inf^inf sigmoid(x) * Normal(loc, scale).pdf(x) dx
+  # To bring it into the right form for Gauss-Hermite quadrature,
+  # we make the substitution y = (x - loc) / scale, to get
+  # A = (1/sqrt(2*pi)) * \int_-inf^inf [
+  #       sigmoid(y * scale + loc) * exp(-1/2 y**2) dy]
+  grid, weights = onp.polynomial.hermite_e.hermegauss(deg)
+  grid = tf.cast(grid, dtype=loc.dtype)
+  weights = tf.cast(weights, dtype=loc.dtype)
+  normalizer = tf.constant(onp.sqrt(2 * onp.pi), dtype=loc.dtype)
+  values = tf.sigmoid(grid * scale[..., tf.newaxis] + loc[..., tf.newaxis])
+  return tf.reduce_sum(values * weights, axis=-1) / normalizer
+
+
+def logit_normal_variance_gh(loc, scale, deg):
+  """Approxmates `Var_{N(m,s)}[sigmoid(X)]` by Gauss-Hermite quadrature."""
+  # Since we have to compute sigmoids for variance anyway, we inline
+  # computing the mean by Gauss-Hermite quadrature at the same grid of points.
+  grid, weights = onp.polynomial.hermite_e.hermegauss(deg)
+  grid = tf.cast(grid, dtype=loc.dtype)
+  weights = tf.cast(weights, dtype=loc.dtype)
+  normalizer = tf.constant(onp.sqrt(2 * onp.pi), dtype=loc.dtype)
+  sigmoids = tf.sigmoid(grid * scale[..., tf.newaxis] + loc[..., tf.newaxis])
+  mean = tf.reduce_sum(sigmoids * weights, axis=-1) / normalizer
+  residuals = (sigmoids - mean[..., tf.newaxis])**2
+  return tf.reduce_sum(residuals * weights, axis=-1) / normalizer
diff --git a/tensorflow_probability/python/distributions/logitnormal_test.py b/tensorflow_probability/python/distributions/logitnormal_test.py
@@ -28,6 +28,42 @@
 
 
 tfd = tfp.distributions
+ln_lib = tfd.logitnormal
+
+
+def logit_normal_trapezoid_rule(loc, scale):
+  """Brute-force statistics of LogitNormal(loc, scale) by quadrature."""
+  # LogitNormal samples as
+  #   z ~ Normal(loc, scale)
+  #   return sigmoid(z)
+  # We find the statistics by integrating f(z) * Normal.pdf(z) over z.
+  # The function f is always bounded, and for z outside +-10 * scale,
+  # the Normal cdf is small enough to be negligible.  Thus it suffices
+  # to integrate from loc - 10 * scale to loc + 10 * scale
+  n = 10000
+  width = 10.0
+  xs = tf.linspace(loc - width*scale, loc + width*scale, n)
+  def trapezoid(vals):
+    total = tf.reduce_sum(vals, axis=0) - 0.5 * (vals[0] + vals[-1])
+    return total * 2 * width * scale / tf.cast((n-1), xs.dtype)
+  return xs, trapezoid
+
+
+def logit_normal_mean_trapezoid(loc, scale):
+  """Brute-force the mean of LogitNormal(loc, scale) by quadrature."""
+  dist = tfd.Normal(loc, scale)
+  grid, compute = logit_normal_trapezoid_rule(loc, scale)
+  return compute(tf.sigmoid(grid) * dist.prob(grid))
+
+
+def logit_normal_variance_trapezoid(loc, scale):
+  """Brute-force the variance of LogitNormal(loc, scale) by quadrature."""
+  dist = tfd.Normal(loc, scale)
+  grid, compute = logit_normal_trapezoid_rule(loc, scale)
+  probs = dist.prob(grid)
+  sigmoids = tf.sigmoid(grid)
+  mean = compute(sigmoids * probs)
+  return compute((sigmoids - mean)**2 * probs)
 
 
 @test_util.test_all_tf_execution_regimes
@@ -69,6 +105,34 @@ def testLogitNormalVarianceApprox(self):
     self.assertAllClose(
         variance_sample_, variance_approx_, atol=1e-4, rtol=0.03)
 
+  def testLogitNormalMeanGH(self):
+    locs, scales = tf.meshgrid(tf.linspace(-10.0, 10.0, 10),
+                               tf.exp(tf.linspace(-3.0, 0.0, 10)))
+    ghs = ln_lib.logit_normal_mean_gh(locs, scales, deg=50)
+    traps = logit_normal_mean_trapezoid(locs, scales)
+    self.assertAllClose(traps, ghs, rtol=1e-4)
+
+  def testLogitNormalVarianceGH(self):
+    locs, scales = tf.meshgrid(tf.linspace(-10.0, 10.0, 10),
+                               tf.exp(tf.linspace(-3.0, 0.0, 10)))
+    ghs = ln_lib.logit_normal_variance_gh(locs, scales, deg=50)
+    traps = logit_normal_variance_trapezoid(locs, scales)
+    self.assertAllClose(traps, ghs, rtol=1e-4)
+
+  def testLogitNormalMeanAndVariance(self):
+    locs, scales = tf.meshgrid(tf.linspace(-10.0, 10.0, 10),
+                               tf.exp(tf.linspace(-3.0, 3.0, 10)))
+    dist = tfd.LogitNormal(
+        loc=locs, scale=scales, validate_args=True,
+        gauss_hermite_scale_limit=1.,
+        num_probit_terms_approx=6)
+    means = dist.mean_approx()
+    trap_means = logit_normal_mean_trapezoid(locs, scales)
+    self.assertAllClose(trap_means, means, rtol=1e-4)
+    variances = dist.variance_approx()
+    trap_variances = logit_normal_variance_trapezoid(locs, scales)
+    self.assertAllClose(trap_variances, variances, rtol=1e-4)
+
   def testLogitNormalLogitNormalKL(self):
     batch_size = 6
     mu_a = np.array([3.0] * batch_size)