Split DeepFactorized into DeepFactorized + NoisyDeepFactorized.

relational · copybara-github · commit fa8ac89ba717 · 2020-09-21T00:09:17.000-07:00
PiperOrigin-RevId: 332792359
Change-Id: I5141a48eae32908c2793fda70b87811e01d1324a
diff --git a/tensorflow_compression/python/distributions/BUILD b/tensorflow_compression/python/distributions/BUILD
@@ -19,10 +19,7 @@ py_library(
     name = "deep_factorized",
     srcs = ["deep_factorized.py"],
     srcs_version = "PY3",
-    deps = [
-        ":helpers",
-        "//tensorflow_compression/python/ops:math_ops",
-    ],
+    deps = [":uniform_noise"],
 )
 
 py_test(
diff --git a/tensorflow_compression/python/distributions/deep_factorized.py b/tensorflow_compression/python/distributions/deep_factorized.py
@@ -17,11 +17,31 @@
 import tensorflow.compat.v2 as tf
 import tensorflow_probability as tfp
 
-from tensorflow_compression.python.distributions import helpers
-from tensorflow_compression.python.ops import math_ops
+from tensorflow_compression.python.distributions import uniform_noise
 
 
-__all__ = ["DeepFactorized"]
+__all__ = ["DeepFactorized", "NoisyDeepFactorized"]
+
+
+def log_expm1(x):
+  """Computes log(exp(x)-1) stably.
+
+  For large values of x, exp(x) will return Inf whereas log(exp(x)-1) ~= x.
+  Here we use this approximation for x>15, such that the output is non-Inf for
+  all positive values x.
+
+  Args:
+   x: A tensor.
+
+  Returns:
+    log(exp(x)-1)
+
+  """
+  # If x<15.0, we can compute it directly. For larger values,
+  # we have log(exp(x)-1) ~= log(exp(x)) = x.
+  cond = (x < 15.0)
+  x_small = tf.minimum(x, 15.0)
+  return tf.where(cond, tf.math.log(tf.math.expm1(x_small)), x)
 
 
 class DeepFactorized(tfp.distributions.Distribution):
@@ -34,7 +54,7 @@ class DeepFactorized(tfp.distributions.Distribution):
   > J. Ballé, D. Minnen, S. Singh, S. J. Hwang, N. Johnston<br />
   > https://openreview.net/forum?id=rkcQFMZRb
 
-  This implementation already includes convolution with a unit-width uniform
+  but *without* convolution with a unit-width uniform
   density, as described in appendix 6.2 of the same paper. Please cite the paper
   if you use this code for scientific work.
 
@@ -43,7 +63,8 @@ class DeepFactorized(tfp.distributions.Distribution):
   trainable distribution parameters.
   """
 
-  def __init__(self, batch_shape=(), num_filters=(3, 3), init_scale=10,
+  def __init__(self,
+               batch_shape=(), num_filters=(3, 3), init_scale=10,
                allow_nan_stats=False, dtype=tf.float32, name="DeepFactorized"):
     """Initializer.
 
@@ -98,22 +119,31 @@ def _make_variables(self):
     self._factors = []
 
     for i in range(len(self.num_filters) + 1):
-      init = tf.math.log(tf.math.expm1(1 / scale / filters[i + 1]))
-      init = tf.cast(init, dtype=self.dtype)
-      init = tf.broadcast_to(init, (channels, filters[i + 1], filters[i]))
-      matrix = tf.Variable(init, name="matrix_{}".format(i))
+
+      def matrix_initializer(i=i):
+        init = log_expm1(1 / scale / filters[i + 1])
+        init = tf.cast(init, dtype=self.dtype)
+        init = tf.broadcast_to(init, (channels, filters[i + 1], filters[i]))
+        return init
+
+      matrix = tf.Variable(matrix_initializer, name="matrix_{}".format(i))
       self._matrices.append(matrix)
 
-      bias = tf.Variable(
-          tf.random.uniform(
-              (channels, filters[i + 1], 1), -.5, .5, dtype=self.dtype),
-          name="bias_{}".format(i))
+      def bias_initializer(i=i):
+        return tf.random.uniform((channels, filters[i + 1], 1),
+                                 -.5,
+                                 .5,
+                                 dtype=self.dtype)
+
+      bias = tf.Variable(bias_initializer, name="bias_{}".format(i))
       self._biases.append(bias)
 
       if i < len(self.num_filters):
-        factor = tf.Variable(
-            tf.zeros((channels, filters[i + 1], 1), dtype=self.dtype),
-            name="factor_{}".format(i))
+
+        def factor_initializer(i=i):
+          return tf.zeros((channels, filters[i + 1], 1), dtype=self.dtype)
+
+        factor = tf.Variable(factor_initializer, name="factor_{}".format(i))
         self._factors.append(factor)
 
   def _batch_shape_tensor(self):
@@ -132,13 +162,20 @@ def _logits_cumulative(self, inputs):
     """Evaluate logits of the cumulative densities.
 
     Arguments:
-      inputs: The values at which to evaluate the cumulative densities, expected
-        to be a `tf.Tensor` of shape `(channels, 1, batch)`.
+      inputs: The values at which to evaluate the cumulative densities.
 
     Returns:
       A `tf.Tensor` of the same shape as `inputs`, containing the logits of the
       cumulative densities evaluated at the given inputs.
     """
+    # Convert to (channels, 1, batch) format by collapsing dimensions and then
+    # commuting channels to front.
+    inputs = tf.broadcast_to(
+        inputs,
+        tf.broadcast_dynamic_shape(tf.shape(inputs), self.batch_shape_tensor()))
+    shape = tf.shape(inputs)
+    inputs = tf.reshape(inputs, (-1, 1, self.batch_shape.num_elements()))
+    inputs = tf.transpose(inputs, (2, 1, 0))
     logits = inputs
     for i in range(len(self.num_filters) + 1):
       matrix = tf.nn.softplus(self._matrices[i])
@@ -147,48 +184,53 @@ def _logits_cumulative(self, inputs):
       if i < len(self.num_filters):
         factor = tf.math.tanh(self._factors[i])
         logits += factor * tf.math.tanh(logits)
-    return logits
-
-  def _prob(self, y):
-    """Called by the base class to compute likelihoods."""
-    # Convert to (channels, 1, batch) format by collapsing dimensions and then
-    # commuting channels to front.
-    y = tf.broadcast_to(
-        y, tf.broadcast_dynamic_shape(tf.shape(y), self.batch_shape_tensor()))
-    shape = tf.shape(y)
-    y = tf.reshape(y, (-1, 1, self.batch_shape.num_elements()))
-    y = tf.transpose(y, (2, 1, 0))
-
-    # Evaluate densities.
-    # We can use the special rule below to only compute differences in the left
-    # tail of the sigmoid. This increases numerical stability: sigmoid(x) is 1
-    # for large x, 0 for small x. Subtracting two numbers close to 0 can be done
-    # with much higher precision than subtracting two numbers close to 1.
-    lower = self._logits_cumulative(y - .5)
-    upper = self._logits_cumulative(y + .5)
-    # Flip signs if we can move more towards the left tail of the sigmoid.
-    sign = tf.stop_gradient(-tf.math.sign(lower + upper))
-    p = abs(tf.sigmoid(sign * upper) - tf.sigmoid(sign * lower))
-    p = math_ops.lower_bound(p, 0.)
 
     # Convert back to (broadcasted) input tensor shape.
-    p = tf.transpose(p, (2, 1, 0))
-    p = tf.reshape(p, shape)
-    return p
+    logits = tf.transpose(logits, (2, 1, 0))
+    logits = tf.reshape(logits, shape)
+    return logits
+
+  def _log_cdf(self, inputs):
+    logits = self._logits_cumulative(inputs)
+    return tf.math.log_sigmoid(logits)
+
+  def _log_survival_function(self, inputs):
+    logits = self._logits_cumulative(inputs)
+    # 1-sigmoid(x) = sigmoid(-x)
+    return tf.math.log_sigmoid(-logits)
+
+  def _cdf(self, inputs):
+    logits = self._logits_cumulative(inputs)
+    return tf.math.sigmoid(logits)
+
+  def _prob(self, inputs):
+    with tf.GradientTape() as tape:
+      tape.watch(inputs)
+      cdf = self._cdf(inputs)
+    prob = tape.gradient(cdf, inputs)
+    return prob
+
+  def _log_prob(self, inputs):
+    # let x=inputs and s(x)=sigmoid(x).
+    with tf.GradientTape() as tape:
+      tape.watch(inputs)
+      logits = self._logits_cumulative(inputs)
+    # We have F(x) = s(logits(x))
+    # so p(x) = F'(x)
+    #         = s'(logits(x)) * logits'(x)
+    #         = s(logits(x))*s(-logits(x)) * logits'(x)
+    # so log p(x) = log(s(logits(x)) + log(s(-logits(x)) + log(logits'(x))
+    log_s_logits = tf.math.log_sigmoid(logits)
+    log_s_neg_logits = tf.math.log_sigmoid(-logits)
+    dlogits = tape.gradient(logits, inputs)
+    return log_s_logits + log_s_neg_logits + tf.math.log(dlogits)
 
   def _quantization_offset(self):
     return tf.constant(0, dtype=self.dtype)
 
-  def _lower_tail(self, tail_mass):
-    tail = helpers.estimate_tails(
-        self._logits_cumulative, -tf.math.log(2 / tail_mass - 1),
-        tf.constant([self.batch_shape.num_elements(), 1, 1], tf.int32),
-        self.dtype)
-    return tf.reshape(tail, self.batch_shape_tensor())
-
-  def _upper_tail(self, tail_mass):
-    tail = helpers.estimate_tails(
-        self._logits_cumulative, tf.math.log(2 / tail_mass - 1),
-        tf.constant([self.batch_shape.num_elements(), 1, 1], tf.int32),
-        self.dtype)
-    return tf.reshape(tail, self.batch_shape_tensor())
+
+class NoisyDeepFactorized(uniform_noise.UniformNoiseAdapter):
+  """DeepFactorized that is convolved with uniform noise."""
+
+  def __init__(self, name="NoisyDeepFactorized", **kwargs):
+    super().__init__(DeepFactorized(**kwargs), name=name)
diff --git a/tensorflow_compression/python/distributions/deep_factorized_test.py b/tensorflow_compression/python/distributions/deep_factorized_test.py
@@ -37,8 +37,80 @@ def test_can_instantiate_batched(self):
     self.assertEqual(df.num_filters, (3, 3))
     self.assertEqual(df.init_scale, 10)
 
+  def test_logistic_is_special_case_prob(self):
+    # With no hidden units, the density should collapse to a logistic
+    # distribution.
+    df = deep_factorized.DeepFactorized(num_filters=(), init_scale=1)
+    logistic = tfp.distributions.Logistic(loc=-df._biases[0][0, 0], scale=1.)
+    x = tf.linspace(-5., 5., 20)
+    prob_df = df.prob(x)
+    prob_logistic = logistic.prob(x)
+    self.assertAllClose(prob_df, prob_logistic)
+
+  def test_logistic_is_special_case_cdf(self):
+    # With no hidden units, the density should collapse to a logistic
+    # distribution.
+    df = deep_factorized.DeepFactorized(num_filters=(), init_scale=1)
+    logistic = tfp.distributions.Logistic(loc=-df._biases[0][0, 0], scale=1.)
+    x = tf.linspace(-5., 5., 20)
+    cdf_df = df.cdf(x)
+    cdf_logistic = logistic.cdf(x)
+    self.assertAllClose(cdf_df, cdf_logistic)
+
+  def test_logistic_is_special_case_log_prob(self):
+    # With no hidden units, the density should collapse to a logistic
+    # distribution.
+    df = deep_factorized.DeepFactorized(num_filters=(), init_scale=1)
+    logistic = tfp.distributions.Logistic(loc=-df._biases[0][0, 0], scale=1.)
+    x = tf.linspace(-5000., 5000., 1000)
+    log_prob_df = df.log_prob(x)
+    log_prob_logistic = logistic.log_prob(x)
+    self.assertAllClose(log_prob_df, log_prob_logistic)
+
+  def test_logistic_is_special_case_log_cdf(self):
+    # With no hidden units, the density should collapse to a logistic
+    # distribution.
+    df = deep_factorized.DeepFactorized(num_filters=(), init_scale=1)
+    logistic = tfp.distributions.Logistic(loc=-df._biases[0][0, 0], scale=1.)
+    x = tf.linspace(-5000., 5000., 1000)
+    log_cdf_df = df.log_cdf(x)
+    log_cdf_logistic = logistic.log_cdf(x)
+    self.assertAllClose(log_cdf_df, log_cdf_logistic)
+
+  def test_logistic_is_special_case_log_survival_function(self):
+    # With no hidden units, the density should collapse to a logistic
+    # distribution.
+    df = deep_factorized.DeepFactorized(num_filters=(), init_scale=1)
+    logistic = tfp.distributions.Logistic(loc=-df._biases[0][0, 0], scale=1.)
+    x = tf.linspace(-5000., 5000., 1000)
+    log_survival_function_df = df.log_survival_function(x)
+    log_survival_function_logistic = logistic.log_survival_function(x)
+    self.assertAllClose(log_survival_function_df,
+                        log_survival_function_logistic)
+
+
+class NoisyDeepFactorizedTest(tf.test.TestCase):
+
+  def test_can_instantiate_and_run_scalar(self):
+    df = deep_factorized.NoisyDeepFactorized(num_filters=(2, 3, 4))
+    self.assertEqual(df.batch_shape, ())
+    self.assertEqual(df.event_shape, ())
+    self.assertEqual(df.base.num_filters, (2, 3, 4))
+    self.assertEqual(df.base.init_scale, 10)
+    x = tf.random.normal((10,))
+    df.prob(x)
+
+  def test_can_instantiate_and_run_batched(self):
+    df = deep_factorized.NoisyDeepFactorized(batch_shape=(4, 3))
+    self.assertEqual(df.batch_shape, (4, 3))
+    self.assertEqual(df.event_shape, ())
+    self.assertEqual(df.base.num_filters, (3, 3))
+    self.assertEqual(df.base.init_scale, 10)
+    x = tf.random.normal((10, 4, 3))
+    df.prob(x)
+
   def test_variables_receive_gradients(self):
-    df = deep_factorized.DeepFactorized()
+    df = deep_factorized.NoisyDeepFactorized()
     with tf.GradientTape() as tape:
       x = tf.random.normal([20])
       loss = -tf.reduce_mean(df.log_prob(x))
@@ -49,8 +121,9 @@ def test_variables_receive_gradients(self):
   def test_logistic_is_special_case(self):
     # With no hidden units, the density should collapse to a logistic
     # distribution convolved with a standard uniform distribution.
-    df = deep_factorized.DeepFactorized(num_filters=(), init_scale=1)
-    logistic = tfp.distributions.Logistic(loc=-df._biases[0][0, 0], scale=1.)
+    df = deep_factorized.NoisyDeepFactorized(num_filters=(), init_scale=1)
+    logistic = tfp.distributions.Logistic(loc=-df.base._biases[0][0, 0],
+                                          scale=1.)
     x = tf.linspace(-5., 5., 20)
     prob_df = df.prob(x)
     prob_log = logistic.cdf(x + .5) - logistic.cdf(x - .5)
@@ -59,24 +132,24 @@ def test_logistic_is_special_case(self):
   def test_uniform_is_special_case(self):
     # With the scale parameter going to zero, the density should approach a
     # unit-width uniform distribution.
-    df = deep_factorized.DeepFactorized(init_scale=1e-3)
+    df = deep_factorized.NoisyDeepFactorized(init_scale=1e-3)
     x = tf.linspace(-1., 1., 10)
     self.assertAllClose(df.prob(x), [0, 0, 0, 1, 1, 1, 1, 0, 0, 0])
 
   def test_quantization_offset_is_zero(self):
-    df = deep_factorized.DeepFactorized()
+    df = deep_factorized.NoisyDeepFactorized()
     self.assertEqual(helpers.quantization_offset(df), 0)
 
   def test_tails_and_offset_are_in_order(self):
-    df = deep_factorized.DeepFactorized()
+    df = deep_factorized.NoisyDeepFactorized()
     offset = helpers.quantization_offset(df)
     lower_tail = helpers.lower_tail(df, 2**-8)
     upper_tail = helpers.upper_tail(df, 2**-8)
     self.assertGreater(upper_tail, offset)
     self.assertGreater(offset, lower_tail)
 
   def test_stats_throw_error(self):
-    df = deep_factorized.DeepFactorized()
+    df = deep_factorized.NoisyDeepFactorized()
     with self.assertRaises(NotImplementedError):
       df.mode()
     with self.assertRaises(NotImplementedError):
diff --git a/tensorflow_compression/python/distributions/helpers_test.py b/tensorflow_compression/python/distributions/helpers_test.py
@@ -77,6 +77,10 @@ def test_deep_factorized_tails_are_in_order(self):
     self.assertAllGreater(
         helpers.upper_tail(dist, 2**-8) - helpers.lower_tail(dist, 2**-8), 0)
 
+  def test_noisy_deep_factorized_tails_are_in_order(self):
+    dist = deep_factorized.NoisyDeepFactorized(batch_shape=[10])
+    self.assertAllGreater(
+        helpers.upper_tail(dist, 2**-8) - helpers.lower_tail(dist, 2**-8), 0)
 
 if __name__ == "__main__":
   tf.test.main()