Tunes estimate_tails.

Johannes Ballé · copybara-github · commit c0e0fd5e6ec2 · 2020-11-05T08:47:12.000-08:00
Uses a new stopping criterion that doesn't assume anything about initialization,
and hand-tweaks parameters to deal well with distributions that occur in
practice.

PiperOrigin-RevId: 340860445
Change-Id: I515b30cccb1e79703743b7d9f5aac1b1eb0fbacd
diff --git a/tensorflow_compression/python/distributions/helpers.py b/tensorflow_compression/python/distributions/helpers.py
@@ -37,7 +37,7 @@ def estimate_tails(func, target, shape, dtype):
   For instance, if `func` is a CDF and the target is a quantile value, this
   would find the approximate location of that quantile. Note that `func` is
   assumed to be monotonic. When each tail estimate has passed the optimal value
-  of `x`, the algorithm does 10 additional iterations and then stops.
+  of `x`, the algorithm does 100 additional iterations and then stops.
 
   This operation is vectorized. The tensor shape of `x` is given by `shape`, and
   `target` must have a shape that is broadcastable to the output of `func(x)`.
@@ -59,20 +59,21 @@ def estimate_tails(func, target, shape, dtype):
 
     def loop_cond(tails, m, v, count):
       del tails, m, v  # unused
-      return tf.reduce_min(count) < 10
+      return tf.reduce_min(count) < 100
 
-    def loop_body(tails, m, v, count):
+    def loop_body(tails, prev_m, prev_v, count):
       with tf.GradientTape(watch_accessed_variables=False) as tape:
         tape.watch(tails)
         loss = abs(func(tails) - target)
       grad = tape.gradient(loss, tails)
-      m = .5 * m + .5 * grad  # Adam mean estimate.
-      v = .9 * v + .1 * tf.square(grad)  # Adam variance estimate.
-      tails -= .5 * m / (tf.sqrt(v) + 1e-7)
-      # Start counting when the gradient flips sign (note that this assumes
-      # `tails` is initialized to zero).
+      m = (prev_m + grad) / 2  # Adam mean estimate.
+      v = (prev_v + tf.square(grad)) / 2  # Adam variance estimate.
+      tails -= .1 * m / (tf.sqrt(v) + 1e-20)
+      # Start counting when the gradient flips sign. Since the function is
+      # monotonic, m must have the same sign in all initial iterations, until
+      # the optimal point is crossed. At that point the gradient flips sign.
       count = tf.where(
-          tf.math.logical_or(count > 0, tails * grad > 0),
+          tf.math.logical_or(count > 0, prev_m * grad < 0),
           count + 1, count)
       return tails, m, v, count