Normalizes power law entropy model penalty differently.

Johannes Ballé · copybara-github · commit 7bd536387f6d · 2022-05-12T13:36:17.000-07:00
This changes the penalty to be normalized such that it is non-negative, rather
than it representing a normalized distribution. This is more intuitive from an
optimization perspective, and the only other practical effect this should have
is that the effective weight of the penalty changes.

PiperOrigin-RevId: 448326753
Change-Id: I4baf22a3446062622d7c643b3040bb6ac22b5fe7
diff --git a/tensorflow_compression/python/entropy_models/power_law.py b/tensorflow_compression/python/entropy_models/power_law.py
@@ -27,11 +27,16 @@
 class PowerLawEntropyModel(tf.Module):
   """Entropy model for power-law distributed random variables.
 
-  This entropy model handles quantization of a bottleneck tensor and implements
-  a cross entropy penalty that is consistent with the Elias gamma code.
+  This entropy model handles quantization and compression of a bottleneck tensor
+  and implements a penalty that encourages compressibility under the Elias gamma
+  code.
 
   The gamma code has code lengths `1 + 2 floor(log_2(x))`, for `x` a positive
-  integer. For details on the gamma code, see:
+  integer, and is close to optimal if `x` is distributed according to a power
+  law. Being a universal code, it also guarantees that in the worst case, the
+  expected code length is no more than 3 times the entropy of the empirical
+  distribution of `x`, as long as probability decreases with increasing `x`. For
+  details on the gamma code, see:
 
   > "Universal Codeword Sets and Representations of the Integers"<br />
   > P. Elias<br />
@@ -43,13 +48,12 @@ class PowerLawEntropyModel(tf.Module):
 
   The penalty applied by this class is given by:
   ```
-  -log_2 p(x), with p(x) = alpha / 2 * (x + alpha) ** -2
+  log((abs(x) + alpha) / alpha)
   ```
-  Like the gamma code, this follows a symmetrized power law, but only
-  approximately for `alpha > 0`. Without `alpha`, the distribution would not be
-  normalizable, and the penalty would have a singularity at zero. Setting
-  `alpha` to a small positive value ensures that the penalty is non-negative,
-  and that its gradients are useful for optimization.
+  This encourages `x` to follow a symmetrized power law, but only approximately
+  for `alpha > 0`. Without `alpha`, the penalty would have a singularity at
+  zero. Setting `alpha` to a small positive value ensures that the penalty is
+  non-negative, and that its gradients are useful for optimization.
   """
 
   def __init__(self,
@@ -123,11 +127,7 @@ def penalty(self, bottleneck):
       entropy.
     """
     bottleneck = tf.convert_to_tensor(bottleneck, dtype=self.bottleneck_dtype)
-    log_alpha = tf.math.log(
-        tf.constant(self.alpha, dtype=self.bottleneck_dtype))
-    log_2 = tf.math.log(tf.constant(2, dtype=self.bottleneck_dtype))
-    penalty = ((1. - log_alpha / log_2) +
-               tf.math.log(abs(bottleneck) + self.alpha) * (2. / log_2))
+    penalty = tf.math.log((abs(bottleneck) + self.alpha) / self.alpha)
     return tf.reduce_sum(penalty, axis=tuple(range(-self.coding_rank, 0)))
 
   @tf.Module.with_name_scope
diff --git a/tensorflow_compression/python/entropy_models/power_law_test.py b/tensorflow_compression/python/entropy_models/power_law_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Tests of power law entropy model."""
 
+import numpy as np
 import tensorflow as tf
 from tensorflow_compression.python.entropy_models.power_law import PowerLawEntropyModel
 
@@ -56,27 +57,26 @@ def test_compression_consistent_with_quantization(self):
 
   def test_penalty_is_proportional_to_code_length(self):
     em = PowerLawEntropyModel(coding_rank=1)
-    # Sample some values from a Laplacian distribution.
-    u = tf.random.uniform((100, 1), minval=-1., maxval=1.)
-    values = 100. * tf.math.log(abs(u)) * tf.sign(u)
-    # Ensure there are some large values.
-    self.assertGreater(tf.reduce_sum(tf.cast(abs(values) > 100, tf.int32)), 0)
-    strings = em.compress(tf.broadcast_to(values, (100, 100)))
+    x = tf.range(-20., 20.)[:, None]
+    x += tf.random.uniform(x.shape, -.49, .49)
+    strings = em.compress(tf.broadcast_to(x, (40, 100)))
     code_lengths = tf.cast(tf.strings.length(strings, unit="BYTE"), tf.float32)
     code_lengths *= 8 / 100
-    penalties = em.penalty(values)
-    self.assertAllInRange(penalties - code_lengths, 4, 7)
+    penalties = em.penalty(x)
+    # There are some fluctuations due to `alpha`, `floor`, and rounding, but we
+    # expect a high degree of correlation between code lengths and penalty.
+    self.assertGreater(np.corrcoef(code_lengths, penalties)[0, 1], .96)
 
-  def test_penalty_is_differentiable(self):
+  def test_penalty_is_nonnegative_and_differentiable(self):
     em = PowerLawEntropyModel(coding_rank=1)
-    # Sample some values from a Laplacian distribution.
-    u = tf.random.uniform((100, 1), minval=-1., maxval=1.)
-    values = 100. * tf.math.log(abs(u)) * tf.sign(u)
+    x = tf.range(-20., 20.)[:, None]
+    x += tf.random.uniform(x.shape, -.49, .49)
     with tf.GradientTape() as tape:
-      tape.watch(values)
-      penalties = em.penalty(values)
-    gradients = tape.gradient(penalties, values)
-    self.assertAllEqual(tf.sign(gradients), tf.sign(values))
+      tape.watch(x)
+      penalties = em.penalty(x)
+    gradients = tape.gradient(penalties, x)
+    self.assertAllGreaterEqual(penalties, 0)
+    self.assertAllEqual(tf.sign(gradients), tf.sign(x))
 
   def test_compression_works_in_tf_function(self):
     samples = tf.random.stateless_normal([100], (34, 232))