Fixes Keras layer reparameterization in eager mode.

Johannes Ball? · copybara-github · commit 19eb16cbde09 · 2020-01-31T00:55:00.000-08:00
Storing a "computed" layer parameter like a reparameterized kernel as an object
attribute in the `build()` function breaks backpropagation in eager mode. The
reason is that if `build()` is called outside (before) a `GradientTape` scope,
the gradients from the layer output to its variables will be disconnected.

One fix is to make `Parameterizer`s return an object that, like variables, has
a `value()` method, and define the layer's parameters via `@property`s. Every
time a value of a parameter is requested, these then call the `value()` method
and thus recompute the parameter from its variable every time the layer is
called. This way, the dependency can be tracked with the current `GradientTape`
scope.

PiperOrigin-RevId: 292501734
Change-Id: Idec76b516f7799c32e5b0375b09eb76a63e0372f
diff --git a/tensorflow_compression/python/layers/BUILD b/tensorflow_compression/python/layers/BUILD
@@ -34,6 +34,13 @@ py_library(
     deps = [":parameterizers"],
 )
 
+py_test(
+    name = "gdn_test",
+    srcs = ["gdn_test.py"],
+    python_version = "PY3",
+    deps = [":gdn"],
+)
+
 py_library(
     name = "initializers",
     srcs = ["initializers.py"],
@@ -68,13 +75,6 @@ py_test(
     deps = [":entropy_models"],
 )
 
-py_test(
-    name = "gdn_test",
-    srcs = ["gdn_test.py"],
-    python_version = "PY3",
-    deps = [":gdn"],
-)
-
 py_test(
     name = "parameterizers_test",
     srcs = ["parameterizers_test.py"],
diff --git a/tensorflow_compression/python/layers/gdn.py b/tensorflow_compression/python/layers/gdn.py
@@ -85,7 +85,7 @@ def __init__(self,
         Defaults to `NonnegativeParameterizer` with a minimum value of 0.
       **kwargs: Other keyword arguments passed to superclass (`Layer`).
     """
-    super(GDN, self).__init__(**kwargs)
+    super().__init__(**kwargs)
     self._inverse = bool(inverse)
     self._rectify = bool(rectify)
     self._gamma_init = float(gamma_init)
@@ -136,6 +136,14 @@ def gamma_parameterizer(self, val):
           "Can't set `gamma_parameterizer` once layer has been built.")
     self._gamma_parameterizer = val
 
+  @property
+  def beta(self):
+    return self._beta.value()
+
+  @property
+  def gamma(self):
+    return self._gamma.value()
+
   def _channel_axis(self):
     return {"channels_first": 1, "channels_last": -1}[self.data_format]
 
@@ -152,17 +160,17 @@ def build(self, input_shape):
 
     # Sorry, lint, but these objects really are callable ...
     # pylint:disable=not-callable
-    self.beta = self.beta_parameterizer(
+    self._beta = self.beta_parameterizer(
         name="beta", shape=[num_channels], dtype=self.dtype,
         getter=self.add_weight, initializer=tf.initializers.ones())
 
-    self.gamma = self.gamma_parameterizer(
+    self._gamma = self.gamma_parameterizer(
         name="gamma", shape=[num_channels, num_channels], dtype=self.dtype,
         getter=self.add_weight,
         initializer=tf.initializers.identity(gain=self._gamma_init))
     # pylint:enable=not-callable
 
-    self.built = True
+    super().build(input_shape)
 
   def call(self, inputs):
     inputs = tf.convert_to_tensor(inputs, dtype=self.dtype)
@@ -175,7 +183,9 @@ def call(self, inputs):
     if ndim == 2:
       norm_pool = tf.linalg.matmul(tf.math.square(inputs), self.gamma)
       norm_pool = tf.nn.bias_add(norm_pool, self.beta)
-    elif self.data_format == "channels_last" and ndim <= 5:
+    elif self.data_format == "channels_last" and ndim <= 4:
+      # TODO(unassigned): This branch should also work for ndim == 5, but
+      # currently triggers a bug in TF.
       shape = self.gamma.shape.as_list()
       gamma = tf.reshape(self.gamma, (ndim - 2) * [1] + shape)
       norm_pool = tf.nn.convolution(tf.math.square(inputs), gamma, "VALID")
diff --git a/tensorflow_compression/python/layers/gdn_test.py b/tensorflow_compression/python/layers/gdn_test.py
@@ -1,3 +1,4 @@
+# Lint as: python3
 # Copyright 2018 Google LLC. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,70 +15,64 @@
 # ==============================================================================
 """Tests of GDN layer."""
 
-import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow.compat.v2 as tf
 
-from tensorflow.python.framework import test_util
 from tensorflow_compression.python.layers import gdn
 
 
-@test_util.deprecated_graph_mode_only
 class GDNTest(tf.test.TestCase):
 
-  def _run_gdn(self, x, shape, inverse, rectify, data_format):
-    inputs = tf.placeholder(tf.float32, shape)
-    layer = gdn.GDN(
-        inverse=inverse, rectify=rectify, data_format=data_format)
-    outputs = layer(inputs)
-    with self.cached_session() as sess:
-      tf.global_variables_initializer().run()
-      y, = sess.run([outputs], {inputs: x})
-    return y
-
-  def test_invalid_data_format(self):
-    x = np.random.uniform(size=(1, 2, 3, 4))
+  def test_invalid_data_format_raises_error(self):
+    x = tf.random.uniform((1, 2, 3, 4), dtype=tf.float32)
     with self.assertRaises(ValueError):
-      self._run_gdn(x, x.shape, False, False, "NHWC")
+      gdn.GDN(inverse=False, rectify=False, data_format="NHWC")(x)
 
-  def test_unknown_dim(self):
-    x = np.random.uniform(size=(1, 2, 3, 4))
+  def test_vector_input_raises_error(self):
+    x = tf.random.uniform((3,), dtype=tf.float32)
+    with self.assertRaises(ValueError):
+      gdn.GDN(inverse=False, rectify=False, data_format="channels_last")(x)
     with self.assertRaises(ValueError):
-      self._run_gdn(x, 4 * [None], False, False, "channels_last")
+      gdn.GDN(inverse=True, rectify=True, data_format="channels_first")(x)
 
-  def test_channels_last(self):
+  def test_channels_last_has_correct_output(self):
+    # This tests that the layer produces the correct output for a number of
+    # different input dimensionalities with 'channels_last' data format.
     for ndim in [2, 3, 4, 5, 6]:
-      x = np.random.uniform(size=(1, 2, 3, 4, 5, 6)[:ndim])
-      y = self._run_gdn(x, x.shape, False, False, "channels_last")
+      x = tf.random.uniform((1, 2, 3, 4, 5, 6)[:ndim], dtype=tf.float32)
+      y = gdn.GDN(inverse=False, rectify=False, data_format="channels_last")(x)
       self.assertEqual(x.shape, y.shape)
-      self.assertAllClose(y, x / np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+      self.assertAllClose(y, x / tf.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
 
-  def test_channels_first(self):
+  def test_channels_first_has_correct_output(self):
+    # This tests that the layer produces the correct output for a number of
+    # different input dimensionalities with 'channels_first' data format.
     for ndim in [2, 3, 4, 5, 6]:
-      x = np.random.uniform(size=(6, 5, 4, 3, 2, 1)[:ndim])
-      y = self._run_gdn(x, x.shape, False, False, "channels_first")
+      x = tf.random.uniform((6, 5, 4, 3, 2, 1)[:ndim], dtype=tf.float32)
+      y = gdn.GDN(inverse=False, rectify=False, data_format="channels_first")(x)
       self.assertEqual(x.shape, y.shape)
-      self.assertAllClose(
-          y, x / np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
-
-  def test_wrong_dims(self):
-    x = np.random.uniform(size=(3,))
-    with self.assertRaises(ValueError):
-      self._run_gdn(x, x.shape, False, False, "channels_last")
-    with self.assertRaises(ValueError):
-      self._run_gdn(x, x.shape, True, True, "channels_first")
+      self.assertAllClose(y, x / tf.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
 
-  def test_igdn(self):
-    x = np.random.uniform(size=(1, 2, 3, 4))
-    y = self._run_gdn(x, x.shape, True, False, "channels_last")
+  def test_igdn_has_correct_output(self):
+    x = tf.random.uniform((1, 2, 3, 4), dtype=tf.float32)
+    y = gdn.GDN(inverse=True, rectify=False)(x)
     self.assertEqual(x.shape, y.shape)
-    self.assertAllClose(y, x * np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+    self.assertAllClose(y, x * tf.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
 
-  def test_rgdn(self):
-    x = np.random.uniform(-.5, .5, size=(1, 2, 3, 4))
-    y = self._run_gdn(x, x.shape, False, True, "channels_last")
+  def test_rgdn_has_correct_output(self):
+    x = tf.random.uniform((1, 2, 3, 4), -.5, .5, dtype=tf.float32)
+    y = gdn.GDN(inverse=False, rectify=True)(x)
     self.assertEqual(x.shape, y.shape)
-    x = np.maximum(x, 0)
-    self.assertAllClose(y, x / np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+    x = tf.maximum(x, 0)
+    self.assertAllClose(y, x / tf.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+
+  def test_variables_receive_gradients(self):
+    x = tf.random.uniform((1, 2), dtype=tf.float32)
+    layer = gdn.GDN(inverse=False, rectify=True)
+    with tf.GradientTape() as g:
+      y = layer(x)
+    grads = g.gradient(y, layer.trainable_variables)
+    self.assertLen(grads, 2)
+    self.assertNotIn(None, grads)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow_compression/python/layers/parameterizers.py b/tensorflow_compression/python/layers/parameterizers.py
@@ -21,13 +21,28 @@
 
 
 __all__ = [
+    "Parameter",
     "Parameterizer",
     "StaticParameterizer",
     "RDFTParameterizer",
     "NonnegativeParameterizer",
 ]
 
 
+class Parameter(object):
+  """Reparameterized `Layer` variable.
+
+  This object represents a parameter of a `tf.keras.layer.Layer` object which
+  isn't directly stored in a `tf.Variable`. Instead, the value is computed
+  on-demand by calling its `value()` method.
+  """
+
+  def __init__(self, value):
+    if not callable(value):
+      raise TypeError("`value` must be callable without arguments.")
+    self.value = value
+
+
 class Parameterizer(object):
   """Parameterization object (abstract base class).
 
@@ -69,9 +84,9 @@ def __call__(self, getter, name, shape, dtype, initializer, regularizer=None):
                                 "static parameterizers.")
     if callable(self.value):
       # Treat value as initializer.
-      return self.value(shape, dtype=dtype)
+      return Parameter(lambda: self.value(shape, dtype=dtype))
     else:
-      return self.value
+      return Parameter(lambda: self.value)
 
 
 class RDFTParameterizer(Parameterizer):
@@ -137,7 +152,7 @@ def reparam(rdft):
     rdft = getter(
         name=rdft_name, shape=rdft_shape, dtype=rdft_dtype,
         initializer=rdft_initializer, regularizer=reparam_regularizer)
-    return reparam(rdft)
+    return Parameter(lambda: reparam(rdft))
 
 
 class NonnegativeParameterizer(Parameterizer):
@@ -194,4 +209,4 @@ def reparam(var):
     var = getter(
         name=reparam_name, shape=shape, dtype=dtype,
         initializer=reparam_initializer, regularizer=reparam_regularizer)
-    return reparam(var)
+    return Parameter(lambda: reparam(var))
diff --git a/tensorflow_compression/python/layers/parameterizers_test.py b/tensorflow_compression/python/layers/parameterizers_test.py
@@ -27,7 +27,7 @@ class ParameterizersTest(tf.test.TestCase):
   def _test_parameterizer(self, param, init, shape):
     var = param(
         getter=tf.get_variable, name="test", shape=shape, dtype=tf.float32,
-        initializer=init, regularizer=None)
+        initializer=init, regularizer=None).value()
     with self.cached_session() as sess:
       tf.global_variables_initializer().run()
       var, = sess.run([var])
diff --git a/tensorflow_compression/python/layers/signal_conv.py b/tensorflow_compression/python/layers/signal_conv.py
@@ -350,11 +350,11 @@ def bias_parameterizer(self):
 
   @property
   def kernel(self):
-    return self._kernel
+    return self._kernel.value()
 
   @property
   def bias(self):
-    return self._bias
+    return self._bias.value()
 
   @property
   def _op_data_format(self):
@@ -432,8 +432,6 @@ def build(self, input_shape):
       self._bias = getter(
           name="bias", shape=(output_channels,), dtype=self.dtype,
           initializer=self.bias_initializer, regularizer=self.bias_regularizer)
-    else:
-      self._bias = None
 
     super(_SignalConv, self).build(input_shape)
 
@@ -778,7 +776,7 @@ def call(self, inputs):
       self._raise_notimplemented()
 
     # Now, add bias if requested.
-    if self.bias is not None:
+    if self.use_bias:
       bias = self.bias
       if self.data_format == "channels_first":
         # As of Mar 2017, direct addition is significantly slower than