Pruning of MultiHeadAttention layer.

wwwind · wwwind · commit fa5843000a6a · 2021-10-26T17:11:30.000+01:00
Added implementation of pruning for MHA layer  + small model with this layer on mnist dataset.

Change-Id: I88ad879f507dfacbb797a65eec153b3c7ac08abd
diff --git a/tensorflow_model_optimization/python/core/sparsity/keras/prune_integration_test.py b/tensorflow_model_optimization/python/core/sparsity/keras/prune_integration_test.py
@@ -105,6 +105,9 @@ def _get_params_for_layer(self, layer_type):
         # Embedding has a separate test since training it is not
         # feasible as a single layer.
         layers.Embedding: (None, None),
+
+        # MultiHeadAttention layer has a separate test
+        layers.MultiHeadAttention: (None, None)
     }[layer_type]
 
   def setUp(self):
@@ -528,6 +531,28 @@ def testPruneRecursivelyReachesTargetSparsity(self):
     input_data = np.random.randint(10, size=(32, 10))
     self._check_strip_pruning_matches_original(model, 0.5, input_data)
 
+  def testMHALayerReachesTargetSparsity(self):
+    inp = tf.keras.layers.Input(shape=(32,32), batch_size=100)
+    x = tf.keras.layers.MultiHeadAttention(num_heads=2, key_dim=16)(query=inp, value=inp)
+    out = tf.keras.layers.Flatten()(x)
+    model = tf.keras.Model(inputs=inp, outputs=out)
+    model = prune.prune_low_magnitude(model, **self.params)
+    x_train = np.random.uniform(size=(500, 32, 32))
+    y_train = np.random.randint(low=0, high=1024, size=(500,))
+    model.compile(
+      optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
+      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+      metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')])
+    test_utils.assert_model_sparsity(self, 0.0, model)
+    model.fit(
+        x_train,
+        y_train,
+        epochs=1,
+        batch_size=100,
+        callbacks=[pruning_callbacks.UpdatePruningStep()])
+    test_utils.assert_model_sparsity(self, 0.5, model)
+    self._check_strip_pruning_matches_original(model, 0.5, x_train)
+
   @parameterized.parameters(test_utils.model_type_keys())
   def testPrunesMnist_ReachesTargetSparsity(self, model_type):
     model = test_utils.build_mnist_model(model_type, self.params)
diff --git a/tensorflow_model_optimization/python/core/sparsity/keras/prune_registry.py b/tensorflow_model_optimization/python/core/sparsity/keras/prune_registry.py
@@ -93,6 +93,11 @@ class PruneRegistry(object):
       layers.MaxPooling1D: [],
       layers.MaxPooling2D: [],
       layers.MaxPooling3D: [],
+      layers.MultiHeadAttention: [
+        '_query_dense.kernel',
+        '_key_dense.kernel',
+        '_value_dense.kernel',
+        '_output_dense.kernel'],
       layers.experimental.preprocessing.Rescaling.__class__: [],
       TensorFlowOpLayer: [],
   }
@@ -163,6 +168,10 @@ def _get_rnn_cells(cls, rnn_layer):
   def _is_rnn_layer(cls, layer):
     return layer.__class__ in cls._RNN_LAYERS
 
+  @classmethod
+  def _is_mha_layer(cls, layer):
+    return layer.__class__ is layers.MultiHeadAttention
+
   @classmethod
   def _weight_names(cls, layer):
     return cls._LAYERS_WEIGHTS_MAP[layer.__class__]
@@ -202,8 +211,20 @@ def get_prunable_weights_rnn_cell(cell):
         prunable_weights.extend(get_prunable_weights_rnn_cell(rnn_cell))
       return prunable_weights
 
+    def get_prunable_weights_mha():  # pylint: disable=missing-docstring
+      def get_prunable_weights_mha_weight(weight_name):
+        pre, _, post = weight_name.rpartition('.')
+        return getattr(getattr(layer, pre), post)
+
+      prunable_weights = []
+      for weight_name in cls._weight_names(layer):
+        prunable_weights.append(get_prunable_weights_mha_weight(weight_name))
+      return prunable_weights
+
     if cls._is_rnn_layer(layer):
       layer.get_prunable_weights = get_prunable_weights_rnn
+    elif cls._is_mha_layer(layer):
+      layer.get_prunable_weights = get_prunable_weights_mha
     else:
       layer.get_prunable_weights = get_prunable_weights
 
diff --git a/tensorflow_model_optimization/python/examples/sparsity/keras/mnist/mnist_mha.py b/tensorflow_model_optimization/python/examples/sparsity/keras/mnist/mnist_mha.py
@@ -0,0 +1,99 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=missing-docstring
+"""Train a simple model with MultiHeadAttention layer on MNIST dataset
+and prune it.
+"""
+import tensorflow as tf
+
+from tensorflow_model_optimization.python.core.keras import test_utils as keras_test_utils
+from tensorflow_model_optimization.python.core.sparsity.keras import prune
+from tensorflow_model_optimization.python.core.sparsity.keras import pruning_callbacks
+from tensorflow_model_optimization.python.core.sparsity.keras import pruning_schedule
+from tensorflow_model_optimization.python.core.sparsity.keras import pruning_utils
+from tensorflow_model_optimization.python.core.sparsity.keras import pruning_wrapper
+
+tf.random.set_seed(42)
+
+ConstantSparsity = pruning_schedule.ConstantSparsity
+
+# Load MNIST dataset
+mnist = tf.keras.datasets.mnist
+(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
+
+# Normalize the input image so that each pixel value is between 0 to 1.
+train_images = train_images / 255.0
+test_images = test_images / 255.0
+
+# define model
+input = tf.keras.layers.Input(shape=(28, 28))
+x = tf.keras.layers.MultiHeadAttention(num_heads=2, key_dim=16, name="mha")(
+    query=input, value=input
+)
+x = tf.keras.layers.Flatten()(x)
+out = tf.keras.layers.Dense(10)(x)
+model = tf.keras.Model(inputs=input, outputs=out)
+
+# Train the digit classification model
+model.compile(
+    optimizer="adam",
+    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    metrics=["accuracy"],
+)
+
+model.fit(
+    train_images, train_labels, epochs=10, validation_split=0.1,
+)
+
+score = model.evaluate(test_images, test_labels, verbose=0)
+print('Model test loss:', score[0])
+print('Model test accuracy:', score[1])
+
+# Define parameters for pruning
+
+batch_size = 128
+epochs = 3
+validation_split = 0.1  # 10% of training set will be used for validation set.
+
+callbacks = [
+    pruning_callbacks.UpdatePruningStep(),
+    pruning_callbacks.PruningSummaries(log_dir='/tmp/logs')
+]
+
+pruning_params = {
+      'pruning_schedule': ConstantSparsity(0.75, begin_step=2000, frequency=100)
+}
+
+model_for_pruning = prune.prune_low_magnitude(model, **pruning_params)
+
+# `prune_low_magnitude` requires a recompile.
+model_for_pruning.compile(
+    optimizer="adam",
+    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    metrics=["accuracy"],
+)
+
+model_for_pruning.fit(
+    train_images,
+    train_labels,
+    batch_size=batch_size,
+    epochs=epochs,
+    callbacks=callbacks,
+    validation_split=validation_split,
+)
+
+score = model_for_pruning.evaluate(test_images, test_labels, verbose=0)
+print('Pruned model test loss:', score[0])
+print('Pruned model test accuracy:', score[1])