tensorflow
diff --git a/‎tensorflow_model_optimization/python/core/clustering/keras/cluster.py‎
Lines changed: 16 additions & 3 deletions b/‎tensorflow_model_optimization/python/core/clustering/keras/cluster.py‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎tensorflow_model_optimization/python/core/clustering/keras/cluster_integration_test.py‎
Lines changed: 47 additions & 0 deletions b/‎tensorflow_model_optimization/python/core/clustering/keras/cluster_integration_test.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎tensorflow_model_optimization/python/core/clustering/keras/cluster_wrapper.py‎
Lines changed: 76 additions & 19 deletions b/‎tensorflow_model_optimization/python/core/clustering/keras/cluster_wrapper.py‎
Lines changed: 76 additions & 19 deletions
@@ -128,12 +128,11 @@ def cluster_weights(
       to_cluster,
       number_of_clusters,
       cluster_centroids_init,
-      preserve_sparsity=False,
       **kwargs)
 
 
 def _cluster_weights(to_cluster, number_of_clusters, cluster_centroids_init,
-                     preserve_sparsity, **kwargs):
+                     preserve_sparsity=False, cluster_per_channel=False, **kwargs):
   """Modifies a keras layer or model to be clustered during training.
 
   This function wraps a keras model or layer with clustering functionality
@@ -158,6 +157,7 @@ def _cluster_weights(to_cluster, number_of_clusters, cluster_centroids_init,
   clustering_params = {
     'number_of_clusters': 8,
     'cluster_centroids_init': CentroidInitialization.DENSITY_BASED,
+    'cluster_per_channel': False,
     'preserve_sparsity': False
   }
 
@@ -170,6 +170,7 @@ def _cluster_weights(to_cluster, number_of_clusters, cluster_centroids_init,
   clustering_params = {
     'number_of_clusters': 8,
     'cluster_centroids_init': CentroidInitialization.DENSITY_BASED,
+    'cluster_per_channel': False,
     'preserve_sparsity': False
   }
 
@@ -202,8 +203,19 @@ def _cluster_weights(to_cluster, number_of_clusters, cluster_centroids_init,
         8 unique values will be used in each weight array.
       cluster_centroids_init: `tfmot.clustering.keras.CentroidInitialization`
         instance that determines how the cluster centroids will be initialized.
+      cluster_per_channel: optional boolean value that determines whether the
+        clustering should be applied separately on the individual channels, as
+        opposed to the whole kernel. Only applicable to Conv2D layers and is
+        ignored otherwise. The number of clusters in this case would be
+        num_clusters*num_channels. This is useful for the collaborative
+        optimization pipeline where clustering is followed by quantization,
+        since Conv2D is quantized per-channel, so we end up with
+        num_clusters*num_channels total clusters at the end. Clustering
+        per-channel from the beginning leads to better accuracy.
       preserve_sparsity (experimental): optional boolean value that determines
         whether or not sparsity preservation will be enforced during training.
+        When used along with cluster_per_channel flag above, the zero centroid
+        is treated separately and maintained individually for each channel.
       **kwargs: Additional keyword arguments to be passed to the keras layer.
         Ignored when to_cluster is not a keras layer.
 
@@ -255,7 +267,8 @@ def _add_clustering_wrapper(layer):
 
     return cluster_wrapper.ClusterWeights(layer, number_of_clusters,
                                           cluster_centroids_init,
-                                          preserve_sparsity, **kwargs)
+                                          preserve_sparsity,
+                                          cluster_per_channel, **kwargs)
 
   def _wrap_list(layers):
     output = []
 
@@ -579,5 +579,52 @@ def testMHA(self):
         nr_unique_weights = len(np.unique(weight.numpy()))
         assert nr_unique_weights == self.nr_of_clusters
 
+class ClusterPerChannelIntegrationTest(tf.test.TestCase, parameterized.TestCase):
+  """Integration tests for per-channel clustering of Conv2D layer."""
+
+  def setUp(self):
+    self.x_train = np.random.uniform(size=(500, 32, 32))
+    self.y_train = np.random.randint(low=0, high=1024, size=(500,))
+
+    self.nr_of_clusters = 4
+    self.num_channels = 12
+    self.params_clustering = {
+      "number_of_clusters": self.nr_of_clusters,
+      "cluster_centroids_init": CentroidInitialization.KMEANS_PLUS_PLUS,
+      "cluster_per_channel": True
+    }
+
+  def _get_model(self):
+    """Returns functional model with Conv2D layer."""
+    inp = tf.keras.layers.Input(shape=(32,32), batch_size=100)
+    x = tf.keras.layers.Reshape((32, 32, 1))(inp)
+    x = tf.keras.layers.Conv2D(
+        filters=self.num_channels, kernel_size=(3, 3),
+        activation='relu')(x)
+    x = tf.keras.layers.MaxPool2D(2, 2)(x)
+    out = tf.keras.layers.Flatten()(x)
+    model = tf.keras.Model(inputs=inp, outputs=out)
+    return model
+
+  @keras_parameterized.run_all_keras_modes
+  def testPerChannel(self):
+    model = self._get_model()
+
+    clustered_model = cluster.cluster_weights(model, **self.params_clustering)
+
+    clustered_model.compile(
+      optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
+      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+      metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')])
+    clustered_model.fit(self.x_train, self.y_train, epochs=1, batch_size=100, verbose=1)
+
+    stripped_model = cluster.strip_clustering(clustered_model)
+
+    layerConv2D = stripped_model.layers[2]
+    for weight in layerConv2D.weights:
+      if 'kernel' in weight.name:
+        nr_unique_weights = len(np.unique(weight.numpy()))
+        assert nr_unique_weights == self.nr_of_clusters*self.num_channels
+
 if __name__ == "__main__":
   test.main()
@@ -58,6 +58,7 @@ def __init__(self,
                number_of_clusters,
                cluster_centroids_init=CentroidInitialization.KMEANS_PLUS_PLUS,
                preserve_sparsity=False,
+               cluster_per_channel=False,
                cluster_gradient_aggregation=GradientAggregation.SUM,
                **kwargs):
     if not isinstance(layer, Layer):
@@ -101,6 +102,17 @@ def __init__(self,
     # The number of cluster centroids
     self.number_of_clusters = number_of_clusters
 
+    # Whether to cluster Conv2D kernels per-channel.
+    # In case the layer isn't a Conv2D, this isn't
+    # applicable
+    self.cluster_per_channel = (
+      cluster_per_channel if isinstance(layer, tf.keras.layers.Conv2D)
+        else False)
+
+    # Number of channels in a Conv2D layer, to be
+    # used the case of per-channel clustering
+    self.num_channels = None
+
     # Whether to apply sparsity preservation or not
     self.preserve_sparsity = preserve_sparsity
 
@@ -137,12 +149,31 @@ def __init__(self,
         hasattr(layer, '_batch_input_shape')):
       self._batch_input_shape = self.layer._batch_input_shape
 
+    # In the case of Conv2D layer, the data_format
+    # needs to be preserved to be used for per-channel
+    # clustering
+    if hasattr(layer, 'data_format'):
+      self.data_format = self.layer.data_format
+    else:
+      self.data_format = None
+
     # Save the input shape specified in the build
     self.build_input_shape = None
 
   def _make_layer_name(self, layer):
     return '{}_{}'.format('cluster', layer.name)
 
+  def _get_zero_idx_mask(self, centroids, zero_cluster):
+    zero_idx_mask = (tf.cast(tf.math.not_equal(centroids,
+                                              zero_cluster),
+                                              dtype=tf.float32))
+    return zero_idx_mask
+
+  def _get_zero_centroid(self, centroids, zero_idx_mask):
+    zero_centroid = tf.math.multiply(centroids,
+                                     zero_idx_mask)
+    return zero_centroid
+
   def get_weight_from_layer(self, weight_name):
     return getattr(self.layer, weight_name)
 
@@ -173,15 +204,28 @@ def build(self, input_shape):
           i for i, w in enumerate(self.layer.weights) if w is original_weight)
       self.position_original_weights[position_original_weight] = weight_name
 
+      # In the case of per-channel clustering, the number of channels,
+      # per-channel number of clusters, as well as the overall number
+      # of clusters all need to be preserved in the wrapper.
+      if self.cluster_per_channel:
+        self.num_channels = (
+          original_weight.shape[1] if self.data_format == "channels_first"
+            else original_weight.shape[-1])
+
+      centroid_init_factory = clustering_centroids.CentroidsInitializerFactory
+      centroid_init = centroid_init_factory.get_centroid_initializer(
+                                            self.cluster_centroids_init)(
+                                            weight, self.number_of_clusters,
+                                            self.cluster_per_channel,
+                                            self.num_channels,
+                                            self.preserve_sparsity)
+
       # Init the cluster centroids
-      cluster_centroids = (
-          clustering_centroids.CentroidsInitializerFactory
-          .get_centroid_initializer(self.cluster_centroids_init)(
-              weight, self.number_of_clusters,
-              self.preserve_sparsity).get_cluster_centroids())
+      cluster_centroids = (centroid_init.get_cluster_centroids())
+
       self.cluster_centroids[weight_name] = self.add_weight(
           '{}{}'.format('cluster_centroids_', weight_name),
-          shape=(self.number_of_clusters,),
+          shape=(cluster_centroids.shape),
           dtype=weight.dtype,
           trainable=True,
           initializer=tf.keras.initializers.Constant(value=cluster_centroids))
@@ -198,10 +242,11 @@ def build(self, input_shape):
         weight_name_no_index = weight_name
       self.clustering_algorithms[weight_name] = (
           clustering_registry.ClusteringLookupRegistry().get_clustering_impl(
-              self.layer, weight_name_no_index)
+              self.layer, weight_name_no_index, self.cluster_per_channel)
           (
               clusters_centroids=self.cluster_centroids[weight_name],
               cluster_gradient_aggregation=self.cluster_gradient_aggregation,
+              data_format=self.data_format,
           ))
 
       # Init the pulling_indices (weights associations)
@@ -233,18 +278,27 @@ def update_clustered_weights_associations(self):
     ):
 
       if self.preserve_sparsity:
-        # Set the smallest centroid to zero to force sparsity
-        # and avoid extra cluster from forming
-        zero_idx_mask = (
-            tf.cast(
-                tf.math.not_equal(
-                    self.cluster_centroids[weight_name],
-                    self.cluster_centroids[weight_name][
-                        self.zero_idx[weight_name]]),
-                dtype=tf.float32))
-        self.cluster_centroids[weight_name].assign(
-            tf.math.multiply(self.cluster_centroids[weight_name],
-                             zero_idx_mask))
+        # In the case of per-channel clustering, sparsity
+        # needs to be preserved per-channel
+        if self.cluster_per_channel:
+          for channel in range(self.num_channels):
+            zero_idx_mask = (
+              self._get_zero_idx_mask(self.cluster_centroids[weight_name][channel],
+                                      self.cluster_centroids[weight_name][channel][
+                                      self.zero_idx[weight_name][channel]]))
+            self.cluster_centroids[weight_name][channel].assign(
+                self._get_zero_centroid(self.cluster_centroids[weight_name][channel],
+                                        zero_idx_mask))
+        else:
+          # Set the smallest centroid to zero to force sparsity
+          # and avoid extra cluster from forming
+          zero_idx_mask = self._get_zero_idx_mask(self.cluster_centroids[weight_name],
+                                                  self.cluster_centroids[weight_name][
+                                                  self.zero_idx[weight_name]])
+          self.cluster_centroids[weight_name].assign(
+              self._get_zero_centroid(self.cluster_centroids[weight_name],
+                                      zero_idx_mask))
+
         # During training, the original zero weights can drift slightly.
         # We want to prevent this by forcing them to stay zero at the places
         # where they were originally zero to begin with.
@@ -284,6 +338,7 @@ def get_config(self):
         'cluster_centroids_init': self.cluster_centroids_init,
         'preserve_sparsity': self.preserve_sparsity,
         'cluster_gradient_aggregation': self.cluster_gradient_aggregation,
+        'cluster_per_channel': self.cluster_per_channel,
         **base_config
     }
     return config
@@ -296,12 +351,14 @@ def from_config(cls, config, custom_objects=None):
     cluster_centroids_init = config.pop('cluster_centroids_init')
     preserve_sparsity = config.pop('preserve_sparsity')
     cluster_gradient_aggregation = config.pop('cluster_gradient_aggregation')
+    cluster_per_channel = config.pop('cluster_per_channel')
 
     config['number_of_clusters'] = number_of_clusters
     config['cluster_centroids_init'] = cluster_config.CentroidInitialization(
         cluster_centroids_init)
     config['preserve_sparsity'] = preserve_sparsity
     config['cluster_gradient_aggregation'] = cluster_gradient_aggregation
+    config['cluster_per_channel'] = cluster_per_channel
 
     layer = tf.keras.layers.deserialize(
         config.pop('layer'), custom_objects=custom_objects)