tensorflow
diff --git a/‎tensorflow_model_optimization/python/core/clustering/keras/BUILD‎
Lines changed: 1 addition & 0 deletions b/‎tensorflow_model_optimization/python/core/clustering/keras/BUILD‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorflow_model_optimization/python/core/clustering/keras/cluster.py‎
Lines changed: 27 additions & 13 deletions b/‎tensorflow_model_optimization/python/core/clustering/keras/cluster.py‎
Lines changed: 27 additions & 13 deletions
diff --git a/‎tensorflow_model_optimization/python/core/clustering/keras/cluster_integration_test.py‎
Lines changed: 66 additions & 11 deletions b/‎tensorflow_model_optimization/python/core/clustering/keras/cluster_integration_test.py‎
Lines changed: 66 additions & 11 deletions
@@ -139,6 +139,7 @@ py_strict_test(
         ":cluster_config",
         ":clustering_centroids",
         # absl/testing:parameterized dep1,
+        # numpy dep1,
         # tensorflow dep1,
     ],
 )
 
@@ -124,16 +124,16 @@ def cluster_weights(
     ValueError: if the keras layer is unsupported, or the keras model contains
     an unsupported layer.
   """
-  return _cluster_weights(
-      to_cluster,
-      number_of_clusters,
-      cluster_centroids_init,
-      preserve_sparsity=False,
-      **kwargs)
+  return _cluster_weights(to_cluster, number_of_clusters,
+                          cluster_centroids_init, **kwargs)
 
 
-def _cluster_weights(to_cluster, number_of_clusters, cluster_centroids_init,
-                     preserve_sparsity, **kwargs):
+def _cluster_weights(to_cluster,
+                     number_of_clusters,
+                     cluster_centroids_init,
+                     preserve_sparsity=False,
+                     cluster_per_channel=False,
+                     **kwargs):
   """Modifies a keras layer or model to be clustered during training.
 
   This function wraps a keras model or layer with clustering functionality
@@ -158,6 +158,7 @@ def _cluster_weights(to_cluster, number_of_clusters, cluster_centroids_init,
   clustering_params = {
     'number_of_clusters': 8,
     'cluster_centroids_init': CentroidInitialization.DENSITY_BASED,
+    'cluster_per_channel': False,
     'preserve_sparsity': False
   }
 
@@ -170,6 +171,7 @@ def _cluster_weights(to_cluster, number_of_clusters, cluster_centroids_init,
   clustering_params = {
     'number_of_clusters': 8,
     'cluster_centroids_init': CentroidInitialization.DENSITY_BASED,
+    'cluster_per_channel': False,
     'preserve_sparsity': False
   }
 
@@ -204,6 +206,17 @@ def _cluster_weights(to_cluster, number_of_clusters, cluster_centroids_init,
         instance that determines how the cluster centroids will be initialized.
       preserve_sparsity (experimental): optional boolean value that determines
         whether or not sparsity preservation will be enforced during training.
+        When used along with cluster_per_channel flag below, the zero centroid
+        is treated separately and maintained individually for each channel.
+      cluster_per_channel: optional boolean value that determines whether the
+        clustering should be applied separately on the individual channels, as
+        opposed to the whole kernel. Only applicable to Conv2D layers and is
+        ignored otherwise. The number of clusters in this case would be
+        num_clusters*num_channels. This is useful for the collaborative
+        optimization pipeline where clustering is followed by quantization,
+        since Conv2D is quantized per-channel, so we end up with
+        num_clusters*num_channels total clusters at the end. Clustering
+        per-channel from the beginning leads to better accuracy.
       **kwargs: Additional keyword arguments to be passed to the keras layer.
         Ignored when to_cluster is not a keras layer.
 
@@ -255,7 +268,8 @@ def _add_clustering_wrapper(layer):
 
     return cluster_wrapper.ClusterWeights(layer, number_of_clusters,
                                           cluster_centroids_init,
-                                          preserve_sparsity, **kwargs)
+                                          preserve_sparsity,
+                                          cluster_per_channel, **kwargs)
 
   def _wrap_list(layers):
     output = []
@@ -310,11 +324,11 @@ def _strip_clustering_wrapper(layer):
           layer, input_tensors=None, clone_function=_strip_clustering_wrapper)
 
     elif isinstance(layer, cluster_wrapper.ClusterWeightsMHA):
-        # Update cluster associations in order to get the latest weights
-        layer.update_clustered_weights_associations()
+      # Update cluster associations in order to get the latest weights
+      layer.update_clustered_weights_associations()
 
-        # In case of MHA layer, use the overloaded implementation
-        return layer.strip_clustering()
+      # In case of MHA layer, use the overloaded implementation
+      return layer.strip_clustering()
 
     elif isinstance(layer, cluster_wrapper.ClusterWeights):
       # Update cluster associations in order to get the latest weights
 
@@ -538,23 +538,26 @@ def testClusterStackedRNNCells(self):
         expected_unique_weights=self.params_clustering["number_of_clusters"],
     )
 
+
 class ClusterMHAIntegrationTest(tf.test.TestCase, parameterized.TestCase):
   """Integration tests for clustering MHA layer."""
 
   def setUp(self):
+    super(ClusterMHAIntegrationTest, self).setUp()
     self.x_train = np.random.uniform(size=(500, 32, 32))
     self.y_train = np.random.randint(low=0, high=1024, size=(500,))
 
     self.nr_of_clusters = 16
     self.params_clustering = {
-      "number_of_clusters": self.nr_of_clusters,
-      "cluster_centroids_init": CentroidInitialization.KMEANS_PLUS_PLUS,
+        "number_of_clusters": self.nr_of_clusters,
+        "cluster_centroids_init": CentroidInitialization.KMEANS_PLUS_PLUS,
     }
 
   def _get_model(self):
     """Returns functional model with MHA layer."""
-    inp = tf.keras.layers.Input(shape=(32,32), batch_size=100)
-    x = tf.keras.layers.MultiHeadAttention(num_heads=2, key_dim=16)(query=inp, value=inp)
+    inp = tf.keras.layers.Input(shape=(32, 32), batch_size=100)
+    x = tf.keras.layers.MultiHeadAttention(num_heads=2, key_dim=16)(
+        query=inp, value=inp)
     out = tf.keras.layers.Flatten()(x)
     model = tf.keras.Model(inputs=inp, outputs=out)
     return model
@@ -566,18 +569,70 @@ def testMHA(self):
     clustered_model = cluster.cluster_weights(model, **self.params_clustering)
 
     clustered_model.compile(
-      optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
-      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-      metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')])
-    clustered_model.fit(self.x_train, self.y_train, epochs=1, batch_size=100, verbose=1)
+        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
+        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")])
+    clustered_model.fit(
+        self.x_train, self.y_train, epochs=1, batch_size=100, verbose=1)
 
     stripped_model = cluster.strip_clustering(clustered_model)
 
-    layerMHA = stripped_model.layers[1]
-    for weight in layerMHA.weights:
-      if 'kernel' in weight.name:
+    layer_mha = stripped_model.layers[1]
+    for weight in layer_mha.weights:
+      if "kernel" in weight.name:
         nr_unique_weights = len(np.unique(weight.numpy()))
         assert nr_unique_weights == self.nr_of_clusters
 
+
+class ClusterPerChannelIntegrationTest(tf.test.TestCase,
+                                       parameterized.TestCase):
+  """Integration tests for per-channel clustering of Conv2D layer."""
+
+  def setUp(self):
+    super(ClusterPerChannelIntegrationTest, self).setUp()
+    self.x_train = np.random.uniform(size=(500, 32, 32))
+    self.y_train = np.random.randint(low=0, high=1024, size=(500,))
+
+    self.nr_of_clusters = 4
+    self.num_channels = 12
+    self.params_clustering = {
+        "number_of_clusters": self.nr_of_clusters,
+        "cluster_centroids_init": CentroidInitialization.KMEANS_PLUS_PLUS,
+        "cluster_per_channel": True
+    }
+
+  def _get_model(self):
+    """Returns functional model with Conv2D layer."""
+    inp = tf.keras.layers.Input(shape=(32, 32), batch_size=100)
+    x = tf.keras.layers.Reshape((32, 32, 1))(inp)
+    x = tf.keras.layers.Conv2D(
+        filters=self.num_channels, kernel_size=(3, 3),
+        activation="relu")(x)
+    x = tf.keras.layers.MaxPool2D(2, 2)(x)
+    out = tf.keras.layers.Flatten()(x)
+    model = tf.keras.Model(inputs=inp, outputs=out)
+    return model
+
+  @keras_parameterized.run_all_keras_modes
+  def testPerChannel(self):
+    model = self._get_model()
+
+    clustered_model = cluster.cluster_weights(model, **self.params_clustering)
+
+    clustered_model.compile(
+        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
+        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")])
+    clustered_model.fit(
+        self.x_train, self.y_train, epochs=1, batch_size=100, verbose=1)
+
+    stripped_model = cluster.strip_clustering(clustered_model)
+
+    layer_conv2d = stripped_model.layers[2]
+    for weight in layer_conv2d.weights:
+      if "kernel" in weight.name:
+        nr_unique_weights = len(np.unique(weight.numpy()))
+        assert nr_unique_weights == self.nr_of_clusters*self.num_channels
+
 if __name__ == "__main__":
   test.main()
Original file line number	Diff line number	Diff line change
`@@ -139,6 +139,7 @@ py_strict_test(`
`139`	`139`	`":cluster_config",`
`140`	`140`	`":clustering_centroids",`
`141`	`141`	`# absl/testing:parameterized dep1,`
	`142`	`+ # numpy dep1,`
`142`	`143`	`# tensorflow dep1,`
`143`	`144`	`],`
`144`	`145`	`)`