[SPARK-50995][ML][PYTHON][CONNECT] Support clusterCenters for KMeans and BisectingKMeans

zhengruifeng · zhengruifeng · commit 2c32f935d341 · 2025-01-27T09:23:11.000+08:00
### What changes were proposed in this pull request? Support `clusterCenters` for KMeans and BisectingKMeans, To simplify the serde of `Array[Vector]`, combine it to a `Matrix` ### Why are the changes needed? for parity ### Does this PR introduce _any_ user-facing change? yes, new API supported on connect ### How was this patch tested? added test ### Was this patch authored or co-authored using generative AI tooling? no Closes #49680 from zhengruifeng/ml_connect_km_cluster. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org> (cherry picked from commit 66c2920) Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -21,7 +21,7 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.annotation.Since
 import org.apache.spark.ml.{Estimator, Model}
-import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.linalg._
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
@@ -142,6 +142,9 @@ class BisectingKMeansModel private[ml] (
   @Since("2.0.0")
   def clusterCenters: Array[Vector] = parentModel.clusterCenters.map(_.asML)
 
+  private[ml] def clusterCenterMatrix: Matrix =
+    Matrices.fromVectors(clusterCenters.toSeq)
+
   /**
    * Computes the sum of squared distances between the input points and their corresponding cluster
    * centers.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -187,6 +187,9 @@ class KMeansModel private[ml] (
   @Since("2.0.0")
   def clusterCenters: Array[Vector] = parentModel.clusterCenters.map(_.asML)
 
+  private[ml] def clusterCenterMatrix: Matrix =
+    Matrices.fromVectors(clusterCenters.toSeq)
+
   /**
    * Returns a [[org.apache.spark.ml.util.GeneralMLWriter]] instance for this ML instance.
    *
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
@@ -686,7 +686,8 @@ def setPredictionCol(self, value: str) -> "KMeansModel":
     @since("1.5.0")
     def clusterCenters(self) -> List[np.ndarray]:
         """Get the cluster centers, represented as a list of NumPy arrays."""
-        return [c.toArray() for c in self._call_java("clusterCenters")]
+        matrix = self._call_java("clusterCenterMatrix")
+        return [vec for vec in matrix.toArray()]
 
     @property
     @since("2.1.0")
@@ -1006,7 +1007,8 @@ def setPredictionCol(self, value: str) -> "BisectingKMeansModel":
     @since("2.0.0")
     def clusterCenters(self) -> List[np.ndarray]:
         """Get the cluster centers, represented as a list of NumPy arrays."""
-        return [c.toArray() for c in self._call_java("clusterCenters")]
+        matrix = self._call_java("clusterCenterMatrix")
+        return [vec for vec in matrix.toArray()]
 
     @since("2.0.0")
     def computeCost(self, dataset: DataFrame) -> float:
diff --git a/python/pyspark/ml/tests/test_clustering.py b/python/pyspark/ml/tests/test_clustering.py
@@ -69,6 +69,12 @@ def test_kmeans(self):
 
         model = km.fit(df)
         self.assertEqual(km.uid, model.uid)
+
+        centers = model.clusterCenters()
+        self.assertEqual(len(centers), 2)
+        self.assertTrue(np.allclose(centers[0], [-0.372, -0.338], atol=1e-3), centers[0])
+        self.assertTrue(np.allclose(centers[1], [0.8625, 0.83375], atol=1e-3), centers[1])
+
         # TODO: support KMeansModel.numFeatures in Python
         # self.assertEqual(model.numFeatures, 2)
 
@@ -138,6 +144,12 @@ def test_bisecting_kmeans(self):
 
         model = bkm.fit(df)
         self.assertEqual(bkm.uid, model.uid)
+
+        centers = model.clusterCenters()
+        self.assertEqual(len(centers), 2)
+        self.assertTrue(np.allclose(centers[0], [-0.372, -0.338], atol=1e-3), centers[0])
+        self.assertTrue(np.allclose(centers[1], [0.8625, 0.83375], atol=1e-3), centers[1])
+
         # TODO: support KMeansModel.numFeatures in Python
         # self.assertEqual(model.numFeatures, 2)
 
diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala
@@ -584,11 +584,11 @@ private[ml] object MLUtils {
     (classOf[LinearRegressionTrainingSummary], Set("objectiveHistory", "totalIterations")),
 
     // Clustering Models
-    (classOf[KMeansModel], Set("predict", "numFeatures", "clusterCenters")),
+    (classOf[KMeansModel], Set("predict", "numFeatures", "clusterCenterMatrix")),
     (classOf[KMeansSummary], Set("trainingCost")),
     (
       classOf[BisectingKMeansModel],
-      Set("predict", "numFeatures", "clusterCenters", "computeCost")),
+      Set("predict", "numFeatures", "clusterCenterMatrix", "computeCost")),
     (classOf[BisectingKMeansSummary], Set("trainingCost")),
     (
       classOf[GaussianMixtureModel],

Original file line number	Diff line number	Diff line change
`@@ -187,6 +187,9 @@ class KMeansModel private[ml] (`
`187`	`187`	`@Since("2.0.0")`
`188`	`188`	`def clusterCenters: Array[Vector] = parentModel.clusterCenters.map(_.asML)`
`189`	`189`
	`190`	`+ private[ml] def clusterCenterMatrix: Matrix =`
	`191`	`+ Matrices.fromVectors(clusterCenters.toSeq)`
	`192`	`+`
`190`	`193`	`/**`
`191`	`194`	`* Returns a [[org.apache.spark.ml.util.GeneralMLWriter]] instance for this ML instance.`
`192`	`195`	`*`
Original file line number	Diff line number	Diff line change
`@@ -584,11 +584,11 @@ private[ml] object MLUtils {`
`584`	`584`	`(classOf[LinearRegressionTrainingSummary], Set("objectiveHistory", "totalIterations")),`
`585`	`585`
`586`	`586`	`// Clustering Models`
`587`		`- (classOf[KMeansModel], Set("predict", "numFeatures", "clusterCenters")),`
	`587`	`+ (classOf[KMeansModel], Set("predict", "numFeatures", "clusterCenterMatrix")),`
`588`	`588`	`(classOf[KMeansSummary], Set("trainingCost")),`
`589`	`589`	`(`
`590`	`590`	`classOf[BisectingKMeansModel],`
`591`		`- Set("predict", "numFeatures", "clusterCenters", "computeCost")),`
	`591`	`+ Set("predict", "numFeatures", "clusterCenterMatrix", "computeCost")),`
`592`	`592`	`(classOf[BisectingKMeansSummary], Set("trainingCost")),`
`593`	`593`	`(`
`594`	`594`	`classOf[GaussianMixtureModel],`