Skip to content

Commit dd8c179

Browse files
mgaido91srowen
authored andcommitted
[SPARK-25867][ML] Remove KMeans computeCost
## What changes were proposed in this pull request? The PR removes the deprecated method `computeCost` of `KMeans`. ## How was this patch tested? NA Closes apache#22875 from mgaido91/SPARK-25867. Authored-by: Marco Gaido <[email protected]> Signed-off-by: Sean Owen <[email protected]>
1 parent aeda76e commit dd8c179

File tree

4 files changed

+8
-39
lines changed

4 files changed

+8
-39
lines changed

mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -143,22 +143,6 @@ class KMeansModel private[ml] (
143143
@Since("2.0.0")
144144
def clusterCenters: Array[Vector] = parentModel.clusterCenters.map(_.asML)
145145

146-
/**
147-
* Return the K-means cost (sum of squared distances of points to their nearest center) for this
148-
* model on the given data.
149-
*
150-
* @deprecated This method is deprecated and will be removed in 3.0.0. Use ClusteringEvaluator
151-
* instead. You can also get the cost on the training dataset in the summary.
152-
*/
153-
@deprecated("This method is deprecated and will be removed in 3.0.0. Use ClusteringEvaluator " +
154-
"instead. You can also get the cost on the training dataset in the summary.", "2.4.0")
155-
@Since("2.0.0")
156-
def computeCost(dataset: Dataset[_]): Double = {
157-
SchemaUtils.validateVectorCompatibleColumn(dataset.schema, getFeaturesCol)
158-
val data = DatasetUtils.columnToOldVector(dataset, getFeaturesCol)
159-
parentModel.computeCost(data)
160-
}
161-
162146
/**
163147
* Returns a [[org.apache.spark.ml.util.GeneralMLWriter]] instance for this ML instance.
164148
*

mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,6 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
117117
assert(clusters === Set(0, 1, 2, 3, 4))
118118
}
119119

120-
assert(model.computeCost(dataset) < 0.1)
121120
assert(model.hasParent)
122121

123122
// Check validity of model summary
@@ -132,7 +131,6 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
132131
}
133132
assert(summary.cluster.columns === Array(predictionColName))
134133
assert(summary.trainingCost < 0.1)
135-
assert(model.computeCost(dataset) == summary.trainingCost)
136134
val clusterSizes = summary.clusterSizes
137135
assert(clusterSizes.length === k)
138136
assert(clusterSizes.sum === numRows)
@@ -201,15 +199,15 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
201199
}
202200

203201
test("KMean with Array input") {
204-
def trainAndComputeCost(dataset: Dataset[_]): Double = {
202+
def trainAndGetCost(dataset: Dataset[_]): Double = {
205203
val model = new KMeans().setK(k).setMaxIter(1).setSeed(1).fit(dataset)
206-
model.computeCost(dataset)
204+
model.summary.trainingCost
207205
}
208206

209207
val (newDataset, newDatasetD, newDatasetF) = MLTestingUtils.generateArrayFeatureDataset(dataset)
210-
val trueCost = trainAndComputeCost(newDataset)
211-
val doubleArrayCost = trainAndComputeCost(newDatasetD)
212-
val floatArrayCost = trainAndComputeCost(newDatasetF)
208+
val trueCost = trainAndGetCost(newDataset)
209+
val doubleArrayCost = trainAndGetCost(newDatasetD)
210+
val floatArrayCost = trainAndGetCost(newDatasetF)
213211

214212
// checking the cost is fine enough as a sanity check
215213
assert(trueCost ~== doubleArrayCost absTol 1e-6)

project/MimaExcludes.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ object MimaExcludes {
3636

3737
// Exclude rules for 3.0.x
3838
lazy val v30excludes = v24excludes ++ Seq(
39+
// [SPARK-25867] Remove KMeans computeCost
40+
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.KMeansModel.computeCost"),
41+
3942
// [SPARK-26127] Remove deprecated setters from tree regression and classification models
4043
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.DecisionTreeClassificationModel.setSeed"),
4144
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.DecisionTreeClassificationModel.setMinInfoGain"),

python/pyspark/ml/clustering.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -335,20 +335,6 @@ def clusterCenters(self):
335335
"""Get the cluster centers, represented as a list of NumPy arrays."""
336336
return [c.toArray() for c in self._call_java("clusterCenters")]
337337

338-
@since("2.0.0")
339-
def computeCost(self, dataset):
340-
"""
341-
Return the K-means cost (sum of squared distances of points to their nearest center)
342-
for this model on the given data.
343-
344-
..note:: Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator instead.
345-
You can also get the cost on the training dataset in the summary.
346-
"""
347-
warnings.warn("Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator "
348-
"instead. You can also get the cost on the training dataset in the summary.",
349-
DeprecationWarning)
350-
return self._call_java("computeCost", dataset)
351-
352338
@property
353339
@since("2.1.0")
354340
def hasSummary(self):
@@ -387,8 +373,6 @@ class KMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol
387373
>>> centers = model.clusterCenters()
388374
>>> len(centers)
389375
2
390-
>>> model.computeCost(df)
391-
2.0
392376
>>> transformed = model.transform(df).select("features", "prediction")
393377
>>> rows = transformed.collect()
394378
>>> rows[0].prediction == rows[1].prediction

0 commit comments

Comments
 (0)