[SPARK-14516][FOLLOWUP] Adding ClusteringEvaluator to examples

mgaido91 · srowen · commit ec873a4fd20a · 2017-12-11T06:35:31.000-06:00
## What changes were proposed in this pull request? In SPARK-14516 we have introduced ClusteringEvaluator, but we didn't put any reference in the documentation and the examples were still relying on the sum of squared errors to show a way to evaluate the clustering model. The PR adds the ClusteringEvaluator in the examples. ## How was this patch tested? Manual runs of the examples. Author: Marco Gaido <mgaido@hortonworks.com> Closes #19676 from mgaido91/SPARK-14516_examples.
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java
@@ -20,6 +20,7 @@
 // $example on$
 import org.apache.spark.ml.clustering.KMeansModel;
 import org.apache.spark.ml.clustering.KMeans;
+import org.apache.spark.ml.evaluation.ClusteringEvaluator;
 import org.apache.spark.ml.linalg.Vector;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
@@ -51,9 +52,14 @@ public static void main(String[] args) {
     KMeans kmeans = new KMeans().setK(2).setSeed(1L);
     KMeansModel model = kmeans.fit(dataset);
 
-    // Evaluate clustering by computing Within Set Sum of Squared Errors.
-    double WSSSE = model.computeCost(dataset);
-    System.out.println("Within Set Sum of Squared Errors = " + WSSSE);
+    // Make predictions
+    Dataset<Row> predictions = model.transform(dataset);
+
+    // Evaluate clustering by computing Silhouette score
+    ClusteringEvaluator evaluator = new ClusteringEvaluator();
+
+    double silhouette = evaluator.evaluate(predictions);
+    System.out.println("Silhouette with squared euclidean distance = " + silhouette);
 
     // Shows the result.
     Vector[] centers = model.clusterCenters();
diff --git a/examples/src/main/python/ml/kmeans_example.py b/examples/src/main/python/ml/kmeans_example.py
@@ -19,6 +19,7 @@
 
 # $example on$
 from pyspark.ml.clustering import KMeans
+from pyspark.ml.evaluation import ClusteringEvaluator
 # $example off$
 
 from pyspark.sql import SparkSession
@@ -45,9 +46,14 @@
     kmeans = KMeans().setK(2).setSeed(1)
     model = kmeans.fit(dataset)
 
-    # Evaluate clustering by computing Within Set Sum of Squared Errors.
-    wssse = model.computeCost(dataset)
-    print("Within Set Sum of Squared Errors = " + str(wssse))
+    # Make predictions
+    predictions = model.transform(dataset)
+
+    # Evaluate clustering by computing Silhouette score
+    evaluator = ClusteringEvaluator()
+
+    silhouette = evaluator.evaluate(predictions)
+    print("Silhouette with squared euclidean distance = " + str(silhouette))
 
     # Shows the result.
     centers = model.clusterCenters()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala
@@ -21,6 +21,7 @@ package org.apache.spark.examples.ml
 
 // $example on$
 import org.apache.spark.ml.clustering.KMeans
+import org.apache.spark.ml.evaluation.ClusteringEvaluator
 // $example off$
 import org.apache.spark.sql.SparkSession
 
@@ -47,9 +48,14 @@ object KMeansExample {
     val kmeans = new KMeans().setK(2).setSeed(1L)
     val model = kmeans.fit(dataset)
 
-    // Evaluate clustering by computing Within Set Sum of Squared Errors.
-    val WSSSE = model.computeCost(dataset)
-    println(s"Within Set Sum of Squared Errors = $WSSSE")
+    // Make predictions
+    val predictions = model.transform(dataset)
+
+    // Evaluate clustering by computing Silhouette score
+    val evaluator = new ClusteringEvaluator()
+
+    val silhouette = evaluator.evaluate(predictions)
+    println(s"Silhouette with squared euclidean distance = $silhouette")
 
     // Shows the result.
     println("Cluster Centers: ")