[SPARK-28117][ML] LDA and BisectingKMeans cache the input dataset if necessary

zhengruifeng · srowen · commit 83b96f6b3076 · 2019-06-25T06:47:06.000-05:00
## What changes were proposed in this pull request? cache dataset in BisectingKMeans cache dataset in LDA if Online solver is chosen. ## How was this patch tested? existing test Closes apache#24920 from zhengruifeng/bikm_cache. Authored-by: zhengruifeng <ruifengz@foxmail.com> Signed-off-by: Sean Owen <sean.owen@databricks.com>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -32,6 +32,7 @@ import org.apache.spark.mllib.linalg.VectorImplicits._
 import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.sql.functions.udf
 import org.apache.spark.sql.types.{IntegerType, StructType}
+import org.apache.spark.storage.StorageLevel
 
 
 /**
@@ -248,7 +249,12 @@ class BisectingKMeans @Since("2.0.0") (
   @Since("2.0.0")
   override def fit(dataset: Dataset[_]): BisectingKMeansModel = instrumented { instr =>
     transformSchema(dataset.schema, logging = true)
+
+    val handlePersistence = dataset.storageLevel == StorageLevel.NONE
     val rdd = DatasetUtils.columnToOldVector(dataset, getFeaturesCol)
+    if (handlePersistence) {
+      rdd.persist(StorageLevel.MEMORY_AND_DISK)
+    }
 
     instr.logPipelineStage(this)
     instr.logDataset(dataset)
@@ -263,6 +269,10 @@ class BisectingKMeans @Since("2.0.0") (
       .setDistanceMeasure($(distanceMeasure))
     val parentModel = bkm.run(rdd, Some(instr))
     val model = copyValues(new BisectingKMeansModel(uid, parentModel).setParent(this))
+    if (handlePersistence) {
+      rdd.unpersist()
+    }
+
     val summary = new BisectingKMeansSummary(
       model.transform(dataset),
       $(predictionCol),
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -44,7 +44,8 @@ import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 import org.apache.spark.sql.functions.{col, monotonically_increasing_id, udf}
-import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, StructType}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.PeriodicCheckpointer
 import org.apache.spark.util.VersionUtils
 
@@ -904,6 +905,18 @@ class LDA @Since("1.6.0") (
       checkpointInterval, keepLastCheckpoint, optimizeDocConcentration, topicConcentration,
       learningDecay, optimizer, learningOffset, seed)
 
+    val oldData = LDA.getOldDataset(dataset, $(featuresCol))
+
+    // The EM solver will transform this oldData to a graph, and use a internal graphCheckpointer
+    // to update and cache the graph, so we do not need to cache it.
+    // The Online solver directly perform sampling on the oldData and update the model.
+    // However, Online solver will not cache the dataset internally.
+    val handlePersistence = dataset.storageLevel == StorageLevel.NONE &&
+      getOptimizer.toLowerCase(Locale.ROOT) == "online"
+    if (handlePersistence) {
+      oldData.persist(StorageLevel.MEMORY_AND_DISK)
+    }
+
     val oldLDA = new OldLDA()
       .setK($(k))
       .setDocConcentration(getOldDocConcentration)
@@ -912,15 +925,17 @@ class LDA @Since("1.6.0") (
       .setSeed($(seed))
       .setCheckpointInterval($(checkpointInterval))
       .setOptimizer(getOldOptimizer)
-    // TODO: persist here, or in old LDA?
-    val oldData = LDA.getOldDataset(dataset, $(featuresCol))
+
     val oldModel = oldLDA.run(oldData)
     val newModel = oldModel match {
       case m: OldLocalLDAModel =>
         new LocalLDAModel(uid, m.vocabSize, m, dataset.sparkSession)
       case m: OldDistributedLDAModel =>
         new DistributedLDAModel(uid, m.vocabSize, m, dataset.sparkSession, None)
     }
+    if (handlePersistence) {
+      oldData.unpersist()
+    }
 
     instr.logNumFeatures(newModel.vocabSize)
     copyValues(newModel).setParent(this)
@@ -940,8 +955,8 @@ object LDA extends MLReadable[LDA] {
        dataset: Dataset[_],
        featuresCol: String): RDD[(Long, OldVector)] = {
     dataset
-      .withColumn("docId", monotonically_increasing_id())
-      .select(col("docId"), DatasetUtils.columnToVector(dataset, featuresCol))
+      .select(monotonically_increasing_id(),
+        DatasetUtils.columnToVector(dataset, featuresCol))
       .rdd
       .map { case Row(docId: Long, features: Vector) =>
         (docId, OldVectors.fromML(features))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -30,7 +30,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.mllib.linalg.{DenseVector, Matrices, SparseVector, Vector, Vectors}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.util.Utils
+
 
 /**
  * :: DeveloperApi ::
@@ -437,6 +437,10 @@ final class OnlineLDAOptimizer extends LDAOptimizer with Logging {
     this.randomGenerator = new Random(lda.getSeed)
 
     this.docs = docs
+    if (this.docs.getStorageLevel == StorageLevel.NONE) {
+      logWarning("The input data is not directly cached, which may hurt performance if its"
+        + " parent RDDs are also uncached.")
+    }
 
     // Initialize the variational distribution q(beta|lambda)
     this.lambda = getGammaMatrix(k, vocabSize)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
@@ -71,7 +71,7 @@ class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest {
       rows =>
         val numClusters = rows.distinct.length
         // Verify we hit the edge case
-        assert(numClusters < k && numClusters > 1)
+        assert(numClusters > 1)
     }
   }
 

Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest {`
`71`	`71`	`rows =>`
`72`	`72`	`val numClusters = rows.distinct.length`
`73`	`73`	`// Verify we hit the edge case`
`74`		`- assert(numClusters < k && numClusters > 1)`
	`74`	`+ assert(numClusters > 1)`
`75`	`75`	`}`
`76`	`76`	`}`
`77`	`77`