[SPARK-52051][ML][CONNECT] Enable model summary when memory control is enabled

WeichenXu123 · WeichenXu123 · commit 568f92017fe1 · 2025-05-09T22:33:41.000+08:00
### What changes were proposed in this pull request? Enable model summary in SparkConnect when memory control is enabled. ### Why are the changes needed? Motivation: model summary is necessary in many use-cases. although it hasn't support offloading, we can still enable it. User can use the summary object within the offloading timeout. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#50843 from WeichenXu123/spark-connect-enable-summary. Lead-authored-by: Weichen Xu <weichen.xu@databricks.com> Co-authored-by: WeichenXu <weichen.xu@databricks.com> Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -826,15 +826,17 @@
       },
       "CACHE_INVALID" : {
         "message" : [
-          "Cannot retrieve <objectName> from the ML cache. It is probably because the entry has been evicted."
+          "Cannot retrieve Summary object <objectName> from the ML cache.",
+          "The Summary object is evicted if it hasn't been used a specified period of time.",
+          "You can configure the timeout by setting Spark cluster configure 'spark.connect.session.connectML.mlCache.memoryControl.offloadingTimeout'."
         ]
       },
       "ML_CACHE_SIZE_OVERFLOW_EXCEPTION" : {
         "message" : [
           "The model cache size in current session is about to exceed",
           "<mlCacheMaxSize> bytes.",
           "Please delete existing cached model by executing 'del model' in python client before fitting new model or loading new model,",
-          "or increase Spark config 'spark.connect.session.connectML.mlCache.memoryControl.maxSize'."
+          "or increase Spark config 'spark.connect.session.connectML.mlCache.memoryControl.maxStorageSize'."
         ]
       },
       "MODEL_SIZE_OVERFLOW_EXCEPTION" : {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/FMClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/FMClassifier.scala
@@ -224,18 +224,15 @@ class FMClassifier @Since("3.0.0") (
     val model = copyValues(new FMClassificationModel(uid, intercept, linear, factors))
     val weightColName = if (!isDefined(weightCol)) "weightCol" else $(weightCol)
 
-    if (SummaryUtils.enableTrainingSummary) {
-      val (summaryModel, probabilityColName, predictionColName) = model.findSummaryModel()
-      val summary = new FMClassificationTrainingSummaryImpl(
-        summaryModel.transform(dataset),
-        probabilityColName,
-        predictionColName,
-        $(labelCol),
-        weightColName,
-        objectiveHistory)
-      model.setSummary(Some(summary))
-    }
-    model
+    val (summaryModel, probabilityColName, predictionColName) = model.findSummaryModel()
+    val summary = new FMClassificationTrainingSummaryImpl(
+      summaryModel.transform(dataset),
+      probabilityColName,
+      predictionColName,
+      $(labelCol),
+      weightColName,
+      objectiveHistory)
+    model.setSummary(Some(summary))
   }
 
   @Since("3.0.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
@@ -277,18 +277,15 @@ class LinearSVC @Since("2.2.0") (
     val model = copyValues(new LinearSVCModel(uid, coefficients, intercept))
     val weightColName = if (!isDefined(weightCol)) "weightCol" else $(weightCol)
 
-    if (SummaryUtils.enableTrainingSummary) {
-      val (summaryModel, rawPredictionColName, predictionColName) = model.findSummaryModel()
-      val summary = new LinearSVCTrainingSummaryImpl(
-        summaryModel.transform(dataset),
-        rawPredictionColName,
-        predictionColName,
-        $(labelCol),
-        weightColName,
-        objectiveHistory)
-      model.setSummary(Some(summary))
-    }
-    model
+    val (summaryModel, rawPredictionColName, predictionColName) = model.findSummaryModel()
+    val summary = new LinearSVCTrainingSummaryImpl(
+      summaryModel.transform(dataset),
+      rawPredictionColName,
+      predictionColName,
+      $(labelCol),
+      weightColName,
+      objectiveHistory)
+    model.setSummary(Some(summary))
   }
 
   private def trainImpl(
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -711,30 +711,27 @@ class LogisticRegression @Since("1.2.0") (
       numClasses, checkMultinomial(numClasses)))
     val weightColName = if (!isDefined(weightCol)) "weightCol" else $(weightCol)
 
-    if (SummaryUtils.enableTrainingSummary) {
-      val (summaryModel, probabilityColName, predictionColName) = model.findSummaryModel()
-      val logRegSummary = if (numClasses <= 2) {
-        new BinaryLogisticRegressionTrainingSummaryImpl(
-          summaryModel.transform(dataset),
-          probabilityColName,
-          predictionColName,
-          $(labelCol),
-          $(featuresCol),
-          weightColName,
-          objectiveHistory)
-      } else {
-        new LogisticRegressionTrainingSummaryImpl(
-          summaryModel.transform(dataset),
-          probabilityColName,
-          predictionColName,
-          $(labelCol),
-          $(featuresCol),
-          weightColName,
-          objectiveHistory)
-      }
-      model.setSummary(Some(logRegSummary))
+    val (summaryModel, probabilityColName, predictionColName) = model.findSummaryModel()
+    val logRegSummary = if (numClasses <= 2) {
+      new BinaryLogisticRegressionTrainingSummaryImpl(
+        summaryModel.transform(dataset),
+        probabilityColName,
+        predictionColName,
+        $(labelCol),
+        $(featuresCol),
+        weightColName,
+        objectiveHistory)
+    } else {
+      new LogisticRegressionTrainingSummaryImpl(
+        summaryModel.transform(dataset),
+        probabilityColName,
+        predictionColName,
+        $(labelCol),
+        $(featuresCol),
+        weightColName,
+        objectiveHistory)
     }
-    model
+    model.setSummary(Some(logRegSummary))
   }
 
   private def createBounds(
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -249,17 +249,14 @@ class MultilayerPerceptronClassifier @Since("1.5.0") (
       objectiveHistory: Array[Double]): MultilayerPerceptronClassificationModel = {
     val model = copyValues(new MultilayerPerceptronClassificationModel(uid, weights))
 
-    if (SummaryUtils.enableTrainingSummary) {
-      val (summaryModel, _, predictionColName) = model.findSummaryModel()
-      val summary = new MultilayerPerceptronClassificationTrainingSummaryImpl(
-        summaryModel.transform(dataset),
-        predictionColName,
-        $(labelCol),
-        "",
-        objectiveHistory)
-      model.setSummary(Some(summary))
-    }
-    model
+    val (summaryModel, _, predictionColName) = model.findSummaryModel()
+    val summary = new MultilayerPerceptronClassificationTrainingSummaryImpl(
+      summaryModel.transform(dataset),
+      predictionColName,
+      $(labelCol),
+      "",
+      objectiveHistory)
+    model.setSummary(Some(summary))
   }
 }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -185,26 +185,23 @@ class RandomForestClassifier @Since("1.4.0") (
     val weightColName = if (!isDefined(weightCol)) "weightCol" else $(weightCol)
 
     val (summaryModel, probabilityColName, predictionColName) = model.findSummaryModel()
-    if (SummaryUtils.enableTrainingSummary) {
-      val rfSummary = if (numClasses <= 2) {
-        new BinaryRandomForestClassificationTrainingSummaryImpl(
-          summaryModel.transform(dataset),
-          probabilityColName,
-          predictionColName,
-          $(labelCol),
-          weightColName,
-          Array(0.0))
-      } else {
-        new RandomForestClassificationTrainingSummaryImpl(
-          summaryModel.transform(dataset),
-          predictionColName,
-          $(labelCol),
-          weightColName,
-          Array(0.0))
-      }
-      model.setSummary(Some(rfSummary))
+    val rfSummary = if (numClasses <= 2) {
+      new BinaryRandomForestClassificationTrainingSummaryImpl(
+        summaryModel.transform(dataset),
+        probabilityColName,
+        predictionColName,
+        $(labelCol),
+        weightColName,
+        Array(0.0))
+    } else {
+      new RandomForestClassificationTrainingSummaryImpl(
+        summaryModel.transform(dataset),
+        predictionColName,
+        $(labelCol),
+        weightColName,
+        Array(0.0))
     }
-    model
+    model.setSummary(Some(rfSummary))
   }
 
   @Since("1.4.1")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -303,19 +303,16 @@ class BisectingKMeans @Since("2.0.0") (
     val parentModel = bkm.runWithWeight(instances, handlePersistence, Some(instr))
     val model = copyValues(new BisectingKMeansModel(uid, parentModel).setParent(this))
 
-    if (SummaryUtils.enableTrainingSummary) {
-      val summary = new BisectingKMeansSummary(
-        model.transform(dataset),
-        $(predictionCol),
-        $(featuresCol),
-        $(k),
-        $(maxIter),
-        parentModel.trainingCost)
-      instr.logNamedValue("clusterSizes", summary.clusterSizes)
-      instr.logNumFeatures(model.clusterCenters.head.size)
-      model.setSummary(Some(summary))
-    }
-    model
+    val summary = new BisectingKMeansSummary(
+      model.transform(dataset),
+      $(predictionCol),
+      $(featuresCol),
+      $(k),
+      $(maxIter),
+      parentModel.trainingCost)
+    instr.logNamedValue("clusterSizes", summary.clusterSizes)
+    instr.logNumFeatures(model.clusterCenters.head.size)
+    model.setSummary(Some(summary))
   }
 
   @Since("2.0.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -430,14 +430,11 @@ class GaussianMixture @Since("2.0.0") (
 
     val model = copyValues(new GaussianMixtureModel(uid, weights, gaussianDists))
       .setParent(this)
-    if (SummaryUtils.enableTrainingSummary) {
-      val summary = new GaussianMixtureSummary(model.transform(dataset),
-        $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood, iteration)
-      instr.logNamedValue("logLikelihood", logLikelihood)
-      instr.logNamedValue("clusterSizes", summary.clusterSizes)
-      model.setSummary(Some(summary))
-    }
-    model
+    val summary = new GaussianMixtureSummary(model.transform(dataset),
+      $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood, iteration)
+    instr.logNamedValue("logLikelihood", logLikelihood)
+    instr.logNamedValue("clusterSizes", summary.clusterSizes)
+    model.setSummary(Some(summary))
   }
 
   private def trainImpl(
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -391,18 +391,16 @@ class KMeans @Since("1.5.0") (
     }
 
     val model = copyValues(new KMeansModel(uid, oldModel).setParent(this))
-    if (SummaryUtils.enableTrainingSummary) {
-      val summary = new KMeansSummary(
-        model.transform(dataset),
-        $(predictionCol),
-        $(featuresCol),
-        $(k),
-        oldModel.numIter,
-        oldModel.trainingCost)
-
-      model.setSummary(Some(summary))
-      instr.logNamedValue("clusterSizes", summary.clusterSizes)
-    }
+    val summary = new KMeansSummary(
+      model.transform(dataset),
+      $(predictionCol),
+      $(featuresCol),
+      $(k),
+      oldModel.numIter,
+      oldModel.trainingCost)
+
+    model.setSummary(Some(summary))
+    instr.logNamedValue("clusterSizes", summary.clusterSizes)
     model
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -418,12 +418,9 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
       val model = copyValues(
         new GeneralizedLinearRegressionModel(uid, wlsModel.coefficients, wlsModel.intercept)
           .setParent(this))
-      if (SummaryUtils.enableTrainingSummary) {
-        val trainingSummary = new GeneralizedLinearRegressionTrainingSummary(dataset, model,
-          wlsModel.diagInvAtWA.toArray, 1, getSolver)
-        model.setSummary(Some(trainingSummary))
-      }
-      model
+      val trainingSummary = new GeneralizedLinearRegressionTrainingSummary(dataset, model,
+        wlsModel.diagInvAtWA.toArray, 1, getSolver)
+      model.setSummary(Some(trainingSummary))
     } else {
       val instances = validated.rdd.map {
         case Row(label: Double, weight: Double, offset: Double, features: Vector) =>
@@ -438,12 +435,9 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
       val model = copyValues(
         new GeneralizedLinearRegressionModel(uid, irlsModel.coefficients, irlsModel.intercept)
           .setParent(this))
-      if (SummaryUtils.enableTrainingSummary) {
-        val trainingSummary = new GeneralizedLinearRegressionTrainingSummary(dataset, model,
-          irlsModel.diagInvAtWA.toArray, irlsModel.numIterations, getSolver)
-        model.setSummary(Some(trainingSummary))
-      }
-      model
+      val trainingSummary = new GeneralizedLinearRegressionTrainingSummary(dataset, model,
+        irlsModel.diagInvAtWA.toArray, irlsModel.numIterations, getSolver)
+      model.setSummary(Some(trainingSummary))
     }
 
     model
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -432,17 +432,14 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
 
     val model = createModel(parameters, yMean, yStd, featuresMean, featuresStd)
 
-    if (SummaryUtils.enableTrainingSummary) {
-      // Handle possible missing or invalid prediction columns
-      val (summaryModel, predictionColName) = model.findSummaryModelAndPredictionCol()
-      val trainingSummary = new LinearRegressionTrainingSummary(
-        summaryModel.transform(dataset), predictionColName, $(labelCol), $(featuresCol),
-        summaryModel.get(summaryModel.weightCol).getOrElse(""),
-        summaryModel.numFeatures, summaryModel.getFitIntercept,
-        Array(0.0), objectiveHistory)
-      model.setSummary(Some(trainingSummary))
-    }
-    model
+    // Handle possible missing or invalid prediction columns
+    val (summaryModel, predictionColName) = model.findSummaryModelAndPredictionCol()
+    val trainingSummary = new LinearRegressionTrainingSummary(
+      summaryModel.transform(dataset), predictionColName, $(labelCol), $(featuresCol),
+      summaryModel.get(summaryModel.weightCol).getOrElse(""),
+      summaryModel.numFeatures, summaryModel.getFitIntercept,
+      Array(0.0), objectiveHistory)
+    model.setSummary(Some(trainingSummary))
   }
 
   private def trainWithNormal(
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/HasTrainingSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/util/HasTrainingSummary.scala
@@ -50,9 +50,3 @@ private[spark] trait HasTrainingSummary[T] {
     this
   }
 }
-
-private[spark] object SummaryUtils {
-
-  // This flag is only used by Spark Connect
-  private[spark] var enableTrainingSummary: Boolean = true
-}
diff --git a/python/pyspark/ml/tests/connect/test_parity_tuning.py b/python/pyspark/ml/tests/connect/test_parity_tuning.py
@@ -25,16 +25,6 @@ class TuningParityTests(TuningTestsMixin, ReusedConnectTestCase):
     pass
 
 
-class TuningParityWithMLCacheOffloadingEnabledTests(TuningTestsMixin, ReusedConnectTestCase):
-    @classmethod
-    def conf(cls):
-        conf = super().conf()
-        conf.set("spark.connect.session.connectML.mlCache.memoryControl.enabled", "true")
-        conf.set("spark.connect.session.connectML.mlCache.memoryControl.maxInMemorySize", "1024")
-        conf.set("spark.connect.session.connectML.mlCache.memoryControl.offloadingTimeout", "1")
-        return conf
-
-
 if __name__ == "__main__":
     from pyspark.ml.tests.connect.test_parity_tuning import *  # noqa: F401
 
diff --git a/python/pyspark/testing/connectutils.py b/python/pyspark/testing/connectutils.py
@@ -158,9 +158,6 @@ def conf(cls):
         # Set a static token for all tests so the parallelism doesn't overwrite each
         # tests' environment variables
         conf.set("spark.connect.authenticate.token", "deadbeef")
-        # Disable ml cache offloading,
-        # offloading hasn't supported APIs like model.summary / model.evaluate
-        conf.set("spark.connect.session.connectML.mlCache.memoryControl.enabled", "false")
         return conf
 
     @classmethod
diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala
diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLCache.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLCache.scala
diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLHandler.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLHandler.scala
diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ml/MLSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/ml/MLSuite.scala