[SPARK-50920][ML][PYTHON][CONNECT] Support NaiveBayes on Connect

zhengruifeng · zhengruifeng · commit e0437e0021ef · 2025-01-26T12:06:00.000+08:00
### What changes were proposed in this pull request? Support NaiveBayes on Connect ### Why are the changes needed? feature parity ### Does this PR introduce _any_ user-facing change? yes, new algorithm supported on connect ### How was this patch tested? added tests ### Was this patch authored or co-authored using generative AI tooling? no Closes #49672 from zhengruifeng/ml_connect_nb. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
@@ -1048,6 +1048,8 @@ object SparseMatrix {
 @Since("2.0.0")
 object Matrices {
 
+  private[ml] val empty = new DenseMatrix(0, 0, Array.emptyDoubleArray)
+
   private[ml] def fromVectors(vectors: Seq[Vector]): Matrix = {
     val numRows = vectors.length
     val numCols = vectors.head.size
diff --git a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator
@@ -19,6 +19,7 @@
 # So register the supported estimator here if you're trying to add a new one.
 
 # classification
+org.apache.spark.ml.classification.NaiveBayes
 org.apache.spark.ml.classification.LinearSVC
 org.apache.spark.ml.classification.LogisticRegression
 org.apache.spark.ml.classification.DecisionTreeClassifier
diff --git a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
@@ -35,6 +35,7 @@ org.apache.spark.ml.feature.HashingTF
 
 ########### Model for loading
 # classification
+org.apache.spark.ml.classification.NaiveBayesModel
 org.apache.spark.ml.classification.LinearSVCModel
 org.apache.spark.ml.classification.LogisticRegressionModel
 org.apache.spark.ml.classification.DecisionTreeClassificationModel
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -401,6 +401,9 @@ class NaiveBayesModel private[ml] (
 
   import NaiveBayes._
 
+  private[ml] def this() = this(Identifiable.randomUID("nb"),
+    Vectors.empty, Matrices.empty, Matrices.empty)
+
   /**
    * mllib NaiveBayes is a wrapper of ml implementation currently.
    * Input labels of mllib could be {-1, +1} and mllib NaiveBayesModel exposes labels,
diff --git a/python/pyspark/ml/tests/test_classification.py b/python/pyspark/ml/tests/test_classification.py
@@ -22,8 +22,10 @@
 import numpy as np
 
 from pyspark.ml.linalg import Vectors, Matrices
-from pyspark.sql import SparkSession, DataFrame
+from pyspark.sql import SparkSession, DataFrame, Row
 from pyspark.ml.classification import (
+    NaiveBayes,
+    NaiveBayesModel,
     LinearSVC,
     LinearSVCModel,
     LinearSVCSummary,
@@ -46,6 +48,66 @@
 
 
 class ClassificationTestsMixin:
+    def test_naive_bayes(self):
+        spark = self.spark
+        df = spark.createDataFrame(
+            [
+                Row(label=0.0, weight=0.1, features=Vectors.dense([0.0, 0.0])),
+                Row(label=0.0, weight=0.5, features=Vectors.dense([0.0, 1.0])),
+                Row(label=1.0, weight=1.0, features=Vectors.dense([1.0, 0.0])),
+            ]
+        )
+
+        nb = NaiveBayes(smoothing=1.0, modelType="multinomial", weightCol="weight")
+        self.assertEqual(nb.getSmoothing(), 1.0)
+        self.assertEqual(nb.getModelType(), "multinomial")
+        self.assertEqual(nb.getWeightCol(), "weight")
+
+        model = nb.fit(df)
+        self.assertEqual(model.numClasses, 2)
+        self.assertEqual(model.numFeatures, 2)
+        self.assertTrue(
+            np.allclose(model.pi.toArray(), [-0.81093022, -0.58778666], atol=1e-4), model.pi
+        )
+        self.assertTrue(
+            np.allclose(
+                model.theta.toArray(),
+                [[-0.91629073, -0.51082562], [-0.40546511, -1.09861229]],
+                atol=1e-4,
+            ),
+            model.theta,
+        )
+        self.assertTrue(np.allclose(model.sigma.toArray(), [], atol=1e-4), model.sigma)
+
+        vec = Vectors.dense(0.0, 5.0)
+        self.assertEqual(model.predict(vec), 0.0)
+        pred = model.predictRaw(vec)
+        self.assertTrue(np.allclose(pred.toArray(), [-3.36505834, -6.08084811], atol=1e-4), pred)
+        pred = model.predictProbability(vec)
+        self.assertTrue(np.allclose(pred.toArray(), [0.93795196, 0.06204804], atol=1e-4), pred)
+
+        output = model.transform(df)
+        expected_cols = [
+            "label",
+            "weight",
+            "features",
+            "rawPrediction",
+            "probability",
+            "prediction",
+        ]
+        self.assertEqual(output.columns, expected_cols)
+        self.assertEqual(output.count(), 3)
+
+        # Model save & load
+        with tempfile.TemporaryDirectory(prefix="naive_bayes") as d:
+            nb.write().overwrite().save(d)
+            nb2 = NaiveBayes.load(d)
+            self.assertEqual(str(nb), str(nb2))
+
+            model.write().overwrite().save(d)
+            model2 = NaiveBayesModel.load(d)
+            self.assertEqual(str(model), str(model2))
+
     def test_binomial_logistic_regression_with_bound(self):
         df = self.spark.createDataFrame(
             [
diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala
@@ -523,6 +523,7 @@ private[ml] object MLUtils {
     (classOf[GBTRegressionModel], Set("featureImportances", "evaluateEachIteration")),
 
     // Classification Models
+    (classOf[NaiveBayesModel], Set("pi", "theta", "sigma")),
     (classOf[LinearSVCModel], Set("intercept", "coefficients", "evaluate")),
     (
       classOf[LogisticRegressionModel],

Original file line number	Diff line number	Diff line change
`@@ -523,6 +523,7 @@ private[ml] object MLUtils {`
`523`	`523`	`(classOf[GBTRegressionModel], Set("featureImportances", "evaluateEachIteration")),`
`524`	`524`
`525`	`525`	`// Classification Models`
	`526`	`+ (classOf[NaiveBayesModel], Set("pi", "theta", "sigma")),`
`526`	`527`	`(classOf[LinearSVCModel], Set("intercept", "coefficients", "evaluate")),`
`527`	`528`	`(`
`528`	`529`	`classOf[LogisticRegressionModel],`