[SPARK-30044][ML] MNB/CNB/BNB use empty sigma matrix instead of null

zhengruifeng · zhengruifeng · commit 4021354b73dd · 2019-12-03T10:02:23.000+08:00
### What changes were proposed in this pull request? MNB/CNB/BNB use empty sigma matrix instead of null ### Why are the changes needed? 1,Using empty sigma matrix will simplify the impl 2,I am reviewing FM impl these days, FMModels have optional bias and linear part. It seems more reasonable to set optional part an empty vector/matrix or zero value than `null` ### Does this PR introduce any user-facing change? yes, sigma from `null` to empty matrix ### How was this patch tested? updated testsuites Closes apache#26679 from zhengruifeng/nb_use_empty_sigma. Authored-by: zhengruifeng <ruifengz@foxmail.com> Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -19,7 +19,6 @@ package org.apache.spark.ml.classification
 
 import org.apache.hadoop.fs.Path
 import org.json4s.DefaultFormats
-import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.annotation.Since
 import org.apache.spark.ml.PredictorParams
@@ -243,12 +242,12 @@ class NaiveBayes @Since("1.5.0") (
     $(modelType) match {
       case Multinomial | Bernoulli =>
         val theta = new DenseMatrix(numLabels, numFeatures, thetaArray, true)
-        new NaiveBayesModel(uid, pi.compressed, theta.compressed, null)
+        new NaiveBayesModel(uid, pi.compressed, theta.compressed, Matrices.zeros(0, 0))
           .setOldLabels(labelArray)
       case Complement =>
         // Since the CNB compute the coefficient in a complement way.
         val theta = new DenseMatrix(numLabels, numFeatures, thetaArray.map(v => -v), true)
-        new NaiveBayesModel(uid, pi.compressed, theta.compressed, null)
+        new NaiveBayesModel(uid, pi.compressed, theta.compressed, Matrices.zeros(0, 0))
     }
   }
 
@@ -575,8 +574,7 @@ object NaiveBayesModel extends MLReadable[NaiveBayesModel] {
   private[NaiveBayesModel] class NaiveBayesModelWriter(instance: NaiveBayesModel) extends MLWriter {
     import NaiveBayes._
 
-    private case class Data(pi: Vector, theta: Matrix)
-    private case class GaussianData(pi: Vector, theta: Matrix, sigma: Matrix)
+    private case class Data(pi: Vector, theta: Matrix, sigma: Matrix)
 
     override protected def saveImpl(path: String): Unit = {
       // Save metadata and Params
@@ -585,21 +583,17 @@ object NaiveBayesModel extends MLReadable[NaiveBayesModel] {
 
       instance.getModelType match {
         case Multinomial | Bernoulli | Complement =>
-          // Save model data: pi, theta
-          require(instance.sigma == null)
-          val data = Data(instance.pi, instance.theta)
-          sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
-
+          require(instance.sigma.numRows == 0 && instance.sigma.numCols == 0)
         case Gaussian =>
-          require(instance.sigma != null)
-          val data = GaussianData(instance.pi, instance.theta, instance.sigma)
-          sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
+          require(instance.sigma.numRows != 0 && instance.sigma.numCols != 0)
       }
+
+      val data = Data(instance.pi, instance.theta, instance.sigma)
+      sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
     }
   }
 
   private class NaiveBayesModelReader extends MLReader[NaiveBayesModel] {
-    import NaiveBayes._
 
     /** Checked against metadata when loading model */
     private val className = classOf[NaiveBayesModel].getName
@@ -608,19 +602,17 @@ object NaiveBayesModel extends MLReadable[NaiveBayesModel] {
       implicit val format = DefaultFormats
       val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
       val (major, minor) = VersionUtils.majorMinorVersion(metadata.sparkVersion)
-      val modelTypeJson = metadata.getParamValue("modelType")
-      val modelType = Param.jsonDecode[String](compact(render(modelTypeJson)))
 
       val dataPath = new Path(path, "data").toString
       val data = sparkSession.read.parquet(dataPath)
       val vecConverted = MLUtils.convertVectorColumnsToML(data, "pi")
 
-      val model = if (major.toInt < 3 || modelType != Gaussian) {
+      val model = if (major.toInt < 3) {
         val Row(pi: Vector, theta: Matrix) =
           MLUtils.convertMatrixColumnsToML(vecConverted, "theta")
             .select("pi", "theta")
             .head()
-        new NaiveBayesModel(metadata.uid, pi, theta, null)
+        new NaiveBayesModel(metadata.uid, pi, theta, Matrices.zeros(0, 0))
       } else {
         val Row(pi: Vector, theta: Matrix, sigma: Matrix) =
           MLUtils.convertMatrixColumnsToML(vecConverted, "theta", "sigma")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
@@ -96,8 +96,8 @@ class NaiveBayesSuite extends MLTest with DefaultReadWriteTest {
     assert(Vectors.dense(model.pi.toArray.map(math.exp)) ~==
       Vectors.dense(piData.toArray.map(math.exp)) absTol 0.05, "pi mismatch")
     assert(model.theta.map(math.exp) ~== thetaData.map(math.exp) absTol 0.05, "theta mismatch")
-    if (sigmaData == null) {
-      assert(model.sigma == null, "sigma mismatch")
+    if (sigmaData === Matrices.zeros(0, 0)) {
+      assert(model.sigma === Matrices.zeros(0, 0), "sigma mismatch")
     } else {
       assert(model.sigma.map(math.exp) ~== sigmaData.map(math.exp) absTol 0.05,
         "sigma mismatch")
@@ -166,7 +166,7 @@ class NaiveBayesSuite extends MLTest with DefaultReadWriteTest {
     ParamsSuite.checkParams(new NaiveBayes)
     val model = new NaiveBayesModel("nb", pi = Vectors.dense(Array(0.2, 0.8)),
       theta = new DenseMatrix(2, 3, Array(0.1, 0.2, 0.3, 0.4, 0.6, 0.4)),
-      sigma = null)
+      sigma = Matrices.zeros(0, 0))
     ParamsSuite.checkParams(model)
   }
 
@@ -195,7 +195,7 @@ class NaiveBayesSuite extends MLTest with DefaultReadWriteTest {
     val nb = new NaiveBayes().setSmoothing(1.0).setModelType("multinomial")
     val model = nb.fit(testDataset)
 
-    validateModelFit(pi, theta, null, model)
+    validateModelFit(pi, theta, Matrices.zeros(0, 0), model)
     assert(model.hasParent)
     MLTestingUtils.checkCopyAndUids(nb, model)
 
@@ -281,7 +281,7 @@ class NaiveBayesSuite extends MLTest with DefaultReadWriteTest {
     val nb = new NaiveBayes().setSmoothing(1.0).setModelType("bernoulli")
     val model = nb.fit(testDataset)
 
-    validateModelFit(pi, theta, null, model)
+    validateModelFit(pi, theta, Matrices.zeros(0, 0), model)
     assert(model.hasParent)
 
     val validationDataset =
@@ -512,7 +512,7 @@ class NaiveBayesSuite extends MLTest with DefaultReadWriteTest {
       if (model.getModelType == "gaussian") {
         assert(model.sigma === model2.sigma)
       } else {
-        assert(model.sigma === null && model2.sigma === null)
+        assert(model.sigma === Matrices.zeros(0, 0) && model2.sigma === Matrices.zeros(0, 0))
       }
     }
     val nb = new NaiveBayes()
@@ -531,7 +531,7 @@ class NaiveBayesSuite extends MLTest with DefaultReadWriteTest {
       nb, spark) { (expected, actual) =>
         assert(expected.pi === actual.pi)
         assert(expected.theta === actual.theta)
-        assert(expected.sigma === null && actual.sigma === null)
+        assert(expected.sigma === Matrices.zeros(0, 0) && actual.sigma === Matrices.zeros(0, 0))
       }
   }
 }
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
@@ -1934,8 +1934,8 @@ class NaiveBayes(JavaProbabilisticClassifier, _NaiveBayesParams, HasThresholds,
     DenseVector([-0.81..., -0.58...])
     >>> model.theta
     DenseMatrix(2, 2, [-0.91..., -0.51..., -0.40..., -1.09...], 1)
-    >>> model.sigma == None
-    True
+    >>> model.sigma
+    DenseMatrix(0, 0, [...], ...)
     >>> test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF()
     >>> model.predict(test0.head().features)
     1.0
@@ -1978,8 +1978,8 @@ class NaiveBayes(JavaProbabilisticClassifier, _NaiveBayesParams, HasThresholds,
     'complement'
     >>> model5.theta
     DenseMatrix(2, 2, [...], 1)
-    >>> model5.sigma == None
-    True
+    >>> model5.sigma
+    DenseMatrix(0, 0, [...], ...)
 
     .. versionadded:: 1.5.0
     """