Naive Bayes: Ignore existing classes in Laplacian smoothing

janezd · janezd · commit de273b100871 · 2019-02-02T17:52:45.000+01:00
diff --git a/Orange/classification/naive_bayes.py b/Orange/classification/naive_bayes.py
@@ -33,11 +33,23 @@ def fit_storage(self, table):
         cont = contingency.get_contingencies(table)
         class_freq = np.array(np.diag(
             contingency.get_contingency(table, table.domain.class_var)))
-        class_prob = (class_freq + 1) / (np.sum(class_freq) + len(class_freq))
+        nclss = (class_freq != 0).sum()
+        if not nclss:
+            raise ValueError("Data has no defined target values")
+
+        # Laplacian smoothing considers only classes that appear in the data,
+        # in part to avoid cases where the probabilities are affected by empty
+        # (or completely spurious) classes that appear because of Orange's reuse
+        # of variables. See GH-2943.
+        # The corresponding elements of class_probs are set to zero only after
+        # mock non-zero values are used in computation of log_cont_prob to
+        # prevent division by zero.
+        class_prob = (class_freq + 1) / (np.sum(class_freq) + nclss)
         log_cont_prob = [np.log(
-            (np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] +
-                                 c.shape[0]) / class_prob[:, None])
+            (np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] + nclss)
+            / class_prob[:, None])
                          for c in cont]
+        class_prob[class_freq == 0] = 0
         return NaiveBayesModel(log_cont_prob, class_prob, table.domain)
 
 
@@ -58,35 +70,30 @@ def predict_storage(self, data):
         else:
             isnan = np.isnan
             zeros = np.zeros_like(self.class_prob)
-            probs = np.atleast_2d(np.exp(
-                np.log(self.class_prob) +
-                np.array([
-                    zeros if isnan(ins.x).all() else
-                    sum(attr_prob[:, int(attr_val)]
-                        for attr_val, attr_prob in zip(ins, self.log_cont_prob)
-                        if not isnan(attr_val))
-                    for ins in data])))
+            probs = self.class_prob * np.exp(np.array([
+                zeros if isnan(ins.x).all() else
+                sum(attr_prob[:, int(attr_val)]
+                    for attr_val, attr_prob in zip(ins, self.log_cont_prob)
+                    if not isnan(attr_val))
+                for ins in data]))
         probs /= probs.sum(axis=1)[:, None]
         values = probs.argmax(axis=1)
         return values, probs
 
     def predict(self, X):
-        if not self.log_cont_prob:
-            probs = self._priors(X)
-        elif sp.issparse(X):
-            probs = self._sparse_probs(X)
-        else:
-            probs = self._dense_probs(X)
-        probs = np.exp(probs)
+        probs = np.zeros((X.shape[0], self.class_prob.shape[0]))
+        if self.log_cont_prob is not None:
+            if sp.issparse(X):
+                self._sparse_probs(X, probs)
+            else:
+                self._dense_probs(X, probs)
+        np.exp(probs, probs)
+        probs *= self.class_prob
         probs /= probs.sum(axis=1)[:, None]
         values = probs.argmax(axis=1)
         return values, probs
 
-    def _priors(self, data):
-        return np.tile(np.log(self.class_prob), (data.shape[0], 1))
-
-    def _dense_probs(self, data):
-        probs = self._priors(data)
+    def _dense_probs(self, data, probs):
         zeros = np.zeros((1, probs.shape[1]))
         for col, attr_prob in zip(data.T, self.log_cont_prob):
             col = col.copy()
@@ -96,9 +103,7 @@ def _dense_probs(self, data):
             probs += probs0[col]
         return probs
 
-    def _sparse_probs(self, data):
-        probs = self._priors(data)
-
+    def _sparse_probs(self, data, probs):
         n_vals = max(p.shape[1] for p in self.log_cont_prob) + 1
         log_prob = np.zeros((len(self.log_cont_prob),
                              n_vals,
diff --git a/Orange/tests/test_naive_bayes.py b/Orange/tests/test_naive_bayes.py
@@ -2,8 +2,8 @@
 # pylint: disable=missing-docstring
 
 import unittest
-import warnings
 from unittest.mock import Mock
+import warnings
 
 import numpy as np
 import scipy.sparse as sp
@@ -92,18 +92,21 @@ def test_compare_results_of_predict_and_predict_storage(self):
 
     def test_predictions(self):
         self._test_predictions(sparse=None)
+        self._test_predictions_with_absent_class(sparse=None)
 
     def test_predictions_csr_matrix(self):
         with warnings.catch_warnings():
             warnings.filterwarnings(
                 "ignore", ".*the matrix subclass.*", PendingDeprecationWarning)
             self._test_predictions(sparse=sp.csr_matrix)
+            self._test_predictions_with_absent_class(sparse=sp.csr_matrix)
 
     def test_predictions_csc_matrix(self):
         with warnings.catch_warnings():
             warnings.filterwarnings(
                 "ignore", ".*the matrix subclass.*", PendingDeprecationWarning)
             self._test_predictions(sparse=sp.csc_matrix)
+            self._test_predictions_with_absent_class(sparse=sp.csc_matrix)
 
     def _test_predictions(self, sparse):
         x = np.array([
@@ -205,6 +208,107 @@ def _test_predictions(self, sparse):
         np.testing.assert_almost_equal(exp_probs, probs)
         np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
 
+    def _test_predictions_with_absent_class(self, sparse):
+        """Empty classes should not affect predictions"""
+        x = np.array([
+            [1, 0, 0],
+            [0, np.nan, 0],
+            [0, 1, 0],
+            [0, 0, 0],
+            [1, 2, 0],
+            [1, 1, 0],
+            [1, 2, 0],
+            [0, 1, 0]])
+        if sparse is not None:
+            x = sparse(x)
+
+        y = np.array([0, 0, 0, 2, 2, 2, 3, 3])
+        domain = Domain(
+            [DiscreteVariable("a", values="ab"),
+             DiscreteVariable("b", values="abc"),
+             DiscreteVariable("c", values="a")],
+            DiscreteVariable("y", values="abcd"))
+        data = Table.from_numpy(domain, x, y)
+
+        model = self.learner(data)
+        np.testing.assert_almost_equal(
+            model.class_prob,
+            [4/11, 0, 4/11, 3/11]
+        )
+        np.testing.assert_almost_equal(
+            np.exp(model.log_cont_prob[0]) * model.class_prob[:, None],
+            [[3/7, 2/7], [0, 0], [2/7, 3/7], [2/7, 2/7]])
+        np.testing.assert_almost_equal(
+            np.exp(model.log_cont_prob[1]) * model.class_prob[:, None],
+            [[2/5, 1/3, 1/5], [0, 0, 0], [2/5, 1/3, 2/5], [1/5, 1/3, 2/5]])
+        np.testing.assert_almost_equal(
+            np.exp(model.log_cont_prob[2]) * model.class_prob[:, None],
+            [[4/11], [0], [4/11], [3/11]])
+
+        test_x = np.array([[a, b, 0] for a in [0, 1] for b in [0, 1, 2]])
+        # Classifiers reject csc matrices in the base class
+        # Naive bayesian classifier supports them if predict_storage is
+        # called directly, which we do below
+        if sparse is not None and sparse is not sp.csc_matrix:
+            test_x = sparse(test_x)
+        test_y = np.full((6, ), np.nan)
+        # The following was computed manually, too
+        exp_probs = np.array([
+            [0.47368421052632, 0, 0.31578947368421, 0.21052631578947],
+            [0.39130434782609, 0, 0.26086956521739, 0.34782608695652],
+            [0.24324324324324, 0, 0.32432432432432, 0.43243243243243],
+            [0.31578947368421, 0, 0.47368421052632, 0.21052631578947],
+            [0.26086956521739, 0, 0.39130434782609, 0.34782608695652],
+            [0.15000000000000, 0, 0.45000000000000, 0.40000000000000]
+        ])
+
+        # Test the faster algorithm for Table (numpy matrices)
+        test_data = Table.from_numpy(domain, test_x, test_y)
+        probs = model(test_data, ret=model.Probs)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        values = model(test_data)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+        values, probs = model(test_data, ret=model.ValueProbs)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+
+        # Test the slower algorithm for non-Table data (iteration in Python)
+        test_data = NotATable.from_numpy(domain, test_x, test_y)
+        probs = model(test_data, ret=model.Probs)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        values = model(test_data)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+        values, probs = model(test_data, ret=model.ValueProbs)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+
+        # Test prediction directly on numpy
+        probs = model(test_x, ret=model.Probs)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        values = model(test_x)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+        values, probs = model(test_x, ret=model.ValueProbs)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+
+        # Test prediction on instances
+        for inst, exp_prob in zip(test_data, exp_probs):
+            np.testing.assert_almost_equal(
+                model(inst, ret=model.Probs)[0],
+                exp_prob)
+            self.assertEqual(model(inst), np.argmax(exp_prob))
+            value, prob = model(inst, ret=model.ValueProbs)
+            np.testing.assert_almost_equal(prob[0], exp_prob)
+            self.assertEqual(value, np.argmax(exp_prob))
+
+        # Test prediction by directly calling predict. This is needed to test
+        # csc_matrix, but doesn't hurt others
+        if sparse is sp.csc_matrix:
+            test_x = sparse(test_x)
+        values, probs = model.predict(test_x)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+
     def test_no_attributes(self):
         y = np.array([0, 0, 0, 1, 1, 1, 2, 2])
         domain = Domain([], DiscreteVariable("y", values="abc"))
@@ -215,6 +319,5 @@ def test_no_attributes(self):
             [[4/11, 4/11, 3/11]] * 5
         )
 
-
 if __name__ == "__main__":
     unittest.main()