Merge pull request #3540 from janezd/reimplement-nb-predict-storage

janezd · web-flow · commit c9e60c9a8a23 · 2019-01-28T11:59:52.000+01:00
[ENH] Naive Bayes: Implement predict, fix predict_storage
diff --git a/Orange/classification/naive_bayes.py b/Orange/classification/naive_bayes.py
@@ -1,7 +1,8 @@
 import numpy as np
+import scipy.sparse as sp
 
 from Orange.classification import Learner, Model
-from Orange.data import Instance, Storage
+from Orange.data import Instance, Storage, Table
 from Orange.statistics import contingency
 from Orange.preprocess import Discretize, RemoveNaNColumns
 
@@ -48,22 +49,80 @@ def __init__(self, log_cont_prob, class_prob, domain):
 
     def predict_storage(self, data):
         if isinstance(data, Instance):
-            data = [data]
-        if len(data.domain.attributes) == 0:
+            data = Table(np.atleast_2d(data.x))
+        if type(data) is Table:  # pylint: disable=unidiomatic-typecheck
+            return self.predict(data.X)
+
+        if not len(data) or not len(data[0]):
             probs = np.tile(self.class_prob, (len(data), 1))
         else:
             isnan = np.isnan
-            probs = np.exp(
+            zeros = np.zeros_like(self.class_prob)
+            probs = np.atleast_2d(np.exp(
                 np.log(self.class_prob) +
-                np.array([np.zeros_like(self.class_prob)
-                          if isnan(ins.x).all() else
-                          np.sum(attr_prob[:, int(attr_val)]
-                                 for attr_val, attr_prob in zip(ins, self.log_cont_prob)
-                                 if not isnan(attr_val))
-                          for ins in data]))
+                np.array([
+                    zeros if isnan(ins.x).all() else
+                    sum(attr_prob[:, int(attr_val)]
+                        for attr_val, attr_prob in zip(ins, self.log_cont_prob)
+                        if not isnan(attr_val))
+                    for ins in data])))
         probs /= probs.sum(axis=1)[:, None]
         values = probs.argmax(axis=1)
         return values, probs
 
+    def predict(self, X):
+        if not self.log_cont_prob:
+            probs = self._priors(X)
+        elif sp.issparse(X):
+            probs = self._sparse_probs(X)
+        else:
+            probs = self._dense_probs(X)
+        probs = np.exp(probs)
+        probs /= probs.sum(axis=1)[:, None]
+        values = probs.argmax(axis=1)
+        return values, probs
+
+    def _priors(self, data):
+        return np.tile(np.log(self.class_prob), (data.shape[0], 1))
+
+    def _dense_probs(self, data):
+        probs = self._priors(data)
+        zeros = np.zeros((1, probs.shape[1]))
+        for col, attr_prob in zip(data.T, self.log_cont_prob):
+            col = col.copy()
+            col[np.isnan(col)] = attr_prob.shape[1] - 1
+            col = col.astype(int)
+            probs0 = np.vstack((attr_prob.T, zeros))
+            probs += probs0[col]
+        return probs
+
+    def _sparse_probs(self, data):
+        probs = self._priors(data)
+
+        n_vals = max(p.shape[1] for p in self.log_cont_prob) + 1
+        log_prob = np.zeros((len(self.log_cont_prob),
+                             n_vals,
+                             self.log_cont_prob[0].shape[0]))
+        for i, p in enumerate(self.log_cont_prob):
+            p0 = p.T[0].copy()
+            probs[:] += p0
+            log_prob[i, :p.shape[1]] = p.T - p0
+
+        dat = data.data.copy()
+        dat[np.isnan(dat)] = n_vals - 1
+        dat = dat.astype(int)
+
+        if sp.isspmatrix_csr(data):
+            for row, start, end in zip(probs, data.indptr, data.indptr[1:]):
+                row += log_prob[data.indices[start:end],
+                                dat[start:end]].sum(axis=0)
+        else:
+            csc = data.tocsc()
+            for start, end, attr_prob in zip(csc.indptr, csc.indptr[1:],
+                                             log_prob):
+                probs[csc.indices[start:end]] += attr_prob[dat[start:end]]
+
+        return probs
+
 
 NaiveBayesLearner.__returns__ = NaiveBayesModel
diff --git a/Orange/tests/test_naive_bayes.py b/Orange/tests/test_naive_bayes.py
@@ -2,39 +2,42 @@
 # pylint: disable=missing-docstring
 
 import unittest
+import warnings
+from unittest.mock import Mock
+
+import numpy as np
+import scipy.sparse as sp
 
 from Orange.classification import NaiveBayesLearner
 from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable
 from Orange.evaluation import CrossValidation, CA
 
 
+# This class is used to force predict_storage to fall back to the slower
+# procedure instead of calling `predict`
+class NotATable(Table):  # pylint: disable=too-many-ancestors,abstract-method
+    pass
+
+
 class TestNaiveBayesLearner(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        data = Table('titanic')
+        cls.data = data = Table('titanic')
         cls.learner = NaiveBayesLearner()
-        cls.model = cls.learner(data)
         cls.table = data[::20]
 
+    def setUp(self):
+        self.model = self.learner(self.data)
+
     def test_NaiveBayes(self):
         results = CrossValidation(self.table, [self.learner], k=10)
         ca = CA(results)
         self.assertGreater(ca, 0.7)
         self.assertLess(ca, 0.9)
 
-    def test_predict_single_instance(self):
-        for ins in self.table:
-            self.model(ins)
-            val, prob = self.model(ins, self.model.ValueProbs)
-
-    def test_predict_table(self):
-        self.model(self.table)
-        vals, probs = self.model(self.table, self.model.ValueProbs)
-
-    def test_predict_numpy(self):
-        X = self.table.X[::20]
-        self.model(X)
-        vals, probs = self.model(X, self.model.ValueProbs)
+        results = CrossValidation(Table("iris"), [self.learner], k=10)
+        ca = CA(results)
+        self.assertGreater(ca, 0.7)
 
     def test_degenerate(self):
         d = Domain((ContinuousVariable(name="A"),
@@ -53,3 +56,165 @@ def test_allnan_cv(self):
         data = Table('voting')
         results = CrossValidation(data, [self.learner])
         self.assertFalse(any(results.failed))
+
+    def test_prediction_routing(self):
+        data = self.data
+        predict = self.model.predict = Mock(return_value=(data.Y, None))
+
+        self.model(data)
+        predict.assert_called()
+        predict.reset_mock()
+
+        self.model(data.X)
+        predict.assert_called()
+        predict.reset_mock()
+
+        self.model.predict_storage(data)
+        predict.assert_called()
+        predict.reset_mock()
+
+        self.model.predict_storage(data[0])
+        predict.assert_called()
+
+    def test_compare_results_of_predict_and_predict_storage(self):
+        data2 = NotATable("titanic")
+
+        self.model = self.learner(self.data[:50])
+        predict = self.model.predict = Mock(side_effect=self.model.predict)
+        values, probs = self.model.predict_storage(self.data[50:])
+        predict.assert_called()
+        predict.reset_mock()
+        values2, probs2 = self.model.predict_storage(data2[50:])
+        predict.assert_not_called()
+
+        np.testing.assert_equal(values, values2)
+        np.testing.assert_equal(probs, probs2)
+
+    def test_predictions(self):
+        self._test_predictions(sparse=None)
+
+    def test_predictions_csr_matrix(self):
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore", ".*the matrix subclass.*", PendingDeprecationWarning)
+            self._test_predictions(sparse=sp.csr_matrix)
+
+    def test_predictions_csc_matrix(self):
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore", ".*the matrix subclass.*", PendingDeprecationWarning)
+            self._test_predictions(sparse=sp.csc_matrix)
+
+    def _test_predictions(self, sparse):
+        x = np.array([
+            [1, 0, 0],
+            [0, np.nan, 0],
+            [0, 1, 0],
+            [0, 0, 0],
+            [1, 2, 0],
+            [1, 1, 0],
+            [1, 2, 0],
+            [0, 1, 0]])
+        if sparse is not None:
+            x = sparse(x)
+
+        y = np.array([0, 0, 0, 1, 1, 1, 2, 2])
+        domain = Domain(
+            [DiscreteVariable("a", values="ab"),
+             DiscreteVariable("b", values="abc"),
+             DiscreteVariable("c", values="a")],
+            DiscreteVariable("y", values="abc"))
+        data = Table.from_numpy(domain, x, y)
+
+        model = self.learner(data)
+        np.testing.assert_almost_equal(
+            model.class_prob,
+            [4/11, 4/11, 3/11]
+        )
+        np.testing.assert_almost_equal(
+            np.exp(model.log_cont_prob[0]) * model.class_prob[:, None],
+            [[3/7, 2/7], [2/7, 3/7], [2/7, 2/7]])
+        np.testing.assert_almost_equal(
+            np.exp(model.log_cont_prob[1]) * model.class_prob[:, None],
+            [[2/5, 1/3, 1/5], [2/5, 1/3, 2/5], [1/5, 1/3, 2/5]])
+        np.testing.assert_almost_equal(
+            np.exp(model.log_cont_prob[2]) * model.class_prob[:, None],
+            [[4/11], [4/11], [3/11]])
+
+        test_x = np.array([[a, b, 0] for a in [0, 1] for b in [0, 1, 2]])
+        # Classifiers reject csc matrices in the base class
+        # Naive bayesian classifier supports them if predict_storage is
+        # called directly, which we do below
+        if sparse is not None and sparse is not sp.csc_matrix:
+            test_x = sparse(test_x)
+        test_y = np.full((6, ), np.nan)
+        # The following was computed manually, too
+        exp_probs = np.array([
+            [0.47368421052632, 0.31578947368421, 0.21052631578947],
+            [0.39130434782609, 0.26086956521739, 0.34782608695652],
+            [0.24324324324324, 0.32432432432432, 0.43243243243243],
+            [0.31578947368421, 0.47368421052632, 0.21052631578947],
+            [0.26086956521739, 0.39130434782609, 0.34782608695652],
+            [0.15000000000000, 0.45000000000000, 0.40000000000000]
+        ])
+
+        # Test the faster algorithm for Table (numpy matrices)
+        test_data = Table.from_numpy(domain, test_x, test_y)
+        probs = model(test_data, ret=model.Probs)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        values = model(test_data)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+        values, probs = model(test_data, ret=model.ValueProbs)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+
+        # Test the slower algorithm for non-Table data (iteration in Python)
+        test_data = NotATable.from_numpy(domain, test_x, test_y)
+        probs = model(test_data, ret=model.Probs)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        values = model(test_data)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+        values, probs = model(test_data, ret=model.ValueProbs)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+
+        # Test prediction directly on numpy
+        probs = model(test_x, ret=model.Probs)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        values = model(test_x)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+        values, probs = model(test_x, ret=model.ValueProbs)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+
+        # Test prediction on instances
+        for inst, exp_prob in zip(test_data, exp_probs):
+            np.testing.assert_almost_equal(
+                model(inst, ret=model.Probs)[0],
+                exp_prob)
+            self.assertEqual(model(inst), np.argmax(exp_prob))
+            value, prob = model(inst, ret=model.ValueProbs)
+            np.testing.assert_almost_equal(prob[0], exp_prob)
+            self.assertEqual(value, np.argmax(exp_prob))
+
+        # Test prediction by directly calling predict. This is needed to test
+        # csc_matrix, but doesn't hurt others
+        if sparse is sp.csc_matrix:
+            test_x = sparse(test_x)
+        values, probs = model.predict(test_x)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+
+    def test_no_attributes(self):
+        y = np.array([0, 0, 0, 1, 1, 1, 2, 2])
+        domain = Domain([], DiscreteVariable("y", values="abc"))
+        data = Table.from_numpy(domain, np.zeros((len(y), 0)), y.T)
+        model = self.learner(data)
+        np.testing.assert_almost_equal(
+            model.predict_storage(np.zeros((5, 0)))[1],
+            [[4/11, 4/11, 3/11]] * 5
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()