Naive Bayes: Add tests for prediction

janezd · janezd · commit 33b6e55d2d45 · 2019-01-25T22:43:59.000+01:00
diff --git a/Orange/tests/test_naive_bayes.py b/Orange/tests/test_naive_bayes.py
@@ -2,6 +2,7 @@
 # pylint: disable=missing-docstring
 
 import unittest
+import warnings
 from unittest.mock import Mock
 
 import numpy as np
@@ -12,6 +13,12 @@
 from Orange.evaluation import CrossValidation, CA
 
 
+# This class is used to force predict_storage to fall back to the slower
+# procedure instead of calling `predict`
+class NotATable(Table):  # pylint: disable=too-many-ancestors,abstract-method
+    pass
+
+
 class TestNaiveBayesLearner(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -32,20 +39,6 @@ def test_NaiveBayes(self):
         ca = CA(results)
         self.assertGreater(ca, 0.7)
 
-    def test_predict_single_instance(self):
-        for ins in self.table:
-            self.model(ins)
-            val, prob = self.model(ins, self.model.ValueProbs)
-
-    def test_predict_table(self):
-        self.model(self.table)
-        vals, probs = self.model(self.table, self.model.ValueProbs)
-
-    def test_predict_numpy(self):
-        X = self.table.X[::20]
-        self.model(X)
-        vals, probs = self.model(X, self.model.ValueProbs)
-
     def test_degenerate(self):
         d = Domain((ContinuousVariable(name="A"),
                     ContinuousVariable(name="B"),
@@ -64,15 +57,6 @@ def test_allnan_cv(self):
         results = CrossValidation(data, [self.learner])
         self.assertFalse(any(results.failed))
 
-    def test_sparse(self):
-        _, dense_p = self.model.predict(self.data.X)
-
-        _, csc_p = self.model.predict(sp.csc_matrix(self.data.X))
-        np.testing.assert_almost_equal(dense_p, csc_p)
-
-        _, csr_p = self.model.predict(sp.csr_matrix(self.data.X))
-        np.testing.assert_almost_equal(dense_p, csr_p)
-
     def test_prediction_routing(self):
         data = self.data
         predict = self.model.predict = Mock(return_value=(data.Y, None))
@@ -92,6 +76,132 @@ def test_prediction_routing(self):
         self.model.predict_storage(data[0])
         predict.assert_not_called()
 
+    def test_compare_results_of_storage_and_predict_storage(self):
+        data2 = NotATable("titanic")
+
+        self.model = self.learner(self.data[:50])
+        values, probs = self.model.predict_storage(self.data[50:])
+        values2, probs2 = self.model.predict_storage(data2[50:])
+        np.testing.assert_equal(values, values2)
+        np.testing.assert_equal(probs, probs2)
+
+    def test_predictions(self):
+        self._test_predictions(sparse=None)
+
+    def test_predictions_csr_matrix(self):
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore", ".*the matrix subclass.*", PendingDeprecationWarning)
+            self._test_predictions(sparse=sp.csr_matrix)
+
+    def test_predictions_csc_matrix(self):
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore", ".*the matrix subclass.*", PendingDeprecationWarning)
+            self._test_predictions(sparse=sp.csc_matrix)
+
+    def _test_predictions(self, sparse):
+        x = np.array([
+            [1, 0, 0],
+            [0, np.nan, 0],
+            [0, 1, 0],
+            [0, 0, 0],
+            [1, 2, 0],
+            [1, 1, 0],
+            [1, 2, 0],
+            [0, 1, 0]])
+        if sparse is not None:
+            x = sparse(x)
+
+        y = np.array([0, 0, 0, 1, 1, 1, 2, 2])
+        domain = Domain(
+            [DiscreteVariable("a", values="ab"),
+             DiscreteVariable("b", values="abc"),
+             DiscreteVariable("c", values="a")],
+            DiscreteVariable("y", values="abc"))
+        data = Table.from_numpy(domain, x, y)
+
+        model = self.learner(data)
+        np.testing.assert_almost_equal(
+            model.class_prob,
+            [4/11, 4/11, 3/11]
+        )
+        np.testing.assert_almost_equal(
+            np.exp(model.log_cont_prob[0]) * model.class_prob[:, None],
+            [[3/7, 2/7], [2/7, 3/7], [2/7, 2/7]])
+        np.testing.assert_almost_equal(
+            np.exp(model.log_cont_prob[1]) * model.class_prob[:, None],
+            [[2/5, 1/3, 1/5], [2/5, 1/3, 2/5], [1/5, 1/3, 2/5]])
+        np.testing.assert_almost_equal(
+            np.exp(model.log_cont_prob[2]) * model.class_prob[:, None],
+            [[4/11], [4/11], [3/11]])
+
+        test_x = np.array([[a, b, 0] for a in [0, 1] for b in [0, 1, 2]])
+        # Model.__call__ does not accept csc matrices
+        # We however test the classifier with csc_matrix (below)
+        if sparse is not None and sparse is not sp.csc_matrix:
+            test_x = sparse(test_x)
+        test_y = np.full((6, ), np.nan)
+        # The following was computed manually, too
+        exp_probs = np.array([
+            [0.47368421052632, 0.31578947368421, 0.21052631578947],
+            [0.39130434782609, 0.26086956521739, 0.34782608695652],
+            [0.24324324324324, 0.32432432432432, 0.43243243243243],
+            [0.31578947368421, 0.47368421052632, 0.21052631578947],
+            [0.26086956521739, 0.39130434782609, 0.34782608695652],
+            [0.15000000000000, 0.45000000000000, 0.40000000000000]
+        ])
+
+        # Test the faster algorithm for Table (numpy matrices)
+        test_data = Table.from_numpy(domain, test_x, test_y)
+        probs = model(test_data, ret=model.Probs)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        values = model(test_data)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+        values, probs = model(test_data, ret=model.ValueProbs)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+
+        # Test the slower algorithm for non-Table data (iteration in Python)
+        test_data = NotATable.from_numpy(domain, test_x, test_y)
+        probs = model(test_data, ret=model.Probs)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        values = model(test_data)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+        values, probs = model(test_data, ret=model.ValueProbs)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+
+        # Test prediction directly on numpy
+        probs = model(test_x, ret=model.Probs)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        values = model(test_x)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+        values, probs = model(test_x, ret=model.ValueProbs)
+        np.testing.assert_almost_equal(exp_probs, probs)
+        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+
+        # Test prediction on instances
+        for inst, exp_prob in zip(test_data, exp_probs):
+            np.testing.assert_almost_equal(
+                model(inst, ret=model.Probs)[0],
+                exp_prob)
+            self.assertEqual(model(inst), np.argmax(exp_prob))
+            value, prob = model(inst, ret=model.ValueProbs)
+            np.testing.assert_almost_equal(prob[0], exp_prob)
+            self.assertEqual(value, np.argmax(exp_prob))
+
+    def test_no_attributes(self):
+        y = np.array([0, 0, 0, 1, 1, 1, 2, 2])
+        domain = Domain([], DiscreteVariable("y", values="abc"))
+        data = Table.from_numpy(domain, np.zeros((len(y), 0)), y.T)
+        test_data = Table.from_numpy(domain, np.zeros((5, 0)), np.zeros((5, 1)))
+        model = self.learner(data)
+        np.testing.assert_almost_equal(
+            model.predict_storage(test_data)[1],
+            [[4/11, 4/11, 3/11]] * 5
+        )
+
 
 if __name__ == "__main__":
     unittest.main()