test naive bayes

noahnovsak · noahnovsak · commit 67fc41cf255a · 2023-09-14T11:35:19.000+02:00
diff --git a/Orange/tests/test_naive_bayes.py b/Orange/tests/test_naive_bayes.py
@@ -10,20 +10,42 @@
 from Orange.classification import NaiveBayesLearner
 from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable
 from Orange.evaluation import CrossValidation, CA
+from Orange.tests import test_filename
 
 
 # This class is used to force predict_storage to fall back to the slower
 # procedure instead of calling `predict`
-from Orange.tests import test_filename
-
-
 class NotATable(Table):  # pylint: disable=too-many-ancestors,abstract-method
     @classmethod
     def from_file(cls, *args, **kwargs):
         table = super().from_file(*args, **kwargs)
         return cls(table)
 
 
+def assert_predictions_equal(data, model, exp_probs):
+    exp_vals = np.argmax(np.atleast_2d(exp_probs), axis=1)
+    np.testing.assert_almost_equal(model(data, ret=model.Probs), exp_probs)
+    np.testing.assert_equal(model(data), exp_vals)
+    values, probs = model(data, ret=model.ValueProbs)
+    np.testing.assert_almost_equal(probs, exp_probs)
+    np.testing.assert_equal(values, exp_vals)
+
+
+def assert_model_equal(model, results):
+    np.testing.assert_almost_equal(
+        model.class_prob,
+        results[0])
+    np.testing.assert_almost_equal(
+        np.exp(model.log_cont_prob[0]) * model.class_prob[:, None],
+        results[1])
+    np.testing.assert_almost_equal(
+        np.exp(model.log_cont_prob[1]) * model.class_prob[:, None],
+        results[2])
+    np.testing.assert_almost_equal(
+        np.exp(model.log_cont_prob[2]) * model.class_prob[:, None],
+        results[3])
+
+
 class TestNaiveBayesLearner(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -100,20 +122,22 @@ def test_compare_results_of_predict_and_predict_storage(self):
 
     def test_predictions(self):
         self._test_predictions(sparse=None)
-        self._test_predictions_with_absent_class(sparse=None)
+        self._test_predictions(sparse=None, absent_class=True)
         self._test_predict_missing_attributes(sparse=None)
 
     def test_predictions_csr_matrix(self):
         self._test_predictions(sparse=sp.csr_matrix)
-        self._test_predictions_with_absent_class(sparse=sp.csr_matrix)
+        self._test_predictions(sparse=sp.csr_matrix, absent_class=True)
         self._test_predict_missing_attributes(sparse=sp.csr_matrix)
 
     def test_predictions_csc_matrix(self):
         self._test_predictions(sparse=sp.csc_matrix)
-        self._test_predictions_with_absent_class(sparse=sp.csc_matrix)
+        self._test_predictions(sparse=sp.csc_matrix, absent_class=True)
         self._test_predict_missing_attributes(sparse=sp.csc_matrix)
 
-    def _test_predictions(self, sparse):
+    @staticmethod
+    def _create_prediction_data(sparse, absent_class=False):
+        """ The following was computed manually """
         x = np.array([
             [1, 0, 0],
             [0, np.nan, 0],
@@ -127,27 +151,13 @@ def _test_predictions(self, sparse):
             x = sparse(x)
 
         y = np.array([0, 0, 0, 1, 1, 1, 2, 2])
-        domain = Domain(
-            [DiscreteVariable("a", values="ab"),
-             DiscreteVariable("b", values="abc"),
-             DiscreteVariable("c", values="a")],
-            DiscreteVariable("y", values="abc"))
-        data = Table.from_numpy(domain, x, y)
-
-        model = self.learner(data)
-        np.testing.assert_almost_equal(
-            model.class_prob,
-            [4/11, 4/11, 3/11]
-        )
-        np.testing.assert_almost_equal(
-            np.exp(model.log_cont_prob[0]) * model.class_prob[:, None],
-            [[3/7, 2/7], [2/7, 3/7], [2/7, 2/7]])
-        np.testing.assert_almost_equal(
-            np.exp(model.log_cont_prob[1]) * model.class_prob[:, None],
-            [[2/5, 1/3, 1/5], [2/5, 1/3, 2/5], [1/5, 1/3, 2/5]])
-        np.testing.assert_almost_equal(
-            np.exp(model.log_cont_prob[2]) * model.class_prob[:, None],
-            [[4/11], [4/11], [3/11]])
+        class_var = DiscreteVariable("y", values="abc")
+        results = [
+            [4/11, 4/11, 3/11],
+            [[3/7, 2/7], [2/7, 3/7], [2/7, 2/7]],
+            [[2/5, 1/3, 1/5], [2/5, 1/3, 2/5], [1/5, 1/3, 2/5]],
+            [[4/11], [4/11], [3/11]]
+        ]
 
         test_x = np.array([[a, b, 0] for a in [0, 1] for b in [0, 1, 2]])
         # Classifiers reject csc matrices in the base class
@@ -156,7 +166,7 @@ def _test_predictions(self, sparse):
         if sparse is not None and sparse is not sp.csc_matrix:
             test_x = sparse(test_x)
         test_y = np.full((6, ), np.nan)
-        # The following was computed manually, too
+
         exp_probs = np.array([
             [0.47368421052632, 0.31578947368421, 0.21052631578947],
             [0.39130434782609, 0.26086956521739, 0.34782608695652],
@@ -166,155 +176,54 @@ def _test_predictions(self, sparse):
             [0.15000000000000, 0.45000000000000, 0.40000000000000]
         ])
 
-        # Test the faster algorithm for Table (numpy matrices)
-        test_data = Table.from_numpy(domain, test_x, test_y)
-        probs = model(test_data, ret=model.Probs)
-        np.testing.assert_almost_equal(exp_probs, probs)
-        values = model(test_data)
-        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
-        values, probs = model(test_data, ret=model.ValueProbs)
-        np.testing.assert_almost_equal(exp_probs, probs)
-        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
-
-        # Test the slower algorithm for non-Table data (iteration in Python)
-        test_data = NotATable.from_numpy(domain, test_x, test_y)
-        probs = model(test_data, ret=model.Probs)
-        np.testing.assert_almost_equal(exp_probs, probs)
-        values = model(test_data)
-        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
-        values, probs = model(test_data, ret=model.ValueProbs)
-        np.testing.assert_almost_equal(exp_probs, probs)
-        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
-
-        # Test prediction directly on numpy
-        probs = model(test_x, ret=model.Probs)
-        np.testing.assert_almost_equal(exp_probs, probs)
-        values = model(test_x)
-        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
-        values, probs = model(test_x, ret=model.ValueProbs)
-        np.testing.assert_almost_equal(exp_probs, probs)
-        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
-
-        # Test prediction on instances
-        for inst, exp_prob in zip(test_data, exp_probs):
-            np.testing.assert_almost_equal(
-                model(inst, ret=model.Probs),
-                exp_prob)
-            self.assertEqual(model(inst), np.argmax(exp_prob))
-            value, prob = model(inst, ret=model.ValueProbs)
-            np.testing.assert_almost_equal(prob, exp_prob)
-            self.assertEqual(value, np.argmax(exp_prob))
-
-        # Test prediction by directly calling predict. This is needed to test
-        # csc_matrix, but doesn't hurt others
-        if sparse is sp.csc_matrix:
-            test_x = sparse(test_x)
-        values, probs = model.predict(test_x)
-        np.testing.assert_almost_equal(exp_probs, probs)
-        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
-
-    def _test_predictions_with_absent_class(self, sparse):
-        """Empty classes should not affect predictions"""
-        x = np.array([
-            [1, 0, 0],
-            [0, np.nan, 0],
-            [0, 1, 0],
-            [0, 0, 0],
-            [1, 2, 0],
-            [1, 1, 0],
-            [1, 2, 0],
-            [0, 1, 0]])
-        if sparse is not None:
-            x = sparse(x)
+        if absent_class:
+            y = np.array([0, 0, 0, 2, 2, 2, 3, 3])
+            class_var = DiscreteVariable("y", values="abcd")
+            for i, row in enumerate(results):
+                row.insert(1, i and [0]*len(row[0]))
+            exp_probs = np.insert(exp_probs, 1, 0, axis=1)
 
-        y = np.array([0, 0, 0, 2, 2, 2, 3, 3])
         domain = Domain(
             [DiscreteVariable("a", values="ab"),
              DiscreteVariable("b", values="abc"),
              DiscreteVariable("c", values="a")],
-            DiscreteVariable("y", values="abcd"))
+            class_var)
         data = Table.from_numpy(domain, x, y)
 
-        model = self.learner(data)
-        np.testing.assert_almost_equal(
-            model.class_prob,
-            [4/11, 0, 4/11, 3/11]
-        )
-        np.testing.assert_almost_equal(
-            np.exp(model.log_cont_prob[0]) * model.class_prob[:, None],
-            [[3/7, 2/7], [0, 0], [2/7, 3/7], [2/7, 2/7]])
-        np.testing.assert_almost_equal(
-            np.exp(model.log_cont_prob[1]) * model.class_prob[:, None],
-            [[2/5, 1/3, 1/5], [0, 0, 0], [2/5, 1/3, 2/5], [1/5, 1/3, 2/5]])
-        np.testing.assert_almost_equal(
-            np.exp(model.log_cont_prob[2]) * model.class_prob[:, None],
-            [[4/11], [0], [4/11], [3/11]])
+        return data, domain, results, test_x, test_y, exp_probs
 
-        test_x = np.array([[a, b, 0] for a in [0, 1] for b in [0, 1, 2]])
-        # Classifiers reject csc matrices in the base class
-        # Naive bayesian classifier supports them if predict_storage is
-        # called directly, which we do below
-        if sparse is not None and sparse is not sp.csc_matrix:
-            test_x = sparse(test_x)
-        test_y = np.full((6, ), np.nan)
-        # The following was computed manually, too
-        exp_probs = np.array([
-            [0.47368421052632, 0, 0.31578947368421, 0.21052631578947],
-            [0.39130434782609, 0, 0.26086956521739, 0.34782608695652],
-            [0.24324324324324, 0, 0.32432432432432, 0.43243243243243],
-            [0.31578947368421, 0, 0.47368421052632, 0.21052631578947],
-            [0.26086956521739, 0, 0.39130434782609, 0.34782608695652],
-            [0.15000000000000, 0, 0.45000000000000, 0.40000000000000]
-        ])
+    def _test_predictions(self, sparse, absent_class=False):
+        (data, domain, results,
+         test_x, test_y, exp_probs) = self._create_prediction_data(sparse, absent_class)
+
+        model = self.learner(data)
+        assert_model_equal(model, results)
 
         # Test the faster algorithm for Table (numpy matrices)
         test_data = Table.from_numpy(domain, test_x, test_y)
-        probs = model(test_data, ret=model.Probs)
-        np.testing.assert_almost_equal(exp_probs, probs)
-        values = model(test_data)
-        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
-        values, probs = model(test_data, ret=model.ValueProbs)
-        np.testing.assert_almost_equal(exp_probs, probs)
-        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+        assert_predictions_equal(test_data, model, exp_probs)
 
         # Test the slower algorithm for non-Table data (iteration in Python)
         test_data = NotATable.from_numpy(domain, test_x, test_y)
-        probs = model(test_data, ret=model.Probs)
-        np.testing.assert_almost_equal(exp_probs, probs)
-        values = model(test_data)
-        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
-        values, probs = model(test_data, ret=model.ValueProbs)
-        np.testing.assert_almost_equal(exp_probs, probs)
-        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+        assert_predictions_equal(test_data, model, exp_probs)
 
         # Test prediction directly on numpy
-        probs = model(test_x, ret=model.Probs)
-        np.testing.assert_almost_equal(exp_probs, probs)
-        values = model(test_x)
-        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
-        values, probs = model(test_x, ret=model.ValueProbs)
-        np.testing.assert_almost_equal(exp_probs, probs)
-        np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
+        assert_predictions_equal(test_x, model, exp_probs)
 
         # Test prediction on instances
         for inst, exp_prob in zip(test_data, exp_probs):
-            np.testing.assert_almost_equal(
-                model(inst, ret=model.Probs),
-                exp_prob)
-            self.assertEqual(model(inst), np.argmax(exp_prob))
-            value, prob = model(inst, ret=model.ValueProbs)
-            np.testing.assert_almost_equal(prob, exp_prob)
-            self.assertEqual(value, np.argmax(exp_prob))
+            assert_predictions_equal(inst, model, exp_prob)
 
         # Test prediction by directly calling predict. This is needed to test
         # csc_matrix, but doesn't hurt others
         if sparse is sp.csc_matrix:
             test_x = sparse(test_x)
         values, probs = model.predict(test_x)
-        np.testing.assert_almost_equal(exp_probs, probs)
+        np.testing.assert_almost_equal(probs, exp_probs)
         np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
 
-    def _test_predict_missing_attributes(self, sparse):
+    @staticmethod
+    def _create_missing_attributes(sparse):
         x = np.array([
             [1, 0, 0],
             [0, 1, 0],
@@ -325,24 +234,29 @@ def _test_predict_missing_attributes(self, sparse):
             [1, 2, np.nan]])
         if sparse is not None:
             x = sparse(x)
-        y = np.array([1,0,0,0,1,1,1])
+        y = np.array([1, 0, 0, 0, 1, 1, 1])
+
+        test_x = np.array([[np.nan, np.nan, np.nan],
+                           [np.nan, 0, np.nan],
+                           [0, np.nan, np.nan]])
+        if sparse is not None and sparse is not sp.csc_matrix:
+            test_x = sparse(test_x)
+        exp_probs = np.array([[(3 + 1) / (7 + 2), (4 + 1) / (7 + 2)],
+                              [(1 + 1) / (2 + 2), (1 + 1) / (2 + 2)],
+                              [(3 + 1) / (3 + 2), (0 + 1) / (3 + 2)]])
+
         domain = Domain(
             [DiscreteVariable("a", values="ab"),
              DiscreteVariable("b", values="abc"),
              DiscreteVariable("c", values="a")],
             DiscreteVariable("y", values="AB"))
-        data = Table.from_numpy(domain, x, y)
+        return Table.from_numpy(domain, x, y), test_x, exp_probs
 
+    def _test_predict_missing_attributes(self, sparse):
+        data, test_x, exp_probs = self._create_missing_attributes(sparse)
         model = self.learner(data)
-        test_x = np.array([[np.nan, np.nan, np.nan],
-                           [np.nan, 0, np.nan],
-                           [0, np.nan, np.nan]])
-        if sparse is not None and sparse is not sp.csc_matrix:
-            test_x = sparse(test_x)
         probs = model(test_x, ret=model.Probs)
-        np.testing.assert_almost_equal(probs, [[(3+1)/(7+2), (4+1)/(7+2)],
-                                               [(1+1)/(2+2), (1+1)/(2+2)],
-                                               [(3+1)/(3+2), (0+1)/(3+2)]])
+        np.testing.assert_almost_equal(probs, exp_probs)
 
     def test_no_attributes(self):
         y = np.array([0, 0, 0, 1, 1, 1, 2, 2])