diff --git a/Orange/classification/__init__.py b/Orange/classification/__init__.py index f0489b4cb74..842518fca31 100644 --- a/Orange/classification/__init__.py +++ b/Orange/classification/__init__.py @@ -19,3 +19,4 @@ from .rules import * from .sgd import * from .neural_network import * +from .calibration import * diff --git a/Orange/classification/calibration.py b/Orange/classification/calibration.py new file mode 100644 index 00000000000..46bf2e8f242 --- /dev/null +++ b/Orange/classification/calibration.py @@ -0,0 +1,176 @@ +import numpy as np +from sklearn.isotonic import IsotonicRegression +from sklearn.calibration import _SigmoidCalibration + +from Orange.classification import Model, Learner +from Orange.evaluation import TestOnTrainingData +from Orange.evaluation.performance_curves import Curves + +__all__ = ["ThresholdClassifier", "ThresholdLearner", + "CalibratedLearner", "CalibratedClassifier"] + + +class ThresholdClassifier(Model): + """ + A model that wraps a binary model and sets a different threshold. + + The target class is the class with index 1. A data instances is classified + to class 1 it the probability of this class equals or exceeds the threshold + + Attributes: + base_model (Orange.classification.Model): base mode + threshold (float): decision threshold + """ + def __init__(self, base_model, threshold): + if not base_model.domain.class_var.is_discrete \ + or len(base_model.domain.class_var.values) != 2: + raise ValueError("ThresholdClassifier requires a binary class") + + super().__init__(base_model.domain, base_model.original_domain) + self.name = f"{base_model.name}, thresh={threshold:.2f}" + self.base_model = base_model + self.threshold = threshold + + def __call__(self, data, ret=Model.Value): + probs = self.base_model(data, ret=Model.Probs) + if ret == Model.Probs: + return probs + class_probs = probs[:, 1].ravel() + with np.errstate(invalid="ignore"): # we fix nanx below + vals = (class_probs >= self.threshold).astype(float) + vals[np.isnan(class_probs)] = np.nan + if ret == Model.Value: + return vals + else: + return vals, probs + + +class ThresholdLearner(Learner): + """ + A learner that runs another learner and then finds the optimal threshold + for CA or F1 on the training data. + + Attributes: + base_leaner (Learner): base learner + threshold_criterion (int): + `ThresholdLearner.OptimizeCA` or `ThresholdLearner.OptimizeF1` + """ + __returns__ = ThresholdClassifier + + OptimizeCA, OptimizeF1 = range(2) + + def __init__(self, base_learner, threshold_criterion=OptimizeCA): + super().__init__() + self.base_learner = base_learner + self.threshold_criterion = threshold_criterion + + def fit_storage(self, data): + """ + Induce a model using the provided `base_learner`, compute probabilities + on training data and the find the optimal decision thresholds. In case + of ties, select the threshold that is closest to 0.5. + """ + if not data.domain.class_var.is_discrete \ + or len(data.domain.class_var.values) != 2: + raise ValueError("ThresholdLearner requires a binary class") + + res = TestOnTrainingData(data, [self.base_learner], store_models=True) + model = res.models[0, 0] + curves = Curves.from_results(res) + curve = [curves.ca, curves.f1][self.threshold_criterion]() + # In case of ties, we want the optimal threshold that is closest to 0.5 + best_threshs = curves.probs[curve == np.max(curve)] + threshold = best_threshs[min(np.searchsorted(best_threshs, 0.5), + len(best_threshs) - 1)] + return ThresholdClassifier(model, threshold) + + +class CalibratedClassifier(Model): + """ + A model that wraps another model and recalibrates probabilities + + Attributes: + base_model (Mode): base mode + calibrators (list of callable): + list of functions that get a vector of probabilities and return + calibrated probabilities + """ + def __init__(self, base_model, calibrators): + if not base_model.domain.class_var.is_discrete: + raise ValueError("CalibratedClassifier requires a discrete target") + + super().__init__(base_model.domain, base_model.original_domain) + self.base_model = base_model + self.calibrators = calibrators + self.name = f"{base_model.name}, calibrated" + + def __call__(self, data, ret=Model.Value): + probs = self.base_model(data, Model.Probs) + cal_probs = self.calibrated_probs(probs) + if ret == Model.Probs: + return cal_probs + vals = np.argmax(cal_probs, axis=1) + if ret == Model.Value: + return vals + else: + return vals, cal_probs + + def calibrated_probs(self, probs): + if self.calibrators: + ps = np.hstack( + tuple( + calibr.predict(cls_probs).reshape(-1, 1) + for calibr, cls_probs in zip(self.calibrators, probs.T))) + else: + ps = probs.copy() + sums = np.sum(ps, axis=1) + zero_sums = sums == 0 + with np.errstate(invalid="ignore"): # handled below + ps /= sums[:, None] + if zero_sums.any(): + ps[zero_sums] = 1 / ps.shape[1] + return ps + + +class CalibratedLearner(Learner): + """ + Probability calibration for learning algorithms + + This learner that wraps another learner, so that after training, it predicts + the probabilities on training data and calibrates them using sigmoid or + isotonic calibration. It then returns a :obj:`CalibratedClassifier`. + + Attributes: + base_learner (Learner): base learner + calibration_method (int): + `CalibratedLearner.Sigmoid` or `CalibratedLearner.Isotonic` + """ + __returns__ = CalibratedClassifier + + Sigmoid, Isotonic = range(2) + + def __init__(self, base_learner, calibration_method=Sigmoid): + super().__init__() + self.base_learner = base_learner + self.calibration_method = calibration_method + + def fit_storage(self, data): + """ + Induce a model using the provided `base_learner`, compute probabilities + on training data and use scipy's `_SigmoidCalibration` or + `IsotonicRegression` to prepare calibrators. + """ + res = TestOnTrainingData(data, [self.base_learner], store_models=True) + model = res.models[0, 0] + probabilities = res.probabilities[0] + return self.get_model(model, res.actual, probabilities) + + def get_model(self, model, ytrue, probabilities): + if self.calibration_method == CalibratedLearner.Sigmoid: + fitter = _SigmoidCalibration() + else: + fitter = IsotonicRegression(out_of_bounds='clip') + probabilities[np.isinf(probabilities)] = 1 + calibrators = [fitter.fit(cls_probs, ytrue) + for cls_idx, cls_probs in enumerate(probabilities.T)] + return CalibratedClassifier(model, calibrators) diff --git a/Orange/classification/tests/test_calibration.py b/Orange/classification/tests/test_calibration.py new file mode 100644 index 00000000000..a538a3b1870 --- /dev/null +++ b/Orange/classification/tests/test_calibration.py @@ -0,0 +1,203 @@ +import unittest +from unittest.mock import Mock, patch + +import numpy as np + +from Orange.base import Model +from Orange.classification.calibration import \ + ThresholdLearner, ThresholdClassifier, \ + CalibratedLearner, CalibratedClassifier +from Orange.data import Table + + +class TestThresholdClassifier(unittest.TestCase): + def setUp(self): + probs1 = np.array([0.3, 0.5, 0.2, 0.8, 0.9, 0]).reshape(-1, 1) + self.probs = np.hstack((1 - probs1, probs1)) + base_model = Mock(return_value=self.probs) + base_model.domain.class_var.is_discrete = True + base_model.domain.class_var.values = ["a", "b"] + self.model = ThresholdClassifier(base_model, 0.5) + self.data = Mock() + + def test_threshold(self): + vals = self.model(self.data) + np.testing.assert_equal(vals, [0, 1, 0, 1, 1, 0]) + + self.model.threshold = 0.8 + vals = self.model(self.data) + np.testing.assert_equal(vals, [0, 0, 0, 1, 1, 0]) + + self.model.threshold = 0 + vals = self.model(self.data) + np.testing.assert_equal(vals, [1] * 6) + + def test_return_types(self): + vals = self.model(self.data, ret=Model.Value) + np.testing.assert_equal(vals, [0, 1, 0, 1, 1, 0]) + + vals = self.model(self.data) + np.testing.assert_equal(vals, [0, 1, 0, 1, 1, 0]) + + probs = self.model(self.data, ret=Model.Probs) + np.testing.assert_equal(probs, self.probs) + + vals, probs = self.model(self.data, ret=Model.ValueProbs) + np.testing.assert_equal(vals, [0, 1, 0, 1, 1, 0]) + np.testing.assert_equal(probs, self.probs) + + def test_nans(self): + self.probs[1, :] = np.nan + vals, probs = self.model(self.data, ret=Model.ValueProbs) + np.testing.assert_equal(vals, [0, np.nan, 0, 1, 1, 0]) + np.testing.assert_equal(probs, self.probs) + + def test_non_binary_base(self): + base_model = Mock() + base_model.domain.class_var.is_discrete = True + base_model.domain.class_var.values = ["a"] + self.assertRaises(ValueError, ThresholdClassifier, base_model, 0.5) + + base_model.domain.class_var.values = ["a", "b", "c"] + self.assertRaises(ValueError, ThresholdClassifier, base_model, 0.5) + + base_model.domain.class_var = Mock() + base_model.domain.class_var.is_discrete = False + self.assertRaises(ValueError, ThresholdClassifier, base_model, 0.5) + + +class TestThresholdLearner(unittest.TestCase): + @patch("Orange.evaluation.performance_curves.Curves.from_results") + @patch("Orange.classification.calibration.TestOnTrainingData") + def test_fit_storage(self, test_on_training, curves_from_results): + curves_from_results.return_value = curves = Mock() + curves.probs = np.array([0.1, 0.15, 0.3, 0.45, 0.6, 0.8]) + curves.ca = lambda: np.array([0.1, 0.7, 0.4, 0.4, 0.3, 0.1]) + curves.f1 = lambda: np.array([0.1, 0.2, 0.4, 0.4, 0.3, 0.1]) + model = Mock() + model.domain.class_var.is_discrete = True + model.domain.class_var.values = ("a", "b") + data = Table("heart_disease") + learner = Mock() + test_on_training.return_value = res = Mock() + res.models = np.array([[model]]) + test_on_training.return_value = res + + thresh_learner = ThresholdLearner( + base_learner=learner, + threshold_criterion=ThresholdLearner.OptimizeCA) + thresh_model = thresh_learner(data) + self.assertEqual(thresh_model.threshold, 0.15) + args, kwargs = test_on_training.call_args + self.assertEqual(len(args), 2) + self.assertIs(args[0], data) + self.assertIs(args[1][0], learner) + self.assertEqual(len(args[1]), 1) + self.assertEqual(kwargs, {"store_models": 1}) + + thresh_learner = ThresholdLearner( + base_learner=learner, + threshold_criterion=ThresholdLearner.OptimizeF1) + thresh_model = thresh_learner(data) + self.assertEqual(thresh_model.threshold, 0.45) + + def test_non_binary_class(self): + thresh_learner = ThresholdLearner( + base_learner=Mock(), + threshold_criterion=ThresholdLearner.OptimizeF1) + + data = Mock() + data.domain.class_var.is_discrete = True + data.domain.class_var.values = ["a"] + self.assertRaises(ValueError, thresh_learner.fit_storage, data) + + data.domain.class_var.values = ["a", "b", "c"] + self.assertRaises(ValueError, thresh_learner.fit_storage, data) + + data.domain.class_var = Mock() + data.domain.class_var.is_discrete = False + self.assertRaises(ValueError, thresh_learner.fit_storage, data) + + +class TestCalibratedClassifier(unittest.TestCase): + def setUp(self): + probs1 = np.array([0.3, 0.5, 0.2, 0.8, 0.9, 0]).reshape(-1, 1) + self.probs = np.hstack((1 - probs1, probs1)) + base_model = Mock(return_value=self.probs) + base_model.domain.class_var.is_discrete = True + base_model.domain.class_var.values = ["a", "b"] + self.model = CalibratedClassifier(base_model, None) + self.data = Mock() + + def test_call(self): + calprobs = np.arange(self.probs.size).reshape(self.probs.shape) + calprobs = calprobs / np.sum(calprobs, axis=1)[:, None] + calprobs[-1] = [0.7, 0.3] + self.model.calibrated_probs = Mock(return_value=calprobs) + + probs = self.model(self.data, ret=Model.Probs) + self.model.calibrated_probs.assert_called_with(self.probs) + np.testing.assert_almost_equal(probs, calprobs) + + vals = self.model(self.data, ret=Model.Value) + np.testing.assert_almost_equal(vals, [1, 1, 1, 1, 1, 0]) + + vals, probs = self.model(self.data, ret=Model.ValueProbs) + np.testing.assert_almost_equal(probs, calprobs) + np.testing.assert_almost_equal(vals, [1, 1, 1, 1, 1, 0]) + + def test_calibrated_probs(self): + self.model.calibrators = None + calprobs = self.model.calibrated_probs(self.probs) + np.testing.assert_equal(calprobs, self.probs) + self.assertIsNot(calprobs, self.probs) + + calibrator = Mock() + calibrator.predict = lambda x: x**2 + self.model.calibrators = [calibrator] * 2 + calprobs = self.model.calibrated_probs(self.probs) + expprobs = self.probs ** 2 / np.sum(self.probs ** 2, axis=1)[:, None] + np.testing.assert_almost_equal(calprobs, expprobs) + + self.probs[1] = 0 + self.probs[2] = np.nan + expprobs[1] = 0.5 + expprobs[2] = np.nan + calprobs = self.model.calibrated_probs(self.probs) + np.testing.assert_almost_equal(calprobs, expprobs) + + +class TestCalibratedLearner(unittest.TestCase): + @patch("Orange.classification.calibration._SigmoidCalibration.fit") + @patch("Orange.classification.calibration.TestOnTrainingData") + def test_fit_storage(self, test_on_training, sigmoid_fit): + data = Table("heart_disease") + learner = Mock() + + model = Mock() + model.domain.class_var.is_discrete = True + model.domain.class_var.values = ("a", "b") + + test_on_training.return_value = res = Mock() + res.models = np.array([[model]]) + res.probabilities = np.arange(20, dtype=float).reshape(1, 5, 4) + test_on_training.return_value = res + + sigmoid_fit.return_value = Mock() + + cal_learner = CalibratedLearner( + base_learner=learner, calibration_method=CalibratedLearner.Sigmoid) + cal_model = cal_learner(data) + + self.assertIs(cal_model.base_model, model) + self.assertEqual(cal_model.calibrators, [sigmoid_fit.return_value] * 4) + args, kwargs = test_on_training.call_args + self.assertEqual(len(args), 2) + self.assertIs(args[0], data) + self.assertIs(args[1][0], learner) + self.assertEqual(len(args[1]), 1) + self.assertEqual(kwargs, {"store_models": 1}) + + for call, cls_probs in zip(sigmoid_fit.call_args_list, + res.probabilities[0].T): + np.testing.assert_equal(call[0][0], cls_probs) diff --git a/Orange/evaluation/performance_curves.py b/Orange/evaluation/performance_curves.py new file mode 100644 index 00000000000..c7dee568e53 --- /dev/null +++ b/Orange/evaluation/performance_curves.py @@ -0,0 +1,150 @@ +import numpy as np + + +class Curves: + # names of scores are standard acronyms, pylint: disable=invalid-name + """ + Computation of performance curves (ca, f1, precision, recall and the rest + of the zoo) from test results. + + The class works with binary classes. Attribute `probs` contains ordered + probabilities and all curves represent performance statistics if an + instance is classified as positive if it equals or exceeds the threshold + in `probs`, that is, `sensitivity[i]` is the sensitivity of the classifier + that classifies an instances as positive if the probability of being + positive is at least `probs[i]`. + + Class can be constructed by giving `probs` and `ytrue`, or from test + results (see :obj:`Curves.from_results`). The latter removes instances + with missing class values or predicted probabilities. + + The class treats all results as obtained from a single run instead of + computing separate curves and fancy averaging. + + Arguments: + probs (np.ndarray): vector of predicted probabilities + ytrue (np.ndarray): corresponding true classes + + Attributes: + probs (np.ndarray): ordered vector of predicted probabilities + ytrue (np.ndarray): corresponding true classes + tot (int): total number of data instances + p (int): number of real positive instances + n (int): number of real negative instances + tp (np.ndarray): number of true positives (property computed from `tn`) + fp (np.ndarray): number of false positives (property computed from `tn`) + tn (np.ndarray): number of true negatives (property computed from `tn`) + fn (np.ndarray): number of false negatives (precomputed, not a property) + """ + def __init__(self, ytrue, probs): + sortind = np.argsort(probs) + self.probs = np.hstack((probs[sortind], [1])) + self.ytrue = ytrue[sortind] + self.fn = np.hstack(([0], np.cumsum(self.ytrue))) + self.tot = len(probs) + self.p = self.fn[-1] + self.n = self.tot - self.p + + @classmethod + def from_results(cls, results, target_class=None, model_index=None): + """ + Construct an instance of `Curves` from test results. + + Args: + results (:obj:`Orange.evaluation.testing.Results`): test results + target_class (int): target class index; if the class is binary, + this defaults to `1`, otherwise it must be given + model_index (int): model index; if there is only one model, this + argument can be omitted + + Returns: + curves (:obj:`Curves`) + """ + if model_index is None: + if results.probabilities.shape[0] != 1: + raise ValueError("Argument 'model_index' is required when " + "there are multiple models") + model_index = 0 + if target_class is None: + if results.probabilities.shape[2] != 2: + raise ValueError("Argument 'target_class' is required when the " + "class is not binary") + target_class = 1 + actual = results.actual + probs = results.probabilities[model_index, :, target_class] + nans = np.isnan(actual) + np.isnan(probs) + if nans.any(): + actual = actual[~nans] + probs = probs[~nans] + return cls(actual == target_class, probs) + + @property + def tn(self): + return np.arange(self.tot + 1) - self.fn + + @property + def tp(self): + return self.p - self.fn + + @property + def fp(self): + return self.n - self.tn + + def ca(self): + """Classification accuracy curve""" + return (self.tp + self.tn) / self.tot + + def f1(self): + """F1 curve""" + return 2 * self.tp / (2 * self.tp + self.fp + self.fn) + + def sensitivity(self): + """Sensitivity curve""" + return self.tp / self.p + + def specificity(self): + """Specificity curve""" + return self.tn / self.n + + def precision(self): + """ + Precision curve + + The last element represents precision at threshold 1. Unless such + a probability appears in the data, the precision at this point is + undefined. To avoid this, we copy the previous value to the last. + """ + tp_fp = np.arange(self.tot, -1, -1) + tp_fp[-1] = 1 # avoid division by zero + prec = self.tp / tp_fp + prec[-1] = prec[-2] + return prec + + def recall(self): + """Recall curve""" + return self.sensitivity() + + def ppv(self): + """PPV curve; see the comment at :obj:`precision`""" + return self.precision() + + def npv(self): + """ + NPV curve + + The first value is undefined (no negative instances). To avoid this, + we copy the second value into the first. + """ + tn_fn = np.arange(self.tot + 1) + tn_fn[0] = 1 # avoid division by zero + npv = self.tn / tn_fn + npv[0] = npv[1] + return npv + + def fpr(self): + """FPR curve""" + return self.fp / self.n + + def tpr(self): + """TPR curve""" + return self.sensitivity() diff --git a/Orange/evaluation/testing.py b/Orange/evaluation/testing.py index 92c68d1c13f..93c0d563238 100644 --- a/Orange/evaluation/testing.py +++ b/Orange/evaluation/testing.py @@ -171,7 +171,7 @@ def set_or_raise(value, exp_values, msg): "mismatching number of class values") nmethods = set_or_raise( nmethods, [learners is not None and len(learners), - models is not None and len(models), + models is not None and models.shape[1], failed is not None and len(failed), predicted is not None and predicted.shape[0], probabilities is not None and probabilities.shape[0]], @@ -317,7 +317,7 @@ def split_by_model(self): res.probabilities = self.probabilities[(i,), :, :] if self.models is not None: - res.models = self.models[:, i] + res.models = self.models[:, i:i + 1] res.failed = [self.failed[i]] yield res @@ -365,7 +365,7 @@ def __new__(cls, "and train_data are omitted") return self - warn("calling Validation's constructor with data and learners" + warn("calling Validation's constructor with data and learners " "is deprecated;\nconstruct an instance and call it", DeprecationWarning, stacklevel=2) diff --git a/Orange/evaluation/tests/__init__.py b/Orange/evaluation/tests/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/Orange/evaluation/tests/test_performance_curves.py b/Orange/evaluation/tests/test_performance_curves.py new file mode 100644 index 00000000000..a73d7165557 --- /dev/null +++ b/Orange/evaluation/tests/test_performance_curves.py @@ -0,0 +1,125 @@ +import unittest +from unittest.mock import patch + +import numpy as np + +from Orange.evaluation.testing import Results +from Orange.evaluation.performance_curves import Curves + + +# Test data and sensitivity/specificity are taken from +# Tom Fawcett: An introduction to ROC analysis, with one true positive instance +# removed, so that the number of positive and negative does not match + +class TestCurves(unittest.TestCase): + def setUp(self): + n, p = (0, 1) + self.data = np.array([ + (p, .8), (n, .7), (p, .6), (p, .55), (p, .54), (n, .53), + (n, .52), (p, .51), (n, .505), (p, .4), (n, .39), (p, .38), + (n, .37), (n, .36), (n, .35), (p, .34), (n, .33), (p, .30), (n, .1) + ]) + + def test_curves(self): + np.random.shuffle(self.data) + ytrue, probs = self.data.T + curves = Curves(ytrue, probs) + + tn = np.array( + [0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 9, 9, 10, 10]) + np.testing.assert_equal(curves.tn, tn) + np.testing.assert_equal(curves.fp, 10 - tn) + np.testing.assert_almost_equal(curves.specificity(), tn / 10) + + tp = np.array( + [9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 5, 5, 4, 4, 4, 3, 2, 1, 1, 0]) + np.testing.assert_equal(curves.tp, tp) + np.testing.assert_equal(curves.fn, 9 - tp) + np.testing.assert_almost_equal(curves.sensitivity(), tp / 9) + + np.testing.assert_almost_equal( + curves.ca(), + np.array([9, 10, 9, 10, 9, 10, 11, 12, 11, 12, 11, 12, 11, 12, + 13, 12, 11, 10, 11, 10]) / 19) + + precision = np.array( + [9 / 19, 9 / 18, 8 / 17, 8 / 16, 7 / 15, 7 / 14, 7 / 13, + 7 / 12, 6 / 11, 6 / 10, 5 / 9, 5 / 8, 4 / 7, 4 / 6, + 4 / 5, 3 / 4, 2 / 3, 1 / 2, 1 / 1, 1]) + np.testing.assert_almost_equal(curves.precision(), precision) + np.testing.assert_almost_equal(curves.recall(), tp / 9) + + np.testing.assert_almost_equal(curves.ppv(), precision) + np.testing.assert_almost_equal( + curves.npv(), + np.array([1, 1 / 1, 1 / 2, 2 / 3, 2 / 4, 3 / 5, 4 / 6, 5 / 7, + 5 / 8, 6 / 9, 6 / 10, 7 / 11, 7 / 12, 8 / 13, 9 / 14, + 9 / 15, 9 / 16, 9 / 17, 10 / 18, 10 / 19])) + + np.testing.assert_almost_equal(curves.tpr(), tp / 9) + np.testing.assert_almost_equal(curves.fpr(), (10 - tn) / 10) + + @patch("Orange.evaluation.performance_curves.Curves.__init__", + return_value=None) + def test_curves_from_results(self, init): + res = Results() + ytrue, probs = self.data.T + res.actual = ytrue.astype(float) + res.probabilities = np.vstack((1 - probs, probs)).T.reshape(1, -1, 2) + Curves.from_results(res) + cytrue, cprobs = init.call_args[0] + np.testing.assert_equal(cytrue, ytrue) + np.testing.assert_equal(cprobs, probs) + + Curves.from_results(res, target_class=0) + cytrue, cprobs = init.call_args[0] + np.testing.assert_equal(cytrue, 1 - ytrue) + np.testing.assert_equal(cprobs, 1 - probs) + + res.actual = ytrue.astype(float) + res.probabilities = np.random.random((2, 19, 2)) + res.probabilities[1] = np.vstack((1 - probs, probs)).T + + Curves.from_results(res, model_index=1) + cytrue, cprobs = init.call_args[0] + np.testing.assert_equal(cytrue, ytrue) + np.testing.assert_equal(cprobs, probs) + + self.assertRaises(ValueError, Curves.from_results, res) + + ytrue[ytrue == 0] = 2 * (np.arange(10) % 2) + res.actual = ytrue.astype(float) + res.probabilities = np.random.random((2, 19, 3)) + res.probabilities[1] = np.vstack( + ((1 - probs) / 3, probs, (1 - probs) * 2 / 3)).T + + Curves.from_results(res, model_index=1, target_class=1) + cytrue, cprobs = init.call_args[0] + np.testing.assert_equal(cytrue, ytrue == 1) + np.testing.assert_equal(cprobs, probs) + + Curves.from_results(res, model_index=1, target_class=0) + cytrue, cprobs = init.call_args[0] + np.testing.assert_equal(cytrue, ytrue == 0) + np.testing.assert_equal(cprobs, (1 - probs) / 3) + + Curves.from_results(res, model_index=1, target_class=2) + cytrue, cprobs = init.call_args[0] + np.testing.assert_equal(cytrue, ytrue == 2) + np.testing.assert_equal(cprobs, (1 - probs) * 2 / 3) + + self.assertRaises(ValueError, Curves.from_results, res, model_index=1) + + @patch("Orange.evaluation.performance_curves.Curves.__init__", + return_value=None) + def test_curves_from_results_nans(self, init): + res = Results() + ytrue, probs = self.data.T + ytrue[0] = np.nan + probs[-1] = np.nan + res.actual = ytrue.astype(float) + res.probabilities = np.vstack((1 - probs, probs)).T.reshape(1, -1, 2) + Curves.from_results(res) + cytrue, cprobs = init.call_args[0] + np.testing.assert_equal(cytrue, ytrue[1:-1]) + np.testing.assert_equal(cprobs, probs[1:-1]) diff --git a/Orange/tests/test_evaluation_testing.py b/Orange/tests/test_evaluation_testing.py index a57910eb971..a5f78cb2972 100644 --- a/Orange/tests/test_evaluation_testing.py +++ b/Orange/tests/test_evaluation_testing.py @@ -233,7 +233,7 @@ def test_split_by_model(self): self.assertTrue((result.predicted == res.predicted[i]).all()) self.assertTrue((result.probabilities == res.probabilities[i]).all()) self.assertEqual(len(result.models), 5) - for model in result.models: + for model in result.models[0]: self.assertIsInstance(model, learners[i].__returns__) self.assertSequenceEqual(result.learners, [res.learners[i]]) @@ -756,7 +756,7 @@ def setUp(self): self.row_indices = np.arange(100) self.folds = (range(50), range(10, 60)), (range(50, 100), range(50)) self.learners = [MajorityLearner(), MajorityLearner()] - self.models = [Mock(), Mock()] + self.models = np.array([[Mock(), Mock()]]) self.predicted = np.zeros((2, 100)) self.probabilities = np.zeros((2, 100, 3)) self.failed = [False, True] diff --git a/Orange/widgets/evaluate/contexthandlers.py b/Orange/widgets/evaluate/contexthandlers.py index d79def2ca60..3ad2796698d 100644 --- a/Orange/widgets/evaluate/contexthandlers.py +++ b/Orange/widgets/evaluate/contexthandlers.py @@ -1,47 +1,30 @@ +from Orange.data import Variable from Orange.widgets import settings -from Orange.widgets.utils import getdeepattr class EvaluationResultsContextHandler(settings.ContextHandler): - def __init__(self, targetAttr, selectedAttr): - super().__init__() - self.targetAttr, self.selectedAttr = targetAttr, selectedAttr + """Context handler for evaluation results""" - #noinspection PyMethodOverriding - def match(self, context, cnames, cvalues): - return (cnames, cvalues) == ( - context.classifierNames, context.classValues) and 2 + def open_context(self, widget, classes, classifier_names): + if isinstance(classes, Variable): + if classes.is_discrete: + classes = classes.values + else: + classes = None + super().open_context(widget, classes, classifier_names) - def fast_save(self, widget, name, value): - context = widget.current_context - if name == self.targetAttr: - context.targetClass = value - elif name == self.selectedAttr: - context.selectedClassifiers = list(value) + def new_context(self, classes, classifier_names): + context = super().new_context() + context.classes = classes + context.classifier_names = classifier_names + return context - def settings_from_widget(self, widget, *args): - super().settings_from_widget(widget, *args) - context = widget.current_context - context.targetClass = getdeepattr(widget, self.targetAttr) - context.selectedClassifiers = list(getdeepattr(self.selectedAttr)) - - def settings_to_widget(self, widget, *args): - super().settings_to_widget(widget, *args) - context = widget.current_context - if context.targetClass is not None: - setattr(widget, self.targetAttr, context.targetClass) - if context.selectedClassifiers is not None: - setattr(widget, self.selectedAttr, context.selectedClassifiers) - - #noinspection PyMethodOverriding - def find_or_create_context(self, widget, results): - cnames = [c.name for c in results.classifiers] - cvalues = results.classValues - context, isNew = super().find_or_create_context( - widget, results.classifierNames, results.classValues) - if isNew: - context.classifierNames = results.classifierNames - context.classValues = results.classValues - context.selectedClassifiers = None - context.targetClass = None - return context, isNew + def match(self, context, classes, classifier_names): + if classifier_names != context.classifier_names: + return self.NO_MATCH + elif isinstance(classes, Variable) and classes.is_continuous: + return (self.PERFECT_MATCH if context.classes is None + else self.NO_MATCH) + else: + return (self.PERFECT_MATCH if context.classes == classes + else self.NO_MATCH) diff --git a/Orange/widgets/evaluate/owcalibrationplot.py b/Orange/widgets/evaluate/owcalibrationplot.py index c757932adea..562c3d5aa01 100644 --- a/Orange/widgets/evaluate/owcalibrationplot.py +++ b/Orange/widgets/evaluate/owcalibrationplot.py @@ -1,37 +1,61 @@ -""" -Calibration Plot Widget ------------------------ - -""" from collections import namedtuple import numpy as np -from AnyQt.QtWidgets import QListWidget +from AnyQt.QtCore import Qt, QSize +from AnyQt.QtWidgets import QListWidget, QSizePolicy import pyqtgraph as pg -import Orange +from Orange.base import Model +from Orange.classification import ThresholdClassifier, CalibratedLearner +from Orange.evaluation import Results +from Orange.evaluation.performance_curves import Curves from Orange.widgets import widget, gui, settings -from Orange.widgets.evaluate.utils import \ - check_results_adequacy, results_for_preview +from Orange.widgets.evaluate.contexthandlers import \ + EvaluationResultsContextHandler +from Orange.widgets.evaluate.utils import results_for_preview from Orange.widgets.utils import colorpalette, colorbrewer from Orange.widgets.utils.widgetpreview import WidgetPreview -from Orange.widgets.widget import Input +from Orange.widgets.widget import Input, Output, Msg from Orange.widgets import report -Curve = namedtuple( - "Curve", - ["x", "y"] -) - -PlotCurve = namedtuple( - "PlotCurve", - ["curve", - "curve_item", - "rug_item"] -) +MetricDefinition = namedtuple( + "metric_definition", + ("name", "functions", "short_names", "explanation")) + +Metrics = [MetricDefinition(*args) for args in ( + ("Calibration curve", None, (), ""), + ("Classification accuracy", (Curves.ca, ), (), ""), + ("F1", (Curves.f1, ), (), ""), + ("Sensitivity and specificity", + (Curves.sensitivity, Curves.specificity), + ("sens", "spec"), + "

Sensitivity (falling) is the proportion of correctly " + "detected positive instances (TP / P).

" + "

Specificity (rising) is the proportion of detected " + "negative instances (TP / N).

"), + ("Precision and recall", + (Curves.precision, Curves.recall), + ("prec", "recall"), + "

Precision (rising) is the fraction of retrieved instances " + "that are relevant, TP / (TP + FP).

" + "

Recall (falling) is the proportion of discovered relevant " + "instances, TP / P.

"), + ("Pos and neg predictive value", + (Curves.ppv, Curves.npv), + ("PPV", "TPV"), + "

Positive predictive value (rising) is the proportion of " + "correct positives, TP / (TP + FP).

" + "

Negative predictive value is the proportion of correct " + "negatives, TN / (TN + FN).

"), + ("True and false positive rate", + (Curves.tpr, Curves.fpr), + ("TPR", "FPR"), + "

True and false positive rate are proportions of detected " + "and omitted positive instances

"), +)] class OWCalibrationPlot(widget.OWWidget): @@ -42,15 +66,41 @@ class OWCalibrationPlot(widget.OWWidget): keywords = [] class Inputs: - evaluation_results = Input("Evaluation Results", Orange.evaluation.Results) + evaluation_results = Input("Evaluation Results", Results) - class Warning(widget.OWWidget.Warning): - empty_input = widget.Msg( - "Empty result on input. Nothing to display.") + class Outputs: + calibrated_model = Output("Calibrated Model", Model) + + class Error(widget.OWWidget.Error): + non_discrete_target = Msg("Calibration plot requires a discrete target") + empty_input = widget.Msg("Empty result on input. Nothing to display.") + nan_classes = \ + widget.Msg("Remove test data instances with unknown classes") + all_target_class = widget.Msg( + "All data instances belong to target class") + no_target_class = widget.Msg( + "No data instances belong to target class") - target_index = settings.Setting(0) - selected_classifiers = settings.Setting([]) + class Warning(widget.OWWidget.Warning): + omitted_folds = widget.Msg( + "Test folds where all data belongs to (non)-target are not shown") + omitted_nan_prob_points = widget.Msg( + "Instance for which the model couldn't compute probabilities are" + "skipped") + no_valid_data = widget.Msg("No valid data for model(s) {}") + + class Information(widget.OWWidget.Information): + no_output = Msg("Can't output a model: {}") + + settingsHandler = EvaluationResultsContextHandler() + target_index = settings.ContextSetting(0) + selected_classifiers = settings.ContextSetting([]) + score = settings.Setting(0) + output_calibration = settings.Setting(0) + fold_curves = settings.Setting(False) display_rug = settings.Setting(True) + threshold = settings.Setting(0.5) + auto_commit = settings.Setting(True) graph_name = "plot" @@ -58,56 +108,100 @@ def __init__(self): super().__init__() self.results = None + self.scores = None self.classifier_names = [] self.colors = [] - self._curve_data = {} + self.line = None - box = gui.vBox(self.controlArea, "Plot") - tbox = gui.vBox(box, "Target Class") - tbox.setFlat(True) + self._last_score_value = -1 + box = gui.vBox(self.controlArea, box="Settings") self.target_cb = gui.comboBox( - tbox, self, "target_index", callback=self._replot, + box, self, "target_index", label="Target:", + orientation=Qt.Horizontal, callback=self.target_index_changed, contentsLength=8) - - cbox = gui.vBox(box, "Classifier") - cbox.setFlat(True) - - self.classifiers_list_box = gui.listBox( - box, self, "selected_classifiers", "classifier_names", - selectionMode=QListWidget.MultiSelection, + gui.checkBox( + box, self, "display_rug", "Show rug", + callback=self._on_display_rug_changed) + gui.checkBox( + box, self, "fold_curves", "Curves for individual folds", callback=self._replot) - gui.checkBox(box, self, "display_rug", "Show rug", - callback=self._on_display_rug_changed) + self.classifiers_list_box = gui.listBox( + self.controlArea, self, "selected_classifiers", "classifier_names", + box="Classifier", selectionMode=QListWidget.ExtendedSelection, + sizePolicy=(QSizePolicy.Preferred, QSizePolicy.Preferred), + sizeHint=QSize(150, 40), + callback=self._on_selection_changed) + + box = gui.vBox(self.controlArea, "Metrics") + combo = gui.comboBox( + box, self, "score", items=(metric.name for metric in Metrics), + callback=self.score_changed) + + self.explanation = gui.widgetLabel( + box, wordWrap=True, fixedWidth=combo.sizeHint().width()) + self.explanation.setContentsMargins(8, 8, 0, 0) + font = self.explanation.font() + font.setPointSizeF(0.85 * font.pointSizeF()) + self.explanation.setFont(font) + + gui.radioButtons( + box, self, value="output_calibration", + btnLabels=("Sigmoid calibration", "Isotonic calibration"), + label="Output model calibration", callback=self.apply) + + self.info_box = gui.widgetBox(self.controlArea, "Info") + self.info_label = gui.widgetLabel(self.info_box) + + gui.auto_commit( + self.controlArea, self, "auto_commit", "Apply", commit=self.apply) self.plotview = pg.GraphicsView(background="w") self.plot = pg.PlotItem(enableMenu=False) self.plot.setMouseEnabled(False, False) self.plot.hideButtons() - axis = self.plot.getAxis("bottom") - axis.setLabel("Predicted Probability") - - axis = self.plot.getAxis("left") - axis.setLabel("Observed Average") + for axis_name in ("bottom", "left"): + axis = self.plot.getAxis(axis_name) + axis.setPen(pg.mkPen(color=0.0)) + # Remove the condition (that is, allow setting this for bottom + # axis) when pyqtgraph is fixed + # Issue: https://github.com/pyqtgraph/pyqtgraph/issues/930 + # Pull request: https://github.com/pyqtgraph/pyqtgraph/pull/932 + if axis_name != "bottom": # remove if when pyqtgraph is fixed + axis.setStyle(stopAxisAtTick=(True, True)) self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0), padding=0.05) self.plotview.setCentralItem(self.plot) + self.mainArea.layout().addWidget(self.plotview) + self._set_explanation() @Inputs.evaluation_results def set_results(self, results): + self.closeContext() self.clear() - results = check_results_adequacy(results, self.Error) - if results is not None and not results.actual.size: - self.Warning.empty_input() - else: - self.Warning.empty_input.clear() - self.results = results - if self.results is not None: - self._initialize(results) - self._replot() + self.Error.clear() + self.Information.clear() + + self.results = None + if results is not None: + if not results.domain.has_discrete_class: + self.Error.non_discrete_target() + elif not results.actual.size: + self.Error.empty_input() + elif np.any(np.isnan(results.actual)): + self.Error.nan_classes() + else: + self.results = results + self._initialize(results) + class_var = self.results.domain.class_var + self.target_index = int(len(class_var.values) == 2) + self.openContext(class_var, self.classifier_names) + self._replot() + + self.apply() def clear(self): self.plot.clear() @@ -117,106 +211,296 @@ def clear(self): self.target_cb.clear() self.target_index = 0 self.colors = [] - self._curve_data = {} + + def target_index_changed(self): + if len(self.results.domain.class_var.values) == 2: + self.threshold = 1 - self.threshold + self._set_explanation() + self._replot() + self.apply() + + def score_changed(self): + self._set_explanation() + self._replot() + if self._last_score_value != self.score: + self.apply() + self._last_score_value = self.score + + def _set_explanation(self): + explanation = Metrics[self.score].explanation + if explanation: + self.explanation.setText(explanation) + self.explanation.show() + else: + self.explanation.hide() + + if self.score == 0: + self.controls.output_calibration.show() + self.info_box.hide() + else: + self.controls.output_calibration.hide() + self.info_box.show() + + axis = self.plot.getAxis("bottom") + axis.setLabel("Predicted probability" if self.score == 0 + else "Threshold probability to classify as positive") + + axis = self.plot.getAxis("left") + axis.setLabel(Metrics[self.score].name) def _initialize(self, results): - N = len(results.predicted) + n = len(results.predicted) names = getattr(results, "learner_names", None) if names is None: - names = ["#{}".format(i + 1) for i in range(N)] + names = ["#{}".format(i + 1) for i in range(n)] self.classifier_names = names scheme = colorbrewer.colorSchemes["qualitative"]["Dark2"] - if N > len(scheme): + if n > len(scheme): scheme = colorpalette.DefaultRGBColors - self.colors = colorpalette.ColorPaletteGenerator(N, scheme) + self.colors = colorpalette.ColorPaletteGenerator(n, scheme) - for i in range(N): + for i in range(n): item = self.classifiers_list_box.item(i) item.setIcon(colorpalette.ColorPixmap(self.colors[i])) - self.selected_classifiers = list(range(N)) - self.target_cb.addItems(results.data.domain.class_var.values) - - def plot_curve(self, clf_idx, target): - if (clf_idx, target) in self._curve_data: - return self._curve_data[clf_idx, target] - - ytrue = self.results.actual == target - probs = self.results.probabilities[clf_idx, :, target] - sortind = np.argsort(probs) - probs = probs[sortind] - ytrue = ytrue[sortind] - if probs.size: - xmin, xmax = probs.min(), probs.max() - x = np.linspace(xmin, xmax, 100) - if xmax != xmin: - f = gaussian_smoother(probs, ytrue, sigma=0.15 * (xmax - xmin)) - observed = f(x) - else: - observed = np.full(100, xmax) - else: - x = np.array([]) - observed = np.array([]) - - curve = Curve(x, observed) - curve_item = pg.PlotDataItem( - x, observed, pen=pg.mkPen(self.colors[clf_idx], width=1), - shadowPen=pg.mkPen(self.colors[clf_idx].lighter(160), width=2), - symbol="+", symbolSize=4, - antialias=True - ) + self.selected_classifiers = list(range(n)) + self.target_cb.addItems(results.domain.class_var.values) + def _rug(self, data, pen_args): + color = pen_args["pen"].color() rh = 0.025 - rug_x = np.c_[probs, probs] - rug_x_true = rug_x[ytrue].ravel() - rug_x_false = rug_x[~ytrue].ravel() + rug_x = np.c_[data.probs[:-1], data.probs[:-1]] + rug_x_true = rug_x[data.ytrue].ravel() + rug_x_false = rug_x[~data.ytrue].ravel() rug_y_true = np.ones_like(rug_x_true) rug_y_true[1::2] = 1 - rh rug_y_false = np.zeros_like(rug_x_false) rug_y_false[1::2] = rh - rug1 = pg.PlotDataItem( - rug_x_false, rug_y_false, pen=self.colors[clf_idx], - connect="pairs", antialias=True - ) - rug2 = pg.PlotDataItem( - rug_x_true, rug_y_true, pen=self.colors[clf_idx], - connect="pairs", antialias=True - ) - self._curve_data[clf_idx, target] = PlotCurve(curve, curve_item, (rug1, rug2)) - return self._curve_data[clf_idx, target] + self.plot.plot( + rug_x_false, rug_y_false, + pen=color, connect="pairs", antialias=True) + self.plot.plot( + rug_x_true, rug_y_true, + pen=color, connect="pairs", antialias=True) + + def plot_metrics(self, data, metrics, pen_args): + if metrics is None: + return self._prob_curve(data.ytrue, data.probs[:-1], pen_args) + ys = [metric(data) for metric in metrics] + for y in ys: + self.plot.plot(data.probs, y, **pen_args) + return data.probs, ys + + def _prob_curve(self, ytrue, probs, pen_args): + xmin, xmax = probs.min(), probs.max() + x = np.linspace(xmin, xmax, 100) + if xmax != xmin: + f = gaussian_smoother(probs, ytrue, sigma=0.15 * (xmax - xmin)) + y = f(x) + else: + y = np.full(100, xmax) + + self.plot.plot(x, y, symbol="+", symbolSize=4, **pen_args) + return x, (y, ) def _setup_plot(self): target = self.target_index - selected = self.selected_classifiers - curves = [self.plot_curve(i, target) for i in selected] + results = self.results + metrics = Metrics[self.score].functions + plot_folds = self.fold_curves and results.folds is not None + self.scores = [] - for curve in curves: - self.plot.addItem(curve.curve_item) - if self.display_rug: - self.plot.addItem(curve.rug_item[0]) - self.plot.addItem(curve.rug_item[1]) + if not self._check_class_presence(results.actual == target): + return - self.plot.plot([0, 1], [0, 1], antialias=True) + self.Warning.omitted_folds.clear() + self.Warning.omitted_nan_prob_points.clear() + no_valid_models = [] + shadow_width = 4 + 4 * plot_folds + for clsf in self.selected_classifiers: + data = Curves.from_results(results, target, clsf) + if data.tot == 0: # all probabilities are nan + no_valid_models.append(clsf) + continue + if data.tot != results.probabilities.shape[1]: # some are nan + self.Warning.omitted_nan_prob_points() + + color = self.colors[clsf] + pen_args = dict( + pen=pg.mkPen(color, width=1), antiAlias=True, + shadowPen=pg.mkPen(color.lighter(160), width=shadow_width)) + self.scores.append( + (self.classifier_names[clsf], + self.plot_metrics(data, metrics, pen_args))) + + if self.display_rug: + self._rug(data, pen_args) + + if plot_folds: + pen_args = dict( + pen=pg.mkPen(color, width=1, style=Qt.DashLine), + antiAlias=True) + for fold in range(len(results.folds)): + fold_results = results.get_fold(fold) + fold_curve = Curves.from_results(fold_results, target, clsf) + # Can't check this before: p and n can be 0 because of + # nan probabilities + if fold_curve.p * fold_curve.n == 0: + self.Warning.omitted_folds() + self.plot_metrics(fold_curve, metrics, pen_args) + + if no_valid_models: + self.Warning.no_valid_data( + ", ".join(self.classifier_names[i] for i in no_valid_models)) + + if self.score == 0: + self.plot.plot([0, 1], [0, 1], antialias=True) + else: + self.line = pg.InfiniteLine( + pos=self.threshold, movable=True, + pen=pg.mkPen(color="k", style=Qt.DashLine, width=2), + hoverPen=pg.mkPen(color="k", style=Qt.DashLine, width=3), + bounds=(0, 1), + ) + self.line.sigPositionChanged.connect(self.threshold_change) + self.line.sigPositionChangeFinished.connect( + self.threshold_change_done) + self.plot.addItem(self.line) + + def _check_class_presence(self, ytrue): + self.Error.all_target_class.clear() + self.Error.no_target_class.clear() + if np.max(ytrue) == 0: + self.Error.no_target_class() + return False + if np.min(ytrue) == 1: + self.Error.all_target_class() + return False + return True def _replot(self): self.plot.clear() if self.results is not None: self._setup_plot() + self._update_info() def _on_display_rug_changed(self): self._replot() + def _on_selection_changed(self): + self._replot() + self.apply() + + def threshold_change(self): + self.threshold = round(self.line.pos().x(), 2) + self.line.setPos(self.threshold) + self._update_info() + + def get_info_text(self, short): + if short: + def elided(s): + return s[:17] + "..." if len(s) > 20 else s + + text = f""" + + + + """ + + else: + def elided(s): + return s + + text = f"""
Threshold: p={self.threshold:.2f}
+ + + + + """ + + if self.scores is not None: + short_names = Metrics[self.score].short_names + if short_names: + text += f""" + + {"".join(f"" + for n in short_names)} + """ + for name, (probs, curves) in self.scores: + ind = min(np.searchsorted(probs, self.threshold), + len(probs) - 1) + text += f"" + text += "".join(f'' + for curve in curves) + text += "" + text += "
Threshold:p = {self.threshold:.2f}
+
{n}
{elided(name)}:/{curve[ind]:.3f}
" + return text + + def _update_info(self): + self.info_label.setText(self.get_info_text(short=True)) + + def threshold_change_done(self): + self.apply() + + def apply(self): + self.Information.no_output.clear() + wrapped = None + results = self.results + if results is not None: + problems = [ + msg for condition, msg in ( + (len(results.folds) > 1, + "each training data sample produces a different model"), + (results.models is None, + "test results do not contain stored models - try testing " + "on separate data or on training data"), + (len(self.selected_classifiers) != 1, + "select a single model - the widget can output only one"), + (self.score != 0 and len(results.domain.class_var.values) != 2, + "cannot calibrate non-binary classes")) + if condition] + if len(problems) == 1: + self.Information.no_output(problems[0]) + elif problems: + self.Information.no_output( + "".join(f"\n - {problem}" for problem in problems)) + else: + clsf_idx = self.selected_classifiers[0] + model = results.models[0, clsf_idx] + if self.score == 0: + cal_learner = CalibratedLearner( + None, self.output_calibration) + wrapped = cal_learner.get_model( + model, results.actual, results.probabilities[clsf_idx]) + else: + threshold = [1 - self.threshold, + self.threshold][self.target_index] + wrapped = ThresholdClassifier(model, threshold) + + self.Outputs.calibrated_model.send(wrapped) + def send_report(self): if self.results is None: return + self.report_items(( + ("Target class", self.target_cb.currentText()), + ("Output model calibration", + self.score == 0 + and ("Sigmoid calibration", + "Isotonic calibration")[self.output_calibration]) + )) caption = report.list_legend(self.classifiers_list_box, self.selected_classifiers) - self.report_items((("Target class", self.target_cb.currentText()),)) self.report_plot() self.report_caption(caption) + self.report_caption(self.controls.score.currentText()) + + if self.score != 0: + self.report_raw(self.get_info_text(short=False)) def gaussian_smoother(x, y, sigma=1.0): diff --git a/Orange/widgets/evaluate/owtestlearners.py b/Orange/widgets/evaluate/owtestlearners.py index 0577b448950..d534bbe6a32 100644 --- a/Orange/widgets/evaluate/owtestlearners.py +++ b/Orange/widgets/evaluate/owtestlearners.py @@ -315,7 +315,7 @@ def set_learner(self, learner, key): # Removed self._invalidate([key]) del self.learners[key] - else: + elif learner is not None: self.learners[key] = InputLearner(learner, None, None) self._invalidate([key]) @@ -735,7 +735,8 @@ def __update(self): if self.resampling == OWTestLearners.TestOnTest: test_f = partial( - Orange.evaluation.TestOnTestData(store_data=True), + Orange.evaluation.TestOnTestData( + store_data=True, store_models=True), self.data, self.test_data, learners_c, self.preprocessor ) else: @@ -756,7 +757,8 @@ def __update(self): stratified=self.shuffle_stratified, random_state=rstate) elif self.resampling == OWTestLearners.TestOnTrain: - sampler = Orange.evaluation.TestOnTrainingData() + sampler = Orange.evaluation.TestOnTrainingData( + store_models=True) else: assert False, "self.resampling %s" % self.resampling @@ -916,7 +918,7 @@ def is_empty(res): res.probabilities = np.vstack((x.probabilities, y.probabilities)) if x.models is not None: - res.models = [xm + ym for xm, ym in zip(x.models, y.models)] + res.models = np.hstack((x.models, y.models)) return res diff --git a/Orange/widgets/evaluate/tests/base.py b/Orange/widgets/evaluate/tests/base.py index 3100f1e1905..93fafea1e51 100644 --- a/Orange/widgets/evaluate/tests/base.py +++ b/Orange/widgets/evaluate/tests/base.py @@ -17,6 +17,6 @@ def test_many_evaluation_results(self): classification.NaiveBayesLearner(), classification.SGDClassificationLearner() ] - res = evaluation.CrossValidation(data, learners, k=2, store_data=True) + res = evaluation.CrossValidation(k=2, store_data=True)(data, learners) # this is a mixin; pylint: disable=no-member self.send_signal("Evaluation Results", res) diff --git a/Orange/widgets/evaluate/tests/test_owcalibrationplot.py b/Orange/widgets/evaluate/tests/test_owcalibrationplot.py index 0575e03e8d1..e4f18231686 100644 --- a/Orange/widgets/evaluate/tests/test_owcalibrationplot.py +++ b/Orange/widgets/evaluate/tests/test_owcalibrationplot.py @@ -1,12 +1,18 @@ import copy import warnings +from unittest.mock import Mock, patch import numpy as np +from AnyQt.QtCore import QItemSelection +from pyqtgraph import InfiniteLine + from sklearn.exceptions import ConvergenceWarning -from Orange.data import Table +from Orange.data import Table, DiscreteVariable, Domain, ContinuousVariable import Orange.evaluation import Orange.classification +from Orange.evaluation import Results +from Orange.evaluation.performance_curves import Curves from Orange.widgets.evaluate.tests.base import EvaluateTest from Orange.widgets.evaluate.owcalibrationplot import OWCalibrationPlot @@ -15,42 +21,620 @@ class TestOWCalibrationPlot(WidgetTest, EvaluateTest): - @classmethod - def setUpClass(cls): - super().setUpClass() - cls.lenses = data = Table(test_filename("datasets/lenses.tab")) - cls.res = Orange.evaluation.TestOnTestData( - train_data=data[::2], test_data=data[1::2], - learners=[Orange.classification.MajorityLearner(), - Orange.classification.KNNLearner()], - store_data=True, - ) - def setUp(self): super().setUp() + + n, p = (0, 1) + actual, probs = np.array([ + (p, .8), (n, .7), (p, .6), (p, .55), (p, .54), (n, .53), (n, .52), + (p, .51), (n, .505), (p, .4), (n, .39), (p, .38), (n, .37), + (n, .36), (n, .35), (p, .34), (n, .33), (p, .30), (n, .1)]).T + self.curves = Curves(actual, probs) + probs2 = (probs + 0.5) / 2 + 1 + self.curves2 = Curves(actual, probs2) + pred = probs > 0.5 + pred2 = probs2 > 0.5 + probs = np.vstack((1 - probs, probs)).T + probs2 = np.vstack((1 - probs2, probs2)).T + domain = Domain([], DiscreteVariable("y", values=("a", "b"))) + self.results = Results( + domain=domain, + actual=actual, + folds=np.array([Ellipsis]), + models=np.array([[Mock(), Mock()]]), + row_indices=np.arange(19), + predicted=np.array((pred, pred2)), + probabilities=np.array([probs, probs2])) + + self.lenses = data = Table(test_filename("datasets/lenses.tab")) + majority = Orange.classification.MajorityLearner() + majority.name = "majority" + knn3 = Orange.classification.KNNLearner(n_neighbors=3) + knn3.name = "knn-3" + knn1 = Orange.classification.KNNLearner(n_neighbors=1) + knn1.name = "knn-1" + self.lenses_results = Orange.evaluation.TestOnTestData( + store_data=True, store_models=True)( + data=data[::2], test_data=data[1::2], + learners=[majority, knn3, knn1]) + self.lenses_results.learner_names = ["majority", "knn-3", "knn-1"] + self.widget = self.create_widget(OWCalibrationPlot) # type: OWCalibrationPlot warnings.filterwarnings("ignore", ".*", ConvergenceWarning) - def test_basic(self): - self.send_signal(self.widget.Inputs.evaluation_results, self.res) - self.widget.controls.display_rug.click() + def test_initialization(self): + """Test initialization of lists and combos""" + def check_clsfr_names(names): + self.assertEqual(widget.classifier_names, names) + clsf_list = widget.controls.selected_classifiers + self.assertEqual( + [clsf_list.item(i).text() for i in range(clsf_list.count())], + names) + + widget = self.widget + tcomb = widget.controls.target_index + + self.send_signal(widget.Inputs.evaluation_results, self.lenses_results) + check_clsfr_names(["majority", "knn-3", "knn-1"]) + self.assertEqual(widget.selected_classifiers, [0, 1, 2]) + self.assertEqual( + [tcomb.itemText(i) for i in range(tcomb.count())], + self.lenses.domain.class_var.values) + self.assertEqual(widget.target_index, 0) + + self.send_signal(widget.Inputs.evaluation_results, self.results) + check_clsfr_names(["#1", "#2"]) + self.assertEqual(widget.selected_classifiers, [0, 1]) + self.assertEqual( + [tcomb.itemText(i) for i in range(tcomb.count())], ["a", "b"]) + self.assertEqual(widget.target_index, 1) + + self.send_signal(widget.Inputs.evaluation_results, None) + check_clsfr_names([]) + self.assertEqual(widget.selected_classifiers, []) + self.assertEqual(widget.controls.target_index.count(), 0) + + def test_empty_input_error(self): + """Show an error when data is present but empty""" + widget = self.widget + + res = copy.copy(self.results) + res.row_indices = res.row_indices[:0] + res.actual = res.actual[:0] + res.predicted = res.predicted[:, 0] + res.probabilities = res.probabilities[:, :0, :] + self.send_signal(widget.Inputs.evaluation_results, self.results) + self.assertFalse(widget.Error.empty_input.is_shown()) + self.assertTrue(bool(widget.plot.items)) + + self.send_signal(widget.Inputs.evaluation_results, res) + self.assertTrue(widget.Error.empty_input.is_shown()) + self.assertIsNone(widget.results) + self.assertFalse(bool(widget.plot.items)) + + self.send_signal(widget.Inputs.evaluation_results, self.results) + self.assertFalse(widget.Error.empty_input.is_shown()) + self.assertTrue(bool(widget.plot.items)) + + def test_regression_input_error(self): + """Show an error for regression data""" + widget = self.widget - def test_empty(self): - res = copy.copy(self.res) + res = copy.copy(self.results) + res.domain = Domain([], ContinuousVariable("y")) res.row_indices = res.row_indices[:0] res.actual = res.actual[:0] res.predicted = res.predicted[:, 0] res.probabilities = res.probabilities[:, :0, :] - self.send_signal(self.widget.Inputs.evaluation_results, res) - - def test_nan_input(self): - res = copy.copy(self.res) - res.actual = res.actual.copy() - res.probabilities = res.probabilities.copy() - - res.actual[0] = np.nan - res.probabilities[:, [0, 3], :] = np.nan - self.send_signal(self.widget.Inputs.evaluation_results, res) - self.assertTrue(self.widget.Error.invalid_results.is_shown()) - self.send_signal(self.widget.Inputs.evaluation_results, None) - self.assertFalse(self.widget.Error.invalid_results.is_shown()) + self.send_signal(widget.Inputs.evaluation_results, self.results) + self.assertFalse(widget.Error.non_discrete_target.is_shown()) + self.assertTrue(bool(widget.plot.items)) + + self.send_signal(widget.Inputs.evaluation_results, res) + self.assertTrue(widget.Error.non_discrete_target.is_shown()) + self.assertIsNone(widget.results) + self.assertFalse(bool(widget.plot.items)) + + self.send_signal(widget.Inputs.evaluation_results, self.results) + self.assertFalse(widget.Error.non_discrete_target.is_shown()) + self.assertTrue(bool(widget.plot.items)) + + @staticmethod + def _set_combo(combo, val): + combo.setCurrentIndex(val) + combo.activated[int].emit(val) + combo.activated[str].emit(combo.currentText()) + + @staticmethod + def _set_radio_buttons(radios, val): + radios.buttons[val].click() + + @staticmethod + def _set_list_selection(listview, selection): + model = listview.model() + selectionmodel = listview.selectionModel() + itemselection = QItemSelection() + for item in selection: + itemselection.select(model.index(item, 0), model.index(item, 0)) + selectionmodel.select(itemselection, selectionmodel.ClearAndSelect) + + def _set_threshold(self, pos, done): + _, line = self._get_curves() + line.setPos(pos) + if done: + line.sigPositionChangeFinished.emit(line) + else: + line.sigPositionChanged.emit(line) + + def _get_curves(self): + plot_items = self.widget.plot.items[:] + for i, item in enumerate(plot_items): + if isinstance(item, InfiniteLine): + del plot_items[i] + return plot_items, item + return plot_items, None + + @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier") + @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner") + def test_plotting_curves(self, *_): + """Curve coordinates match those computed by `Curves`""" + widget = self.widget + widget.display_rug = False + self.send_signal(widget.Inputs.evaluation_results, self.results) + widget.selected_classifiers = [0] + combo = widget.controls.score + + c = self.curves + combinations = ([c.ca()], + [c.f1()], + [c.sensitivity(), c.specificity()], + [c.precision(), c.recall()], + [c.ppv(), c.npv()], + [c.tpr(), c.fpr()]) + for idx, curves_data in enumerate(combinations, start=1): + self._set_combo(combo, idx) + curves, line = self._get_curves() + self.assertEqual(len(curves), len(curves_data)) + self.assertIsNotNone(line) + for curve in curves: + x, y = curve.getData() + np.testing.assert_almost_equal(x, self.curves.probs) + for i, curve_data in enumerate(curves_data): + if np.max(curve_data - y) < 1e-6: + del curves_data[i] + break + else: + self.fail(f"invalid curve for {combo.currentText()}") + + @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier") + @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner") + def test_multiple_fold_curves(self, *_): + widget = self.widget + widget.display_rug = False + widget.fold_curves = False + self.send_signal(widget.Inputs.evaluation_results, self.results) + self._set_list_selection(widget.controls.selected_classifiers, [0]) + self._set_combo(widget.controls.score, 1) # CA + + self.results.folds = [slice(1, 5), slice(5, 19)] + self.results.models = np.array([[Mock(), Mock()]] * 2) + curves, _ = self._get_curves() + self.assertEqual(len(curves), 1) + + widget.controls.fold_curves.click() + curves, _ = self._get_curves() + self.assertEqual(len(curves), 3) + + widget.controls.fold_curves.click() + curves, _ = self._get_curves() + self.assertEqual(len(curves), 1) + + @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier") + @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner") + def test_change_target_class(self, *_): + """Changing target combo changes the curves""" + widget = self.widget + widget.display_rug = False + self.send_signal(widget.Inputs.evaluation_results, self.results) + widget.selected_classifiers = [0] + score_combo = widget.controls.score + target_combo = widget.controls.target_index + + self._set_combo(score_combo, 1) # ca + self._set_combo(target_combo, 1) + (ca, ), _ = self._get_curves() + np.testing.assert_almost_equal(ca.getData()[1], self.curves.ca()) + + self._set_combo(target_combo, 0) + (ca, ), _ = self._get_curves() + curves = Curves(1 - self.curves.ytrue, 1 - self.curves.probs[:-1]) + np.testing.assert_almost_equal(ca.getData()[1], curves.ca()) + + def test_changing_score_explanation(self): + """Changing score hides/shows explanation and options for calibration""" + widget = self.widget + score_combo = widget.controls.score + explanation = widget.explanation + calibrations = widget.controls.output_calibration + + self._set_combo(score_combo, 1) # ca + self.assertTrue(explanation.isHidden()) + self.assertTrue(calibrations.isHidden()) + + self._set_combo(score_combo, 0) # calibration + self.assertTrue(explanation.isHidden()) + self.assertFalse(calibrations.isHidden()) + + self._set_combo(score_combo, 3) # sens/spec + self.assertFalse(explanation.isHidden()) + self.assertTrue(calibrations.isHidden()) + + def test_rug(self): + """Test rug appearance and positions""" + def get_rugs(): + rugs = [None, None] + for item in widget.plot.items: + if item.curve.opts.get("connect", "") == "pairs": + x, y = item.getData() + np.testing.assert_almost_equal(x[::2], x[1::2]) + rugs[int(y[0] == 1)] = x[::2] + return rugs + + widget = self.widget + widget.display_rug = True + model_list = widget.controls.selected_classifiers + self.send_signal(widget.Inputs.evaluation_results, self.results) + + self._set_list_selection(model_list, [0]) + probs = self.curves.probs[:-1] + truex = probs[self.curves.ytrue == 1] + falsex = probs[self.curves.ytrue == 0] + bottom, top = get_rugs() + np.testing.assert_almost_equal(bottom, falsex) + np.testing.assert_almost_equal(top, truex) + + # Switching targets should switch rugs and takes other probabilities + self._set_combo(widget.controls.target_index, 0) + bottom, top = get_rugs() + np.testing.assert_almost_equal(bottom, (1 - truex)[::-1]) + np.testing.assert_almost_equal(top, (1 - falsex)[::-1]) + self._set_combo(widget.controls.target_index, 1) + + # Changing models gives a different rug + self._set_list_selection(model_list, [1]) + probs2 = self.curves2.probs[:-1] + truex2 = probs2[self.curves2.ytrue == 1] + falsex2 = probs2[self.curves2.ytrue == 0] + bottom, top = get_rugs() + np.testing.assert_almost_equal(bottom, falsex2) + np.testing.assert_almost_equal(top, truex2) + + # Two models - two rugs - four rug items + self._set_list_selection(model_list, [0, 1]) + self.assertEqual(sum(item.curve.opts.get("connect", "") == "pairs" + for item in widget.plot.items), 4) + + # No models - no rugs + self._set_list_selection(model_list, []) + self.assertEqual(get_rugs(), [None, None]) + + # Bring the rug back + self._set_list_selection(model_list, [1]) + self.assertIsNotNone(get_rugs()[0]) + + # Disable it with checkbox + widget.controls.display_rug.click() + self.assertEqual(get_rugs(), [None, None]) + + def test_calibration_curve(self): + """Test the correct number of calibration curves""" + widget = self.widget + model_list = widget.controls.selected_classifiers + widget.display_rug = False + + self.send_signal(widget.Inputs.evaluation_results, self.results) + self.assertEqual(len(widget.plot.items), 3) # 2 + diagonal + + self._set_list_selection(model_list, [1]) + self.assertEqual(len(widget.plot.items), 2) + + self._set_list_selection(model_list, []) + self.assertEqual(len(widget.plot.items), 1) + + def test_threshold_change_updates_info(self): + """Changing the threshold updates info label""" + widget = self.widget + self.send_signal(widget.Inputs.evaluation_results, self.results) + self._set_combo(widget.controls.score, 1) + + original_text = widget.info_label.text() + self._set_threshold(0.3, False) + self.assertNotEqual(widget.info_label.text(), original_text) + + def test_threshold_rounding(self): + """Threshold is rounded to two decimals""" + widget = self.widget + self.send_signal(widget.Inputs.evaluation_results, self.results) + self._set_combo(widget.controls.score, 1) + self._set_threshold(0.367, False) + self.assertAlmostEqual(widget.threshold, 0.37) + + def test_threshold_flips_on_two_classes(self): + """Threshold changes to 1 - threshold if *binary* class is switched""" + widget = self.widget + self.send_signal(widget.Inputs.evaluation_results, self.results) + self._set_combo(widget.controls.target_index, 0) + self._set_combo(widget.controls.score, 1) # CA + self._set_threshold(0.25, False) + self.assertEqual(widget.threshold, 0.25) + self._set_combo(widget.controls.target_index, 1) + self.assertEqual(widget.threshold, 0.75) + + self.send_signal(widget.Inputs.evaluation_results, self.lenses_results) + self._set_combo(widget.controls.target_index, 0) + self._set_combo(widget.controls.score, 1) # CA + self._set_threshold(0.25, False) + self.assertEqual(widget.threshold, 0.25) + self._set_combo(widget.controls.target_index, 1) + self.assertEqual(widget.threshold, 0.25) + + + @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier") + @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner") + def test_apply_no_output(self, *_): + """Test no output warnings""" + widget = self.widget + model_list = widget.controls.selected_classifiers + + multiple_folds, multiple_selected, no_models, non_binary_class = "abcd" + messages = { + multiple_folds: + "each training data sample produces a different model", + no_models: + "test results do not contain stored models - try testing on " + "separate data or on training data", + multiple_selected: + "select a single model - the widget can output only one", + non_binary_class: + "cannot calibrate non-binary classes"} + + def test_shown(shown): + widget_msg = widget.Information.no_output + output = self.get_output(widget.Outputs.calibrated_model) + if not shown: + self.assertFalse(widget_msg.is_shown()) + self.assertIsNotNone(output) + else: + self.assertTrue(widget_msg.is_shown()) + self.assertIsNone(output) + for msg_id in shown: + msg = messages[msg_id] + self.assertIn(msg, widget_msg.formatted, + f"{msg} not included in the message") + + self.send_signal(widget.Inputs.evaluation_results, self.results) + self._set_combo(widget.controls.score, 1) # CA + test_shown({multiple_selected}) + + self._set_list_selection(model_list, [0]) + test_shown(()) + self._set_list_selection(model_list, [0, 1]) + + self.results.models = None + self.send_signal(widget.Inputs.evaluation_results, self.results) + test_shown({multiple_selected, no_models}) + + self.send_signal(widget.Inputs.evaluation_results, self.lenses_results) + test_shown({multiple_selected, non_binary_class}) + + self._set_list_selection(model_list, [0]) + test_shown({non_binary_class}) + + self.results.folds = [slice(0, 5), slice(5, 10), slice(10, 19)] + self.results.models = np.array([[Mock(), Mock()]] * 3) + + self.send_signal(widget.Inputs.evaluation_results, self.results) + test_shown({multiple_selected, multiple_folds}) + + self._set_list_selection(model_list, [0]) + test_shown({multiple_folds}) + + self._set_combo(widget.controls.score, 0) # calibration + self.send_signal(widget.Inputs.evaluation_results, self.lenses_results) + self._set_list_selection(model_list, [0, 1]) + test_shown({multiple_selected}) + self._set_list_selection(model_list, [0]) + test_shown(()) + + @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier") + def test_output_threshold_classifier(self, threshold_classifier): + """Test threshold classifier on output""" + widget = self.widget + model_list = widget.controls.selected_classifiers + models = self.results.models.ravel() + target_combo = widget.controls.target_index + self.send_signal(widget.Inputs.evaluation_results, self.results) + self._set_list_selection(model_list, [0]) + widget.target_index = 1 + + widget.threshold = 0.3 + self._set_combo(widget.controls.score, 1) # CA + model = self.get_output(widget.Outputs.calibrated_model) + threshold_classifier.assert_called_with(models[0], 0.3) + self.assertIs(model, threshold_classifier.return_value) + threshold_classifier.reset_mock() + + widget.auto_commit = True + self._set_threshold(0.4, False) + threshold_classifier.assert_not_called() + + widget.auto_commit = False + self._set_threshold(0.35, True) + threshold_classifier.assert_not_called() + + widget.auto_commit = True + self._set_threshold(0.4, True) + threshold_classifier.assert_called_with(models[0], 0.4) + self.assertIs(model, threshold_classifier.return_value) + threshold_classifier.reset_mock() + + self._set_combo(target_combo, 0) + threshold_classifier.assert_called_with(models[0], 0.4) + self.assertIs(model, threshold_classifier.return_value) + threshold_classifier.reset_mock() + + self._set_combo(target_combo, 1) + threshold_classifier.assert_called_with(models[0], 0.4) + self.assertIs(model, threshold_classifier.return_value) + threshold_classifier.reset_mock() + + self._set_list_selection(model_list, [1]) + threshold_classifier.assert_called_with(models[1], 0.4) + self.assertIs(model, threshold_classifier.return_value) + threshold_classifier.reset_mock() + + @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner") + def test_output_calibrated_classifier(self, calibrated_learner): + """Test calibrated classifier on output""" + calibrated_instance = calibrated_learner.return_value + get_model = calibrated_instance.get_model + + widget = self.widget + model_list = widget.controls.selected_classifiers + models = self.lenses_results.models.ravel() + results = self.lenses_results + self.send_signal(widget.Inputs.evaluation_results, results) + self._set_combo(widget.controls.score, 0) + + self._set_list_selection(model_list, [1]) + + self._set_radio_buttons(widget.controls.output_calibration, 0) + calibrated_learner.assert_called_with(None, 0) + model, actual, probabilities = get_model.call_args[0] + self.assertIs(model, models[1]) + np.testing.assert_equal(actual, results.actual) + np.testing.assert_equal(probabilities, results.probabilities[1]) + self.assertIs(self.get_output(widget.Outputs.calibrated_model), + get_model.return_value) + calibrated_learner.reset_mock() + get_model.reset_mock() + + self._set_radio_buttons(widget.controls.output_calibration, 1) + calibrated_learner.assert_called_with(None, 1) + model, actual, probabilities = get_model.call_args[0] + self.assertIs(model, models[1]) + np.testing.assert_equal(actual, results.actual) + np.testing.assert_equal(probabilities, results.probabilities[1]) + self.assertIs(self.get_output(widget.Outputs.calibrated_model), + get_model.return_value) + calibrated_learner.reset_mock() + get_model.reset_mock() + + self._set_list_selection(model_list, [0]) + self._set_radio_buttons(widget.controls.output_calibration, 1) + calibrated_learner.assert_called_with(None, 1) + model, actual, probabilities = get_model.call_args[0] + self.assertIs(model, models[0]) + np.testing.assert_equal(actual, results.actual) + np.testing.assert_equal(probabilities, results.probabilities[0]) + self.assertIs(self.get_output(widget.Outputs.calibrated_model), + get_model.return_value) + calibrated_learner.reset_mock() + get_model.reset_mock() + + def test_contexts(self): + """Test storing and retrieving context settings""" + widget = self.widget + model_list = widget.controls.selected_classifiers + target_combo = widget.controls.target_index + self.send_signal(widget.Inputs.evaluation_results, self.lenses_results) + self._set_list_selection(model_list, [0, 2]) + self._set_combo(target_combo, 2) + self.send_signal(widget.Inputs.evaluation_results, self.results) + self._set_list_selection(model_list, [0]) + self._set_combo(target_combo, 0) + self.send_signal(widget.Inputs.evaluation_results, self.lenses_results) + self.assertEqual(widget.selected_classifiers, [0, 2]) + self.assertEqual(widget.target_index, 2) + + def test_report(self): + """Test that report does not crash""" + widget = self.widget + self.send_signal(widget.Inputs.evaluation_results, self.lenses_results) + widget.send_report() + + @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier") + @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner") + def test_single_class(self, *_): + """Curves are not plotted if all data belongs to (non)-target""" + def check_error(shown): + for error in (errors.no_target_class, errors.all_target_class, + errors.nan_classes): + self.assertEqual(error.is_shown(), error is shown, + f"{error} is unexpectedly" + f"{'' if error.is_shown() else ' not'} shown") + if shown is not None: + self.assertEqual(len(widget.plot.items), 0) + else: + self.assertGreater(len(widget.plot.items), 0) + + widget = self.widget + errors = widget.Error + widget.display_rug = True + combo = widget.controls.score + + original_actual = self.results.actual.copy() + self.send_signal(widget.Inputs.evaluation_results, self.results) + widget.selected_classifiers = [0] + for idx in range(combo.count()): + self._set_combo(combo, idx) + self.results.actual[:] = 0 + self.send_signal(widget.Inputs.evaluation_results, self.results) + check_error(errors.no_target_class) + + self.results.actual[:] = 1 + self.send_signal(widget.Inputs.evaluation_results, self.results) + check_error(errors.all_target_class) + + self.results.actual[:] = original_actual + self.results.actual[3] = np.nan + self.send_signal(widget.Inputs.evaluation_results, self.results) + check_error(errors.nan_classes) + + self.results.actual[:] = original_actual + self.send_signal(widget.Inputs.evaluation_results, self.results) + check_error(None) + + @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier") + @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner") + def test_single_class_folds(self, *_): + """Curves for single-class folds are not plotted""" + widget = self.widget + widget.display_rug = False + widget.fold_curves = False + + results = self.lenses_results + results.folds = [slice(0, 5), slice(5, 19)] + results.models = results.models.repeat(2, axis=0) + results.actual[:3] = 0 + results.probabilities[1, 3:5] = np.nan + # after this, model 1 has just negative instances in fold 0 + self.send_signal(widget.Inputs.evaluation_results, results) + self._set_combo(widget.controls.score, 1) # CA + self.assertFalse(widget.Warning.omitted_folds.is_shown()) + widget.controls.fold_curves.click() + self.assertTrue(widget.Warning.omitted_folds.is_shown()) + + @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier") + @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner") + def test_warn_nan_probabilities(self, *_): + """Warn about omitted points with nan probabiities""" + widget = self.widget + widget.display_rug = False + widget.fold_curves = False + + self.results.probabilities[1, 3] = np.nan + self.send_signal(widget.Inputs.evaluation_results, self.results) + self.assertTrue(widget.Warning.omitted_nan_prob_points.is_shown()) + self._set_list_selection(widget.controls.selected_classifiers, [0, 2]) + self.assertFalse(widget.Warning.omitted_folds.is_shown()) diff --git a/Orange/widgets/evaluate/utils.py b/Orange/widgets/evaluate/utils.py index 9e2f579dfae..ebe06032777 100644 --- a/Orange/widgets/evaluate/utils.py +++ b/Orange/widgets/evaluate/utils.py @@ -47,7 +47,7 @@ def results_for_preview(data_name=""): from Orange.classification import \ LogisticRegressionLearner, SVMLearner, NuSVMLearner - data = Table(data_name or "ionosphere") + data = Table(data_name or "heart_disease") results = CrossValidation( data, [LogisticRegressionLearner(penalty="l2"), diff --git a/Orange/widgets/gui.py b/Orange/widgets/gui.py index 683b8be2f73..b6a8d84552b 100644 --- a/Orange/widgets/gui.py +++ b/Orange/widgets/gui.py @@ -1783,6 +1783,9 @@ def __init__(self, master, enableDragDrop=False, dragDropCallback=None, def sizeHint(self): return self.size_hint + def minimumSizeHint(self): + return self.size_hint + def dragEnterEvent(self, event): super().dragEnterEvent(event) if self.valid_data_callback: diff --git a/Orange/widgets/model/icons/CalibratedLearner.svg b/Orange/widgets/model/icons/CalibratedLearner.svg new file mode 100644 index 00000000000..360a0d188ba --- /dev/null +++ b/Orange/widgets/model/icons/CalibratedLearner.svg @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + diff --git a/Orange/widgets/model/owcalibratedlearner.py b/Orange/widgets/model/owcalibratedlearner.py new file mode 100644 index 00000000000..0edf3184797 --- /dev/null +++ b/Orange/widgets/model/owcalibratedlearner.py @@ -0,0 +1,111 @@ +from Orange.classification import CalibratedLearner, ThresholdLearner, \ + NaiveBayesLearner +from Orange.data import Table +from Orange.modelling import Learner +from Orange.widgets import gui +from Orange.widgets.widget import Input +from Orange.widgets.settings import Setting +from Orange.widgets.utils.owlearnerwidget import OWBaseLearner +from Orange.widgets.utils.widgetpreview import WidgetPreview + + +class OWCalibratedLearner(OWBaseLearner): + name = "Calibrated Learner" + description = "Wraps another learner with probability calibration and " \ + "decision threshold optimization" + icon = "icons/CalibratedLearner.svg" + priority = 20 + keywords = ["calibration", "threshold"] + + LEARNER = CalibratedLearner + + SigmoidCalibration, IsotonicCalibration, NoCalibration = range(3) + CalibrationOptions = ("Sigmoid calibration", + "Isotonic calibration", + "No calibration") + CalibrationShort = ("Sigmoid", "Isotonic", "") + CalibrationMap = { + SigmoidCalibration: CalibratedLearner.Sigmoid, + IsotonicCalibration: CalibratedLearner.Isotonic} + + OptimizeCA, OptimizeF1, NoThresholdOptimization = range(3) + ThresholdOptions = ("Optimize classification accuracy", + "Optimize F1 score", + "No threshold optimization") + ThresholdShort = ("CA", "F1", "") + ThresholdMap = { + OptimizeCA: ThresholdLearner.OptimizeCA, + OptimizeF1: ThresholdLearner.OptimizeF1} + + learner_name = Setting("", schema_only=True) + calibration = Setting(SigmoidCalibration) + threshold = Setting(OptimizeCA) + + class Inputs(OWBaseLearner.Inputs): + base_learner = Input("Base Learner", Learner) + + def __init__(self): + super().__init__() + self.base_learner = None + + def add_main_layout(self): + gui.radioButtons( + self.controlArea, self, "calibration", self.CalibrationOptions, + box="Probability calibration", + callback=self.calibration_options_changed) + gui.radioButtons( + self.controlArea, self, "threshold", self.ThresholdOptions, + box="Decision threshold optimization", + callback=self.calibration_options_changed) + + @Inputs.base_learner + def set_learner(self, learner): + self.base_learner = learner + self._set_default_name() + self.unconditional_apply() + + def _set_default_name(self): + if self.base_learner is None: + self.name = "Calibrated learner" + else: + self.name = " + ".join(part for part in ( + self.base_learner.name.title(), + self.CalibrationShort[self.calibration], + self.ThresholdShort[self.threshold]) if part) + self.controls.learner_name.setPlaceholderText(self.name) + + def calibration_options_changed(self): + self._set_default_name() + self.apply() + + def create_learner(self): + class IdentityWrapper(Learner): + def fit_storage(self, data): + return self.base_learner.fit_storage(data) + + if self.base_learner is None: + return None + learner = self.base_learner + if self.calibration != self.NoCalibration: + learner = CalibratedLearner(learner, + self.CalibrationMap[self.calibration]) + if self.threshold != self.NoThresholdOptimization: + learner = ThresholdLearner(learner, + self.ThresholdMap[self.threshold]) + if self.preprocessors: + if learner is self.base_learner: + learner = IdentityWrapper() + learner.preprocessors = (self.preprocessors, ) + return learner + + def get_learner_parameters(self): + return (("Calibrate probabilities", + self.CalibrationOptions[self.calibration]), + ("Threshold optimization", + self.ThresholdOptions[self.threshold])) + + +if __name__ == "__main__": # pragma: no cover + WidgetPreview(OWCalibratedLearner).run( + Table("heart_disease"), + set_learner=NaiveBayesLearner()) diff --git a/Orange/widgets/model/tests/test_owcalibratedlearner.py b/Orange/widgets/model/tests/test_owcalibratedlearner.py new file mode 100644 index 00000000000..400d483a592 --- /dev/null +++ b/Orange/widgets/model/tests/test_owcalibratedlearner.py @@ -0,0 +1,158 @@ +from unittest.mock import Mock + +from Orange.classification import ThresholdLearner, CalibratedLearner, \ + NaiveBayesLearner, ThresholdClassifier, CalibratedClassifier +from Orange.classification.base_classification import ModelClassification, \ + LearnerClassification +from Orange.classification.naive_bayes import NaiveBayesModel +from Orange.data import Table +from Orange.widgets.model.owcalibratedlearner import OWCalibratedLearner +from Orange.widgets.tests.base import WidgetTest, WidgetLearnerTestMixin, \ + datasets + + +class TestOWCalibratedLearner(WidgetTest, WidgetLearnerTestMixin): + def setUp(self): + self.widget = self.create_widget( + OWCalibratedLearner, stored_settings={"auto_apply": False}) + self.send_signal(self.widget.Inputs.base_learner, NaiveBayesLearner()) + + self.data = Table("heart_disease") + self.valid_datasets = (self.data,) + self.inadequate_dataset = (Table(datasets.path("testing_dataset_reg")),) + self.learner_class = LearnerClassification + self.model_class = ModelClassification + self.model_name = 'Calibrated classifier' + self.parameters = [] + + def test_output_learner(self): + """Check if learner is on output after apply""" + # Overridden to change the output type in the last test + initial = self.get_output("Learner") + self.assertIsNotNone(initial, "Does not initialize the learner output") + self.widget.apply_button.button.click() + newlearner = self.get_output("Learner") + self.assertIsNot(initial, newlearner, + "Does not send a new learner instance on `Apply`.") + self.assertIsNotNone(newlearner) + self.assertIsInstance( + newlearner, + (CalibratedLearner, ThresholdLearner, NaiveBayesLearner)) + + def test_output_model(self): + """Check if model is on output after sending data and apply""" + # Overridden to change the output type in the last two test + self.assertIsNone(self.get_output(self.widget.Outputs.model)) + self.widget.apply_button.button.click() + self.assertIsNone(self.get_output(self.widget.Outputs.model)) + self.send_signal('Data', self.data) + self.widget.apply_button.button.click() + self.wait_until_stop_blocking() + model = self.get_output(self.widget.Outputs.model) + self.assertIsNotNone(model) + self.assertIsInstance( + model, (CalibratedClassifier, ThresholdClassifier, NaiveBayesModel)) + + def test_create_learner(self): + widget = self.widget #: OWCalibratedLearner + self.widget.base_learner = Mock() + + widget.calibration = widget.SigmoidCalibration + widget.threshold = widget.OptimizeF1 + learner = self.widget.create_learner() + self.assertIsInstance(learner, ThresholdLearner) + self.assertEqual(learner.threshold_criterion, learner.OptimizeF1) + cal_learner = learner.base_learner + self.assertIsInstance(cal_learner, CalibratedLearner) + self.assertEqual(cal_learner.calibration_method, cal_learner.Sigmoid) + self.assertIs(cal_learner.base_learner, self.widget.base_learner) + + widget.calibration = widget.IsotonicCalibration + widget.threshold = widget.OptimizeCA + learner = self.widget.create_learner() + self.assertIsInstance(learner, ThresholdLearner) + self.assertEqual(learner.threshold_criterion, learner.OptimizeCA) + cal_learner = learner.base_learner + self.assertIsInstance(cal_learner, CalibratedLearner) + self.assertEqual(cal_learner.calibration_method, cal_learner.Isotonic) + self.assertIs(cal_learner.base_learner, self.widget.base_learner) + + widget.calibration = widget.NoCalibration + widget.threshold = widget.OptimizeCA + learner = self.widget.create_learner() + self.assertIsInstance(learner, ThresholdLearner) + self.assertEqual(learner.threshold_criterion, learner.OptimizeCA) + self.assertIs(learner.base_learner, self.widget.base_learner) + + widget.calibration = widget.IsotonicCalibration + widget.threshold = widget.NoThresholdOptimization + learner = self.widget.create_learner() + self.assertIsInstance(learner, CalibratedLearner) + self.assertEqual(learner.calibration_method, cal_learner.Isotonic) + self.assertIs(learner.base_learner, self.widget.base_learner) + + widget.calibration = widget.NoCalibration + widget.threshold = widget.NoThresholdOptimization + learner = self.widget.create_learner() + self.assertIs(learner, self.widget.base_learner) + + widget.calibration = widget.SigmoidCalibration + widget.threshold = widget.OptimizeF1 + widget.base_learner = None + learner = self.widget.create_learner() + self.assertIsNone(learner) + + def test_preprocessors(self): + widget = self.widget #: OWCalibratedLearner + self.widget.base_learner = Mock() + self.widget.base_learner.preprocessors = () + + widget.calibration = widget.SigmoidCalibration + widget.threshold = widget.OptimizeF1 + widget.preprocessors = Mock() + learner = self.widget.create_learner() + self.assertEqual(learner.preprocessors, (widget.preprocessors, )) + self.assertEqual(learner.base_learner.preprocessors, ()) + self.assertEqual(learner.base_learner.base_learner.preprocessors, ()) + + widget.calibration = widget.NoCalibration + widget.threshold = widget.NoThresholdOptimization + learner = self.widget.create_learner() + self.assertIsNot(learner, self.widget.base_learner) + self.assertFalse( + isinstance(learner, (CalibratedLearner, ThresholdLearner))) + self.assertEqual(learner.preprocessors, (widget.preprocessors, )) + + def test_set_learner_calls_unconditional_apply(self): + widget = self.widget + self.assertIsNotNone(self.get_output(widget.Outputs.learner)) + + widget.auto_apply = False + self.send_signal(widget.Inputs.base_learner, None) + self.assertIsNone(self.get_output(widget.Outputs.learner)) + + def test_name_changes(self): + widget = self.widget + widget.auto_apply = True + learner = NaiveBayesLearner() + learner.name = "foo" + self.send_signal(widget.Inputs.base_learner, learner) + + widget.calibration = widget.IsotonicCalibration + widget.threshold = widget.OptimizeCA + widget.controls.calibration.group.buttonClicked[int].emit( + widget.IsotonicCalibration) + + learner = self.get_output(widget.Outputs.learner) + self.assertEqual(learner.name, "Foo + Isotonic + CA") + + widget.calibration = widget.NoCalibration + widget.threshold = widget.OptimizeCA + widget.controls.calibration.group.buttonClicked[int].emit( + widget.NoCalibration) + learner = self.get_output(widget.Outputs.learner) + self.assertEqual(learner.name, "Foo + CA") + + self.send_signal(widget.Inputs.base_learner, None) + self.assertEqual(widget.controls.learner_name.placeholderText(), + "Calibrated learner") diff --git a/Orange/widgets/tests/base.py b/Orange/widgets/tests/base.py index 635dd2e5fd8..1204e1c6ed5 100644 --- a/Orange/widgets/tests/base.py +++ b/Orange/widgets/tests/base.py @@ -672,7 +672,8 @@ def test_output_learner_name(self): new_name = "Learner Name" self.widget.apply_button.button.click() self.assertEqual(self.widget.learner.name, - self.widget.name_line_edit.text()) + self.widget.name_line_edit.text() + or self.widget.name_line_edit.placeholderText()) self.widget.name_line_edit.setText(new_name) self.widget.apply_button.button.click() self.wait_until_stop_blocking() diff --git a/Orange/widgets/utils/owlearnerwidget.py b/Orange/widgets/utils/owlearnerwidget.py index 3c6ee6ea65f..63b2795c78e 100644 --- a/Orange/widgets/utils/owlearnerwidget.py +++ b/Orange/widgets/utils/owlearnerwidget.py @@ -65,7 +65,7 @@ class OWBaseLearner(OWWidget, metaclass=OWBaseLearnerMeta, openclass=True): LEARNER = None supports_sparse = True - learner_name = Setting(None, schema_only=True) + learner_name = Setting("", schema_only=True) want_main_area = False resizing_enabled = False auto_apply = Setting(True) @@ -95,8 +95,6 @@ def __init__(self): self.data = None self.valid_data = False self.learner = None - if self.learner_name is None: - self.learner_name = self.name self.model = None self.preprocessors = None self.outdated_settings = False @@ -149,7 +147,7 @@ def update_learner(self): if self.learner and issubclass(self.LEARNER, Fitter): self.learner.use_default_preprocessors = True if self.learner is not None: - self.learner.name = self.learner_name + self.learner.name = self.learner_name or self.name self.Outputs.learner.send(self.learner) self.outdated_settings = False self.Warning.outdated_learner.clear() @@ -168,7 +166,7 @@ def update_model(self): except BaseException as exc: self.show_fitting_failed(exc) else: - self.model.name = self.learner_name + self.model.name = self.learner_name or self.name self.model.instances = self.data self.Outputs.model.send(self.model) @@ -198,7 +196,7 @@ def settings_changed(self, *args, **kwargs): def _change_name(self, instance, output): if instance: - instance.name = self.learner_name + instance.name = self.learner_name or self.name if self.auto_apply: output.send(instance) @@ -207,7 +205,7 @@ def learner_name_changed(self): self._change_name(self.model, self.Outputs.model) def send_report(self): - self.report_items((("Name", self.learner_name),)) + self.report_items((("Name", self.learner_name or self.name),)) model_parameters = self.get_learner_parameters() if model_parameters: @@ -264,6 +262,7 @@ def add_regression_layout(self, box): def add_learner_name_widget(self): self.name_line_edit = gui.lineEdit( self.controlArea, self, 'learner_name', box='Name', + placeholderText=self.name, tooltip='The name will identify this model in other widgets', orientation=Qt.Horizontal, callback=self.learner_name_changed) diff --git a/Orange/widgets/utils/tests/test_owlearnerwidget.py b/Orange/widgets/utils/tests/test_owlearnerwidget.py index 99f792196b6..9a43365a473 100644 --- a/Orange/widgets/utils/tests/test_owlearnerwidget.py +++ b/Orange/widgets/utils/tests/test_owlearnerwidget.py @@ -105,7 +105,6 @@ class WidgetA(OWBaseLearner): LEARNER = KNNLearner w1 = self.create_widget(WidgetA) - self.assertEqual(w1.learner_name, "A") w1.learner_name = "MyWidget" settings = w1.settingsHandler.pack_data(w1) diff --git a/doc/data-mining-library/source/reference/classification.rst b/doc/data-mining-library/source/reference/classification.rst index 5095e147f2a..55792fa340f 100644 --- a/doc/data-mining-library/source/reference/classification.rst +++ b/doc/data-mining-library/source/reference/classification.rst @@ -196,3 +196,21 @@ CN2 Rule Induction .. autoclass:: CN2SDUnorderedLearner :members: + + +Calibration and threshold optimization +-------------------------------------- + +.. automodule:: Orange.classification.calibration + +.. autoclass:: ThresholdClassifier + :members: + +.. autoclass:: ThresholdLearner + :members: + +.. autoclass:: CalibratedClassifier + :members: + +.. autoclass:: CalibratedLearner + :members: diff --git a/doc/data-mining-library/source/reference/evaluation.performance_curves.rst b/doc/data-mining-library/source/reference/evaluation.performance_curves.rst new file mode 100644 index 00000000000..d9eaa515c0f --- /dev/null +++ b/doc/data-mining-library/source/reference/evaluation.performance_curves.rst @@ -0,0 +1,8 @@ +.. py:currentmodule:: Orange.evaluation.performance_curves + +################## +Performance curves +################## + +.. autoclass:: Orange.evaluation.performance_curves.Curves + :members: diff --git a/doc/data-mining-library/source/reference/evaluation.rst b/doc/data-mining-library/source/reference/evaluation.rst index 422371a41eb..a07c99ae44f 100644 --- a/doc/data-mining-library/source/reference/evaluation.rst +++ b/doc/data-mining-library/source/reference/evaluation.rst @@ -9,3 +9,4 @@ Evaluation (``evaluation``) evaluation.testing evaluation.cd + evaluation.performance_curves