diff --git a/Orange/classification/__init__.py b/Orange/classification/__init__.py
index f0489b4cb74..842518fca31 100644
--- a/Orange/classification/__init__.py
+++ b/Orange/classification/__init__.py
@@ -19,3 +19,4 @@
 from .rules import *
 from .sgd import *
 from .neural_network import *
+from .calibration import *
diff --git a/Orange/classification/calibration.py b/Orange/classification/calibration.py
new file mode 100644
index 00000000000..46bf2e8f242
--- /dev/null
+++ b/Orange/classification/calibration.py
@@ -0,0 +1,176 @@
+import numpy as np
+from sklearn.isotonic import IsotonicRegression
+from sklearn.calibration import _SigmoidCalibration
+
+from Orange.classification import Model, Learner
+from Orange.evaluation import TestOnTrainingData
+from Orange.evaluation.performance_curves import Curves
+
+__all__ = ["ThresholdClassifier", "ThresholdLearner",
+           "CalibratedLearner", "CalibratedClassifier"]
+
+
+class ThresholdClassifier(Model):
+    """
+    A model that wraps a binary model and sets a different threshold.
+
+    The target class is the class with index 1. A data instances is classified
+    to class 1 it the probability of this class equals or exceeds the threshold
+
+    Attributes:
+        base_model (Orange.classification.Model): base mode
+        threshold (float): decision threshold
+    """
+    def __init__(self, base_model, threshold):
+        if not base_model.domain.class_var.is_discrete \
+                or len(base_model.domain.class_var.values) != 2:
+            raise ValueError("ThresholdClassifier requires a binary class")
+
+        super().__init__(base_model.domain, base_model.original_domain)
+        self.name = f"{base_model.name}, thresh={threshold:.2f}"
+        self.base_model = base_model
+        self.threshold = threshold
+
+    def __call__(self, data, ret=Model.Value):
+        probs = self.base_model(data, ret=Model.Probs)
+        if ret == Model.Probs:
+            return probs
+        class_probs = probs[:, 1].ravel()
+        with np.errstate(invalid="ignore"):  # we fix nanx below
+            vals = (class_probs >= self.threshold).astype(float)
+        vals[np.isnan(class_probs)] = np.nan
+        if ret == Model.Value:
+            return vals
+        else:
+            return vals, probs
+
+
+class ThresholdLearner(Learner):
+    """
+    A learner that runs another learner and then finds the optimal threshold
+    for CA or F1 on the training data.
+
+    Attributes:
+        base_leaner (Learner): base learner
+        threshold_criterion (int):
+            `ThresholdLearner.OptimizeCA` or `ThresholdLearner.OptimizeF1`
+    """
+    __returns__ = ThresholdClassifier
+
+    OptimizeCA, OptimizeF1 = range(2)
+
+    def __init__(self, base_learner, threshold_criterion=OptimizeCA):
+        super().__init__()
+        self.base_learner = base_learner
+        self.threshold_criterion = threshold_criterion
+
+    def fit_storage(self, data):
+        """
+        Induce a model using the provided `base_learner`, compute probabilities
+        on training data and the find the optimal decision thresholds. In case
+        of ties, select the threshold that is closest to 0.5.
+        """
+        if not data.domain.class_var.is_discrete \
+                or len(data.domain.class_var.values) != 2:
+            raise ValueError("ThresholdLearner requires a binary class")
+
+        res = TestOnTrainingData(data, [self.base_learner], store_models=True)
+        model = res.models[0, 0]
+        curves = Curves.from_results(res)
+        curve = [curves.ca, curves.f1][self.threshold_criterion]()
+        # In case of ties, we want the optimal threshold that is closest to 0.5
+        best_threshs = curves.probs[curve == np.max(curve)]
+        threshold = best_threshs[min(np.searchsorted(best_threshs, 0.5),
+                                     len(best_threshs) - 1)]
+        return ThresholdClassifier(model, threshold)
+
+
+class CalibratedClassifier(Model):
+    """
+    A model that wraps another model and recalibrates probabilities
+
+    Attributes:
+        base_model (Mode): base mode
+        calibrators (list of callable):
+            list of functions that get a vector of probabilities and return
+            calibrated probabilities
+    """
+    def __init__(self, base_model, calibrators):
+        if not base_model.domain.class_var.is_discrete:
+            raise ValueError("CalibratedClassifier requires a discrete target")
+
+        super().__init__(base_model.domain, base_model.original_domain)
+        self.base_model = base_model
+        self.calibrators = calibrators
+        self.name = f"{base_model.name}, calibrated"
+
+    def __call__(self, data, ret=Model.Value):
+        probs = self.base_model(data, Model.Probs)
+        cal_probs = self.calibrated_probs(probs)
+        if ret == Model.Probs:
+            return cal_probs
+        vals = np.argmax(cal_probs, axis=1)
+        if ret == Model.Value:
+            return vals
+        else:
+            return vals, cal_probs
+
+    def calibrated_probs(self, probs):
+        if self.calibrators:
+            ps = np.hstack(
+                tuple(
+                    calibr.predict(cls_probs).reshape(-1, 1)
+                    for calibr, cls_probs in zip(self.calibrators, probs.T)))
+        else:
+            ps = probs.copy()
+        sums = np.sum(ps, axis=1)
+        zero_sums = sums == 0
+        with np.errstate(invalid="ignore"):  # handled below
+            ps /= sums[:, None]
+        if zero_sums.any():
+            ps[zero_sums] = 1 / ps.shape[1]
+        return ps
+
+
+class CalibratedLearner(Learner):
+    """
+    Probability calibration for learning algorithms
+
+    This learner that wraps another learner, so that after training, it predicts
+    the probabilities on training data and calibrates them using sigmoid or
+    isotonic calibration. It then returns a :obj:`CalibratedClassifier`.
+
+    Attributes:
+        base_learner (Learner): base learner
+        calibration_method (int):
+            `CalibratedLearner.Sigmoid` or `CalibratedLearner.Isotonic`
+    """
+    __returns__ = CalibratedClassifier
+
+    Sigmoid, Isotonic = range(2)
+
+    def __init__(self, base_learner, calibration_method=Sigmoid):
+        super().__init__()
+        self.base_learner = base_learner
+        self.calibration_method = calibration_method
+
+    def fit_storage(self, data):
+        """
+        Induce a model using the provided `base_learner`, compute probabilities
+        on training data and use scipy's `_SigmoidCalibration` or
+        `IsotonicRegression` to prepare calibrators.
+        """
+        res = TestOnTrainingData(data, [self.base_learner], store_models=True)
+        model = res.models[0, 0]
+        probabilities = res.probabilities[0]
+        return self.get_model(model, res.actual, probabilities)
+
+    def get_model(self, model, ytrue, probabilities):
+        if self.calibration_method == CalibratedLearner.Sigmoid:
+            fitter = _SigmoidCalibration()
+        else:
+            fitter = IsotonicRegression(out_of_bounds='clip')
+        probabilities[np.isinf(probabilities)] = 1
+        calibrators = [fitter.fit(cls_probs, ytrue)
+                       for cls_idx, cls_probs in enumerate(probabilities.T)]
+        return CalibratedClassifier(model, calibrators)
diff --git a/Orange/classification/tests/test_calibration.py b/Orange/classification/tests/test_calibration.py
new file mode 100644
index 00000000000..a538a3b1870
--- /dev/null
+++ b/Orange/classification/tests/test_calibration.py
@@ -0,0 +1,203 @@
+import unittest
+from unittest.mock import Mock, patch
+
+import numpy as np
+
+from Orange.base import Model
+from Orange.classification.calibration import \
+    ThresholdLearner, ThresholdClassifier, \
+    CalibratedLearner, CalibratedClassifier
+from Orange.data import Table
+
+
+class TestThresholdClassifier(unittest.TestCase):
+    def setUp(self):
+        probs1 = np.array([0.3, 0.5, 0.2, 0.8, 0.9, 0]).reshape(-1, 1)
+        self.probs = np.hstack((1 - probs1, probs1))
+        base_model = Mock(return_value=self.probs)
+        base_model.domain.class_var.is_discrete = True
+        base_model.domain.class_var.values = ["a", "b"]
+        self.model = ThresholdClassifier(base_model, 0.5)
+        self.data = Mock()
+
+    def test_threshold(self):
+        vals = self.model(self.data)
+        np.testing.assert_equal(vals, [0, 1, 0, 1, 1, 0])
+
+        self.model.threshold = 0.8
+        vals = self.model(self.data)
+        np.testing.assert_equal(vals, [0, 0, 0, 1, 1, 0])
+
+        self.model.threshold = 0
+        vals = self.model(self.data)
+        np.testing.assert_equal(vals, [1] * 6)
+
+    def test_return_types(self):
+        vals = self.model(self.data, ret=Model.Value)
+        np.testing.assert_equal(vals, [0, 1, 0, 1, 1, 0])
+
+        vals = self.model(self.data)
+        np.testing.assert_equal(vals, [0, 1, 0, 1, 1, 0])
+
+        probs = self.model(self.data, ret=Model.Probs)
+        np.testing.assert_equal(probs, self.probs)
+
+        vals, probs = self.model(self.data, ret=Model.ValueProbs)
+        np.testing.assert_equal(vals, [0, 1, 0, 1, 1, 0])
+        np.testing.assert_equal(probs, self.probs)
+
+    def test_nans(self):
+        self.probs[1, :] = np.nan
+        vals, probs = self.model(self.data, ret=Model.ValueProbs)
+        np.testing.assert_equal(vals, [0, np.nan, 0, 1, 1, 0])
+        np.testing.assert_equal(probs, self.probs)
+
+    def test_non_binary_base(self):
+        base_model = Mock()
+        base_model.domain.class_var.is_discrete = True
+        base_model.domain.class_var.values = ["a"]
+        self.assertRaises(ValueError, ThresholdClassifier, base_model, 0.5)
+
+        base_model.domain.class_var.values = ["a", "b", "c"]
+        self.assertRaises(ValueError, ThresholdClassifier, base_model, 0.5)
+
+        base_model.domain.class_var = Mock()
+        base_model.domain.class_var.is_discrete = False
+        self.assertRaises(ValueError, ThresholdClassifier, base_model, 0.5)
+
+
+class TestThresholdLearner(unittest.TestCase):
+    @patch("Orange.evaluation.performance_curves.Curves.from_results")
+    @patch("Orange.classification.calibration.TestOnTrainingData")
+    def test_fit_storage(self, test_on_training, curves_from_results):
+        curves_from_results.return_value = curves = Mock()
+        curves.probs = np.array([0.1, 0.15, 0.3, 0.45, 0.6, 0.8])
+        curves.ca = lambda: np.array([0.1, 0.7, 0.4, 0.4, 0.3, 0.1])
+        curves.f1 = lambda: np.array([0.1, 0.2, 0.4, 0.4, 0.3, 0.1])
+        model = Mock()
+        model.domain.class_var.is_discrete = True
+        model.domain.class_var.values = ("a", "b")
+        data = Table("heart_disease")
+        learner = Mock()
+        test_on_training.return_value = res = Mock()
+        res.models = np.array([[model]])
+        test_on_training.return_value = res
+
+        thresh_learner = ThresholdLearner(
+            base_learner=learner,
+            threshold_criterion=ThresholdLearner.OptimizeCA)
+        thresh_model = thresh_learner(data)
+        self.assertEqual(thresh_model.threshold, 0.15)
+        args, kwargs = test_on_training.call_args
+        self.assertEqual(len(args), 2)
+        self.assertIs(args[0], data)
+        self.assertIs(args[1][0], learner)
+        self.assertEqual(len(args[1]), 1)
+        self.assertEqual(kwargs, {"store_models": 1})
+
+        thresh_learner = ThresholdLearner(
+            base_learner=learner,
+            threshold_criterion=ThresholdLearner.OptimizeF1)
+        thresh_model = thresh_learner(data)
+        self.assertEqual(thresh_model.threshold, 0.45)
+
+    def test_non_binary_class(self):
+        thresh_learner = ThresholdLearner(
+            base_learner=Mock(),
+            threshold_criterion=ThresholdLearner.OptimizeF1)
+
+        data = Mock()
+        data.domain.class_var.is_discrete = True
+        data.domain.class_var.values = ["a"]
+        self.assertRaises(ValueError, thresh_learner.fit_storage, data)
+
+        data.domain.class_var.values = ["a", "b", "c"]
+        self.assertRaises(ValueError, thresh_learner.fit_storage, data)
+
+        data.domain.class_var = Mock()
+        data.domain.class_var.is_discrete = False
+        self.assertRaises(ValueError, thresh_learner.fit_storage, data)
+
+
+class TestCalibratedClassifier(unittest.TestCase):
+    def setUp(self):
+        probs1 = np.array([0.3, 0.5, 0.2, 0.8, 0.9, 0]).reshape(-1, 1)
+        self.probs = np.hstack((1 - probs1, probs1))
+        base_model = Mock(return_value=self.probs)
+        base_model.domain.class_var.is_discrete = True
+        base_model.domain.class_var.values = ["a", "b"]
+        self.model = CalibratedClassifier(base_model, None)
+        self.data = Mock()
+
+    def test_call(self):
+        calprobs = np.arange(self.probs.size).reshape(self.probs.shape)
+        calprobs = calprobs / np.sum(calprobs, axis=1)[:, None]
+        calprobs[-1] = [0.7, 0.3]
+        self.model.calibrated_probs = Mock(return_value=calprobs)
+
+        probs = self.model(self.data, ret=Model.Probs)
+        self.model.calibrated_probs.assert_called_with(self.probs)
+        np.testing.assert_almost_equal(probs, calprobs)
+
+        vals = self.model(self.data, ret=Model.Value)
+        np.testing.assert_almost_equal(vals, [1, 1, 1, 1, 1, 0])
+
+        vals, probs = self.model(self.data, ret=Model.ValueProbs)
+        np.testing.assert_almost_equal(probs, calprobs)
+        np.testing.assert_almost_equal(vals, [1, 1, 1, 1, 1, 0])
+
+    def test_calibrated_probs(self):
+        self.model.calibrators = None
+        calprobs = self.model.calibrated_probs(self.probs)
+        np.testing.assert_equal(calprobs, self.probs)
+        self.assertIsNot(calprobs, self.probs)
+
+        calibrator = Mock()
+        calibrator.predict = lambda x: x**2
+        self.model.calibrators = [calibrator] * 2
+        calprobs = self.model.calibrated_probs(self.probs)
+        expprobs = self.probs ** 2 / np.sum(self.probs ** 2, axis=1)[:, None]
+        np.testing.assert_almost_equal(calprobs, expprobs)
+
+        self.probs[1] = 0
+        self.probs[2] = np.nan
+        expprobs[1] = 0.5
+        expprobs[2] = np.nan
+        calprobs = self.model.calibrated_probs(self.probs)
+        np.testing.assert_almost_equal(calprobs, expprobs)
+
+
+class TestCalibratedLearner(unittest.TestCase):
+    @patch("Orange.classification.calibration._SigmoidCalibration.fit")
+    @patch("Orange.classification.calibration.TestOnTrainingData")
+    def test_fit_storage(self, test_on_training, sigmoid_fit):
+        data = Table("heart_disease")
+        learner = Mock()
+
+        model = Mock()
+        model.domain.class_var.is_discrete = True
+        model.domain.class_var.values = ("a", "b")
+
+        test_on_training.return_value = res = Mock()
+        res.models = np.array([[model]])
+        res.probabilities = np.arange(20, dtype=float).reshape(1, 5, 4)
+        test_on_training.return_value = res
+
+        sigmoid_fit.return_value = Mock()
+
+        cal_learner = CalibratedLearner(
+            base_learner=learner, calibration_method=CalibratedLearner.Sigmoid)
+        cal_model = cal_learner(data)
+
+        self.assertIs(cal_model.base_model, model)
+        self.assertEqual(cal_model.calibrators, [sigmoid_fit.return_value] * 4)
+        args, kwargs = test_on_training.call_args
+        self.assertEqual(len(args), 2)
+        self.assertIs(args[0], data)
+        self.assertIs(args[1][0], learner)
+        self.assertEqual(len(args[1]), 1)
+        self.assertEqual(kwargs, {"store_models": 1})
+
+        for call, cls_probs in zip(sigmoid_fit.call_args_list,
+                                   res.probabilities[0].T):
+            np.testing.assert_equal(call[0][0], cls_probs)
diff --git a/Orange/evaluation/performance_curves.py b/Orange/evaluation/performance_curves.py
new file mode 100644
index 00000000000..c7dee568e53
--- /dev/null
+++ b/Orange/evaluation/performance_curves.py
@@ -0,0 +1,150 @@
+import numpy as np
+
+
+class Curves:
+    # names of scores are standard acronyms, pylint: disable=invalid-name
+    """
+    Computation of performance curves (ca, f1, precision, recall and the rest
+    of the zoo) from test results.
+
+    The class works with binary classes. Attribute `probs` contains ordered
+    probabilities and all curves represent performance statistics if an
+    instance is classified as positive if it equals or exceeds the threshold
+    in `probs`, that is, `sensitivity[i]` is the sensitivity of the classifier
+    that classifies an instances as positive if the probability of being
+    positive is at least `probs[i]`.
+
+    Class can be constructed by giving `probs` and `ytrue`, or from test
+    results (see :obj:`Curves.from_results`). The latter removes instances
+    with missing class values or predicted probabilities.
+
+    The class treats all results as obtained from a single run instead of
+    computing separate curves and fancy averaging.
+
+    Arguments:
+        probs (np.ndarray): vector of predicted probabilities
+        ytrue (np.ndarray): corresponding true classes
+
+    Attributes:
+        probs (np.ndarray): ordered vector of predicted probabilities
+        ytrue (np.ndarray): corresponding true classes
+        tot (int): total number of data instances
+        p (int): number of real positive instances
+        n (int): number of real negative instances
+        tp (np.ndarray): number of true positives (property computed from `tn`)
+        fp (np.ndarray): number of false positives (property computed from `tn`)
+        tn (np.ndarray): number of true negatives (property computed from `tn`)
+        fn (np.ndarray): number of false negatives (precomputed, not a property)
+    """
+    def __init__(self, ytrue, probs):
+        sortind = np.argsort(probs)
+        self.probs = np.hstack((probs[sortind], [1]))
+        self.ytrue = ytrue[sortind]
+        self.fn = np.hstack(([0], np.cumsum(self.ytrue)))
+        self.tot = len(probs)
+        self.p = self.fn[-1]
+        self.n = self.tot - self.p
+
+    @classmethod
+    def from_results(cls, results, target_class=None, model_index=None):
+        """
+        Construct an instance of `Curves` from test results.
+
+        Args:
+            results (:obj:`Orange.evaluation.testing.Results`): test results
+            target_class (int): target class index; if the class is binary,
+                this defaults to `1`, otherwise it must be given
+            model_index (int): model index; if there is only one model, this
+                argument can be omitted
+
+        Returns:
+            curves (:obj:`Curves`)
+        """
+        if model_index is None:
+            if results.probabilities.shape[0] != 1:
+                raise ValueError("Argument 'model_index' is required when "
+                                 "there are multiple models")
+            model_index = 0
+        if target_class is None:
+            if results.probabilities.shape[2] != 2:
+                raise ValueError("Argument 'target_class' is required when the "
+                                 "class is not binary")
+            target_class = 1
+        actual = results.actual
+        probs = results.probabilities[model_index, :, target_class]
+        nans = np.isnan(actual) + np.isnan(probs)
+        if nans.any():
+            actual = actual[~nans]
+            probs = probs[~nans]
+        return cls(actual == target_class, probs)
+
+    @property
+    def tn(self):
+        return np.arange(self.tot + 1) - self.fn
+
+    @property
+    def tp(self):
+        return self.p - self.fn
+
+    @property
+    def fp(self):
+        return self.n - self.tn
+
+    def ca(self):
+        """Classification accuracy curve"""
+        return (self.tp + self.tn) / self.tot
+
+    def f1(self):
+        """F1 curve"""
+        return 2 * self.tp / (2 * self.tp + self.fp + self.fn)
+
+    def sensitivity(self):
+        """Sensitivity curve"""
+        return self.tp / self.p
+
+    def specificity(self):
+        """Specificity curve"""
+        return self.tn / self.n
+
+    def precision(self):
+        """
+        Precision curve
+
+        The last element represents precision at threshold 1. Unless such
+        a probability appears in the data, the precision at this point is
+        undefined. To avoid this, we copy the previous value to the last.
+        """
+        tp_fp = np.arange(self.tot, -1, -1)
+        tp_fp[-1] = 1  # avoid division by zero
+        prec = self.tp / tp_fp
+        prec[-1] = prec[-2]
+        return prec
+
+    def recall(self):
+        """Recall curve"""
+        return self.sensitivity()
+
+    def ppv(self):
+        """PPV curve; see the comment at :obj:`precision`"""
+        return self.precision()
+
+    def npv(self):
+        """
+        NPV curve
+
+        The first value is undefined (no negative instances). To avoid this,
+        we copy the second value into the first.
+        """
+        tn_fn = np.arange(self.tot + 1)
+        tn_fn[0] = 1  # avoid division by zero
+        npv = self.tn / tn_fn
+        npv[0] = npv[1]
+        return npv
+
+    def fpr(self):
+        """FPR curve"""
+        return self.fp / self.n
+
+    def tpr(self):
+        """TPR curve"""
+        return self.sensitivity()
diff --git a/Orange/evaluation/testing.py b/Orange/evaluation/testing.py
index 92c68d1c13f..93c0d563238 100644
--- a/Orange/evaluation/testing.py
+++ b/Orange/evaluation/testing.py
@@ -171,7 +171,7 @@ def set_or_raise(value, exp_values, msg):
             "mismatching number of class values")
         nmethods = set_or_raise(
             nmethods, [learners is not None and len(learners),
-                       models is not None and len(models),
+                       models is not None and models.shape[1],
                        failed is not None and len(failed),
                        predicted is not None and predicted.shape[0],
                        probabilities is not None and probabilities.shape[0]],
@@ -317,7 +317,7 @@ def split_by_model(self):
                 res.probabilities = self.probabilities[(i,), :, :]
 
             if self.models is not None:
-                res.models = self.models[:, i]
+                res.models = self.models[:, i:i + 1]
 
             res.failed = [self.failed[i]]
             yield res
@@ -365,7 +365,7 @@ def __new__(cls,
                                  "and train_data are omitted")
             return self
 
-        warn("calling Validation's constructor with data and learners"
+        warn("calling Validation's constructor with data and learners "
              "is deprecated;\nconstruct an instance and call it",
              DeprecationWarning, stacklevel=2)
 
diff --git a/Orange/evaluation/tests/__init__.py b/Orange/evaluation/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/Orange/evaluation/tests/test_performance_curves.py b/Orange/evaluation/tests/test_performance_curves.py
new file mode 100644
index 00000000000..a73d7165557
--- /dev/null
+++ b/Orange/evaluation/tests/test_performance_curves.py
@@ -0,0 +1,125 @@
+import unittest
+from unittest.mock import patch
+
+import numpy as np
+
+from Orange.evaluation.testing import Results
+from Orange.evaluation.performance_curves import Curves
+
+
+# Test data and sensitivity/specificity are taken from
+# Tom Fawcett: An introduction to ROC analysis, with one true positive instance
+# removed, so that the number of positive and negative does not match
+
+class TestCurves(unittest.TestCase):
+    def setUp(self):
+        n, p = (0, 1)
+        self.data = np.array([
+            (p, .8), (n, .7), (p, .6), (p, .55), (p, .54), (n, .53),
+            (n, .52), (p, .51), (n, .505), (p, .4), (n, .39), (p, .38),
+            (n, .37), (n, .36), (n, .35), (p, .34), (n, .33), (p, .30), (n, .1)
+        ])
+
+    def test_curves(self):
+        np.random.shuffle(self.data)
+        ytrue, probs = self.data.T
+        curves = Curves(ytrue, probs)
+
+        tn = np.array(
+            [0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 9, 9, 10, 10])
+        np.testing.assert_equal(curves.tn, tn)
+        np.testing.assert_equal(curves.fp, 10 - tn)
+        np.testing.assert_almost_equal(curves.specificity(), tn / 10)
+
+        tp = np.array(
+            [9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 5, 5, 4, 4, 4, 3, 2, 1, 1, 0])
+        np.testing.assert_equal(curves.tp, tp)
+        np.testing.assert_equal(curves.fn, 9 - tp)
+        np.testing.assert_almost_equal(curves.sensitivity(), tp / 9)
+
+        np.testing.assert_almost_equal(
+            curves.ca(),
+            np.array([9, 10, 9, 10, 9, 10, 11, 12, 11, 12, 11, 12, 11, 12,
+                      13, 12, 11, 10, 11, 10]) / 19)
+
+        precision = np.array(
+            [9 / 19, 9 / 18, 8 / 17, 8 / 16, 7 / 15, 7 / 14, 7 / 13,
+             7 / 12, 6 / 11, 6 / 10, 5 / 9, 5 / 8, 4 / 7, 4 / 6,
+             4 / 5, 3 / 4, 2 / 3, 1 / 2, 1 / 1, 1])
+        np.testing.assert_almost_equal(curves.precision(), precision)
+        np.testing.assert_almost_equal(curves.recall(), tp / 9)
+
+        np.testing.assert_almost_equal(curves.ppv(), precision)
+        np.testing.assert_almost_equal(
+            curves.npv(),
+            np.array([1, 1 / 1, 1 / 2, 2 / 3, 2 / 4, 3 / 5, 4 / 6, 5 / 7,
+                      5 / 8, 6 / 9, 6 / 10, 7 / 11, 7 / 12, 8 / 13, 9 / 14,
+                      9 / 15, 9 / 16, 9 / 17, 10 / 18, 10 / 19]))
+
+        np.testing.assert_almost_equal(curves.tpr(), tp / 9)
+        np.testing.assert_almost_equal(curves.fpr(), (10 - tn) / 10)
+
+    @patch("Orange.evaluation.performance_curves.Curves.__init__",
+           return_value=None)
+    def test_curves_from_results(self, init):
+        res = Results()
+        ytrue, probs = self.data.T
+        res.actual = ytrue.astype(float)
+        res.probabilities = np.vstack((1 - probs, probs)).T.reshape(1, -1, 2)
+        Curves.from_results(res)
+        cytrue, cprobs = init.call_args[0]
+        np.testing.assert_equal(cytrue, ytrue)
+        np.testing.assert_equal(cprobs, probs)
+
+        Curves.from_results(res, target_class=0)
+        cytrue, cprobs = init.call_args[0]
+        np.testing.assert_equal(cytrue, 1 - ytrue)
+        np.testing.assert_equal(cprobs, 1 - probs)
+
+        res.actual = ytrue.astype(float)
+        res.probabilities = np.random.random((2, 19, 2))
+        res.probabilities[1] = np.vstack((1 - probs, probs)).T
+
+        Curves.from_results(res, model_index=1)
+        cytrue, cprobs = init.call_args[0]
+        np.testing.assert_equal(cytrue, ytrue)
+        np.testing.assert_equal(cprobs, probs)
+
+        self.assertRaises(ValueError, Curves.from_results, res)
+
+        ytrue[ytrue == 0] = 2 * (np.arange(10) % 2)
+        res.actual = ytrue.astype(float)
+        res.probabilities = np.random.random((2, 19, 3))
+        res.probabilities[1] = np.vstack(
+            ((1 - probs) / 3, probs, (1 - probs) * 2 / 3)).T
+
+        Curves.from_results(res, model_index=1, target_class=1)
+        cytrue, cprobs = init.call_args[0]
+        np.testing.assert_equal(cytrue, ytrue == 1)
+        np.testing.assert_equal(cprobs, probs)
+
+        Curves.from_results(res, model_index=1, target_class=0)
+        cytrue, cprobs = init.call_args[0]
+        np.testing.assert_equal(cytrue, ytrue == 0)
+        np.testing.assert_equal(cprobs, (1 - probs) / 3)
+
+        Curves.from_results(res, model_index=1, target_class=2)
+        cytrue, cprobs = init.call_args[0]
+        np.testing.assert_equal(cytrue, ytrue == 2)
+        np.testing.assert_equal(cprobs, (1 - probs) * 2 / 3)
+
+        self.assertRaises(ValueError, Curves.from_results, res, model_index=1)
+
+    @patch("Orange.evaluation.performance_curves.Curves.__init__",
+           return_value=None)
+    def test_curves_from_results_nans(self, init):
+        res = Results()
+        ytrue, probs = self.data.T
+        ytrue[0] = np.nan
+        probs[-1] = np.nan
+        res.actual = ytrue.astype(float)
+        res.probabilities = np.vstack((1 - probs, probs)).T.reshape(1, -1, 2)
+        Curves.from_results(res)
+        cytrue, cprobs = init.call_args[0]
+        np.testing.assert_equal(cytrue, ytrue[1:-1])
+        np.testing.assert_equal(cprobs, probs[1:-1])
diff --git a/Orange/tests/test_evaluation_testing.py b/Orange/tests/test_evaluation_testing.py
index a57910eb971..a5f78cb2972 100644
--- a/Orange/tests/test_evaluation_testing.py
+++ b/Orange/tests/test_evaluation_testing.py
@@ -233,7 +233,7 @@ def test_split_by_model(self):
             self.assertTrue((result.predicted == res.predicted[i]).all())
             self.assertTrue((result.probabilities == res.probabilities[i]).all())
             self.assertEqual(len(result.models), 5)
-            for model in result.models:
+            for model in result.models[0]:
                 self.assertIsInstance(model, learners[i].__returns__)
             self.assertSequenceEqual(result.learners, [res.learners[i]])
 
@@ -756,7 +756,7 @@ def setUp(self):
         self.row_indices = np.arange(100)
         self.folds = (range(50), range(10, 60)), (range(50, 100), range(50))
         self.learners = [MajorityLearner(), MajorityLearner()]
-        self.models = [Mock(), Mock()]
+        self.models = np.array([[Mock(), Mock()]])
         self.predicted = np.zeros((2, 100))
         self.probabilities = np.zeros((2, 100, 3))
         self.failed = [False, True]
diff --git a/Orange/widgets/evaluate/contexthandlers.py b/Orange/widgets/evaluate/contexthandlers.py
index d79def2ca60..3ad2796698d 100644
--- a/Orange/widgets/evaluate/contexthandlers.py
+++ b/Orange/widgets/evaluate/contexthandlers.py
@@ -1,47 +1,30 @@
+from Orange.data import Variable
 from Orange.widgets import settings
-from Orange.widgets.utils import getdeepattr
 
 
 class EvaluationResultsContextHandler(settings.ContextHandler):
-    def __init__(self, targetAttr, selectedAttr):
-        super().__init__()
-        self.targetAttr, self.selectedAttr = targetAttr, selectedAttr
+    """Context handler for evaluation results"""
 
-    #noinspection PyMethodOverriding
-    def match(self, context, cnames, cvalues):
-        return (cnames, cvalues) == (
-            context.classifierNames, context.classValues) and 2
+    def open_context(self, widget, classes, classifier_names):
+        if isinstance(classes, Variable):
+            if classes.is_discrete:
+                classes = classes.values
+            else:
+                classes = None
+        super().open_context(widget, classes, classifier_names)
 
-    def fast_save(self, widget, name, value):
-        context = widget.current_context
-        if name == self.targetAttr:
-            context.targetClass = value
-        elif name == self.selectedAttr:
-            context.selectedClassifiers = list(value)
+    def new_context(self, classes, classifier_names):
+        context = super().new_context()
+        context.classes = classes
+        context.classifier_names = classifier_names
+        return context
 
-    def settings_from_widget(self, widget, *args):
-        super().settings_from_widget(widget, *args)
-        context = widget.current_context
-        context.targetClass = getdeepattr(widget, self.targetAttr)
-        context.selectedClassifiers = list(getdeepattr(self.selectedAttr))
-
-    def settings_to_widget(self, widget, *args):
-        super().settings_to_widget(widget, *args)
-        context = widget.current_context
-        if context.targetClass is not None:
-            setattr(widget, self.targetAttr, context.targetClass)
-        if context.selectedClassifiers is not None:
-            setattr(widget, self.selectedAttr, context.selectedClassifiers)
-
-    #noinspection PyMethodOverriding
-    def find_or_create_context(self, widget, results):
-        cnames = [c.name for c in results.classifiers]
-        cvalues = results.classValues
-        context, isNew = super().find_or_create_context(
-            widget, results.classifierNames, results.classValues)
-        if isNew:
-            context.classifierNames = results.classifierNames
-            context.classValues = results.classValues
-            context.selectedClassifiers = None
-            context.targetClass = None
-        return context, isNew
+    def match(self, context, classes, classifier_names):
+        if classifier_names != context.classifier_names:
+            return self.NO_MATCH
+        elif isinstance(classes, Variable) and classes.is_continuous:
+            return (self.PERFECT_MATCH if context.classes is None
+                    else self.NO_MATCH)
+        else:
+            return (self.PERFECT_MATCH if context.classes == classes
+                    else self.NO_MATCH)
diff --git a/Orange/widgets/evaluate/owcalibrationplot.py b/Orange/widgets/evaluate/owcalibrationplot.py
index c757932adea..562c3d5aa01 100644
--- a/Orange/widgets/evaluate/owcalibrationplot.py
+++ b/Orange/widgets/evaluate/owcalibrationplot.py
@@ -1,37 +1,61 @@
-"""
-Calibration Plot Widget
------------------------
-
-"""
 from collections import namedtuple
 
 import numpy as np
 
-from AnyQt.QtWidgets import QListWidget
+from AnyQt.QtCore import Qt, QSize
+from AnyQt.QtWidgets import QListWidget, QSizePolicy
 
 import pyqtgraph as pg
 
-import Orange
+from Orange.base import Model
+from Orange.classification import ThresholdClassifier, CalibratedLearner
+from Orange.evaluation import Results
+from Orange.evaluation.performance_curves import Curves
 from Orange.widgets import widget, gui, settings
-from Orange.widgets.evaluate.utils import \
-    check_results_adequacy, results_for_preview
+from Orange.widgets.evaluate.contexthandlers import \
+    EvaluationResultsContextHandler
+from Orange.widgets.evaluate.utils import results_for_preview
 from Orange.widgets.utils import colorpalette, colorbrewer
 from Orange.widgets.utils.widgetpreview import WidgetPreview
-from Orange.widgets.widget import Input
+from Orange.widgets.widget import Input, Output, Msg
 from Orange.widgets import report
 
 
-Curve = namedtuple(
-    "Curve",
-    ["x", "y"]
-)
-
-PlotCurve = namedtuple(
-    "PlotCurve",
-    ["curve",
-     "curve_item",
-     "rug_item"]
-)
+MetricDefinition = namedtuple(
+    "metric_definition",
+    ("name", "functions", "short_names", "explanation"))
+
+Metrics = [MetricDefinition(*args) for args in (
+    ("Calibration curve", None, (), ""),
+    ("Classification accuracy", (Curves.ca, ), (), ""),
+    ("F1", (Curves.f1, ), (), ""),
+    ("Sensitivity and specificity",
+     (Curves.sensitivity, Curves.specificity),
+     ("sens", "spec"),
+     "<p><b>Sensitivity</b> (falling) is the proportion of correctly "
+     "detected positive instances (TP&nbsp;/&nbsp;P).</p>"
+     "<p><b>Specificity</b> (rising) is the proportion of detected "
+     "negative instances (TP&nbsp;/&nbsp;N).</p>"),
+    ("Precision and recall",
+     (Curves.precision, Curves.recall),
+     ("prec", "recall"),
+     "<p><b>Precision</b> (rising) is the fraction of retrieved instances "
+     "that are relevant, TP&nbsp;/&nbsp;(TP&nbsp;+&nbsp;FP).</p>"
+     "<p><b>Recall</b> (falling) is the proportion of discovered relevant "
+     "instances, TP&nbsp;/&nbsp;P.</p>"),
+    ("Pos and neg predictive value",
+     (Curves.ppv, Curves.npv),
+     ("PPV", "TPV"),
+     "<p><b>Positive predictive value</b> (rising) is the proportion of "
+     "correct positives, TP&nbsp;/&nbsp;(TP&nbsp;+&nbsp;FP).</p>"
+     "<p><b>Negative predictive value</b> is the proportion of correct "
+     "negatives, TN&nbsp;/&nbsp;(TN&nbsp;+&nbsp;FN).</p>"),
+    ("True and false positive rate",
+     (Curves.tpr, Curves.fpr),
+     ("TPR", "FPR"),
+     "<p><b>True and false positive rate</b> are proportions of detected "
+     "and omitted positive instances</p>"),
+)]
 
 
 class OWCalibrationPlot(widget.OWWidget):
@@ -42,15 +66,41 @@ class OWCalibrationPlot(widget.OWWidget):
     keywords = []
 
     class Inputs:
-        evaluation_results = Input("Evaluation Results", Orange.evaluation.Results)
+        evaluation_results = Input("Evaluation Results", Results)
 
-    class Warning(widget.OWWidget.Warning):
-        empty_input = widget.Msg(
-            "Empty result on input. Nothing to display.")
+    class Outputs:
+        calibrated_model = Output("Calibrated Model", Model)
+
+    class Error(widget.OWWidget.Error):
+        non_discrete_target = Msg("Calibration plot requires a discrete target")
+        empty_input = widget.Msg("Empty result on input. Nothing to display.")
+        nan_classes = \
+            widget.Msg("Remove test data instances with unknown classes")
+        all_target_class = widget.Msg(
+            "All data instances belong to target class")
+        no_target_class = widget.Msg(
+            "No data instances belong to target class")
 
-    target_index = settings.Setting(0)
-    selected_classifiers = settings.Setting([])
+    class Warning(widget.OWWidget.Warning):
+        omitted_folds = widget.Msg(
+            "Test folds where all data belongs to (non)-target are not shown")
+        omitted_nan_prob_points = widget.Msg(
+            "Instance for which the model couldn't compute probabilities are"
+            "skipped")
+        no_valid_data = widget.Msg("No valid data for model(s) {}")
+
+    class Information(widget.OWWidget.Information):
+        no_output = Msg("Can't output a model: {}")
+
+    settingsHandler = EvaluationResultsContextHandler()
+    target_index = settings.ContextSetting(0)
+    selected_classifiers = settings.ContextSetting([])
+    score = settings.Setting(0)
+    output_calibration = settings.Setting(0)
+    fold_curves = settings.Setting(False)
     display_rug = settings.Setting(True)
+    threshold = settings.Setting(0.5)
+    auto_commit = settings.Setting(True)
 
     graph_name = "plot"
 
@@ -58,56 +108,100 @@ def __init__(self):
         super().__init__()
 
         self.results = None
+        self.scores = None
         self.classifier_names = []
         self.colors = []
-        self._curve_data = {}
+        self.line = None
 
-        box = gui.vBox(self.controlArea, "Plot")
-        tbox = gui.vBox(box, "Target Class")
-        tbox.setFlat(True)
+        self._last_score_value = -1
 
+        box = gui.vBox(self.controlArea, box="Settings")
         self.target_cb = gui.comboBox(
-            tbox, self, "target_index", callback=self._replot,
+            box, self, "target_index", label="Target:",
+            orientation=Qt.Horizontal, callback=self.target_index_changed,
             contentsLength=8)
-
-        cbox = gui.vBox(box, "Classifier")
-        cbox.setFlat(True)
-
-        self.classifiers_list_box = gui.listBox(
-            box, self, "selected_classifiers", "classifier_names",
-            selectionMode=QListWidget.MultiSelection,
+        gui.checkBox(
+            box, self, "display_rug", "Show rug",
+            callback=self._on_display_rug_changed)
+        gui.checkBox(
+            box, self, "fold_curves", "Curves for individual folds",
             callback=self._replot)
 
-        gui.checkBox(box, self, "display_rug", "Show rug",
-                     callback=self._on_display_rug_changed)
+        self.classifiers_list_box = gui.listBox(
+            self.controlArea, self, "selected_classifiers", "classifier_names",
+            box="Classifier", selectionMode=QListWidget.ExtendedSelection,
+            sizePolicy=(QSizePolicy.Preferred, QSizePolicy.Preferred),
+            sizeHint=QSize(150, 40),
+            callback=self._on_selection_changed)
+
+        box = gui.vBox(self.controlArea, "Metrics")
+        combo = gui.comboBox(
+            box, self, "score", items=(metric.name for metric in Metrics),
+            callback=self.score_changed)
+
+        self.explanation = gui.widgetLabel(
+            box, wordWrap=True, fixedWidth=combo.sizeHint().width())
+        self.explanation.setContentsMargins(8, 8, 0, 0)
+        font = self.explanation.font()
+        font.setPointSizeF(0.85 * font.pointSizeF())
+        self.explanation.setFont(font)
+
+        gui.radioButtons(
+            box, self, value="output_calibration",
+            btnLabels=("Sigmoid calibration", "Isotonic calibration"),
+            label="Output model calibration", callback=self.apply)
+
+        self.info_box = gui.widgetBox(self.controlArea, "Info")
+        self.info_label = gui.widgetLabel(self.info_box)
+
+        gui.auto_commit(
+            self.controlArea, self, "auto_commit", "Apply", commit=self.apply)
 
         self.plotview = pg.GraphicsView(background="w")
         self.plot = pg.PlotItem(enableMenu=False)
         self.plot.setMouseEnabled(False, False)
         self.plot.hideButtons()
 
-        axis = self.plot.getAxis("bottom")
-        axis.setLabel("Predicted Probability")
-
-        axis = self.plot.getAxis("left")
-        axis.setLabel("Observed Average")
+        for axis_name in ("bottom", "left"):
+            axis = self.plot.getAxis(axis_name)
+            axis.setPen(pg.mkPen(color=0.0))
+            # Remove the condition (that is, allow setting this for bottom
+            # axis) when pyqtgraph is fixed
+            # Issue: https://github.com/pyqtgraph/pyqtgraph/issues/930
+            # Pull request: https://github.com/pyqtgraph/pyqtgraph/pull/932
+            if axis_name != "bottom":  # remove if when pyqtgraph is fixed
+                axis.setStyle(stopAxisAtTick=(True, True))
 
         self.plot.setRange(xRange=(0.0, 1.0), yRange=(0.0, 1.0), padding=0.05)
         self.plotview.setCentralItem(self.plot)
+
         self.mainArea.layout().addWidget(self.plotview)
+        self._set_explanation()
 
     @Inputs.evaluation_results
     def set_results(self, results):
+        self.closeContext()
         self.clear()
-        results = check_results_adequacy(results, self.Error)
-        if results is not None and not results.actual.size:
-            self.Warning.empty_input()
-        else:
-            self.Warning.empty_input.clear()
-        self.results = results
-        if self.results is not None:
-            self._initialize(results)
-            self._replot()
+        self.Error.clear()
+        self.Information.clear()
+
+        self.results = None
+        if results is not None:
+            if not results.domain.has_discrete_class:
+                self.Error.non_discrete_target()
+            elif not results.actual.size:
+                self.Error.empty_input()
+            elif np.any(np.isnan(results.actual)):
+                self.Error.nan_classes()
+            else:
+                self.results = results
+                self._initialize(results)
+                class_var = self.results.domain.class_var
+                self.target_index = int(len(class_var.values) == 2)
+                self.openContext(class_var, self.classifier_names)
+                self._replot()
+
+        self.apply()
 
     def clear(self):
         self.plot.clear()
@@ -117,106 +211,296 @@ def clear(self):
         self.target_cb.clear()
         self.target_index = 0
         self.colors = []
-        self._curve_data = {}
+
+    def target_index_changed(self):
+        if len(self.results.domain.class_var.values) == 2:
+            self.threshold = 1 - self.threshold
+        self._set_explanation()
+        self._replot()
+        self.apply()
+
+    def score_changed(self):
+        self._set_explanation()
+        self._replot()
+        if self._last_score_value != self.score:
+            self.apply()
+            self._last_score_value = self.score
+
+    def _set_explanation(self):
+        explanation = Metrics[self.score].explanation
+        if explanation:
+            self.explanation.setText(explanation)
+            self.explanation.show()
+        else:
+            self.explanation.hide()
+
+        if self.score == 0:
+            self.controls.output_calibration.show()
+            self.info_box.hide()
+        else:
+            self.controls.output_calibration.hide()
+            self.info_box.show()
+
+        axis = self.plot.getAxis("bottom")
+        axis.setLabel("Predicted probability" if self.score == 0
+                      else "Threshold probability to classify as positive")
+
+        axis = self.plot.getAxis("left")
+        axis.setLabel(Metrics[self.score].name)
 
     def _initialize(self, results):
-        N = len(results.predicted)
+        n = len(results.predicted)
         names = getattr(results, "learner_names", None)
         if names is None:
-            names = ["#{}".format(i + 1) for i in range(N)]
+            names = ["#{}".format(i + 1) for i in range(n)]
 
         self.classifier_names = names
         scheme = colorbrewer.colorSchemes["qualitative"]["Dark2"]
-        if N > len(scheme):
+        if n > len(scheme):
             scheme = colorpalette.DefaultRGBColors
-        self.colors = colorpalette.ColorPaletteGenerator(N, scheme)
+        self.colors = colorpalette.ColorPaletteGenerator(n, scheme)
 
-        for i in range(N):
+        for i in range(n):
             item = self.classifiers_list_box.item(i)
             item.setIcon(colorpalette.ColorPixmap(self.colors[i]))
 
-        self.selected_classifiers = list(range(N))
-        self.target_cb.addItems(results.data.domain.class_var.values)
-
-    def plot_curve(self, clf_idx, target):
-        if (clf_idx, target) in self._curve_data:
-            return self._curve_data[clf_idx, target]
-
-        ytrue = self.results.actual == target
-        probs = self.results.probabilities[clf_idx, :, target]
-        sortind = np.argsort(probs)
-        probs = probs[sortind]
-        ytrue = ytrue[sortind]
-        if probs.size:
-            xmin, xmax = probs.min(), probs.max()
-            x = np.linspace(xmin, xmax, 100)
-            if xmax != xmin:
-                f = gaussian_smoother(probs, ytrue, sigma=0.15 * (xmax - xmin))
-                observed = f(x)
-            else:
-                observed = np.full(100, xmax)
-        else:
-            x = np.array([])
-            observed = np.array([])
-
-        curve = Curve(x, observed)
-        curve_item = pg.PlotDataItem(
-            x, observed, pen=pg.mkPen(self.colors[clf_idx], width=1),
-            shadowPen=pg.mkPen(self.colors[clf_idx].lighter(160), width=2),
-            symbol="+", symbolSize=4,
-            antialias=True
-        )
+        self.selected_classifiers = list(range(n))
+        self.target_cb.addItems(results.domain.class_var.values)
 
+    def _rug(self, data, pen_args):
+        color = pen_args["pen"].color()
         rh = 0.025
-        rug_x = np.c_[probs, probs]
-        rug_x_true = rug_x[ytrue].ravel()
-        rug_x_false = rug_x[~ytrue].ravel()
+        rug_x = np.c_[data.probs[:-1], data.probs[:-1]]
+        rug_x_true = rug_x[data.ytrue].ravel()
+        rug_x_false = rug_x[~data.ytrue].ravel()
 
         rug_y_true = np.ones_like(rug_x_true)
         rug_y_true[1::2] = 1 - rh
         rug_y_false = np.zeros_like(rug_x_false)
         rug_y_false[1::2] = rh
 
-        rug1 = pg.PlotDataItem(
-            rug_x_false, rug_y_false, pen=self.colors[clf_idx],
-            connect="pairs", antialias=True
-        )
-        rug2 = pg.PlotDataItem(
-            rug_x_true, rug_y_true, pen=self.colors[clf_idx],
-            connect="pairs", antialias=True
-        )
-        self._curve_data[clf_idx, target] = PlotCurve(curve, curve_item, (rug1, rug2))
-        return self._curve_data[clf_idx, target]
+        self.plot.plot(
+            rug_x_false, rug_y_false,
+            pen=color, connect="pairs", antialias=True)
+        self.plot.plot(
+            rug_x_true, rug_y_true,
+            pen=color, connect="pairs", antialias=True)
+
+    def plot_metrics(self, data, metrics, pen_args):
+        if metrics is None:
+            return self._prob_curve(data.ytrue, data.probs[:-1], pen_args)
+        ys = [metric(data) for metric in metrics]
+        for y in ys:
+            self.plot.plot(data.probs, y, **pen_args)
+        return data.probs, ys
+
+    def _prob_curve(self, ytrue, probs, pen_args):
+        xmin, xmax = probs.min(), probs.max()
+        x = np.linspace(xmin, xmax, 100)
+        if xmax != xmin:
+            f = gaussian_smoother(probs, ytrue, sigma=0.15 * (xmax - xmin))
+            y = f(x)
+        else:
+            y = np.full(100, xmax)
+
+        self.plot.plot(x, y, symbol="+", symbolSize=4, **pen_args)
+        return x, (y, )
 
     def _setup_plot(self):
         target = self.target_index
-        selected = self.selected_classifiers
-        curves = [self.plot_curve(i, target) for i in selected]
+        results = self.results
+        metrics = Metrics[self.score].functions
+        plot_folds = self.fold_curves and results.folds is not None
+        self.scores = []
 
-        for curve in curves:
-            self.plot.addItem(curve.curve_item)
-            if self.display_rug:
-                self.plot.addItem(curve.rug_item[0])
-                self.plot.addItem(curve.rug_item[1])
+        if not self._check_class_presence(results.actual == target):
+            return
 
-        self.plot.plot([0, 1], [0, 1], antialias=True)
+        self.Warning.omitted_folds.clear()
+        self.Warning.omitted_nan_prob_points.clear()
+        no_valid_models = []
+        shadow_width = 4 + 4 * plot_folds
+        for clsf in self.selected_classifiers:
+            data = Curves.from_results(results, target, clsf)
+            if data.tot == 0:  # all probabilities are nan
+                no_valid_models.append(clsf)
+                continue
+            if data.tot != results.probabilities.shape[1]:  # some are nan
+                self.Warning.omitted_nan_prob_points()
+
+            color = self.colors[clsf]
+            pen_args = dict(
+                pen=pg.mkPen(color, width=1), antiAlias=True,
+                shadowPen=pg.mkPen(color.lighter(160), width=shadow_width))
+            self.scores.append(
+                (self.classifier_names[clsf],
+                 self.plot_metrics(data, metrics, pen_args)))
+
+            if self.display_rug:
+                self._rug(data, pen_args)
+
+            if plot_folds:
+                pen_args = dict(
+                    pen=pg.mkPen(color, width=1, style=Qt.DashLine),
+                    antiAlias=True)
+                for fold in range(len(results.folds)):
+                    fold_results = results.get_fold(fold)
+                    fold_curve = Curves.from_results(fold_results, target, clsf)
+                    # Can't check this before: p and n can be 0 because of
+                    # nan probabilities
+                    if fold_curve.p * fold_curve.n == 0:
+                        self.Warning.omitted_folds()
+                    self.plot_metrics(fold_curve, metrics, pen_args)
+
+        if no_valid_models:
+            self.Warning.no_valid_data(
+                ", ".join(self.classifier_names[i] for i in no_valid_models))
+
+        if self.score == 0:
+            self.plot.plot([0, 1], [0, 1], antialias=True)
+        else:
+            self.line = pg.InfiniteLine(
+                pos=self.threshold, movable=True,
+                pen=pg.mkPen(color="k", style=Qt.DashLine, width=2),
+                hoverPen=pg.mkPen(color="k", style=Qt.DashLine, width=3),
+                bounds=(0, 1),
+            )
+            self.line.sigPositionChanged.connect(self.threshold_change)
+            self.line.sigPositionChangeFinished.connect(
+                self.threshold_change_done)
+            self.plot.addItem(self.line)
+
+    def _check_class_presence(self, ytrue):
+        self.Error.all_target_class.clear()
+        self.Error.no_target_class.clear()
+        if np.max(ytrue) == 0:
+            self.Error.no_target_class()
+            return False
+        if np.min(ytrue) == 1:
+            self.Error.all_target_class()
+            return False
+        return True
 
     def _replot(self):
         self.plot.clear()
         if self.results is not None:
             self._setup_plot()
+        self._update_info()
 
     def _on_display_rug_changed(self):
         self._replot()
 
+    def _on_selection_changed(self):
+        self._replot()
+        self.apply()
+
+    def threshold_change(self):
+        self.threshold = round(self.line.pos().x(), 2)
+        self.line.setPos(self.threshold)
+        self._update_info()
+
+    def get_info_text(self, short):
+        if short:
+            def elided(s):
+                return s[:17] + "..." if len(s) > 20 else s
+
+            text = f"""<table>
+                            <tr>
+                                <th align='right'>Threshold: p=</th>
+                                <td colspan='4'>{self.threshold:.2f}<br/></td>
+                            </tr>"""
+
+        else:
+            def elided(s):
+                return s
+
+            text = f"""<table>
+                            <tr>
+                                <th align='right'>Threshold:</th>
+                                <td colspan='4'>p = {self.threshold:.2f}<br/>
+                                </td>
+                                <tr/>
+                            </tr>"""
+
+        if self.scores is not None:
+            short_names = Metrics[self.score].short_names
+            if short_names:
+                text += f"""<tr>
+                                <th></th>
+                                {"<td></td>".join(f"<td align='right'>{n}</td>"
+                                                  for n in short_names)}
+                            </tr>"""
+            for name, (probs, curves) in self.scores:
+                ind = min(np.searchsorted(probs, self.threshold),
+                          len(probs) - 1)
+                text += f"<tr><th align='right'>{elided(name)}:</th>"
+                text += "<td>/</td>".join(f'<td>{curve[ind]:.3f}</td>'
+                                          for curve in curves)
+                text += "</tr>"
+            text += "<table>"
+            return text
+
+    def _update_info(self):
+        self.info_label.setText(self.get_info_text(short=True))
+
+    def threshold_change_done(self):
+        self.apply()
+
+    def apply(self):
+        self.Information.no_output.clear()
+        wrapped = None
+        results = self.results
+        if results is not None:
+            problems = [
+                msg for condition, msg in (
+                    (len(results.folds) > 1,
+                     "each training data sample produces a different model"),
+                    (results.models is None,
+                     "test results do not contain stored models - try testing "
+                     "on separate data or on training data"),
+                    (len(self.selected_classifiers) != 1,
+                     "select a single model - the widget can output only one"),
+                    (self.score != 0 and len(results.domain.class_var.values) != 2,
+                     "cannot calibrate non-binary classes"))
+                if condition]
+            if len(problems) == 1:
+                self.Information.no_output(problems[0])
+            elif problems:
+                self.Information.no_output(
+                    "".join(f"\n - {problem}" for problem in problems))
+            else:
+                clsf_idx = self.selected_classifiers[0]
+                model = results.models[0, clsf_idx]
+                if self.score == 0:
+                    cal_learner = CalibratedLearner(
+                        None, self.output_calibration)
+                    wrapped = cal_learner.get_model(
+                        model, results.actual, results.probabilities[clsf_idx])
+                else:
+                    threshold = [1 - self.threshold,
+                                 self.threshold][self.target_index]
+                    wrapped = ThresholdClassifier(model, threshold)
+
+        self.Outputs.calibrated_model.send(wrapped)
+
     def send_report(self):
         if self.results is None:
             return
+        self.report_items((
+            ("Target class", self.target_cb.currentText()),
+            ("Output model calibration",
+             self.score == 0
+             and ("Sigmoid calibration",
+                  "Isotonic calibration")[self.output_calibration])
+        ))
         caption = report.list_legend(self.classifiers_list_box,
                                      self.selected_classifiers)
-        self.report_items((("Target class", self.target_cb.currentText()),))
         self.report_plot()
         self.report_caption(caption)
+        self.report_caption(self.controls.score.currentText())
+
+        if self.score != 0:
+            self.report_raw(self.get_info_text(short=False))
 
 
 def gaussian_smoother(x, y, sigma=1.0):
diff --git a/Orange/widgets/evaluate/owtestlearners.py b/Orange/widgets/evaluate/owtestlearners.py
index 0577b448950..d534bbe6a32 100644
--- a/Orange/widgets/evaluate/owtestlearners.py
+++ b/Orange/widgets/evaluate/owtestlearners.py
@@ -315,7 +315,7 @@ def set_learner(self, learner, key):
             # Removed
             self._invalidate([key])
             del self.learners[key]
-        else:
+        elif learner is not None:
             self.learners[key] = InputLearner(learner, None, None)
             self._invalidate([key])
 
@@ -735,7 +735,8 @@ def __update(self):
 
         if self.resampling == OWTestLearners.TestOnTest:
             test_f = partial(
-                Orange.evaluation.TestOnTestData(store_data=True),
+                Orange.evaluation.TestOnTestData(
+                    store_data=True, store_models=True),
                 self.data, self.test_data, learners_c, self.preprocessor
             )
         else:
@@ -756,7 +757,8 @@ def __update(self):
                     stratified=self.shuffle_stratified,
                     random_state=rstate)
             elif self.resampling == OWTestLearners.TestOnTrain:
-                sampler = Orange.evaluation.TestOnTrainingData()
+                sampler = Orange.evaluation.TestOnTrainingData(
+                    store_models=True)
             else:
                 assert False, "self.resampling %s" % self.resampling
 
@@ -916,7 +918,7 @@ def is_empty(res):
         res.probabilities = np.vstack((x.probabilities, y.probabilities))
 
     if x.models is not None:
-        res.models = [xm + ym for xm, ym in zip(x.models, y.models)]
+        res.models = np.hstack((x.models, y.models))
     return res
 
 
diff --git a/Orange/widgets/evaluate/tests/base.py b/Orange/widgets/evaluate/tests/base.py
index 3100f1e1905..93fafea1e51 100644
--- a/Orange/widgets/evaluate/tests/base.py
+++ b/Orange/widgets/evaluate/tests/base.py
@@ -17,6 +17,6 @@ def test_many_evaluation_results(self):
             classification.NaiveBayesLearner(),
             classification.SGDClassificationLearner()
         ]
-        res = evaluation.CrossValidation(data, learners, k=2, store_data=True)
+        res = evaluation.CrossValidation(k=2, store_data=True)(data, learners)
         # this is a mixin; pylint: disable=no-member
         self.send_signal("Evaluation Results", res)
diff --git a/Orange/widgets/evaluate/tests/test_owcalibrationplot.py b/Orange/widgets/evaluate/tests/test_owcalibrationplot.py
index 0575e03e8d1..e4f18231686 100644
--- a/Orange/widgets/evaluate/tests/test_owcalibrationplot.py
+++ b/Orange/widgets/evaluate/tests/test_owcalibrationplot.py
@@ -1,12 +1,18 @@
 import copy
 import warnings
+from unittest.mock import Mock, patch
 
 import numpy as np
+from AnyQt.QtCore import QItemSelection
+from pyqtgraph import InfiniteLine
+
 from sklearn.exceptions import ConvergenceWarning
 
-from Orange.data import Table
+from Orange.data import Table, DiscreteVariable, Domain, ContinuousVariable
 import Orange.evaluation
 import Orange.classification
+from Orange.evaluation import Results
+from Orange.evaluation.performance_curves import Curves
 
 from Orange.widgets.evaluate.tests.base import EvaluateTest
 from Orange.widgets.evaluate.owcalibrationplot import OWCalibrationPlot
@@ -15,42 +21,620 @@
 
 
 class TestOWCalibrationPlot(WidgetTest, EvaluateTest):
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-        cls.lenses = data = Table(test_filename("datasets/lenses.tab"))
-        cls.res = Orange.evaluation.TestOnTestData(
-            train_data=data[::2], test_data=data[1::2],
-            learners=[Orange.classification.MajorityLearner(),
-                      Orange.classification.KNNLearner()],
-            store_data=True,
-        )
-
     def setUp(self):
         super().setUp()
+
+        n, p = (0, 1)
+        actual, probs = np.array([
+            (p, .8), (n, .7), (p, .6), (p, .55), (p, .54), (n, .53), (n, .52),
+            (p, .51), (n, .505), (p, .4), (n, .39), (p, .38), (n, .37),
+            (n, .36), (n, .35), (p, .34), (n, .33), (p, .30), (n, .1)]).T
+        self.curves = Curves(actual, probs)
+        probs2 = (probs + 0.5) / 2 + 1
+        self.curves2 = Curves(actual, probs2)
+        pred = probs > 0.5
+        pred2 = probs2 > 0.5
+        probs = np.vstack((1 - probs, probs)).T
+        probs2 = np.vstack((1 - probs2, probs2)).T
+        domain = Domain([], DiscreteVariable("y", values=("a", "b")))
+        self.results = Results(
+            domain=domain,
+            actual=actual,
+            folds=np.array([Ellipsis]),
+            models=np.array([[Mock(), Mock()]]),
+            row_indices=np.arange(19),
+            predicted=np.array((pred, pred2)),
+            probabilities=np.array([probs, probs2]))
+
+        self.lenses = data = Table(test_filename("datasets/lenses.tab"))
+        majority = Orange.classification.MajorityLearner()
+        majority.name = "majority"
+        knn3 = Orange.classification.KNNLearner(n_neighbors=3)
+        knn3.name = "knn-3"
+        knn1 = Orange.classification.KNNLearner(n_neighbors=1)
+        knn1.name = "knn-1"
+        self.lenses_results = Orange.evaluation.TestOnTestData(
+            store_data=True, store_models=True)(
+                data=data[::2], test_data=data[1::2],
+                learners=[majority, knn3, knn1])
+        self.lenses_results.learner_names = ["majority", "knn-3", "knn-1"]
+
         self.widget = self.create_widget(OWCalibrationPlot)  # type: OWCalibrationPlot
         warnings.filterwarnings("ignore", ".*", ConvergenceWarning)
 
-    def test_basic(self):
-        self.send_signal(self.widget.Inputs.evaluation_results, self.res)
-        self.widget.controls.display_rug.click()
+    def test_initialization(self):
+        """Test initialization of lists and combos"""
+        def check_clsfr_names(names):
+            self.assertEqual(widget.classifier_names, names)
+            clsf_list = widget.controls.selected_classifiers
+            self.assertEqual(
+                [clsf_list.item(i).text() for i in range(clsf_list.count())],
+                names)
+
+        widget = self.widget
+        tcomb = widget.controls.target_index
+
+        self.send_signal(widget.Inputs.evaluation_results, self.lenses_results)
+        check_clsfr_names(["majority", "knn-3", "knn-1"])
+        self.assertEqual(widget.selected_classifiers, [0, 1, 2])
+        self.assertEqual(
+            [tcomb.itemText(i) for i in range(tcomb.count())],
+            self.lenses.domain.class_var.values)
+        self.assertEqual(widget.target_index, 0)
+
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        check_clsfr_names(["#1", "#2"])
+        self.assertEqual(widget.selected_classifiers, [0, 1])
+        self.assertEqual(
+            [tcomb.itemText(i) for i in range(tcomb.count())], ["a", "b"])
+        self.assertEqual(widget.target_index, 1)
+
+        self.send_signal(widget.Inputs.evaluation_results, None)
+        check_clsfr_names([])
+        self.assertEqual(widget.selected_classifiers, [])
+        self.assertEqual(widget.controls.target_index.count(), 0)
+
+    def test_empty_input_error(self):
+        """Show an error when data is present but empty"""
+        widget = self.widget
+
+        res = copy.copy(self.results)
+        res.row_indices = res.row_indices[:0]
+        res.actual = res.actual[:0]
+        res.predicted = res.predicted[:, 0]
+        res.probabilities = res.probabilities[:, :0, :]
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self.assertFalse(widget.Error.empty_input.is_shown())
+        self.assertTrue(bool(widget.plot.items))
+
+        self.send_signal(widget.Inputs.evaluation_results, res)
+        self.assertTrue(widget.Error.empty_input.is_shown())
+        self.assertIsNone(widget.results)
+        self.assertFalse(bool(widget.plot.items))
+
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self.assertFalse(widget.Error.empty_input.is_shown())
+        self.assertTrue(bool(widget.plot.items))
+
+    def test_regression_input_error(self):
+        """Show an error for regression data"""
+        widget = self.widget
 
-    def test_empty(self):
-        res = copy.copy(self.res)
+        res = copy.copy(self.results)
+        res.domain = Domain([], ContinuousVariable("y"))
         res.row_indices = res.row_indices[:0]
         res.actual = res.actual[:0]
         res.predicted = res.predicted[:, 0]
         res.probabilities = res.probabilities[:, :0, :]
-        self.send_signal(self.widget.Inputs.evaluation_results, res)
-
-    def test_nan_input(self):
-        res = copy.copy(self.res)
-        res.actual = res.actual.copy()
-        res.probabilities = res.probabilities.copy()
-
-        res.actual[0] = np.nan
-        res.probabilities[:, [0, 3], :] = np.nan
-        self.send_signal(self.widget.Inputs.evaluation_results, res)
-        self.assertTrue(self.widget.Error.invalid_results.is_shown())
-        self.send_signal(self.widget.Inputs.evaluation_results, None)
-        self.assertFalse(self.widget.Error.invalid_results.is_shown())
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self.assertFalse(widget.Error.non_discrete_target.is_shown())
+        self.assertTrue(bool(widget.plot.items))
+
+        self.send_signal(widget.Inputs.evaluation_results, res)
+        self.assertTrue(widget.Error.non_discrete_target.is_shown())
+        self.assertIsNone(widget.results)
+        self.assertFalse(bool(widget.plot.items))
+
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self.assertFalse(widget.Error.non_discrete_target.is_shown())
+        self.assertTrue(bool(widget.plot.items))
+
+    @staticmethod
+    def _set_combo(combo, val):
+        combo.setCurrentIndex(val)
+        combo.activated[int].emit(val)
+        combo.activated[str].emit(combo.currentText())
+
+    @staticmethod
+    def _set_radio_buttons(radios, val):
+        radios.buttons[val].click()
+
+    @staticmethod
+    def _set_list_selection(listview, selection):
+        model = listview.model()
+        selectionmodel = listview.selectionModel()
+        itemselection = QItemSelection()
+        for item in selection:
+            itemselection.select(model.index(item, 0), model.index(item, 0))
+        selectionmodel.select(itemselection, selectionmodel.ClearAndSelect)
+
+    def _set_threshold(self, pos, done):
+        _, line = self._get_curves()
+        line.setPos(pos)
+        if done:
+            line.sigPositionChangeFinished.emit(line)
+        else:
+            line.sigPositionChanged.emit(line)
+
+    def _get_curves(self):
+        plot_items = self.widget.plot.items[:]
+        for i, item in enumerate(plot_items):
+            if isinstance(item, InfiniteLine):
+                del plot_items[i]
+                return plot_items, item
+        return plot_items, None
+
+    @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier")
+    @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner")
+    def test_plotting_curves(self, *_):
+        """Curve coordinates match those computed by `Curves`"""
+        widget = self.widget
+        widget.display_rug = False
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        widget.selected_classifiers = [0]
+        combo = widget.controls.score
+
+        c = self.curves
+        combinations = ([c.ca()],
+                        [c.f1()],
+                        [c.sensitivity(), c.specificity()],
+                        [c.precision(), c.recall()],
+                        [c.ppv(), c.npv()],
+                        [c.tpr(), c.fpr()])
+        for idx, curves_data in enumerate(combinations, start=1):
+            self._set_combo(combo, idx)
+            curves, line = self._get_curves()
+            self.assertEqual(len(curves), len(curves_data))
+            self.assertIsNotNone(line)
+            for curve in curves:
+                x, y = curve.getData()
+                np.testing.assert_almost_equal(x, self.curves.probs)
+                for i, curve_data in enumerate(curves_data):
+                    if np.max(curve_data - y) < 1e-6:
+                        del curves_data[i]
+                        break
+                else:
+                    self.fail(f"invalid curve for {combo.currentText()}")
+
+    @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier")
+    @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner")
+    def test_multiple_fold_curves(self, *_):
+        widget = self.widget
+        widget.display_rug = False
+        widget.fold_curves = False
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self._set_list_selection(widget.controls.selected_classifiers, [0])
+        self._set_combo(widget.controls.score, 1)  # CA
+
+        self.results.folds = [slice(1, 5), slice(5, 19)]
+        self.results.models = np.array([[Mock(), Mock()]] * 2)
+        curves, _ = self._get_curves()
+        self.assertEqual(len(curves), 1)
+
+        widget.controls.fold_curves.click()
+        curves, _ = self._get_curves()
+        self.assertEqual(len(curves), 3)
+
+        widget.controls.fold_curves.click()
+        curves, _ = self._get_curves()
+        self.assertEqual(len(curves), 1)
+
+    @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier")
+    @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner")
+    def test_change_target_class(self, *_):
+        """Changing target combo changes the curves"""
+        widget = self.widget
+        widget.display_rug = False
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        widget.selected_classifiers = [0]
+        score_combo = widget.controls.score
+        target_combo = widget.controls.target_index
+
+        self._set_combo(score_combo, 1)  # ca
+        self._set_combo(target_combo, 1)
+        (ca, ), _ = self._get_curves()
+        np.testing.assert_almost_equal(ca.getData()[1], self.curves.ca())
+
+        self._set_combo(target_combo, 0)
+        (ca, ), _ = self._get_curves()
+        curves = Curves(1 - self.curves.ytrue, 1 - self.curves.probs[:-1])
+        np.testing.assert_almost_equal(ca.getData()[1], curves.ca())
+
+    def test_changing_score_explanation(self):
+        """Changing score hides/shows explanation and options for calibration"""
+        widget = self.widget
+        score_combo = widget.controls.score
+        explanation = widget.explanation
+        calibrations = widget.controls.output_calibration
+
+        self._set_combo(score_combo, 1)  # ca
+        self.assertTrue(explanation.isHidden())
+        self.assertTrue(calibrations.isHidden())
+
+        self._set_combo(score_combo, 0)  # calibration
+        self.assertTrue(explanation.isHidden())
+        self.assertFalse(calibrations.isHidden())
+
+        self._set_combo(score_combo, 3)  # sens/spec
+        self.assertFalse(explanation.isHidden())
+        self.assertTrue(calibrations.isHidden())
+
+    def test_rug(self):
+        """Test rug appearance and positions"""
+        def get_rugs():
+            rugs = [None, None]
+            for item in widget.plot.items:
+                if item.curve.opts.get("connect", "") == "pairs":
+                    x, y = item.getData()
+                    np.testing.assert_almost_equal(x[::2], x[1::2])
+                    rugs[int(y[0] == 1)] = x[::2]
+            return rugs
+
+        widget = self.widget
+        widget.display_rug = True
+        model_list = widget.controls.selected_classifiers
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+
+        self._set_list_selection(model_list, [0])
+        probs = self.curves.probs[:-1]
+        truex = probs[self.curves.ytrue == 1]
+        falsex = probs[self.curves.ytrue == 0]
+        bottom, top = get_rugs()
+        np.testing.assert_almost_equal(bottom, falsex)
+        np.testing.assert_almost_equal(top, truex)
+
+        # Switching targets should switch rugs and takes other probabilities
+        self._set_combo(widget.controls.target_index, 0)
+        bottom, top = get_rugs()
+        np.testing.assert_almost_equal(bottom, (1 - truex)[::-1])
+        np.testing.assert_almost_equal(top, (1 - falsex)[::-1])
+        self._set_combo(widget.controls.target_index, 1)
+
+        # Changing models gives a different rug
+        self._set_list_selection(model_list, [1])
+        probs2 = self.curves2.probs[:-1]
+        truex2 = probs2[self.curves2.ytrue == 1]
+        falsex2 = probs2[self.curves2.ytrue == 0]
+        bottom, top = get_rugs()
+        np.testing.assert_almost_equal(bottom, falsex2)
+        np.testing.assert_almost_equal(top, truex2)
+
+        # Two models - two rugs - four rug items
+        self._set_list_selection(model_list, [0, 1])
+        self.assertEqual(sum(item.curve.opts.get("connect", "") == "pairs"
+                             for item in widget.plot.items), 4)
+
+        # No models - no rugs
+        self._set_list_selection(model_list, [])
+        self.assertEqual(get_rugs(), [None, None])
+
+        # Bring the rug back
+        self._set_list_selection(model_list, [1])
+        self.assertIsNotNone(get_rugs()[0])
+
+        # Disable it with checkbox
+        widget.controls.display_rug.click()
+        self.assertEqual(get_rugs(), [None, None])
+
+    def test_calibration_curve(self):
+        """Test the correct number of calibration curves"""
+        widget = self.widget
+        model_list = widget.controls.selected_classifiers
+        widget.display_rug = False
+
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self.assertEqual(len(widget.plot.items), 3)  # 2 + diagonal
+
+        self._set_list_selection(model_list, [1])
+        self.assertEqual(len(widget.plot.items), 2)
+
+        self._set_list_selection(model_list, [])
+        self.assertEqual(len(widget.plot.items), 1)
+
+    def test_threshold_change_updates_info(self):
+        """Changing the threshold updates info label"""
+        widget = self.widget
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self._set_combo(widget.controls.score, 1)
+
+        original_text = widget.info_label.text()
+        self._set_threshold(0.3, False)
+        self.assertNotEqual(widget.info_label.text(), original_text)
+
+    def test_threshold_rounding(self):
+        """Threshold is rounded to two decimals"""
+        widget = self.widget
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self._set_combo(widget.controls.score, 1)
+        self._set_threshold(0.367, False)
+        self.assertAlmostEqual(widget.threshold, 0.37)
+
+    def test_threshold_flips_on_two_classes(self):
+        """Threshold changes to 1 - threshold if *binary* class is switched"""
+        widget = self.widget
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self._set_combo(widget.controls.target_index, 0)
+        self._set_combo(widget.controls.score, 1) # CA
+        self._set_threshold(0.25, False)
+        self.assertEqual(widget.threshold, 0.25)
+        self._set_combo(widget.controls.target_index, 1)
+        self.assertEqual(widget.threshold, 0.75)
+
+        self.send_signal(widget.Inputs.evaluation_results, self.lenses_results)
+        self._set_combo(widget.controls.target_index, 0)
+        self._set_combo(widget.controls.score, 1) # CA
+        self._set_threshold(0.25, False)
+        self.assertEqual(widget.threshold, 0.25)
+        self._set_combo(widget.controls.target_index, 1)
+        self.assertEqual(widget.threshold, 0.25)
+
+
+    @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier")
+    @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner")
+    def test_apply_no_output(self, *_):
+        """Test no output warnings"""
+        widget = self.widget
+        model_list = widget.controls.selected_classifiers
+
+        multiple_folds, multiple_selected, no_models, non_binary_class = "abcd"
+        messages = {
+            multiple_folds:
+                "each training data sample produces a different model",
+            no_models:
+                "test results do not contain stored models - try testing on "
+                "separate data or on training data",
+            multiple_selected:
+                "select a single model - the widget can output only one",
+            non_binary_class:
+                "cannot calibrate non-binary classes"}
+
+        def test_shown(shown):
+            widget_msg = widget.Information.no_output
+            output = self.get_output(widget.Outputs.calibrated_model)
+            if not shown:
+                self.assertFalse(widget_msg.is_shown())
+                self.assertIsNotNone(output)
+            else:
+                self.assertTrue(widget_msg.is_shown())
+                self.assertIsNone(output)
+                for msg_id in shown:
+                    msg = messages[msg_id]
+                    self.assertIn(msg, widget_msg.formatted,
+                                  f"{msg} not included in the message")
+
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self._set_combo(widget.controls.score, 1)  # CA
+        test_shown({multiple_selected})
+
+        self._set_list_selection(model_list, [0])
+        test_shown(())
+        self._set_list_selection(model_list, [0, 1])
+
+        self.results.models = None
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        test_shown({multiple_selected, no_models})
+
+        self.send_signal(widget.Inputs.evaluation_results, self.lenses_results)
+        test_shown({multiple_selected, non_binary_class})
+
+        self._set_list_selection(model_list, [0])
+        test_shown({non_binary_class})
+
+        self.results.folds = [slice(0, 5), slice(5, 10), slice(10, 19)]
+        self.results.models = np.array([[Mock(), Mock()]] * 3)
+
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        test_shown({multiple_selected, multiple_folds})
+
+        self._set_list_selection(model_list, [0])
+        test_shown({multiple_folds})
+
+        self._set_combo(widget.controls.score, 0)  # calibration
+        self.send_signal(widget.Inputs.evaluation_results, self.lenses_results)
+        self._set_list_selection(model_list, [0, 1])
+        test_shown({multiple_selected})
+        self._set_list_selection(model_list, [0])
+        test_shown(())
+
+    @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier")
+    def test_output_threshold_classifier(self, threshold_classifier):
+        """Test threshold classifier on output"""
+        widget = self.widget
+        model_list = widget.controls.selected_classifiers
+        models = self.results.models.ravel()
+        target_combo = widget.controls.target_index
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self._set_list_selection(model_list, [0])
+        widget.target_index = 1
+
+        widget.threshold = 0.3
+        self._set_combo(widget.controls.score, 1)  # CA
+        model = self.get_output(widget.Outputs.calibrated_model)
+        threshold_classifier.assert_called_with(models[0], 0.3)
+        self.assertIs(model, threshold_classifier.return_value)
+        threshold_classifier.reset_mock()
+
+        widget.auto_commit = True
+        self._set_threshold(0.4, False)
+        threshold_classifier.assert_not_called()
+
+        widget.auto_commit = False
+        self._set_threshold(0.35, True)
+        threshold_classifier.assert_not_called()
+
+        widget.auto_commit = True
+        self._set_threshold(0.4, True)
+        threshold_classifier.assert_called_with(models[0], 0.4)
+        self.assertIs(model, threshold_classifier.return_value)
+        threshold_classifier.reset_mock()
+
+        self._set_combo(target_combo, 0)
+        threshold_classifier.assert_called_with(models[0], 0.4)
+        self.assertIs(model, threshold_classifier.return_value)
+        threshold_classifier.reset_mock()
+
+        self._set_combo(target_combo, 1)
+        threshold_classifier.assert_called_with(models[0], 0.4)
+        self.assertIs(model, threshold_classifier.return_value)
+        threshold_classifier.reset_mock()
+
+        self._set_list_selection(model_list, [1])
+        threshold_classifier.assert_called_with(models[1], 0.4)
+        self.assertIs(model, threshold_classifier.return_value)
+        threshold_classifier.reset_mock()
+
+    @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner")
+    def test_output_calibrated_classifier(self, calibrated_learner):
+        """Test calibrated classifier on output"""
+        calibrated_instance = calibrated_learner.return_value
+        get_model = calibrated_instance.get_model
+
+        widget = self.widget
+        model_list = widget.controls.selected_classifiers
+        models = self.lenses_results.models.ravel()
+        results = self.lenses_results
+        self.send_signal(widget.Inputs.evaluation_results, results)
+        self._set_combo(widget.controls.score, 0)
+
+        self._set_list_selection(model_list, [1])
+
+        self._set_radio_buttons(widget.controls.output_calibration, 0)
+        calibrated_learner.assert_called_with(None, 0)
+        model, actual, probabilities = get_model.call_args[0]
+        self.assertIs(model, models[1])
+        np.testing.assert_equal(actual, results.actual)
+        np.testing.assert_equal(probabilities, results.probabilities[1])
+        self.assertIs(self.get_output(widget.Outputs.calibrated_model),
+                      get_model.return_value)
+        calibrated_learner.reset_mock()
+        get_model.reset_mock()
+
+        self._set_radio_buttons(widget.controls.output_calibration, 1)
+        calibrated_learner.assert_called_with(None, 1)
+        model, actual, probabilities = get_model.call_args[0]
+        self.assertIs(model, models[1])
+        np.testing.assert_equal(actual, results.actual)
+        np.testing.assert_equal(probabilities, results.probabilities[1])
+        self.assertIs(self.get_output(widget.Outputs.calibrated_model),
+                      get_model.return_value)
+        calibrated_learner.reset_mock()
+        get_model.reset_mock()
+
+        self._set_list_selection(model_list, [0])
+        self._set_radio_buttons(widget.controls.output_calibration, 1)
+        calibrated_learner.assert_called_with(None, 1)
+        model, actual, probabilities = get_model.call_args[0]
+        self.assertIs(model, models[0])
+        np.testing.assert_equal(actual, results.actual)
+        np.testing.assert_equal(probabilities, results.probabilities[0])
+        self.assertIs(self.get_output(widget.Outputs.calibrated_model),
+                      get_model.return_value)
+        calibrated_learner.reset_mock()
+        get_model.reset_mock()
+
+    def test_contexts(self):
+        """Test storing and retrieving context settings"""
+        widget = self.widget
+        model_list = widget.controls.selected_classifiers
+        target_combo = widget.controls.target_index
+        self.send_signal(widget.Inputs.evaluation_results, self.lenses_results)
+        self._set_list_selection(model_list, [0, 2])
+        self._set_combo(target_combo, 2)
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self._set_list_selection(model_list, [0])
+        self._set_combo(target_combo, 0)
+        self.send_signal(widget.Inputs.evaluation_results, self.lenses_results)
+        self.assertEqual(widget.selected_classifiers, [0, 2])
+        self.assertEqual(widget.target_index, 2)
+
+    def test_report(self):
+        """Test that report does not crash"""
+        widget = self.widget
+        self.send_signal(widget.Inputs.evaluation_results, self.lenses_results)
+        widget.send_report()
+
+    @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier")
+    @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner")
+    def test_single_class(self, *_):
+        """Curves are not plotted if all data belongs to (non)-target"""
+        def check_error(shown):
+            for error in (errors.no_target_class, errors.all_target_class,
+                          errors.nan_classes):
+                self.assertEqual(error.is_shown(), error is shown,
+                                 f"{error} is unexpectedly"
+                                 f"{'' if error.is_shown() else ' not'} shown")
+            if shown is not None:
+                self.assertEqual(len(widget.plot.items), 0)
+            else:
+                self.assertGreater(len(widget.plot.items), 0)
+
+        widget = self.widget
+        errors = widget.Error
+        widget.display_rug = True
+        combo = widget.controls.score
+
+        original_actual = self.results.actual.copy()
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        widget.selected_classifiers = [0]
+        for idx in range(combo.count()):
+            self._set_combo(combo, idx)
+            self.results.actual[:] = 0
+            self.send_signal(widget.Inputs.evaluation_results, self.results)
+            check_error(errors.no_target_class)
+
+            self.results.actual[:] = 1
+            self.send_signal(widget.Inputs.evaluation_results, self.results)
+            check_error(errors.all_target_class)
+
+            self.results.actual[:] = original_actual
+            self.results.actual[3] = np.nan
+            self.send_signal(widget.Inputs.evaluation_results, self.results)
+            check_error(errors.nan_classes)
+
+            self.results.actual[:] = original_actual
+            self.send_signal(widget.Inputs.evaluation_results, self.results)
+            check_error(None)
+
+    @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier")
+    @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner")
+    def test_single_class_folds(self, *_):
+        """Curves for single-class folds are not plotted"""
+        widget = self.widget
+        widget.display_rug = False
+        widget.fold_curves = False
+
+        results = self.lenses_results
+        results.folds = [slice(0, 5), slice(5, 19)]
+        results.models = results.models.repeat(2, axis=0)
+        results.actual[:3] = 0
+        results.probabilities[1, 3:5] = np.nan
+        # after this, model 1 has just negative instances in fold 0
+        self.send_signal(widget.Inputs.evaluation_results, results)
+        self._set_combo(widget.controls.score, 1)  # CA
+        self.assertFalse(widget.Warning.omitted_folds.is_shown())
+        widget.controls.fold_curves.click()
+        self.assertTrue(widget.Warning.omitted_folds.is_shown())
+
+    @patch("Orange.widgets.evaluate.owcalibrationplot.ThresholdClassifier")
+    @patch("Orange.widgets.evaluate.owcalibrationplot.CalibratedLearner")
+    def test_warn_nan_probabilities(self, *_):
+        """Warn about omitted points with nan probabiities"""
+        widget = self.widget
+        widget.display_rug = False
+        widget.fold_curves = False
+
+        self.results.probabilities[1, 3] = np.nan
+        self.send_signal(widget.Inputs.evaluation_results, self.results)
+        self.assertTrue(widget.Warning.omitted_nan_prob_points.is_shown())
+        self._set_list_selection(widget.controls.selected_classifiers, [0, 2])
+        self.assertFalse(widget.Warning.omitted_folds.is_shown())
diff --git a/Orange/widgets/evaluate/utils.py b/Orange/widgets/evaluate/utils.py
index 9e2f579dfae..ebe06032777 100644
--- a/Orange/widgets/evaluate/utils.py
+++ b/Orange/widgets/evaluate/utils.py
@@ -47,7 +47,7 @@ def results_for_preview(data_name=""):
     from Orange.classification import \
         LogisticRegressionLearner, SVMLearner, NuSVMLearner
 
-    data = Table(data_name or "ionosphere")
+    data = Table(data_name or "heart_disease")
     results = CrossValidation(
         data,
         [LogisticRegressionLearner(penalty="l2"),
diff --git a/Orange/widgets/gui.py b/Orange/widgets/gui.py
index 683b8be2f73..b6a8d84552b 100644
--- a/Orange/widgets/gui.py
+++ b/Orange/widgets/gui.py
@@ -1783,6 +1783,9 @@ def __init__(self, master, enableDragDrop=False, dragDropCallback=None,
     def sizeHint(self):
         return self.size_hint
 
+    def minimumSizeHint(self):
+        return self.size_hint
+
     def dragEnterEvent(self, event):
         super().dragEnterEvent(event)
         if self.valid_data_callback:
diff --git a/Orange/widgets/model/icons/CalibratedLearner.svg b/Orange/widgets/model/icons/CalibratedLearner.svg
new file mode 100644
index 00000000000..360a0d188ba
--- /dev/null
+++ b/Orange/widgets/model/icons/CalibratedLearner.svg
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 16.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 width="48px" height="48px" viewBox="0 0 48 48" enable-background="new 0 0 48 48" xml:space="preserve">
+<rect x="28.113" y="18.581" transform="matrix(-0.6953 -0.7187 0.7187 -0.6953 34.1212 54.4932)" fill="#333333" width="0.999" height="2.865"/>
+<rect x="32.225" y="14.592" transform="matrix(-0.6958 -0.7183 0.7183 -0.6958 43.9836 50.6797)" fill="#333333" width="0.999" height="2.866"/>
+<rect x="36.339" y="10.605" transform="matrix(-0.696 -0.718 0.718 -0.696 53.8361 46.8655)" fill="#333333" width="0.999" height="2.865"/>
+<rect x="11.66" y="34.532" transform="matrix(0.6953 0.7187 -0.7187 0.6953 29.5543 2.2188)" fill="#333333" width="1" height="2.865"/>
+<rect x="15.774" y="30.545" transform="matrix(0.6963 0.7178 -0.7178 0.6963 27.8945 -1.9692)" fill="#333333" width="1" height="2.863"/>
+<rect x="19.886" y="26.556" transform="matrix(0.6953 0.7187 -0.7187 0.6953 26.3278 -6.1237)" fill="#333333" width="1" height="2.865"/>
+<path fill="#333333" d="M9.19,40c0.311-1.484,1.055-4.244,2.672-6.993c2.714-4.614,6.661-7.25,11.751-7.864l0.206,0.213l0.269-0.26
+	C24.71,25.036,25.346,25,26,25c4.834,0,8.686-1.485,11.447-4.414c4.982-5.284,4.57-13.306,4.551-13.645l-1.996,0.117
+	c0.001,0.018,0.023,0.477-0.005,1.218l-0.422,0.409l0.375,0.387c-0.193,2.545-0.974,6.976-3.958,10.141
+	C33.623,21.726,30.262,23,26,23c-0.157,0-0.305,0.012-0.46,0.015l-0.36-0.371l-0.408,0.397C14.065,23.625,9.693,31.754,8,36.836V40
+	H9.19z"/>
+<polygon fill="#B2B2B2" points="9.19,40 8,40 8,36.836 8,6 6,6 6,40 6,42 8,42 42,42 42,40 "/>
+<g>
+	<path fill="#333333" stroke="#FFFFFF" stroke-miterlimit="10" d="M40.724,28.729v-2.145l-3.217-1.072l-0.58-1.613l1.518-3.032
+		l-1.516-1.516l-3.033,1.516l-1.75-0.717l-1.072-3.217h-2.145l-1.072,3.217l-1.608,0.547l-3.033-1.518l-1.519,1.516l1.519,3.034
+		l-0.722,1.783l-3.217,1.072v2.145l3.217,1.074l0.724,1.775l-1.521,3.033l1.519,1.516l3.033-1.518l1.608,0.555l1.072,3.219h2.143
+		l1.074-3.219l1.75-0.723l3.033,1.52l1.516-1.518l-1.516-3.033l0.578-1.607L40.724,28.729z M30.001,30.875
+		c-1.777,0-3.217-1.441-3.217-3.219c0-1.775,1.438-3.219,3.217-3.219c1.777,0,3.217,1.441,3.217,3.219S31.778,30.875,30.001,30.875z
+		"/>
+</g>
+</svg>
diff --git a/Orange/widgets/model/owcalibratedlearner.py b/Orange/widgets/model/owcalibratedlearner.py
new file mode 100644
index 00000000000..0edf3184797
--- /dev/null
+++ b/Orange/widgets/model/owcalibratedlearner.py
@@ -0,0 +1,111 @@
+from Orange.classification import CalibratedLearner, ThresholdLearner, \
+    NaiveBayesLearner
+from Orange.data import Table
+from Orange.modelling import Learner
+from Orange.widgets import gui
+from Orange.widgets.widget import Input
+from Orange.widgets.settings import Setting
+from Orange.widgets.utils.owlearnerwidget import OWBaseLearner
+from Orange.widgets.utils.widgetpreview import WidgetPreview
+
+
+class OWCalibratedLearner(OWBaseLearner):
+    name = "Calibrated Learner"
+    description = "Wraps another learner with probability calibration and " \
+                  "decision threshold optimization"
+    icon = "icons/CalibratedLearner.svg"
+    priority = 20
+    keywords = ["calibration", "threshold"]
+
+    LEARNER = CalibratedLearner
+
+    SigmoidCalibration, IsotonicCalibration, NoCalibration = range(3)
+    CalibrationOptions = ("Sigmoid calibration",
+                          "Isotonic calibration",
+                          "No calibration")
+    CalibrationShort = ("Sigmoid", "Isotonic", "")
+    CalibrationMap = {
+        SigmoidCalibration: CalibratedLearner.Sigmoid,
+        IsotonicCalibration: CalibratedLearner.Isotonic}
+
+    OptimizeCA, OptimizeF1, NoThresholdOptimization = range(3)
+    ThresholdOptions = ("Optimize classification accuracy",
+                        "Optimize F1 score",
+                        "No threshold optimization")
+    ThresholdShort = ("CA", "F1", "")
+    ThresholdMap = {
+        OptimizeCA: ThresholdLearner.OptimizeCA,
+        OptimizeF1: ThresholdLearner.OptimizeF1}
+
+    learner_name = Setting("", schema_only=True)
+    calibration = Setting(SigmoidCalibration)
+    threshold = Setting(OptimizeCA)
+
+    class Inputs(OWBaseLearner.Inputs):
+        base_learner = Input("Base Learner", Learner)
+
+    def __init__(self):
+        super().__init__()
+        self.base_learner = None
+
+    def add_main_layout(self):
+        gui.radioButtons(
+            self.controlArea, self, "calibration", self.CalibrationOptions,
+            box="Probability calibration",
+            callback=self.calibration_options_changed)
+        gui.radioButtons(
+            self.controlArea, self, "threshold", self.ThresholdOptions,
+            box="Decision threshold optimization",
+            callback=self.calibration_options_changed)
+
+    @Inputs.base_learner
+    def set_learner(self, learner):
+        self.base_learner = learner
+        self._set_default_name()
+        self.unconditional_apply()
+
+    def _set_default_name(self):
+        if self.base_learner is None:
+            self.name = "Calibrated learner"
+        else:
+            self.name = " + ".join(part for part in (
+                self.base_learner.name.title(),
+                self.CalibrationShort[self.calibration],
+                self.ThresholdShort[self.threshold]) if part)
+        self.controls.learner_name.setPlaceholderText(self.name)
+
+    def calibration_options_changed(self):
+        self._set_default_name()
+        self.apply()
+
+    def create_learner(self):
+        class IdentityWrapper(Learner):
+            def fit_storage(self, data):
+                return self.base_learner.fit_storage(data)
+
+        if self.base_learner is None:
+            return None
+        learner = self.base_learner
+        if self.calibration != self.NoCalibration:
+            learner = CalibratedLearner(learner,
+                                        self.CalibrationMap[self.calibration])
+        if self.threshold != self.NoThresholdOptimization:
+            learner = ThresholdLearner(learner,
+                                       self.ThresholdMap[self.threshold])
+        if self.preprocessors:
+            if learner is self.base_learner:
+                learner = IdentityWrapper()
+            learner.preprocessors = (self.preprocessors, )
+        return learner
+
+    def get_learner_parameters(self):
+        return (("Calibrate probabilities",
+                 self.CalibrationOptions[self.calibration]),
+                ("Threshold optimization",
+                 self.ThresholdOptions[self.threshold]))
+
+
+if __name__ == "__main__":  # pragma: no cover
+    WidgetPreview(OWCalibratedLearner).run(
+        Table("heart_disease"),
+        set_learner=NaiveBayesLearner())
diff --git a/Orange/widgets/model/tests/test_owcalibratedlearner.py b/Orange/widgets/model/tests/test_owcalibratedlearner.py
new file mode 100644
index 00000000000..400d483a592
--- /dev/null
+++ b/Orange/widgets/model/tests/test_owcalibratedlearner.py
@@ -0,0 +1,158 @@
+from unittest.mock import Mock
+
+from Orange.classification import ThresholdLearner, CalibratedLearner, \
+    NaiveBayesLearner, ThresholdClassifier, CalibratedClassifier
+from Orange.classification.base_classification import ModelClassification, \
+    LearnerClassification
+from Orange.classification.naive_bayes import NaiveBayesModel
+from Orange.data import Table
+from Orange.widgets.model.owcalibratedlearner import OWCalibratedLearner
+from Orange.widgets.tests.base import WidgetTest, WidgetLearnerTestMixin, \
+    datasets
+
+
+class TestOWCalibratedLearner(WidgetTest, WidgetLearnerTestMixin):
+    def setUp(self):
+        self.widget = self.create_widget(
+            OWCalibratedLearner, stored_settings={"auto_apply": False})
+        self.send_signal(self.widget.Inputs.base_learner, NaiveBayesLearner())
+
+        self.data = Table("heart_disease")
+        self.valid_datasets = (self.data,)
+        self.inadequate_dataset = (Table(datasets.path("testing_dataset_reg")),)
+        self.learner_class = LearnerClassification
+        self.model_class = ModelClassification
+        self.model_name = 'Calibrated classifier'
+        self.parameters = []
+
+    def test_output_learner(self):
+        """Check if learner is on output after apply"""
+        # Overridden to change the output type in the last test
+        initial = self.get_output("Learner")
+        self.assertIsNotNone(initial, "Does not initialize the learner output")
+        self.widget.apply_button.button.click()
+        newlearner = self.get_output("Learner")
+        self.assertIsNot(initial, newlearner,
+                         "Does not send a new learner instance on `Apply`.")
+        self.assertIsNotNone(newlearner)
+        self.assertIsInstance(
+            newlearner,
+            (CalibratedLearner, ThresholdLearner, NaiveBayesLearner))
+
+    def test_output_model(self):
+        """Check if model is on output after sending data and apply"""
+        # Overridden to change the output type in the last two test
+        self.assertIsNone(self.get_output(self.widget.Outputs.model))
+        self.widget.apply_button.button.click()
+        self.assertIsNone(self.get_output(self.widget.Outputs.model))
+        self.send_signal('Data', self.data)
+        self.widget.apply_button.button.click()
+        self.wait_until_stop_blocking()
+        model = self.get_output(self.widget.Outputs.model)
+        self.assertIsNotNone(model)
+        self.assertIsInstance(
+            model, (CalibratedClassifier, ThresholdClassifier, NaiveBayesModel))
+
+    def test_create_learner(self):
+        widget = self.widget  #: OWCalibratedLearner
+        self.widget.base_learner = Mock()
+
+        widget.calibration = widget.SigmoidCalibration
+        widget.threshold = widget.OptimizeF1
+        learner = self.widget.create_learner()
+        self.assertIsInstance(learner, ThresholdLearner)
+        self.assertEqual(learner.threshold_criterion, learner.OptimizeF1)
+        cal_learner = learner.base_learner
+        self.assertIsInstance(cal_learner, CalibratedLearner)
+        self.assertEqual(cal_learner.calibration_method, cal_learner.Sigmoid)
+        self.assertIs(cal_learner.base_learner, self.widget.base_learner)
+
+        widget.calibration = widget.IsotonicCalibration
+        widget.threshold = widget.OptimizeCA
+        learner = self.widget.create_learner()
+        self.assertIsInstance(learner, ThresholdLearner)
+        self.assertEqual(learner.threshold_criterion, learner.OptimizeCA)
+        cal_learner = learner.base_learner
+        self.assertIsInstance(cal_learner, CalibratedLearner)
+        self.assertEqual(cal_learner.calibration_method, cal_learner.Isotonic)
+        self.assertIs(cal_learner.base_learner, self.widget.base_learner)
+
+        widget.calibration = widget.NoCalibration
+        widget.threshold = widget.OptimizeCA
+        learner = self.widget.create_learner()
+        self.assertIsInstance(learner, ThresholdLearner)
+        self.assertEqual(learner.threshold_criterion, learner.OptimizeCA)
+        self.assertIs(learner.base_learner, self.widget.base_learner)
+
+        widget.calibration = widget.IsotonicCalibration
+        widget.threshold = widget.NoThresholdOptimization
+        learner = self.widget.create_learner()
+        self.assertIsInstance(learner, CalibratedLearner)
+        self.assertEqual(learner.calibration_method, cal_learner.Isotonic)
+        self.assertIs(learner.base_learner, self.widget.base_learner)
+
+        widget.calibration = widget.NoCalibration
+        widget.threshold = widget.NoThresholdOptimization
+        learner = self.widget.create_learner()
+        self.assertIs(learner, self.widget.base_learner)
+
+        widget.calibration = widget.SigmoidCalibration
+        widget.threshold = widget.OptimizeF1
+        widget.base_learner = None
+        learner = self.widget.create_learner()
+        self.assertIsNone(learner)
+
+    def test_preprocessors(self):
+        widget = self.widget  #: OWCalibratedLearner
+        self.widget.base_learner = Mock()
+        self.widget.base_learner.preprocessors = ()
+
+        widget.calibration = widget.SigmoidCalibration
+        widget.threshold = widget.OptimizeF1
+        widget.preprocessors = Mock()
+        learner = self.widget.create_learner()
+        self.assertEqual(learner.preprocessors, (widget.preprocessors, ))
+        self.assertEqual(learner.base_learner.preprocessors, ())
+        self.assertEqual(learner.base_learner.base_learner.preprocessors, ())
+
+        widget.calibration = widget.NoCalibration
+        widget.threshold = widget.NoThresholdOptimization
+        learner = self.widget.create_learner()
+        self.assertIsNot(learner, self.widget.base_learner)
+        self.assertFalse(
+            isinstance(learner, (CalibratedLearner, ThresholdLearner)))
+        self.assertEqual(learner.preprocessors, (widget.preprocessors, ))
+
+    def test_set_learner_calls_unconditional_apply(self):
+        widget = self.widget
+        self.assertIsNotNone(self.get_output(widget.Outputs.learner))
+
+        widget.auto_apply = False
+        self.send_signal(widget.Inputs.base_learner, None)
+        self.assertIsNone(self.get_output(widget.Outputs.learner))
+
+    def test_name_changes(self):
+        widget = self.widget
+        widget.auto_apply = True
+        learner = NaiveBayesLearner()
+        learner.name = "foo"
+        self.send_signal(widget.Inputs.base_learner, learner)
+
+        widget.calibration = widget.IsotonicCalibration
+        widget.threshold = widget.OptimizeCA
+        widget.controls.calibration.group.buttonClicked[int].emit(
+            widget.IsotonicCalibration)
+
+        learner = self.get_output(widget.Outputs.learner)
+        self.assertEqual(learner.name, "Foo + Isotonic + CA")
+
+        widget.calibration = widget.NoCalibration
+        widget.threshold = widget.OptimizeCA
+        widget.controls.calibration.group.buttonClicked[int].emit(
+            widget.NoCalibration)
+        learner = self.get_output(widget.Outputs.learner)
+        self.assertEqual(learner.name, "Foo + CA")
+
+        self.send_signal(widget.Inputs.base_learner, None)
+        self.assertEqual(widget.controls.learner_name.placeholderText(),
+                         "Calibrated learner")
diff --git a/Orange/widgets/tests/base.py b/Orange/widgets/tests/base.py
index 635dd2e5fd8..1204e1c6ed5 100644
--- a/Orange/widgets/tests/base.py
+++ b/Orange/widgets/tests/base.py
@@ -672,7 +672,8 @@ def test_output_learner_name(self):
         new_name = "Learner Name"
         self.widget.apply_button.button.click()
         self.assertEqual(self.widget.learner.name,
-                         self.widget.name_line_edit.text())
+                         self.widget.name_line_edit.text()
+                         or self.widget.name_line_edit.placeholderText())
         self.widget.name_line_edit.setText(new_name)
         self.widget.apply_button.button.click()
         self.wait_until_stop_blocking()
diff --git a/Orange/widgets/utils/owlearnerwidget.py b/Orange/widgets/utils/owlearnerwidget.py
index 3c6ee6ea65f..63b2795c78e 100644
--- a/Orange/widgets/utils/owlearnerwidget.py
+++ b/Orange/widgets/utils/owlearnerwidget.py
@@ -65,7 +65,7 @@ class OWBaseLearner(OWWidget, metaclass=OWBaseLearnerMeta, openclass=True):
     LEARNER = None
     supports_sparse = True
 
-    learner_name = Setting(None, schema_only=True)
+    learner_name = Setting("", schema_only=True)
     want_main_area = False
     resizing_enabled = False
     auto_apply = Setting(True)
@@ -95,8 +95,6 @@ def __init__(self):
         self.data = None
         self.valid_data = False
         self.learner = None
-        if self.learner_name is None:
-            self.learner_name = self.name
         self.model = None
         self.preprocessors = None
         self.outdated_settings = False
@@ -149,7 +147,7 @@ def update_learner(self):
         if self.learner and issubclass(self.LEARNER, Fitter):
             self.learner.use_default_preprocessors = True
         if self.learner is not None:
-            self.learner.name = self.learner_name
+            self.learner.name = self.learner_name or self.name
         self.Outputs.learner.send(self.learner)
         self.outdated_settings = False
         self.Warning.outdated_learner.clear()
@@ -168,7 +166,7 @@ def update_model(self):
             except BaseException as exc:
                 self.show_fitting_failed(exc)
             else:
-                self.model.name = self.learner_name
+                self.model.name = self.learner_name or self.name
                 self.model.instances = self.data
         self.Outputs.model.send(self.model)
 
@@ -198,7 +196,7 @@ def settings_changed(self, *args, **kwargs):
 
     def _change_name(self, instance, output):
         if instance:
-            instance.name = self.learner_name
+            instance.name = self.learner_name or self.name
             if self.auto_apply:
                 output.send(instance)
 
@@ -207,7 +205,7 @@ def learner_name_changed(self):
         self._change_name(self.model, self.Outputs.model)
 
     def send_report(self):
-        self.report_items((("Name", self.learner_name),))
+        self.report_items((("Name", self.learner_name or self.name),))
 
         model_parameters = self.get_learner_parameters()
         if model_parameters:
@@ -264,6 +262,7 @@ def add_regression_layout(self, box):
     def add_learner_name_widget(self):
         self.name_line_edit = gui.lineEdit(
             self.controlArea, self, 'learner_name', box='Name',
+            placeholderText=self.name,
             tooltip='The name will identify this model in other widgets',
             orientation=Qt.Horizontal, callback=self.learner_name_changed)
 
diff --git a/Orange/widgets/utils/tests/test_owlearnerwidget.py b/Orange/widgets/utils/tests/test_owlearnerwidget.py
index 99f792196b6..9a43365a473 100644
--- a/Orange/widgets/utils/tests/test_owlearnerwidget.py
+++ b/Orange/widgets/utils/tests/test_owlearnerwidget.py
@@ -105,7 +105,6 @@ class WidgetA(OWBaseLearner):
             LEARNER = KNNLearner
 
         w1 = self.create_widget(WidgetA)
-        self.assertEqual(w1.learner_name, "A")
         w1.learner_name = "MyWidget"
 
         settings = w1.settingsHandler.pack_data(w1)
diff --git a/doc/data-mining-library/source/reference/classification.rst b/doc/data-mining-library/source/reference/classification.rst
index 5095e147f2a..55792fa340f 100644
--- a/doc/data-mining-library/source/reference/classification.rst
+++ b/doc/data-mining-library/source/reference/classification.rst
@@ -196,3 +196,21 @@ CN2 Rule Induction
 
 .. autoclass:: CN2SDUnorderedLearner
    :members:
+
+
+Calibration and threshold optimization
+--------------------------------------
+
+.. automodule:: Orange.classification.calibration
+
+.. autoclass:: ThresholdClassifier
+   :members:
+
+.. autoclass:: ThresholdLearner
+   :members:
+
+.. autoclass:: CalibratedClassifier
+   :members:
+
+.. autoclass:: CalibratedLearner
+   :members:
diff --git a/doc/data-mining-library/source/reference/evaluation.performance_curves.rst b/doc/data-mining-library/source/reference/evaluation.performance_curves.rst
new file mode 100644
index 00000000000..d9eaa515c0f
--- /dev/null
+++ b/doc/data-mining-library/source/reference/evaluation.performance_curves.rst
@@ -0,0 +1,8 @@
+.. py:currentmodule:: Orange.evaluation.performance_curves
+
+##################
+Performance curves
+##################
+
+.. autoclass:: Orange.evaluation.performance_curves.Curves
+    :members:
diff --git a/doc/data-mining-library/source/reference/evaluation.rst b/doc/data-mining-library/source/reference/evaluation.rst
index 422371a41eb..a07c99ae44f 100644
--- a/doc/data-mining-library/source/reference/evaluation.rst
+++ b/doc/data-mining-library/source/reference/evaluation.rst
@@ -9,3 +9,4 @@ Evaluation (``evaluation``)
 
    evaluation.testing
    evaluation.cd
+   evaluation.performance_curves