Merge pull request #4810 from janezd/rename-cv-options

markotoplak · web-flow · commit 3a2788100933 · 2020-06-05T11:42:27.000+02:00
Sample Data: Swap output signals for cross-validation
diff --git a/Orange/widgets/data/owdatasampler.py b/Orange/widgets/data/owdatasampler.py
@@ -55,18 +55,33 @@ class Outputs:
     number_of_folds = Setting(10)
     selectedFold = Setting(1)
 
+    # Older versions of the widget had swapped outputs for cross validation
+    # Migrations set this to True for compability with older workflows
+    compatibility_mode = Setting(False, schema_only=True)
+
+    settings_version = 2
+
+    class Information(OWWidget.Information):
+        compatibility_mode = Msg(
+            "Compatibility mode\n"
+            "New versions of widget have swapped outputs for cross validation"
+        )
+
     class Warning(OWWidget.Warning):
         could_not_stratify = Msg("Stratification failed\n{}")
         bigger_sample = Msg('Sample is bigger than input')
 
     class Error(OWWidget.Error):
-        too_many_folds = Msg("Number of folds exceeds data size")
+        too_many_folds = Msg("Number of subsets exceeds data size")
         sample_larger_than_data = Msg("Sample can't be larger than data")
         not_enough_to_stratify = Msg("Data is too small to stratify")
         no_data = Msg("Dataset is empty")
 
     def __init__(self):
         super().__init__()
+        if self.compatibility_mode:
+            self.Information.compatibility_mode()
+
         self.data = None
         self.indices = None
         self.sampled_instances = self.remaining_instances = None
@@ -110,15 +125,16 @@ def set_sampling_type_i():
             labelAlignment=Qt.AlignLeft,
             fieldGrowthPolicy=QFormLayout.AllNonFixedFieldsGrow)
         ibox = gui.indentedBox(sampling, addSpace=True, orientation=form)
-        form.addRow("Number of folds:",
+        form.addRow("Number of subsets:",
                     gui.spin(
                         ibox, self, "number_of_folds", 2, 100,
                         addToLayout=False,
                         callback=self.number_of_folds_changed))
         self.selected_fold_spin = gui.spin(
             ibox, self, "selectedFold", 1, self.number_of_folds,
             addToLayout=False, callback=self.fold_changed)
-        form.addRow("Selected fold:", self.selected_fold_spin)
+        form.addRow("Unused subset:" if not self.compatibility_mode
+                    else "Selected subset:", self.selected_fold_spin)
 
         gui.appendRadioButton(sampling, "Bootstrap")
 
@@ -224,7 +240,10 @@ def commit(self):
                     self.FixedProportion, self.FixedSize, self.Bootstrap):
                 remaining, sample = self.indices
             elif self.sampling_type == self.CrossValidation:
-                remaining, sample = self.indices[self.selectedFold - 1]
+                if self.compatibility_mode:
+                    remaining, sample = self.indices[self.selectedFold - 1]
+                else:
+                    sample, remaining = self.indices[self.selectedFold - 1]
 
             sample = self.data[sample]
             other = self.data[remaining]
@@ -315,9 +334,11 @@ def send_report(self):
                 if self.replacement:
                     tpe += ", with replacement"
         elif self.sampling_type == self.CrossValidation:
-            tpe = "Fold {} of {}-fold cross-validation".format(
-                self.selectedFold, self.number_of_folds)
-        else:
+            tpe = f"{self.number_of_folds}-fold cross-validation " \
+                  f"without subset #{self.selectedFold}"
+        elif self.sampling_type == self.Bootstrap:
+            tpe = "Bootstrap"
+        else:  # pragma: no cover
             tpe = "Undefined"  # should not come here at all
         if self.stratify:
             tpe += ", stratified (if possible)"
@@ -332,6 +353,12 @@ def send_report(self):
             ]
         self.report_items(items)
 
+    @classmethod
+    def migrate_settings(cls, settings, version):
+        if not version or version < 2 \
+                and settings["sampling_type"] == cls.CrossValidation:
+            settings["compatibility_mode"] = True
+
 
 class SampleFoldIndices(Reprable):
     def __init__(self, folds=10, stratified=False, random_state=None):
diff --git a/Orange/widgets/data/tests/test_owdatasampler.py b/Orange/widgets/data/tests/test_owdatasampler.py
@@ -1,5 +1,6 @@
 # Test methods with long descriptive names can omit docstrings
 # pylint: disable=missing-docstring,unsubscriptable-object
+import unittest
 from unittest.mock import Mock
 
 from Orange.data import Table
@@ -154,3 +155,76 @@ def set_fixed_sample_size(self, sample_size, with_replacement=False):
 
     def assertNoIntersection(self, sample, other):
         self.assertFalse(bool(set(sample.ids) & set(other.ids)))
+
+    def test_cv_outputs(self):
+        w = self.widget
+        self.send_signal(w.Inputs.data, self.iris)
+
+        self.select_sampling_type(w.CrossValidation)
+        self.widget.commit()
+        self.assertEqual(len(self.get_output(w.Outputs.data_sample)), 135)
+        self.assertEqual(len(self.get_output(w.Outputs.remaining_data)), 15)
+
+    def test_cv_output_migration(self):
+        self.assertFalse(self.widget.compatibility_mode)
+
+        settings = {"sampling_type": OWDataSampler.CrossValidation}
+        OWDataSampler.migrate_settings(settings, version=2)
+        self.assertFalse(settings.get("compatibility_mode", False))
+
+        settings = {"sampling_type": OWDataSampler.FixedProportion}
+        OWDataSampler.migrate_settings(settings, version=1)
+        self.assertFalse(settings.get("compatibility_mode", False))
+
+        settings = {"sampling_type": OWDataSampler.CrossValidation}
+        OWDataSampler.migrate_settings(settings, version=1)
+        self.assertTrue(settings["compatibility_mode"])
+
+        w = self.create_widget(
+            OWDataSampler,
+            stored_settings={"sampling_type": OWDataSampler.CrossValidation,
+                             "__version__": 1})
+        self.assertTrue(w.compatibility_mode)
+
+        self.send_signal(w.Inputs.data, self.iris)
+        self.select_sampling_type(w.CrossValidation)
+        w.commit()
+        self.assertEqual(len(self.get_output(w.Outputs.data_sample)), 15)
+        self.assertEqual(len(self.get_output(w.Outputs.remaining_data)), 135)
+
+    def test_send_report(self):
+        w = self.widget
+        self.send_signal(w.Inputs.data, self.iris)
+
+        w.stratify = True
+        w.use_seed = True
+
+        self.select_sampling_type(0)
+        w.commit()
+        w.send_report()
+
+        self.select_sampling_type(1)
+        w.sampleSizeNumber = 1
+        w.commit()
+        w.send_report()
+
+        w.sampleSizeNumber = 10
+        w.replacement = False
+        w.commit()
+        w.send_report()
+
+        w.replacement = True
+        w.commit()
+        w.send_report()
+
+        self.select_sampling_type(2)
+        w.commit()
+        w.send_report()
+
+        self.select_sampling_type(3)
+        w.commit()
+        w.send_report()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/doc/visual-programming/source/widgets/data/datasampler.md b/doc/visual-programming/source/widgets/data/datasampler.md
@@ -20,7 +20,7 @@ The **Data Sampler** widget implements several data sampling methods. It outputs
 2. The desired sampling method:
    - **Fixed proportion of data** returns a selected percentage of the entire data (e.g. 70% of all the data)
    - **Fixed sample size** returns a selected number of data instances with a chance to set *Sample with replacement*, which always samples from the entire dataset (does not subtract instances already in the subset). With replacement, you can generate more instances than available in the input dataset.
-   - [Cross Validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics)) partitions data instances into complementary subsets, where you can select the number of folds (subsets) and which fold you want to use as a sample.
+   - [Cross Validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics)) partitions data instances into the specified number of complementary subsets. Following a typical validation schema, all subsets except the one selected by the user are output as Data Sample, and the selected subset goes to Remaining Data. (Note: In older versions, the outputs were swapped. If the widget is loaded from an older workflow, it switches to compatibility mode.)
    - [Bootstrap](https://en.wikipedia.org/wiki/Bootstrapping_(statistics)) infers the sample from the population statistic.
 3. *Replicable sampling* maintains sampling patterns that can be carried
    across users, while *stratify sample* mimics the composition of the