Skip to content

Commit 3a27881

Browse files
authored
Merge pull request #4810 from janezd/rename-cv-options
Sample Data: Swap output signals for cross-validation
2 parents 8ecbe68 + c49c8ac commit 3a27881

File tree

3 files changed

+109
-8
lines changed

3 files changed

+109
-8
lines changed

Orange/widgets/data/owdatasampler.py

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,18 +55,33 @@ class Outputs:
5555
number_of_folds = Setting(10)
5656
selectedFold = Setting(1)
5757

58+
# Older versions of the widget had swapped outputs for cross validation
59+
# Migrations set this to True for compability with older workflows
60+
compatibility_mode = Setting(False, schema_only=True)
61+
62+
settings_version = 2
63+
64+
class Information(OWWidget.Information):
65+
compatibility_mode = Msg(
66+
"Compatibility mode\n"
67+
"New versions of widget have swapped outputs for cross validation"
68+
)
69+
5870
class Warning(OWWidget.Warning):
5971
could_not_stratify = Msg("Stratification failed\n{}")
6072
bigger_sample = Msg('Sample is bigger than input')
6173

6274
class Error(OWWidget.Error):
63-
too_many_folds = Msg("Number of folds exceeds data size")
75+
too_many_folds = Msg("Number of subsets exceeds data size")
6476
sample_larger_than_data = Msg("Sample can't be larger than data")
6577
not_enough_to_stratify = Msg("Data is too small to stratify")
6678
no_data = Msg("Dataset is empty")
6779

6880
def __init__(self):
6981
super().__init__()
82+
if self.compatibility_mode:
83+
self.Information.compatibility_mode()
84+
7085
self.data = None
7186
self.indices = None
7287
self.sampled_instances = self.remaining_instances = None
@@ -110,15 +125,16 @@ def set_sampling_type_i():
110125
labelAlignment=Qt.AlignLeft,
111126
fieldGrowthPolicy=QFormLayout.AllNonFixedFieldsGrow)
112127
ibox = gui.indentedBox(sampling, addSpace=True, orientation=form)
113-
form.addRow("Number of folds:",
128+
form.addRow("Number of subsets:",
114129
gui.spin(
115130
ibox, self, "number_of_folds", 2, 100,
116131
addToLayout=False,
117132
callback=self.number_of_folds_changed))
118133
self.selected_fold_spin = gui.spin(
119134
ibox, self, "selectedFold", 1, self.number_of_folds,
120135
addToLayout=False, callback=self.fold_changed)
121-
form.addRow("Selected fold:", self.selected_fold_spin)
136+
form.addRow("Unused subset:" if not self.compatibility_mode
137+
else "Selected subset:", self.selected_fold_spin)
122138

123139
gui.appendRadioButton(sampling, "Bootstrap")
124140

@@ -224,7 +240,10 @@ def commit(self):
224240
self.FixedProportion, self.FixedSize, self.Bootstrap):
225241
remaining, sample = self.indices
226242
elif self.sampling_type == self.CrossValidation:
227-
remaining, sample = self.indices[self.selectedFold - 1]
243+
if self.compatibility_mode:
244+
remaining, sample = self.indices[self.selectedFold - 1]
245+
else:
246+
sample, remaining = self.indices[self.selectedFold - 1]
228247

229248
sample = self.data[sample]
230249
other = self.data[remaining]
@@ -315,9 +334,11 @@ def send_report(self):
315334
if self.replacement:
316335
tpe += ", with replacement"
317336
elif self.sampling_type == self.CrossValidation:
318-
tpe = "Fold {} of {}-fold cross-validation".format(
319-
self.selectedFold, self.number_of_folds)
320-
else:
337+
tpe = f"{self.number_of_folds}-fold cross-validation " \
338+
f"without subset #{self.selectedFold}"
339+
elif self.sampling_type == self.Bootstrap:
340+
tpe = "Bootstrap"
341+
else: # pragma: no cover
321342
tpe = "Undefined" # should not come here at all
322343
if self.stratify:
323344
tpe += ", stratified (if possible)"
@@ -332,6 +353,12 @@ def send_report(self):
332353
]
333354
self.report_items(items)
334355

356+
@classmethod
357+
def migrate_settings(cls, settings, version):
358+
if not version or version < 2 \
359+
and settings["sampling_type"] == cls.CrossValidation:
360+
settings["compatibility_mode"] = True
361+
335362

336363
class SampleFoldIndices(Reprable):
337364
def __init__(self, folds=10, stratified=False, random_state=None):

Orange/widgets/data/tests/test_owdatasampler.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Test methods with long descriptive names can omit docstrings
22
# pylint: disable=missing-docstring,unsubscriptable-object
3+
import unittest
34
from unittest.mock import Mock
45

56
from Orange.data import Table
@@ -154,3 +155,76 @@ def set_fixed_sample_size(self, sample_size, with_replacement=False):
154155

155156
def assertNoIntersection(self, sample, other):
156157
self.assertFalse(bool(set(sample.ids) & set(other.ids)))
158+
159+
def test_cv_outputs(self):
160+
w = self.widget
161+
self.send_signal(w.Inputs.data, self.iris)
162+
163+
self.select_sampling_type(w.CrossValidation)
164+
self.widget.commit()
165+
self.assertEqual(len(self.get_output(w.Outputs.data_sample)), 135)
166+
self.assertEqual(len(self.get_output(w.Outputs.remaining_data)), 15)
167+
168+
def test_cv_output_migration(self):
169+
self.assertFalse(self.widget.compatibility_mode)
170+
171+
settings = {"sampling_type": OWDataSampler.CrossValidation}
172+
OWDataSampler.migrate_settings(settings, version=2)
173+
self.assertFalse(settings.get("compatibility_mode", False))
174+
175+
settings = {"sampling_type": OWDataSampler.FixedProportion}
176+
OWDataSampler.migrate_settings(settings, version=1)
177+
self.assertFalse(settings.get("compatibility_mode", False))
178+
179+
settings = {"sampling_type": OWDataSampler.CrossValidation}
180+
OWDataSampler.migrate_settings(settings, version=1)
181+
self.assertTrue(settings["compatibility_mode"])
182+
183+
w = self.create_widget(
184+
OWDataSampler,
185+
stored_settings={"sampling_type": OWDataSampler.CrossValidation,
186+
"__version__": 1})
187+
self.assertTrue(w.compatibility_mode)
188+
189+
self.send_signal(w.Inputs.data, self.iris)
190+
self.select_sampling_type(w.CrossValidation)
191+
w.commit()
192+
self.assertEqual(len(self.get_output(w.Outputs.data_sample)), 15)
193+
self.assertEqual(len(self.get_output(w.Outputs.remaining_data)), 135)
194+
195+
def test_send_report(self):
196+
w = self.widget
197+
self.send_signal(w.Inputs.data, self.iris)
198+
199+
w.stratify = True
200+
w.use_seed = True
201+
202+
self.select_sampling_type(0)
203+
w.commit()
204+
w.send_report()
205+
206+
self.select_sampling_type(1)
207+
w.sampleSizeNumber = 1
208+
w.commit()
209+
w.send_report()
210+
211+
w.sampleSizeNumber = 10
212+
w.replacement = False
213+
w.commit()
214+
w.send_report()
215+
216+
w.replacement = True
217+
w.commit()
218+
w.send_report()
219+
220+
self.select_sampling_type(2)
221+
w.commit()
222+
w.send_report()
223+
224+
self.select_sampling_type(3)
225+
w.commit()
226+
w.send_report()
227+
228+
229+
if __name__ == "__main__":
230+
unittest.main()

doc/visual-programming/source/widgets/data/datasampler.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ The **Data Sampler** widget implements several data sampling methods. It outputs
2020
2. The desired sampling method:
2121
- **Fixed proportion of data** returns a selected percentage of the entire data (e.g. 70% of all the data)
2222
- **Fixed sample size** returns a selected number of data instances with a chance to set *Sample with replacement*, which always samples from the entire dataset (does not subtract instances already in the subset). With replacement, you can generate more instances than available in the input dataset.
23-
- [Cross Validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics)) partitions data instances into complementary subsets, where you can select the number of folds (subsets) and which fold you want to use as a sample.
23+
- [Cross Validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics)) partitions data instances into the specified number of complementary subsets. Following a typical validation schema, all subsets except the one selected by the user are output as Data Sample, and the selected subset goes to Remaining Data. (Note: In older versions, the outputs were swapped. If the widget is loaded from an older workflow, it switches to compatibility mode.)
2424
- [Bootstrap](https://en.wikipedia.org/wiki/Bootstrapping_(statistics)) infers the sample from the population statistic.
2525
3. *Replicable sampling* maintains sampling patterns that can be carried
2626
across users, while *stratify sample* mimics the composition of the

0 commit comments

Comments
 (0)