Skip to content

Commit a3d67c8

Browse files
lanzagarastaric
authored andcommitted
Merge pull request #1952 from astaric/stratified-when-possible
[FIX] DataSampler: Fix crash when stratifying unbalanced datasets (cherry picked from commit a8f71bc) Conflicts: Orange/widgets/data/owdatasampler.py
1 parent f315b17 commit a3d67c8

File tree

2 files changed

+45
-2
lines changed

2 files changed

+45
-2
lines changed

Orange/widgets/data/owdatasampler.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ class OWDataSampler(OWWidget):
4545
number_of_folds = Setting(10)
4646
selectedFold = Setting(1)
4747

48+
class Warning(OWWidget.Warning):
49+
could_not_stratify = Msg("Stratification failed\n{}")
50+
4851
class Error(OWWidget.Error):
4952
too_many_folds = Msg("Number of folds exceeds data size")
5053
sample_larger_than_data = Msg("Sample must be smaller than data")
@@ -251,10 +254,17 @@ def updateindices(self):
251254
self.indices = None
252255
return
253256

254-
rnd = self.RandomSeed if self.use_seed else None
255257
stratified = (self.stratify and
256258
type(self.data) == Table and
257259
self.data.domain.has_discrete_class)
260+
try:
261+
self.sample(data_length, size, stratified)
262+
except ValueError as ex:
263+
self.Warning.could_not_stratify(str(ex))
264+
self.sample(data_length, size, stratified=False)
265+
266+
def sample(self, data_length, size, stratified):
267+
rnd = self.RandomSeed if self.use_seed else None
258268
if self.sampling_type == self.FixedSize:
259269
self.indices = sample_random_n(
260270
self.data, size,

Orange/widgets/data/tests/test_owdatasampler.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def setUpClass(cls):
1212
cls.iris = Table("iris")
1313

1414
def setUp(self):
15-
self.widget = self.create_widget(OWDataSampler)
15+
self.widget = self.create_widget(OWDataSampler) # type: OWDataSampler
1616

1717
def test_error_message(self):
1818
""" Check if error message appears and then disappears when
@@ -26,3 +26,36 @@ def test_error_message(self):
2626
self.assertFalse(self.widget.Error.too_many_folds.is_shown())
2727
self.send_signal("Data", Table(self.iris.domain))
2828
self.assertTrue(self.widget.Error.no_data.is_shown())
29+
30+
def test_stratified_on_unbalanced_data(self):
31+
unbalanced_data = self.iris[:51]
32+
33+
self.widget.controls.stratify.setChecked(True)
34+
self.send_signal("Data", unbalanced_data)
35+
self.assertTrue(self.widget.Warning.could_not_stratify.is_shown())
36+
37+
def test_bootstrap(self):
38+
self.select_sampling_type(self.widget.Bootstrap)
39+
40+
self.send_signal("Data", self.iris)
41+
42+
in_input = set(self.iris.ids)
43+
sample = self.get_output("Data Sample")
44+
in_sample = set(sample.ids)
45+
in_remaining = set(self.get_output("Remaining Data").ids)
46+
47+
# Bootstrap should sample len(input) instances
48+
self.assertEqual(len(sample), len(self.iris))
49+
# Sample and remaining should cover all instances, while none
50+
# should be present in both
51+
self.assertEqual(len(in_sample | in_remaining), len(in_input))
52+
self.assertEqual(len(in_sample & in_remaining), 0)
53+
# Sampling with replacement will always produce at least one distinct
54+
# instance in sample, and at least one instance in remaining with
55+
# high probability (1-(1/150*2/150*...*150/150) ~= 1-2e-64)
56+
self.assertGreater(len(in_sample), 0)
57+
self.assertGreater(len(in_remaining), 0)
58+
59+
def select_sampling_type(self, sampling_type):
60+
buttons = self.widget.controls.sampling_type.group.buttons()
61+
buttons[sampling_type].click()

0 commit comments

Comments
 (0)