Skip to content

Commit bb801df

Browse files
fix typos, comments
1 parent 8d43115 commit bb801df

File tree

4 files changed

+53
-58
lines changed

4 files changed

+53
-58
lines changed

Orange/preprocess/preprocess.py

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -572,44 +572,42 @@ def __call__(self, data):
572572

573573
class RemoveSparse(Preprocess):
574574
"""
575-
Filter out the features with too many nan's or 0. Threshold is user defined.
575+
Filter out the features with too many (>threshold) zeros or missing values. Threshold is user defined.
576576
577577
Parameters
578578
----------
579-
filter_0: bool
580-
filter out by zeros or nan's
581-
fixed_threshold: bool
582-
threshold is either a fixed number of elements or percentage
583-
threshold: int
584-
kept as is if fixed threshold or the percent is used to calculate
585-
the appropriate number of elements
579+
threshold: int or float
580+
if >= 1, the argument represents the allowed number of 0s or NaNs;
581+
if below 1, it represents the allowed proportion of 0s or NaNs
582+
filter0: bool
583+
if True (default), preprocessor counts 0s, otherwise NaNs
586584
"""
587-
588-
def __init__(self, filter_0=True, fixed_threshold=False, threshold=5):
589-
self.filter_0 = filter_0
590-
self.fixed_threshold = fixed_threshold
585+
def __init__(self, threshold=0.05, filter0=True):
586+
self.filter0 = filter0
591587
self.threshold = threshold
592588

593589
def __call__(self, data):
594-
if self.fixed_threshold:
595-
tailored_threshold = self.threshold
596-
else:
597-
tailored_threshold = np.ceil(self.threshold/100 * data.X.shape[0])
590+
threshold = self.threshold
591+
if self.threshold < 1:
592+
threshold *= data.X.shape[0]
598593

599-
if self.filter_0:
594+
if self.filter0:
600595
if sp.issparse(data.X):
601596
data_csc = sp.csc_matrix(data.X)
602597
h, w = data_csc.shape
603-
sparsness = [(h - data_csc[:, i].count_nonzero()) for i in range(w)]
598+
sparseness = [h - data_csc[:, i].count_nonzero()
599+
for i in range(w)]
604600
else:
605-
sparsness = data.X.shape[0] - np.count_nonzero(data.X, axis=0)
606-
else: # filter by nans
601+
sparseness = data.X.shape[0] - np.count_nonzero(data.X, axis=0)
602+
else: # filter by nans
607603
if sp.issparse(data.X):
608604
data_csc = sp.csc_matrix(data.X)
609-
sparsness = [np.sum(np.isnan(data.X[:, i].data)) for i in range(data_csc.shape[1])]
605+
sparseness = [np.sum(np.isnan(data.X[:, i].data))
606+
for i in range(data_csc.shape[1])]
610607
else:
611-
sparsness = np.sum(np.isnan(data.X), axis=0)
612-
att = [a for a, s in zip(data.domain.attributes, sparsness) if s <= tailored_threshold]
608+
sparseness = np.sum(np.isnan(data.X), axis=0)
609+
att = [a for a, s in zip(data.domain.attributes, sparseness)
610+
if s <= threshold]
613611
domain = Orange.data.Domain(att, data.domain.class_vars,
614612
data.domain.metas)
615613
return data.transform(domain)

Orange/tests/test_preprocess.py

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# Test methods with long descriptive names can omit docstrings
22
# pylint: disable=missing-docstring
33

4-
import os
54
import pickle
65
import unittest
76
from unittest.mock import Mock
@@ -15,7 +14,6 @@
1514
EqualWidth, SelectBestFeatures, RemoveNaNRows, Preprocess, Scale, \
1615
Randomize, Continuize, Discretize, Impute, SklImpute, Normalize, \
1716
ProjectCUR, ProjectPCA, RemoveConstant, AdaptiveNormalize, RemoveSparse
18-
from Orange.util import OrangeDeprecationWarning
1917

2018

2119
class TestPreprocess(unittest.TestCase):
@@ -54,7 +52,7 @@ def test_remove_columns(self):
5452
X[3, 1] = np.nan
5553
X[1, 1] = np.nan
5654
X[:, 4] = np.nan
57-
data = Table(X)
55+
data = Table.from_numpy(None, X)
5856
d = RemoveConstant()(data)
5957
self.assertEqual(len(d.domain.attributes), 2)
6058

@@ -103,10 +101,10 @@ def test_column_filtering_sparse(self):
103101
class TestScaling(unittest.TestCase):
104102
@classmethod
105103
def setUpClass(cls):
106-
cls.table = Table([[1, 2, 3],
107-
[2, 3, 4],
108-
[3, 4, 5],
109-
[4, 5, 6]])
104+
cls.table = Table.from_numpy(None, [[1, 2, 3],
105+
[2, 3, 4],
106+
[3, 4, 5],
107+
[4, 5, 6]])
110108

111109
def test_scaling_mean_span(self):
112110
table = Scale(center=Scale.Mean, scale=Scale.Span)(self.table)
@@ -188,32 +186,32 @@ def test_0_dense(self):
188186
self.data[1:, 1] = 7
189187
true_out = self.data[:, 1]
190188
true_out.X = true_out.X.reshape(-1, 1)
191-
out = RemoveSparse(True, False, 0.5)(self.data)
189+
out = RemoveSparse(0.5, True)(self.data)
192190
np.testing.assert_array_equal(out, true_out)
193191

194-
out = RemoveSparse(True, True, 2)(self.data)
192+
out = RemoveSparse(2, True)(self.data)
195193
np.testing.assert_array_equal(out, true_out)
196194

197195
def test_0_sparse(self):
198196
self.data[1:, 1] = 7
199197
true_out = self.data[:, 1]
200198
self.data.X = csr_matrix(self.data.X)
201199
true_out.X = csr_matrix(true_out.X)
202-
out = RemoveSparse(True, False, 0.5)(self.data).X
200+
out = RemoveSparse(0.5, True)(self.data).X
203201
np.testing.assert_array_equal(out, true_out)
204202

205-
out = RemoveSparse(True, True, 1)(self.data).X
203+
out = RemoveSparse(1, True)(self.data).X
206204
np.testing.assert_array_equal(out, true_out)
207205

208206
def test_nan_dense(self):
209207
self.data[1:, 1] = np.nan
210208
self.data.X[:, 0] = 7
211209
true_out = self.data[:, 0]
212210
true_out.X = true_out.X.reshape(-1, 1)
213-
out = RemoveSparse(False, False, 0.5)(self.data)
211+
out = RemoveSparse(0.5, False)(self.data)
214212
np.testing.assert_array_equal(out, true_out)
215213

216-
out = RemoveSparse(False, True, 1)(self.data)
214+
out = RemoveSparse(1, False)(self.data)
217215
np.testing.assert_array_equal(out, true_out)
218216

219217
def test_nan_sparse(self):
@@ -223,11 +221,12 @@ def test_nan_sparse(self):
223221
true_out.X = true_out.X.reshape(-1, 1)
224222
self.data.X = csr_matrix(self.data.X)
225223
true_out.X = csr_matrix(true_out.X)
226-
out = RemoveSparse(False, False, 0.5)(self.data)
224+
out = RemoveSparse(0.5, False)(self.data)
227225
np.testing.assert_array_equal(out, true_out)
228226

229-
out = RemoveSparse(False, True, 1)(self.data)
227+
out = RemoveSparse(1, False)(self.data)
230228
np.testing.assert_array_equal(out, true_out)
231229

230+
232231
if __name__ == '__main__':
233232
unittest.main()

Orange/widgets/data/owpreprocess.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -262,18 +262,19 @@ def __init__(self, parent=None, **kwargs):
262262
self.filter0 = True
263263
self.setLayout(QVBoxLayout())
264264

265-
choose_filts = QGroupBox(title='Filter out features with too many:')
266-
choose_filts.setLayout(QVBoxLayout())
265+
self.layout().addWidget(QLabel("Remove features with too many"))
266+
options = ["missing values",
267+
"zeros"]
267268
self.filter_buttons = QButtonGroup(exclusive=True)
268269
self.filter_buttons.buttonClicked.connect(self.filterByClicked)
269-
for option, idx in zip(self.options, range(len(self.options))):
270+
for idx, option, in enumerate(options):
270271
btn = QRadioButton(self, text=option, checked=idx == 0)
271272
self.filter_buttons.addButton(btn, id=idx)
272-
choose_filts.layout().addWidget(btn)
273-
self.layout().addWidget(choose_filts)
273+
self.layout().addWidget(btn)
274274

275+
self.layout().addSpacing(20)
275276

276-
filter_settings = QGroupBox(title='Threshold settings:', flat=True)
277+
filter_settings = QGroupBox(title='Threshold:', flat=True)
277278
filter_settings.setLayout(QFormLayout())
278279
self.settings_buttons = QButtonGroup(exclusive=True)
279280
self.settings_buttons.buttonClicked.connect(self.filterSettingsClicked)
@@ -347,8 +348,8 @@ def createinstance(params):
347348
if useFixedThreshold:
348349
threshold = params.pop('fixedThresh', 50)
349350
else:
350-
threshold = params.pop('percThresh', 5)
351-
return RemoveSparse(filter0, useFixedThreshold, threshold=threshold)
351+
threshold = params.pop('percThresh', 5) / 100
352+
return RemoveSparse(threshold, filter0)
352353

353354
class ImputeEditor(BaseEditor):
354355
(NoImputation, Constant, Average,

Orange/widgets/data/tests/test_owpreprocess.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -284,25 +284,22 @@ class TestRemoveSparseEditor(WidgetTest):
284284

285285
def test_editor(self):
286286
widget = owpreprocess.RemoveSparseEditor()
287-
self.assertEqual(widget.parameters(), {"fixedThresh" : 50,
288-
"percThresh": 5,
289-
"filter0" : True,
290-
"useFixedThreshold": False})
287+
self.assertEqual(
288+
widget.parameters(),
289+
dict(fixedThresh=50, percThresh=5, filter0=True,
290+
useFixedThreshold=False))
291291

292292
p = widget.createinstance(widget.parameters())
293293
widget.filterSettingsClicked()
294294
self.assertTrue(widget.percSpin.isEnabled())
295295
self.assertFalse(widget.fixedSpin.isEnabled())
296296
self.assertIsInstance(p, RemoveSparse)
297-
self.assertEqual(p.fixed_threshold, False)
298-
self.assertEqual(p.filter_0, True)
299-
self.assertEqual(p.threshold, 5)
297+
self.assertEqual(p.filter0, True)
298+
self.assertEqual(p.threshold, 0.05)
300299

301-
widget.setParameters({"useFixedThreshold" : True,
302-
"fixedThresh" : 30,
303-
"filter0" : False})
300+
widget.setParameters(
301+
dict(useFixedThreshold=True, fixedThresh=30, filter0=False))
304302
p = widget.createinstance(widget.parameters())
305303
self.assertIsInstance(p, RemoveSparse)
306-
self.assertEqual(p.fixed_threshold, True)
307304
self.assertEqual(p.threshold, 30)
308-
self.assertFalse(p.filter_0)
305+
self.assertFalse(p.filter0)

0 commit comments

Comments
 (0)