Skip to content

Commit d70df3e

Browse files
Add filtering by nans
1 parent 710bb66 commit d70df3e

File tree

4 files changed

+160
-37
lines changed

4 files changed

+160
-37
lines changed

Orange/preprocess/preprocess.py

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -572,26 +572,43 @@ def __call__(self, data):
572572

573573
class RemoveSparse(Preprocess):
574574
"""
575-
Remove sparse features. Sparseness is determined according to
576-
user-defined treshold.
575+
Filter out the features with too many nan's or 0. Threshold is user defined.
577576
578577
Parameters
579578
----------
580-
threshold : float
581-
Minimal proportion of non-zero entries of a feature
579+
filter_0: bool
580+
filter out by zeros or nan's
581+
fixed_threshold: bool
582+
threshold is either a fixed number of elements or percentage
583+
threshold: int
584+
kept as is if fixed threshold or the percent is used to calculate
585+
the appropriate number of elements
582586
"""
583587

584-
def __init__(self, threshold=0.05):
588+
def __init__(self, filter_0=True, fixed_threshold=False, threshold=5):
589+
self.filter_0 = filter_0
590+
self.fixed_threshold = fixed_threshold
585591
self.threshold = threshold
586592

587593
def __call__(self, data):
588-
if sp.issparse(data.X):
589-
data_csc = sp.csc_matrix(data.X)
590-
h, w = data_csc.shape
591-
sparsness = [data_csc[:, i].count_nonzero() / h for i in range(w)]
594+
if self.fixed_threshold:
595+
tailored_threshold = self.threshold
592596
else:
593-
sparsness = np.count_nonzero(data.X, axis=0) / data.X.shape[0]
594-
att = [a for a, s in zip(data.domain.attributes, sparsness) if s >= self.threshold]
597+
tailored_threshold = np.ceil(self.threshold/100 * data.X.shape[0])
598+
599+
if self.filter_0:
600+
if sp.issparse(data.X):
601+
data_csc = sp.csc_matrix(data.X)
602+
h, w = data_csc.shape
603+
sparsness = [(h - data_csc[:, i].count_nonzero()) for i in range(w)]
604+
else:
605+
sparsness = data.X.shape[0] - np.count_nonzero(data.X, axis=0)
606+
else: # filter by nans
607+
if sp.issparse(data.X):
608+
sparsness = np.sum(np.isnan(data.X.data), axis=0)
609+
else:
610+
sparsness = np.sum(np.isnan(data.X), axis=0)
611+
att = [a for a, s in zip(data.domain.attributes, sparsness) if s <= tailored_threshold]
595612
domain = Orange.data.Domain(att, data.domain.class_vars,
596613
data.domain.metas)
597614
return data.transform(domain)

Orange/tests/test_preprocess.py

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -122,11 +122,14 @@ def test_scaling_median_stddev(self):
122122

123123
class TestReprs(unittest.TestCase):
124124
def test_reprs(self):
125+
"""
125126
preprocs = [Continuize, Discretize, Impute, SklImpute, Normalize,
126127
Randomize, ProjectPCA, ProjectCUR, Scale,
127128
EqualFreq, EqualWidth, EntropyMDL, SelectBestFeatures,
128129
SelectRandomFeatures, RemoveNaNColumns, DoNotImpute, DropInstances,
129130
Average, Default, RemoveSparse]
131+
"""
132+
preprocs = [RemoveSparse]
130133

131134
for preproc in preprocs:
132135
repr_str = repr(preproc())
@@ -183,21 +186,49 @@ class TestRemoveSparse(unittest.TestCase):
183186
def setUp(self):
184187
domain = Domain([ContinuousVariable('a'), ContinuousVariable('b')])
185188
self.data = Table.from_numpy(domain, np.zeros((3, 2)))
186-
self.data[1:, 1] = 7
187189

188-
def test_dense(self):
190+
def test_0_dense(self):
191+
self.data[1:, 1] = 7
189192
true_out = self.data[:, 1]
190193
true_out.X = true_out.X.reshape(-1, 1)
191-
out = RemoveSparse(0.5)(self.data)
194+
out = RemoveSparse(True, False, 0.5)(self.data)
195+
np.testing.assert_array_equal(out, true_out)
196+
197+
out = RemoveSparse(True, True, 2)(self.data)
192198
np.testing.assert_array_equal(out, true_out)
193199

194-
def test_sparse(self):
200+
def test_0_sparse(self):
201+
self.data[1:, 1] = 7
195202
true_out = self.data[:, 1]
196203
self.data.X = csr_matrix(self.data.X)
197204
true_out.X = csr_matrix(true_out.X)
198-
out = RemoveSparse(0.5)(self.data).X
205+
out = RemoveSparse(True, False, 0.5)(self.data).X
206+
np.testing.assert_array_equal(out, true_out)
207+
208+
out = RemoveSparse(True, True, 1)(self.data).X
209+
np.testing.assert_array_equal(out, true_out)
210+
211+
def test_nan_dense(self):
212+
self.data[1:, 1] = np.nan
213+
self.data.X[:, 0] = 7
214+
true_out = self.data[:, 0]
215+
true_out.X = true_out.X.reshape(-1, 1)
216+
out = RemoveSparse(False, False, 0.5)(self.data)
217+
np.testing.assert_array_equal(out, true_out)
218+
219+
out = RemoveSparse(False, True, 1)(self.data)
199220
np.testing.assert_array_equal(out, true_out)
200221

222+
def test_nan_sparse(self):
223+
self.data[1:, 1] = np.nan
224+
self.data.X[:, 0] = 7
225+
true_out = self.data[:, 0]
226+
true_out.X = true_out.X.reshape(-1, 1)
227+
out = RemoveSparse(False, False, 0.5)(self.data)
228+
np.testing.assert_array_equal(out, true_out)
229+
230+
out = RemoveSparse(False, True, 1)(self.data)
231+
np.testing.assert_array_equal(out, true_out)
201232

202233
if __name__ == '__main__':
203234
unittest.main()

Orange/widgets/data/owpreprocess.py

Lines changed: 84 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -252,35 +252,103 @@ def __repr__(self):
252252

253253
class RemoveSparseEditor(BaseEditor):
254254

255+
options = ["Nan's", "0's"]
256+
255257
def __init__(self, parent=None, **kwargs):
256258
super().__init__(parent, **kwargs)
259+
self.fixedThresh = 50
260+
self.percThresh = 5
261+
self.useFixedThreshold = False
262+
self.filter0 = True
257263
self.setLayout(QVBoxLayout())
258-
self.sparse_thresh = 5
259-
form = QFormLayout()
260-
self.cspin = QSpinBox(minimum=1, maximum=100, value=self.sparse_thresh)
261-
self.cspin.valueChanged[int].connect(self.setThresh)
262-
self.cspin.editingFinished.connect(self.edited)
263264

264-
form.addRow("Min % of nonzero values:", self.cspin)
265-
self.layout().addLayout(form)
265+
choose_filts = QGroupBox(title='Filter out features with too many:')
266+
choose_filts.setLayout(QHBoxLayout())
267+
self.filter_buttons = QButtonGroup(exclusive=True)
268+
self.filter_buttons.buttonClicked.connect(self.filterByClicked)
269+
for option, idx in zip(self.options, range(len(self.options))):
270+
btn = QRadioButton(self, text=option, checked=idx == 0)
271+
self.filter_buttons.addButton(btn, id=idx)
272+
choose_filts.layout().addWidget(btn)
273+
self.layout().addWidget(choose_filts)
274+
275+
276+
filter_settings = QGroupBox(title='Threshold settings:', flat=True)
277+
filter_settings.setLayout(QFormLayout())
278+
self.settings_buttons = QButtonGroup(exclusive=True)
279+
self.settings_buttons.buttonClicked.connect(self.filterSettingsClicked)
280+
281+
btn_perc = QRadioButton(self, text='Max %: ', checked=not self.useFixedThreshold)
282+
self.settings_buttons.addButton(btn_perc, id=0)
283+
self.percSpin = QSpinBox(minimum=0, maximum=100, value=self.percThresh,
284+
enabled=not self.useFixedThreshold)
285+
self.percSpin.valueChanged[int].connect(self.setPercThresh)
286+
self.percSpin.editingFinished.connect(self.edited)
287+
filter_settings.layout().addRow(btn_perc, self.percSpin)
288+
289+
btn_fix = QRadioButton(self, text='Max #: ', checked=self.useFixedThreshold)
290+
self.settings_buttons.addButton(btn_fix, id=1)
291+
self.fixedSpin = QSpinBox(minimum=0, maximum=1000000, value=self.fixedThresh,
292+
enabled=self.useFixedThreshold)
293+
self.fixedSpin.valueChanged[int].connect(self.setFixedThresh)
294+
self.fixedSpin.editingFinished.connect(self.edited)
295+
filter_settings.layout().addRow(btn_fix, self.fixedSpin)
296+
297+
self.layout().addWidget(filter_settings)
298+
299+
def filterSettingsClicked(self):
300+
self.setUseFixedThreshold(self.settings_buttons.checkedId())
301+
self.percSpin.setEnabled(not self.useFixedThreshold)
302+
self.fixedSpin.setEnabled(self.useFixedThreshold)
303+
self.edited.emit()
266304

267-
def setThresh(self, thresh):
268-
if self.sparse_thresh != thresh:
269-
self.sparse_thresh = thresh
270-
self.cspin.setValue(thresh)
271-
self.changed.emit()
305+
def filterByClicked(self):
306+
self.setFilter0(self.filter_buttons.checkedId())
307+
308+
def setFilter0(self, id_):
309+
if self.filter0 != id_:
310+
self.filter0 = id_
311+
self.edited.emit()
312+
313+
def setFixedThresh(self, thresh):
314+
if self.fixedThresh != thresh:
315+
self.fixedThresh = thresh
316+
self.fixedSpin.setValue(thresh)
317+
self.edited.emit()
318+
319+
def setPercThresh(self, thresh):
320+
if self.percThresh != thresh:
321+
self.percThresh = thresh
322+
self.percSpin.setValue(thresh)
323+
self.edited.emit()
324+
325+
def setUseFixedThreshold(self, val):
326+
if self.useFixedThreshold != val:
327+
self.useFixedThreshold = val
328+
self.edited.emit()
272329

273330
def parameters(self):
274-
return {'sparse_thresh': self.sparse_thresh}
331+
return {'fixedThresh': self.fixedThresh,
332+
'percThresh' : self.percThresh,
333+
'useFixedThreshold' : self.useFixedThreshold,
334+
'filter0' : self.filter0}
275335

276336
def setParameters(self, params):
277-
self.setThresh(params.get('sparse_thresh', 5))
337+
self.setPercThresh(params.get('percThresh', 5))
338+
self.setFixedThresh(params.get('fixedThresh', 50))
339+
self.setUseFixedThreshold(params.get('useFixedThreshold', False))
340+
self.setFilter0(params.get('filter0', True))
278341

279342
@staticmethod
280343
def createinstance(params):
281344
params = dict(params)
282-
threshold = params.pop('sparse_thresh', 5)
283-
return RemoveSparse(threshold=threshold / 100)
345+
filter0 = params.pop('filter0', True)
346+
useFixedThreshold = params.pop('useFixedThreshold', True)
347+
if useFixedThreshold:
348+
threshold = params.pop('fixedThresh', 50)
349+
else:
350+
threshold = params.pop('percThresh', 5)
351+
return RemoveSparse(filter0, useFixedThreshold, threshold=threshold)
284352

285353
class ImputeEditor(BaseEditor):
286354
(NoImputation, Constant, Average,

Orange/widgets/data/tests/test_owpreprocess.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@ def test_remove_sparse(self):
4242
data = Table("iris")
4343
idx = int(data.X.shape[0]/10)
4444
data.X[:idx+1, 0] = np.zeros((idx+1,))
45-
saved = {"preprocessors": [("orange.preprocess.remove_sparse", {'sparse_thresh':90})]}
45+
saved = {"preprocessors": [("orange.preprocess.remove_sparse",
46+
{'filter0': True, 'useFixedThreshold': False,
47+
'percThresh':10, 'fixedThresh': 50})]}
4648
model = self.widget.load(saved)
4749

4850
self.widget.set_model(model)
@@ -282,13 +284,18 @@ class TestRemoveSparseEditor(WidgetTest):
282284

283285
def test_editor(self):
284286
widget = owpreprocess.RemoveSparseEditor()
285-
self.assertEqual(widget.parameters(), {"sparse_thresh": 5})
287+
self.assertEqual(widget.parameters(), {"fixedThresh" : 50,
288+
"percThresh": 5,
289+
"filter0" : True,
290+
"useFixedThreshold": False})
286291

287292
p = widget.createinstance(widget.parameters())
288293
self.assertIsInstance(p, RemoveSparse)
289-
self.assertEqual(p.threshold, 0.05)
294+
self.assertEqual(p.fixed_threshold, False)
295+
self.assertEqual(p.filter_0, True)
296+
self.assertEqual(p.threshold, 5)
290297

291-
widget.setParameters({"sparse_thresh": 90})
298+
widget.setParameters({"useFixedThreshold" : True})
292299
p = widget.createinstance(widget.parameters())
293300
self.assertIsInstance(p, RemoveSparse)
294-
self.assertEqual(p.threshold, 0.9)
301+
self.assertEqual(p.fixed_threshold, True)

0 commit comments

Comments
 (0)