Skip to content

Commit 0f730b4

Browse files
authored
Merge pull request #4093 from AndrejaKovacic/remove_sparse_features
[ENH] Add remove sparse features preprocessor
2 parents 5042491 + eb9a6cd commit 0f730b4

File tree

5 files changed

+137
-16
lines changed

5 files changed

+137
-16
lines changed

Orange/preprocess/preprocess.py

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717
__all__ = ["Continuize", "Discretize", "Impute", "RemoveNaNRows",
1818
"SklImpute", "Normalize", "Randomize", "Preprocess",
1919
"RemoveConstant", "RemoveNaNClasses", "RemoveNaNColumns",
20-
"ProjectPCA", "ProjectCUR", "Scale", "AdaptiveNormalize"]
20+
"ProjectPCA", "ProjectCUR", "Scale", "RemoveSparse",
21+
"AdaptiveNormalize"]
2122

2223

2324
class Preprocess(Reprable):
@@ -101,11 +102,11 @@ def transform(var):
101102
else:
102103
return var
103104

104-
def discretized(vars, do_discretize):
105+
def discretized(vars_, do_discretize):
105106
if do_discretize:
106-
vars = (transform(var) for var in vars)
107-
vars = [var for var in vars if var is not None]
108-
return vars
107+
vars_ = (transform(var) for var in vars_)
108+
vars_ = [var for var in vars_ if var is not None]
109+
return vars_
109110

110111
method = self.method or discretize.EqualFreq()
111112
domain = Orange.data.Domain(
@@ -421,7 +422,8 @@ def __call__(self, data):
421422
new_data.metas = self.randomize(new_data.metas, r3)
422423
return new_data
423424

424-
def randomize(self, table, rand_state=None):
425+
@staticmethod
426+
def randomize(table, rand_state=None):
425427
rstate = np.random.RandomState(rand_state)
426428
if sp.issparse(table):
427429
table = table.tocsc() # type: sp.spmatrix
@@ -568,6 +570,32 @@ def __call__(self, data):
568570
data = pp(data)
569571
return data
570572

573+
class RemoveSparse(Preprocess):
574+
"""
575+
Remove sparse features. Sparseness is determined according to
576+
user-defined treshold.
577+
578+
Parameters
579+
----------
580+
threshold : float
581+
Minimal proportion of non-zero entries of a feature
582+
"""
583+
584+
def __init__(self, threshold=0.05):
585+
self.threshold = threshold
586+
587+
def __call__(self, data):
588+
if sp.issparse(data.X):
589+
data_csc = sp.csc_matrix(data.X)
590+
h, w = data_csc.shape
591+
sparsness = [data_csc[:, i].count_nonzero() / h for i in range(w)]
592+
else:
593+
sparsness = np.count_nonzero(data.X, axis=0) / data.X.shape[0]
594+
att = [a for a, s in zip(data.domain.attributes, sparsness) if s >= self.threshold]
595+
domain = Orange.data.Domain(att, data.domain.class_vars,
596+
data.domain.metas)
597+
return data.transform(domain)
598+
571599

572600
class AdaptiveNormalize(Preprocess):
573601
"""

Orange/tests/test_preprocess.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@
99
import numpy as np
1010
from scipy.sparse import csr_matrix
1111

12-
from Orange.data import Table
12+
from Orange.data import Table, Domain, ContinuousVariable
1313
from Orange.preprocess import EntropyMDL, DoNotImpute, Default, Average, \
1414
SelectRandomFeatures, EqualFreq, RemoveNaNColumns, DropInstances, \
1515
EqualWidth, SelectBestFeatures, RemoveNaNRows, Preprocess, Scale, \
1616
Randomize, Continuize, Discretize, Impute, SklImpute, Normalize, \
17-
ProjectCUR, ProjectPCA, RemoveConstant, AdaptiveNormalize
17+
ProjectCUR, ProjectPCA, RemoveConstant, AdaptiveNormalize, RemoveSparse
1818
from Orange.util import OrangeDeprecationWarning
1919

2020

@@ -126,7 +126,7 @@ def test_reprs(self):
126126
Randomize, ProjectPCA, ProjectCUR, Scale,
127127
EqualFreq, EqualWidth, EntropyMDL, SelectBestFeatures,
128128
SelectRandomFeatures, RemoveNaNColumns, DoNotImpute, DropInstances,
129-
Average, Default]
129+
Average, Default, RemoveSparse]
130130

131131
for preproc in preprocs:
132132
repr_str = repr(preproc())
@@ -176,3 +176,28 @@ def test_sparse_pps(self):
176176
true_out = Scale(center=Scale.NoCentering, scale=Scale.Span)(self.data)
177177
np.testing.assert_array_equal(out, true_out)
178178
self.data = self.data.X.toarray()
179+
180+
181+
class TestRemoveSparse(unittest.TestCase):
182+
183+
def setUp(self):
184+
domain = Domain([ContinuousVariable('a'), ContinuousVariable('b')])
185+
self.data = Table.from_numpy(domain, np.zeros((3, 2)))
186+
self.data[1:, 1] = 7
187+
188+
def test_dense(self):
189+
true_out = self.data[:, 1]
190+
true_out.X = true_out.X.reshape(-1, 1)
191+
out = RemoveSparse(0.5)(self.data)
192+
np.testing.assert_array_equal(out, true_out)
193+
194+
def test_sparse(self):
195+
true_out = self.data[:, 1]
196+
self.data.X = csr_matrix(self.data.X)
197+
true_out.X = csr_matrix(true_out.X)
198+
out = RemoveSparse(0.5)(self.data).X
199+
np.testing.assert_array_equal(out, true_out)
200+
201+
202+
if __name__ == '__main__':
203+
unittest.main()

Orange/widgets/data/owpreprocess.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
import Orange.data
2424
from Orange import preprocess
2525
from Orange.preprocess import Continuize, ProjectPCA, RemoveNaNRows, \
26-
ProjectCUR, Scale as _Scale, Randomize as _Randomize
26+
ProjectCUR, Scale as _Scale, Randomize as _Randomize, RemoveSparse
2727
from Orange.widgets import widget, gui
2828
from Orange.widgets.settings import Setting
2929
from Orange.widgets.utils.overlay import OverlayWidget
@@ -250,6 +250,37 @@ def createinstance(params):
250250
def __repr__(self):
251251
return self.Continuizers[self.__treatment]
252252

253+
class RemoveSparseEditor(BaseEditor):
254+
255+
def __init__(self, parent=None, **kwargs):
256+
super().__init__(parent, **kwargs)
257+
self.setLayout(QVBoxLayout())
258+
self.sparse_thresh = 5
259+
form = QFormLayout()
260+
self.cspin = QSpinBox(minimum=1, maximum=100, value=self.sparse_thresh)
261+
self.cspin.valueChanged[int].connect(self.setThresh)
262+
self.cspin.editingFinished.connect(self.edited)
263+
264+
form.addRow("Min % of nonzero values:", self.cspin)
265+
self.layout().addLayout(form)
266+
267+
def setThresh(self, thresh):
268+
if self.sparse_thresh != thresh:
269+
self.sparse_thresh = thresh
270+
self.cspin.setValue(thresh)
271+
self.changed.emit()
272+
273+
def parameters(self):
274+
return {'sparse_thresh': self.sparse_thresh}
275+
276+
def setParameters(self, params):
277+
self.setThresh(params.get('sparse_thresh', 5))
278+
279+
@staticmethod
280+
def createinstance(params):
281+
params = dict(params)
282+
threshold = params.pop('sparse_thresh', 5)
283+
return RemoveSparse(threshold=threshold / 100)
253284

254285
class ImputeEditor(BaseEditor):
255286
(NoImputation, Constant, Average,
@@ -922,6 +953,12 @@ def icon_path(basename):
922953
icon_path("Random.svg")),
923954
Randomize
924955
),
956+
PreprocessAction(
957+
"Remove Sparse", "orange.preprocess.remove_sparse", "Feature Selection",
958+
Description("Remove Sparse Features",
959+
icon_path("PurgeDomain.svg")),
960+
RemoveSparseEditor
961+
),
925962
PreprocessAction(
926963
"PCA", "orange.preprocess.pca", "PCA",
927964
Description("Principal Component Analysis",

Orange/widgets/data/tests/test_owpreprocess.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44

55
from Orange.data import Table
66
from Orange.preprocess import (
7-
Randomize, Scale, Discretize, Continuize, Impute, ProjectPCA, ProjectCUR
7+
Randomize, Scale, Discretize, Continuize, Impute, ProjectPCA, \
8+
ProjectCUR, RemoveSparse
89
)
910
from Orange.preprocess import discretize, impute, fss, score
1011
from Orange.widgets.data import owpreprocess
@@ -37,6 +38,20 @@ def test_randomize(self):
3738
np.testing.assert_array_equal(self.zoo.metas, output.metas)
3839
self.assertFalse(np.array_equal(self.zoo.Y, output.Y))
3940

41+
def test_remove_sparse(self):
42+
data = Table("iris")
43+
idx = int(data.X.shape[0]/10)
44+
data.X[:idx+1, 0] = np.zeros((idx+1,))
45+
saved = {"preprocessors": [("orange.preprocess.remove_sparse", {'sparse_thresh':90})]}
46+
model = self.widget.load(saved)
47+
48+
self.widget.set_model(model)
49+
self.send_signal(self.widget.Inputs.data, data)
50+
output = self.get_output(self.widget.Outputs.preprocessed_data)
51+
np.testing.assert_array_equal(output.X, data.X[:, 1:])
52+
np.testing.assert_array_equal(output.Y, data.Y)
53+
np.testing.assert_array_equal(output.metas, data.metas)
54+
4055
def test_normalize(self):
4156
data = Table("iris")
4257
saved = {"preprocessors": [("orange.preprocess.scale",
@@ -262,3 +277,18 @@ def test_editor(self):
262277
self.assertIsInstance(p, ProjectCUR)
263278
self.assertEqual(p.rank, 5)
264279
self.assertEqual(p.max_error, 0.5)
280+
281+
class TestRemoveSparseEditor(WidgetTest):
282+
283+
def test_editor(self):
284+
widget = owpreprocess.RemoveSparseEditor()
285+
self.assertEqual(widget.parameters(), {"sparse_thresh": 5})
286+
287+
p = widget.createinstance(widget.parameters())
288+
self.assertIsInstance(p, RemoveSparse)
289+
self.assertEqual(p.threshold, 0.05)
290+
291+
widget.setParameters({"sparse_thresh": 90})
292+
p = widget.createinstance(widget.parameters())
293+
self.assertIsInstance(p, RemoveSparse)
294+
self.assertEqual(p.threshold, 0.9)

doc/visual-programming/source/widgets/data/preprocess.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,12 @@ Preprocessors
4747

4848
![](images/Preprocess2-stamped.png)
4949

50-
1. *Select random features* outputs either a fixed number of features from the original data or a percentage. This is mainly used for advanced testing and educational purposes.
51-
2. Normalize adjusts values to a common scale. Center values by mean or median or omit centering altogether. Similar for scaling, one can scale by SD (standard deviation), by span or not at all.
52-
3. Randomize instances. Randomize classes shuffles class values and destroys connection between instances and class. Similarly, one can randomize features or meta data. If replicable shuffling is on, randomization results can be shared and repeated with a saved workflow. This is mainly used for advanced testing and educational purposes.
53-
4. Principal component analysis outputs results of a PCA transformation. Similar to the [PCA](../unsupervised/PCA.md) widget.
54-
5. [CUR matrix decomposition](https://en.wikipedia.org/wiki/CUR_matrix_approximation) is a dimensionality reduction method, similar to SVD.
50+
5. *Select random features* outputs either a fixed number of features from the original data or a percentage. This is mainly used for advanced testing and educational purposes.
51+
6. Normalize adjusts values to a common scale. Center values by mean or median or omit centering altogether. Similar for scaling, one can scale by SD (standard deviation), by span or not at all.
52+
7. Randomize instances. Randomize classes shuffles class values and destroys connection between instances and class. Similarly, one can randomize features or meta data. If replicable shuffling is on, randomization results can be shared and repeated with a saved workflow. This is mainly used for advanced testing and educational purposes.
53+
8. *Remove sparse features* retains features that have more than user-defined threshold percentage of non-zero values. The rest are discarded.
54+
9. Principal component analysis outputs results of a PCA transformation. Similar to the [PCA](../unsupervised/PCA.md) widget.
55+
10. [CUR matrix decomposition](https://en.wikipedia.org/wiki/CUR_matrix_approximation) is a dimensionality reduction method, similar to SVD.
5556

5657
Examples
5758
--------

0 commit comments

Comments
 (0)