Merge pull request #4093 from AndrejaKovacic/remove_sparse_features

janezd · web-flow · commit 0f730b46aa6c · 2019-11-14T19:37:14.000+01:00
[ENH] Add remove sparse features preprocessor
diff --git a/Orange/preprocess/preprocess.py b/Orange/preprocess/preprocess.py
@@ -17,7 +17,8 @@
 __all__ = ["Continuize", "Discretize", "Impute", "RemoveNaNRows",
            "SklImpute", "Normalize", "Randomize", "Preprocess",
            "RemoveConstant", "RemoveNaNClasses", "RemoveNaNColumns",
-           "ProjectPCA", "ProjectCUR", "Scale", "AdaptiveNormalize"]
+           "ProjectPCA", "ProjectCUR", "Scale", "RemoveSparse",
+           "AdaptiveNormalize"]
 
 
 class Preprocess(Reprable):
@@ -101,11 +102,11 @@ def transform(var):
             else:
                 return var
 
-        def discretized(vars, do_discretize):
+        def discretized(vars_, do_discretize):
             if do_discretize:
-                vars = (transform(var) for var in vars)
-                vars = [var for var in vars if var is not None]
-            return vars
+                vars_ = (transform(var) for var in vars_)
+                vars_ = [var for var in vars_ if var is not None]
+            return vars_
 
         method = self.method or discretize.EqualFreq()
         domain = Orange.data.Domain(
@@ -421,7 +422,8 @@ def __call__(self, data):
             new_data.metas = self.randomize(new_data.metas, r3)
         return new_data
 
-    def randomize(self, table, rand_state=None):
+    @staticmethod
+    def randomize(table, rand_state=None):
         rstate = np.random.RandomState(rand_state)
         if sp.issparse(table):
             table = table.tocsc()  # type: sp.spmatrix
@@ -568,6 +570,32 @@ def __call__(self, data):
             data = pp(data)
         return data
 
+class RemoveSparse(Preprocess):
+    """
+    Remove sparse  features. Sparseness is determined according to
+    user-defined treshold.
+
+    Parameters
+    ----------
+    threshold : float
+        Minimal proportion of non-zero entries of a feature
+    """
+
+    def __init__(self, threshold=0.05):
+        self.threshold = threshold
+
+    def __call__(self, data):
+        if sp.issparse(data.X):
+            data_csc = sp.csc_matrix(data.X)
+            h, w = data_csc.shape
+            sparsness = [data_csc[:, i].count_nonzero() / h for i in range(w)]
+        else:
+            sparsness = np.count_nonzero(data.X, axis=0) / data.X.shape[0]
+        att = [a for a, s in zip(data.domain.attributes, sparsness) if s >= self.threshold]
+        domain = Orange.data.Domain(att, data.domain.class_vars,
+                                    data.domain.metas)
+        return data.transform(domain)
+
 
 class AdaptiveNormalize(Preprocess):
     """
diff --git a/Orange/tests/test_preprocess.py b/Orange/tests/test_preprocess.py
@@ -9,12 +9,12 @@
 import numpy as np
 from scipy.sparse import csr_matrix
 
-from Orange.data import Table
+from Orange.data import Table, Domain, ContinuousVariable
 from Orange.preprocess import EntropyMDL, DoNotImpute, Default, Average, \
     SelectRandomFeatures, EqualFreq, RemoveNaNColumns, DropInstances, \
     EqualWidth, SelectBestFeatures, RemoveNaNRows, Preprocess, Scale, \
     Randomize, Continuize, Discretize, Impute, SklImpute, Normalize, \
-    ProjectCUR, ProjectPCA, RemoveConstant, AdaptiveNormalize
+    ProjectCUR, ProjectPCA, RemoveConstant, AdaptiveNormalize, RemoveSparse
 from Orange.util import OrangeDeprecationWarning
 
 
@@ -126,7 +126,7 @@ def test_reprs(self):
                     Randomize, ProjectPCA, ProjectCUR, Scale,
                     EqualFreq, EqualWidth, EntropyMDL, SelectBestFeatures,
                     SelectRandomFeatures, RemoveNaNColumns, DoNotImpute, DropInstances,
-                    Average, Default]
+                    Average, Default, RemoveSparse]
 
         for preproc in preprocs:
             repr_str = repr(preproc())
@@ -176,3 +176,28 @@ def test_sparse_pps(self):
         true_out = Scale(center=Scale.NoCentering, scale=Scale.Span)(self.data)
         np.testing.assert_array_equal(out, true_out)
         self.data = self.data.X.toarray()
+
+
+class TestRemoveSparse(unittest.TestCase):
+
+    def setUp(self):
+        domain = Domain([ContinuousVariable('a'), ContinuousVariable('b')])
+        self.data = Table.from_numpy(domain, np.zeros((3, 2)))
+        self.data[1:, 1] = 7
+
+    def test_dense(self):
+        true_out = self.data[:, 1]
+        true_out.X = true_out.X.reshape(-1, 1)
+        out = RemoveSparse(0.5)(self.data)
+        np.testing.assert_array_equal(out, true_out)
+
+    def test_sparse(self):
+        true_out = self.data[:, 1]
+        self.data.X = csr_matrix(self.data.X)
+        true_out.X = csr_matrix(true_out.X)
+        out = RemoveSparse(0.5)(self.data).X
+        np.testing.assert_array_equal(out, true_out)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/Orange/widgets/data/owpreprocess.py b/Orange/widgets/data/owpreprocess.py
@@ -23,7 +23,7 @@
 import Orange.data
 from Orange import preprocess
 from Orange.preprocess import Continuize, ProjectPCA, RemoveNaNRows, \
-    ProjectCUR, Scale as _Scale, Randomize as _Randomize
+    ProjectCUR, Scale as _Scale, Randomize as _Randomize, RemoveSparse
 from Orange.widgets import widget, gui
 from Orange.widgets.settings import Setting
 from Orange.widgets.utils.overlay import OverlayWidget
@@ -250,6 +250,37 @@ def createinstance(params):
     def __repr__(self):
         return self.Continuizers[self.__treatment]
 
+class RemoveSparseEditor(BaseEditor):
+
+    def __init__(self, parent=None, **kwargs):
+        super().__init__(parent, **kwargs)
+        self.setLayout(QVBoxLayout())
+        self.sparse_thresh = 5
+        form = QFormLayout()
+        self.cspin = QSpinBox(minimum=1, maximum=100, value=self.sparse_thresh)
+        self.cspin.valueChanged[int].connect(self.setThresh)
+        self.cspin.editingFinished.connect(self.edited)
+
+        form.addRow("Min % of nonzero values:", self.cspin)
+        self.layout().addLayout(form)
+
+    def setThresh(self, thresh):
+        if self.sparse_thresh != thresh:
+            self.sparse_thresh = thresh
+            self.cspin.setValue(thresh)
+            self.changed.emit()
+
+    def parameters(self):
+        return {'sparse_thresh': self.sparse_thresh}
+
+    def setParameters(self, params):
+        self.setThresh(params.get('sparse_thresh', 5))
+
+    @staticmethod
+    def createinstance(params):
+        params = dict(params)
+        threshold = params.pop('sparse_thresh', 5)
+        return RemoveSparse(threshold=threshold / 100)
 
 class ImputeEditor(BaseEditor):
     (NoImputation, Constant, Average,
@@ -922,6 +953,12 @@ def icon_path(basename):
                     icon_path("Random.svg")),
         Randomize
     ),
+    PreprocessAction(
+        "Remove Sparse", "orange.preprocess.remove_sparse", "Feature Selection",
+        Description("Remove Sparse Features",
+                    icon_path("PurgeDomain.svg")),
+        RemoveSparseEditor
+    ),
     PreprocessAction(
         "PCA", "orange.preprocess.pca", "PCA",
         Description("Principal Component Analysis",
diff --git a/Orange/widgets/data/tests/test_owpreprocess.py b/Orange/widgets/data/tests/test_owpreprocess.py
@@ -4,7 +4,8 @@
 
 from Orange.data import Table
 from Orange.preprocess import (
-    Randomize, Scale, Discretize, Continuize, Impute, ProjectPCA, ProjectCUR
+    Randomize, Scale, Discretize, Continuize, Impute, ProjectPCA, \
+         ProjectCUR, RemoveSparse
 )
 from Orange.preprocess import discretize, impute, fss, score
 from Orange.widgets.data import owpreprocess
@@ -37,6 +38,20 @@ def test_randomize(self):
         np.testing.assert_array_equal(self.zoo.metas, output.metas)
         self.assertFalse(np.array_equal(self.zoo.Y, output.Y))
 
+    def test_remove_sparse(self):
+        data = Table("iris")
+        idx = int(data.X.shape[0]/10)
+        data.X[:idx+1, 0] = np.zeros((idx+1,))
+        saved = {"preprocessors": [("orange.preprocess.remove_sparse", {'sparse_thresh':90})]}
+        model = self.widget.load(saved)
+
+        self.widget.set_model(model)
+        self.send_signal(self.widget.Inputs.data, data)
+        output = self.get_output(self.widget.Outputs.preprocessed_data)
+        np.testing.assert_array_equal(output.X, data.X[:, 1:])
+        np.testing.assert_array_equal(output.Y, data.Y)
+        np.testing.assert_array_equal(output.metas, data.metas)
+
     def test_normalize(self):
         data = Table("iris")
         saved = {"preprocessors": [("orange.preprocess.scale",
@@ -262,3 +277,18 @@ def test_editor(self):
         self.assertIsInstance(p, ProjectCUR)
         self.assertEqual(p.rank, 5)
         self.assertEqual(p.max_error, 0.5)
+
+class TestRemoveSparseEditor(WidgetTest):
+
+    def test_editor(self):
+        widget = owpreprocess.RemoveSparseEditor()
+        self.assertEqual(widget.parameters(), {"sparse_thresh": 5})
+
+        p = widget.createinstance(widget.parameters())
+        self.assertIsInstance(p, RemoveSparse)
+        self.assertEqual(p.threshold, 0.05)
+
+        widget.setParameters({"sparse_thresh": 90})
+        p = widget.createinstance(widget.parameters())
+        self.assertIsInstance(p, RemoveSparse)
+        self.assertEqual(p.threshold, 0.9)
diff --git a/doc/visual-programming/source/widgets/data/preprocess.md b/doc/visual-programming/source/widgets/data/preprocess.md
@@ -47,11 +47,12 @@ Preprocessors
 
 ![](images/Preprocess2-stamped.png)
 
-1. *Select random features* outputs either a fixed number of features from the original data or a percentage. This is mainly used for advanced testing and educational purposes.
-2. Normalize adjusts values to a common scale. Center values by mean or median or omit centering altogether. Similar for scaling, one can scale by SD (standard deviation), by span or not at all.
-3. Randomize instances. Randomize classes shuffles class values and destroys connection between instances and class. Similarly, one can randomize features or meta data. If replicable shuffling is on, randomization results can be shared and repeated with a saved workflow. This is mainly used for advanced testing and educational purposes.
-4. Principal component analysis outputs results of a PCA transformation. Similar to the [PCA](../unsupervised/PCA.md) widget.
-5. [CUR matrix decomposition](https://en.wikipedia.org/wiki/CUR_matrix_approximation) is a dimensionality reduction method, similar to SVD.
+5. *Select random features* outputs either a fixed number of features from the original data or a percentage. This is mainly used for advanced testing and educational purposes.
+6. Normalize adjusts values to a common scale. Center values by mean or median or omit centering altogether. Similar for scaling, one can scale by SD (standard deviation), by span or not at all.
+7. Randomize instances. Randomize classes shuffles class values and destroys connection between instances and class. Similarly, one can randomize features or meta data. If replicable shuffling is on, randomization results can be shared and repeated with a saved workflow. This is mainly used for advanced testing and educational purposes.
+8. *Remove sparse features* retains features that have more than user-defined threshold percentage of non-zero values. The rest are discarded.
+9. Principal component analysis outputs results of a PCA transformation. Similar to the [PCA](../unsupervised/PCA.md) widget.
+10. [CUR matrix decomposition](https://en.wikipedia.org/wiki/CUR_matrix_approximation) is a dimensionality reduction method, similar to SVD.
 
 Examples
 --------