Merge pull request #2823 from jerneju/lda

markotoplak · web-flow · commit 0790842cfd9a · 2018-01-19T10:15:24.000+01:00
[ENH] Linear Discriminant Analysis: scripting part
diff --git a/Orange/projection/__init__.py b/Orange/projection/__init__.py
@@ -4,3 +4,4 @@
 from .manifold import *
 from .freeviz import *
 from .radviz import radviz
+from .lda import LDA
diff --git a/Orange/projection/lda.py b/Orange/projection/lda.py
@@ -0,0 +1,65 @@
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+
+import Orange.data
+from Orange.classification.logistic_regression import _FeatureScorerMixin
+from Orange.data.util import SharedComputeValue
+from Orange.projection import SklProjector, Projection
+
+__all__ = ["LDA"]
+
+
+class LDA(SklProjector, _FeatureScorerMixin):
+    name = "LDA"
+    supports_sparse = False
+
+    def __init__(self, n_components=2, solver='eigen', preprocessors=None):
+        super().__init__(preprocessors=preprocessors)
+        self.n_components = n_components
+        self.solver = solver
+
+    def fit(self, X, Y=None):
+        if self.n_components is not None:
+            self.n_components = min(min(X.shape), self.n_components)
+        proj = LinearDiscriminantAnalysis(solver='eigen', n_components=2)
+        proj = proj.fit(X, Y)
+        return LDAModel(proj, self.domain)
+
+
+class _LDATransformDomain:
+    """Computation common for all LDA variables."""
+    def __init__(self, lda):
+        self.lda = lda
+
+    def __call__(self, data):
+        if data.domain != self.lda.pre_domain:
+            data = data.transform(self.lda.pre_domain)
+        return self.lda.transform(data.X)
+
+
+class LDAModel(Projection):
+    name = "LDAModel"
+
+    def __init__(self, proj, domain):
+        lda_transform = _LDATransformDomain(self)
+        self.components_ = proj.scalings_.T
+
+        def lda_variable(i):
+            return Orange.data.ContinuousVariable(
+                'LD%d' % (i + 1), compute_value=LDAProjector(self, i, lda_transform))
+
+        super().__init__(proj=proj)
+        self.orig_domain = domain
+        self.n_components = self.components_.shape[0]
+        self.domain = Orange.data.Domain(
+            [lda_variable(i) for i in range(proj.n_components)],
+            domain.class_vars, domain.metas)
+
+
+class LDAProjector(SharedComputeValue):
+    """Transform into a given LDA component."""
+    def __init__(self, projection, feature, lda_transform):
+        super().__init__(lda_transform)
+        self.feature = feature
+
+    def compute(self, data, lda_space):
+        return lda_space[:, self.feature]
diff --git a/Orange/tests/test_lda.py b/Orange/tests/test_lda.py
@@ -0,0 +1,49 @@
+# Test methods with long descriptive names can omit docstrings
+# pylint: disable=missing-docstring
+
+import unittest
+
+import numpy as np
+
+from Orange.preprocess import Continuize, Randomize
+from Orange.projection import LDA
+from Orange.data import Table
+
+
+class TestLDA(unittest.TestCase):
+    def test_lda(self):
+        iris = Table('iris')
+        n_components = 2
+        lda = LDA(n_components=n_components)
+        model = lda(iris)
+        transformed = model(iris)
+        self.assertEqual(transformed.X.shape, (len(iris), n_components))
+        self.assertEqual(transformed.Y.shape, (len(iris),))
+
+    def test_transform_changed_domain(self):
+        """
+        1. Open data, apply some preprocessor, splits the data into two parts,
+        use LDA on the first part, and then transform the second part.
+
+        2. Open data, split into two parts, apply the same preprocessor and
+        LDA only on the first part, and then transform the second part.
+
+        The transformed second part in (1) and (2) has to be the same.
+        """
+        data = Table("iris")
+        data = Randomize()(data)
+        preprocessor = Continuize()
+        lda = LDA()
+
+        # normalize all
+        ndata = preprocessor(data)
+
+        model = lda(ndata[:75])
+        result_1 = model(ndata[75:])
+
+        # normalize only the "training" part
+        ndata = preprocessor(data[:75])
+        model = lda(ndata)
+        result_2 = model(data[75:])
+
+        np.testing.assert_almost_equal(result_1.X, result_2.X)
diff --git a/doc/data-mining-library/source/reference/projection.rst b/doc/data-mining-library/source/reference/projection.rst
@@ -83,3 +83,51 @@ Example
 
 
 .. autoclass:: Orange.projection.freeviz.FreeViz
+
+
+
+
+LDA
+---
+
+Linear discriminant analysis is another way of finding a linear transformation of
+data that reduces the number of dimensions required to represent it. It is often
+used for dimensionality reduction prior to classification, but can also be used as a
+classification technique itself ([1]_).
+
+
+Example
+=======
+
+    >>> from Orange.projection import LDA
+    >>> from Orange.data import Table
+    >>> iris = Table('iris')
+    >>> lda = LDA()
+    >>> model = LDA(iris)
+    >>> model.components_    # LDA components
+    array([[ 0.20490976,  0.38714331, -0.54648218, -0.71378517],
+       [ 0.00898234,  0.58899857, -0.25428655,  0.76703217],
+       [-0.71507172,  0.43568045,  0.45568731, -0.30200008],
+       [ 0.06449913, -0.35780501, -0.42514529,  0.828895  ]])
+    >>> transformed_data = model(iris)    # transformed data
+    >>> transformed_data
+    [[1.492, 1.905 | Iris-setosa],
+    [1.258, 1.608 | Iris-setosa],
+    [1.349, 1.750 | Iris-setosa],
+    [1.180, 1.639 | Iris-setosa],
+    [1.510, 1.963 | Iris-setosa],
+    ...
+    ]
+
+
+
+.. autoclass:: Orange.projection.lda.LDA
+
+
+
+References
+----------
+
+.. [1] Witten, I.H., Frank, E., Hall, M.A. and Pal, C.J., 2016.
+   Data Mining: Practical machine learning tools and techniques. Morgan Kaufmann.
+