[ENH] Linear Discriminant Analysis (LDA)

jerneju · jerneju · commit fa353da932a0 · 2018-01-19T09:51:21.000+01:00
diff --git a/Orange/projection/__init__.py b/Orange/projection/__init__.py
@@ -4,3 +4,4 @@
 from .manifold import *
 from .freeviz import *
 from .radviz import radviz
+from .lda import LDA
diff --git a/Orange/projection/lda.py b/Orange/projection/lda.py
@@ -0,0 +1,65 @@
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+
+import Orange.data
+from Orange.classification.logistic_regression import _FeatureScorerMixin
+from Orange.data.util import SharedComputeValue
+from Orange.projection import SklProjector, Projection, LinearCombinationSql
+
+__all__ = ["LDA"]
+
+
+class LDA(SklProjector, _FeatureScorerMixin):
+    name = "LDA"
+    supports_sparse = False
+
+    def __init__(self, n_components=2, solver='eigen', preprocessors=None):
+        super().__init__(preprocessors=preprocessors)
+        self.n_components = n_components
+        self.solver = solver
+
+    def fit(self, X, Y=None):
+        if self.n_components is not None:
+            self.n_components = min(min(X.shape), self.n_components)
+        proj = LinearDiscriminantAnalysis(solver='eigen', n_components=2)
+        proj = proj.fit(X, Y)
+        return LDAModel(proj, self.domain)
+
+
+class _LDATransformDomain:
+    """Computation common for all LDA variables."""
+    def __init__(self, lda):
+        self.lda = lda
+
+    def __call__(self, data):
+        if data.domain != self.lda.pre_domain:
+            data = data.transform(self.lda.pre_domain)
+        return self.lda.transform(data.X)
+
+
+class LDAModel(Projection):
+    name = "LDAModel"
+
+    def __init__(self, proj, domain):
+        lda_transform = _LDATransformDomain(self)
+        self.components_ = proj.scalings_.T
+
+        def lda_variable(i):
+            return Orange.data.ContinuousVariable(
+                'LD%d' % (i + 1), compute_value=LDAProjector(self, i, lda_transform))
+
+        super().__init__(proj=proj)
+        self.orig_domain = domain
+        self.n_components = self.components_.shape[0]
+        self.domain = Orange.data.Domain(
+            [lda_variable(i) for i in range(proj.n_components)],
+            domain.class_vars, domain.metas)
+
+
+class LDAProjector(SharedComputeValue):
+    """Transform into a given LDA component."""
+    def __init__(self, projection, feature, lda_transform):
+        super().__init__(lda_transform)
+        self.feature = feature
+
+    def compute(self, data, lda_space):
+        return lda_space[:, self.feature]
diff --git a/Orange/tests/test_lda.py b/Orange/tests/test_lda.py
@@ -0,0 +1,49 @@
+# Test methods with long descriptive names can omit docstrings
+# pylint: disable=missing-docstring
+
+import unittest
+
+import numpy as np
+
+from Orange.preprocess import Continuize, Randomize
+from Orange.projection import LDA
+from Orange.data import Table
+
+
+class TestLDA(unittest.TestCase):
+    def test_lda(self):
+        iris = Table('iris')
+        n_components = 2
+        lda = LDA(n_components=n_components)
+        model = lda(iris)
+        transformed = model(iris)
+        self.assertEqual(transformed.X.shape, (len(iris), n_components))
+        self.assertEqual(transformed.Y.shape, (len(iris),))
+
+    def test_transform_changed_domain(self):
+        """
+        1. Open data, apply some preprocessor, splits the data into two parts,
+        use LDA on the first part, and then transform the second part.
+
+        2. Open data, split into two parts, apply the same preprocessor and
+        LDA only on the first part, and then transform the second part.
+
+        The transformed second part in (1) and (2) has to be the same.
+        """
+        data = Table("iris")
+        data = Randomize()(data)
+        preprocessor = Continuize()
+        lda = LDA()
+
+        # normalize all
+        ndata = preprocessor(data)
+
+        model = lda(ndata[:75])
+        result_1 = model(ndata[75:])
+
+        # normalize only the "training" part
+        ndata = preprocessor(data[:75])
+        model = lda(ndata)
+        result_2 = model(data[75:])
+
+        np.testing.assert_almost_equal(result_1.X, result_2.X)