Skip to content

Commit c3a7d1a

Browse files
authored
Merge pull request #2454 from janezd/distances
[ENH] Proper calculation of distances
2 parents 2dc1334 + 13ede45 commit c3a7d1a

File tree

18 files changed

+34503
-896
lines changed

18 files changed

+34503
-896
lines changed

Orange/distance/__init__.py

Lines changed: 5 additions & 260 deletions
Original file line numberDiff line numberDiff line change
@@ -1,261 +1,6 @@
1-
import numpy as np
2-
from scipy import stats
3-
import sklearn.metrics as skl_metrics
1+
from .distance import (Distance, DistanceModel,
2+
Euclidean, Manhattan, Cosine, Jaccard,
3+
SpearmanR, SpearmanRAbsolute, PearsonR, PearsonRAbsolute,
4+
Mahalanobis, MahalanobisDistance)
45

5-
from Orange import data
6-
from Orange.misc import DistMatrix
7-
from Orange.preprocess import SklImpute
8-
9-
__all__ = ['Euclidean', 'Manhattan', 'Cosine', 'Jaccard', 'SpearmanR',
10-
'SpearmanRAbsolute', 'PearsonR', 'PearsonRAbsolute', 'Mahalanobis',
11-
'MahalanobisDistance']
12-
13-
14-
def _preprocess(table):
15-
"""Remove categorical attributes and impute missing values."""
16-
if not len(table):
17-
return table
18-
new_domain = data.Domain(
19-
[a for a in table.domain.attributes if a.is_continuous],
20-
table.domain.class_vars,
21-
table.domain.metas)
22-
new_data = table.transform(new_domain)
23-
new_data = SklImpute()(new_data)
24-
return new_data
25-
26-
27-
def _orange_to_numpy(x):
28-
"""Convert :class:`Orange.data.Table` and :class:`Orange.data.RowInstance`
29-
to :class:`numpy.ndarray`.
30-
"""
31-
if isinstance(x, data.Table):
32-
return x.X
33-
elif isinstance(x, data.Instance):
34-
return np.atleast_2d(x.x)
35-
elif isinstance(x, np.ndarray):
36-
return np.atleast_2d(x)
37-
else:
38-
return x # e.g. None
39-
40-
41-
class Distance:
42-
def __call__(self, e1, e2=None, axis=1, impute=False):
43-
"""
44-
:param e1: input data instances, we calculate distances between all
45-
pairs
46-
:type e1: :class:`Orange.data.Table` or
47-
:class:`Orange.data.RowInstance` or :class:`numpy.ndarray`
48-
:param e2: optional second argument for data instances if provided,
49-
distances between each pair, where first item is from e1 and
50-
second is from e2, are calculated
51-
:type e2: :class:`Orange.data.Table` or
52-
:class:`Orange.data.RowInstance` or :class:`numpy.ndarray`
53-
:param axis: if axis=1 we calculate distances between rows, if axis=0
54-
we calculate distances between columns
55-
:type axis: int
56-
:param impute: if impute=True all NaN values in matrix are replaced
57-
with 0
58-
:type impute: bool
59-
:return: the matrix with distances between given examples
60-
:rtype: :class:`Orange.misc.distmatrix.DistMatrix`
61-
"""
62-
raise NotImplementedError(
63-
'Distance is an abstract class and should not be used directly.')
64-
65-
66-
class SklDistance(Distance):
67-
"""Generic scikit-learn distance."""
68-
def __init__(self, metric, name, supports_sparse):
69-
"""
70-
Args:
71-
metric: The metric to be used for distance calculation
72-
name (str): Name of the distance
73-
supports_sparse (boolean): Whether this metric works on sparse data
74-
or not.
75-
"""
76-
self.metric = metric
77-
self.name = name
78-
self.supports_sparse = supports_sparse
79-
80-
def __call__(self, e1, e2=None, axis=1, impute=False):
81-
x1 = _orange_to_numpy(e1)
82-
x2 = _orange_to_numpy(e2)
83-
if axis == 0:
84-
x1 = x1.T
85-
if x2 is not None:
86-
x2 = x2.T
87-
dist = skl_metrics.pairwise.pairwise_distances(
88-
x1, x2, metric=self.metric)
89-
if isinstance(e1, data.Table) or isinstance(e1, data.RowInstance):
90-
dist = DistMatrix(dist, e1, e2, axis)
91-
else:
92-
dist = DistMatrix(dist)
93-
return dist
94-
95-
Euclidean = SklDistance('euclidean', 'Euclidean', True)
96-
Manhattan = SklDistance('manhattan', 'Manhattan', True)
97-
Cosine = SklDistance('cosine', 'Cosine', True)
98-
Jaccard = SklDistance('jaccard', 'Jaccard', False)
99-
100-
101-
class SpearmanDistance(Distance):
102-
""" Generic Spearman's rank correlation coefficient. """
103-
def __init__(self, absolute, name):
104-
"""
105-
Constructor for Spearman's and Absolute Spearman's distances.
106-
107-
Args:
108-
absolute (boolean): Whether to use absolute values or not.
109-
name (str): Name of the distance
110-
111-
Returns:
112-
If absolute=True return Spearman's Absolute rank class else return
113-
Spearman's rank class.
114-
"""
115-
self.absolute = absolute
116-
self.name = name
117-
self.supports_sparse = False
118-
119-
def __call__(self, e1, e2=None, axis=1, impute=False):
120-
x1 = _orange_to_numpy(e1)
121-
x2 = _orange_to_numpy(e2)
122-
if x2 is None:
123-
x2 = x1
124-
slc = len(x1) if axis == 1 else x1.shape[1]
125-
rho, _ = stats.spearmanr(x1, x2, axis=axis)
126-
if np.isnan(rho).any() and impute:
127-
rho = np.nan_to_num(rho)
128-
if self.absolute:
129-
dist = (1. - np.abs(rho)) / 2.
130-
else:
131-
dist = (1. - rho) / 2.
132-
if isinstance(dist, np.float):
133-
dist = np.array([[dist]])
134-
elif isinstance(dist, np.ndarray):
135-
dist = dist[:slc, slc:]
136-
if isinstance(e1, data.Table) or isinstance(e1, data.RowInstance):
137-
dist = DistMatrix(dist, e1, e2, axis)
138-
else:
139-
dist = DistMatrix(dist)
140-
return dist
141-
142-
SpearmanR = SpearmanDistance(absolute=False, name='Spearman')
143-
SpearmanRAbsolute = SpearmanDistance(absolute=True, name='Spearman absolute')
144-
145-
146-
class PearsonDistance(Distance):
147-
""" Generic Pearson's rank correlation coefficient. """
148-
def __init__(self, absolute, name):
149-
"""
150-
Constructor for Pearson's and Absolute Pearson's distances.
151-
152-
Args:
153-
absolute (boolean): Whether to use absolute values or not.
154-
name (str): Name of the distance
155-
156-
Returns:
157-
If absolute=True return Pearson's Absolute rank class else return
158-
Pearson's rank class.
159-
"""
160-
self.absolute = absolute
161-
self.name = name
162-
self.supports_sparse = False
163-
164-
def __call__(self, e1, e2=None, axis=1, impute=False):
165-
x1 = _orange_to_numpy(e1)
166-
x2 = _orange_to_numpy(e2)
167-
if x2 is None:
168-
x2 = x1
169-
if axis == 0:
170-
x1 = x1.T
171-
x2 = x2.T
172-
rho = np.array([[stats.pearsonr(i, j)[0] for j in x2] for i in x1])
173-
if np.isnan(rho).any() and impute:
174-
rho = np.nan_to_num(rho)
175-
if self.absolute:
176-
dist = (1. - np.abs(rho)) / 2.
177-
else:
178-
dist = (1. - rho) / 2.
179-
if isinstance(e1, data.Table) or isinstance(e1, data.RowInstance):
180-
dist = DistMatrix(dist, e1, e2, axis)
181-
else:
182-
dist = DistMatrix(dist)
183-
return dist
184-
185-
PearsonR = PearsonDistance(absolute=False, name='Pearson')
186-
PearsonRAbsolute = PearsonDistance(absolute=True, name='Pearson absolute')
187-
188-
189-
class MahalanobisDistance(Distance):
190-
"""Mahalanobis distance."""
191-
def __init__(self, data=None, axis=1, name='Mahalanobis'):
192-
self.name = name
193-
self.supports_sparse = False
194-
self.axis = None
195-
self.VI = None
196-
if data is not None:
197-
self.fit(data, axis)
198-
199-
def fit(self, data, axis=1):
200-
"""
201-
Compute the covariance matrix needed for calculating distances.
202-
203-
Args:
204-
data: The dataset used for calculating covariances.
205-
axis: If axis=1 we calculate distances between rows, if axis=0 we
206-
calculate distances between columns.
207-
"""
208-
x = _orange_to_numpy(data)
209-
if axis == 0:
210-
x = x.T
211-
self.axis = axis
212-
try:
213-
c = np.cov(x.T)
214-
except:
215-
raise MemoryError("Covariance matrix is too large.")
216-
try:
217-
self.VI = np.linalg.inv(c)
218-
except:
219-
raise ValueError("Computation of inverse covariance matrix failed.")
220-
221-
def __call__(self, e1, e2=None, axis=None, impute=False):
222-
assert self.VI is not None, \
223-
"Mahalanobis distance must be initialized with the fit() method."
224-
225-
x1 = _orange_to_numpy(e1)
226-
x2 = _orange_to_numpy(e2)
227-
228-
if axis is not None:
229-
assert axis == self.axis, \
230-
"Axis must match its value at initialization."
231-
if self.axis == 0:
232-
x1 = x1.T
233-
if x2 is not None:
234-
x2 = x2.T
235-
if not x1.shape[1] == self.VI.shape[0] or \
236-
x2 is not None and not x2.shape[1] == self.VI.shape[0]:
237-
raise ValueError('Incorrect number of features.')
238-
239-
dist = skl_metrics.pairwise.pairwise_distances(
240-
x1, x2, metric='mahalanobis', VI=self.VI)
241-
if np.isnan(dist).any() and impute:
242-
dist = np.nan_to_num(dist)
243-
if isinstance(e1, data.Table) or isinstance(e1, data.RowInstance):
244-
dist = DistMatrix(dist, e1, e2, self.axis)
245-
else:
246-
dist = DistMatrix(dist)
247-
return dist
248-
249-
250-
# Only retain this to raise errors on use. Remove in some future version.
251-
class __MahalanobisDistanceError(MahalanobisDistance):
252-
def _raise_error(self, *args, **kwargs):
253-
raise RuntimeError(
254-
"Invalid use of MahalanobisDistance.\n"
255-
"Create a new MahalanobisDistance instance first, e.g.\n"
256-
">>> metric = MahalanobisDistance(data)\n"
257-
">>> dist = metric(data)"
258-
)
259-
fit = _raise_error
260-
__call__ = _raise_error
261-
Mahalanobis = __MahalanobisDistanceError()
6+
from .base import _preprocess, remove_discrete_features, impute

0 commit comments

Comments
 (0)