|
1 | | -import numpy as np |
2 | | -from scipy import stats |
3 | | -import sklearn.metrics as skl_metrics |
| 1 | +from .distance import (Distance, DistanceModel, |
| 2 | + Euclidean, Manhattan, Cosine, Jaccard, |
| 3 | + SpearmanR, SpearmanRAbsolute, PearsonR, PearsonRAbsolute, |
| 4 | + Mahalanobis, MahalanobisDistance) |
4 | 5 |
|
5 | | -from Orange import data |
6 | | -from Orange.misc import DistMatrix |
7 | | -from Orange.preprocess import SklImpute |
8 | | - |
9 | | -__all__ = ['Euclidean', 'Manhattan', 'Cosine', 'Jaccard', 'SpearmanR', |
10 | | - 'SpearmanRAbsolute', 'PearsonR', 'PearsonRAbsolute', 'Mahalanobis', |
11 | | - 'MahalanobisDistance'] |
12 | | - |
13 | | - |
14 | | -def _preprocess(table): |
15 | | - """Remove categorical attributes and impute missing values.""" |
16 | | - if not len(table): |
17 | | - return table |
18 | | - new_domain = data.Domain( |
19 | | - [a for a in table.domain.attributes if a.is_continuous], |
20 | | - table.domain.class_vars, |
21 | | - table.domain.metas) |
22 | | - new_data = table.transform(new_domain) |
23 | | - new_data = SklImpute()(new_data) |
24 | | - return new_data |
25 | | - |
26 | | - |
27 | | -def _orange_to_numpy(x): |
28 | | - """Convert :class:`Orange.data.Table` and :class:`Orange.data.RowInstance` |
29 | | - to :class:`numpy.ndarray`. |
30 | | - """ |
31 | | - if isinstance(x, data.Table): |
32 | | - return x.X |
33 | | - elif isinstance(x, data.Instance): |
34 | | - return np.atleast_2d(x.x) |
35 | | - elif isinstance(x, np.ndarray): |
36 | | - return np.atleast_2d(x) |
37 | | - else: |
38 | | - return x # e.g. None |
39 | | - |
40 | | - |
41 | | -class Distance: |
42 | | - def __call__(self, e1, e2=None, axis=1, impute=False): |
43 | | - """ |
44 | | - :param e1: input data instances, we calculate distances between all |
45 | | - pairs |
46 | | - :type e1: :class:`Orange.data.Table` or |
47 | | - :class:`Orange.data.RowInstance` or :class:`numpy.ndarray` |
48 | | - :param e2: optional second argument for data instances if provided, |
49 | | - distances between each pair, where first item is from e1 and |
50 | | - second is from e2, are calculated |
51 | | - :type e2: :class:`Orange.data.Table` or |
52 | | - :class:`Orange.data.RowInstance` or :class:`numpy.ndarray` |
53 | | - :param axis: if axis=1 we calculate distances between rows, if axis=0 |
54 | | - we calculate distances between columns |
55 | | - :type axis: int |
56 | | - :param impute: if impute=True all NaN values in matrix are replaced |
57 | | - with 0 |
58 | | - :type impute: bool |
59 | | - :return: the matrix with distances between given examples |
60 | | - :rtype: :class:`Orange.misc.distmatrix.DistMatrix` |
61 | | - """ |
62 | | - raise NotImplementedError( |
63 | | - 'Distance is an abstract class and should not be used directly.') |
64 | | - |
65 | | - |
66 | | -class SklDistance(Distance): |
67 | | - """Generic scikit-learn distance.""" |
68 | | - def __init__(self, metric, name, supports_sparse): |
69 | | - """ |
70 | | - Args: |
71 | | - metric: The metric to be used for distance calculation |
72 | | - name (str): Name of the distance |
73 | | - supports_sparse (boolean): Whether this metric works on sparse data |
74 | | - or not. |
75 | | - """ |
76 | | - self.metric = metric |
77 | | - self.name = name |
78 | | - self.supports_sparse = supports_sparse |
79 | | - |
80 | | - def __call__(self, e1, e2=None, axis=1, impute=False): |
81 | | - x1 = _orange_to_numpy(e1) |
82 | | - x2 = _orange_to_numpy(e2) |
83 | | - if axis == 0: |
84 | | - x1 = x1.T |
85 | | - if x2 is not None: |
86 | | - x2 = x2.T |
87 | | - dist = skl_metrics.pairwise.pairwise_distances( |
88 | | - x1, x2, metric=self.metric) |
89 | | - if isinstance(e1, data.Table) or isinstance(e1, data.RowInstance): |
90 | | - dist = DistMatrix(dist, e1, e2, axis) |
91 | | - else: |
92 | | - dist = DistMatrix(dist) |
93 | | - return dist |
94 | | - |
95 | | -Euclidean = SklDistance('euclidean', 'Euclidean', True) |
96 | | -Manhattan = SklDistance('manhattan', 'Manhattan', True) |
97 | | -Cosine = SklDistance('cosine', 'Cosine', True) |
98 | | -Jaccard = SklDistance('jaccard', 'Jaccard', False) |
99 | | - |
100 | | - |
101 | | -class SpearmanDistance(Distance): |
102 | | - """ Generic Spearman's rank correlation coefficient. """ |
103 | | - def __init__(self, absolute, name): |
104 | | - """ |
105 | | - Constructor for Spearman's and Absolute Spearman's distances. |
106 | | -
|
107 | | - Args: |
108 | | - absolute (boolean): Whether to use absolute values or not. |
109 | | - name (str): Name of the distance |
110 | | -
|
111 | | - Returns: |
112 | | - If absolute=True return Spearman's Absolute rank class else return |
113 | | - Spearman's rank class. |
114 | | - """ |
115 | | - self.absolute = absolute |
116 | | - self.name = name |
117 | | - self.supports_sparse = False |
118 | | - |
119 | | - def __call__(self, e1, e2=None, axis=1, impute=False): |
120 | | - x1 = _orange_to_numpy(e1) |
121 | | - x2 = _orange_to_numpy(e2) |
122 | | - if x2 is None: |
123 | | - x2 = x1 |
124 | | - slc = len(x1) if axis == 1 else x1.shape[1] |
125 | | - rho, _ = stats.spearmanr(x1, x2, axis=axis) |
126 | | - if np.isnan(rho).any() and impute: |
127 | | - rho = np.nan_to_num(rho) |
128 | | - if self.absolute: |
129 | | - dist = (1. - np.abs(rho)) / 2. |
130 | | - else: |
131 | | - dist = (1. - rho) / 2. |
132 | | - if isinstance(dist, np.float): |
133 | | - dist = np.array([[dist]]) |
134 | | - elif isinstance(dist, np.ndarray): |
135 | | - dist = dist[:slc, slc:] |
136 | | - if isinstance(e1, data.Table) or isinstance(e1, data.RowInstance): |
137 | | - dist = DistMatrix(dist, e1, e2, axis) |
138 | | - else: |
139 | | - dist = DistMatrix(dist) |
140 | | - return dist |
141 | | - |
142 | | -SpearmanR = SpearmanDistance(absolute=False, name='Spearman') |
143 | | -SpearmanRAbsolute = SpearmanDistance(absolute=True, name='Spearman absolute') |
144 | | - |
145 | | - |
146 | | -class PearsonDistance(Distance): |
147 | | - """ Generic Pearson's rank correlation coefficient. """ |
148 | | - def __init__(self, absolute, name): |
149 | | - """ |
150 | | - Constructor for Pearson's and Absolute Pearson's distances. |
151 | | -
|
152 | | - Args: |
153 | | - absolute (boolean): Whether to use absolute values or not. |
154 | | - name (str): Name of the distance |
155 | | -
|
156 | | - Returns: |
157 | | - If absolute=True return Pearson's Absolute rank class else return |
158 | | - Pearson's rank class. |
159 | | - """ |
160 | | - self.absolute = absolute |
161 | | - self.name = name |
162 | | - self.supports_sparse = False |
163 | | - |
164 | | - def __call__(self, e1, e2=None, axis=1, impute=False): |
165 | | - x1 = _orange_to_numpy(e1) |
166 | | - x2 = _orange_to_numpy(e2) |
167 | | - if x2 is None: |
168 | | - x2 = x1 |
169 | | - if axis == 0: |
170 | | - x1 = x1.T |
171 | | - x2 = x2.T |
172 | | - rho = np.array([[stats.pearsonr(i, j)[0] for j in x2] for i in x1]) |
173 | | - if np.isnan(rho).any() and impute: |
174 | | - rho = np.nan_to_num(rho) |
175 | | - if self.absolute: |
176 | | - dist = (1. - np.abs(rho)) / 2. |
177 | | - else: |
178 | | - dist = (1. - rho) / 2. |
179 | | - if isinstance(e1, data.Table) or isinstance(e1, data.RowInstance): |
180 | | - dist = DistMatrix(dist, e1, e2, axis) |
181 | | - else: |
182 | | - dist = DistMatrix(dist) |
183 | | - return dist |
184 | | - |
185 | | -PearsonR = PearsonDistance(absolute=False, name='Pearson') |
186 | | -PearsonRAbsolute = PearsonDistance(absolute=True, name='Pearson absolute') |
187 | | - |
188 | | - |
189 | | -class MahalanobisDistance(Distance): |
190 | | - """Mahalanobis distance.""" |
191 | | - def __init__(self, data=None, axis=1, name='Mahalanobis'): |
192 | | - self.name = name |
193 | | - self.supports_sparse = False |
194 | | - self.axis = None |
195 | | - self.VI = None |
196 | | - if data is not None: |
197 | | - self.fit(data, axis) |
198 | | - |
199 | | - def fit(self, data, axis=1): |
200 | | - """ |
201 | | - Compute the covariance matrix needed for calculating distances. |
202 | | -
|
203 | | - Args: |
204 | | - data: The dataset used for calculating covariances. |
205 | | - axis: If axis=1 we calculate distances between rows, if axis=0 we |
206 | | - calculate distances between columns. |
207 | | - """ |
208 | | - x = _orange_to_numpy(data) |
209 | | - if axis == 0: |
210 | | - x = x.T |
211 | | - self.axis = axis |
212 | | - try: |
213 | | - c = np.cov(x.T) |
214 | | - except: |
215 | | - raise MemoryError("Covariance matrix is too large.") |
216 | | - try: |
217 | | - self.VI = np.linalg.inv(c) |
218 | | - except: |
219 | | - raise ValueError("Computation of inverse covariance matrix failed.") |
220 | | - |
221 | | - def __call__(self, e1, e2=None, axis=None, impute=False): |
222 | | - assert self.VI is not None, \ |
223 | | - "Mahalanobis distance must be initialized with the fit() method." |
224 | | - |
225 | | - x1 = _orange_to_numpy(e1) |
226 | | - x2 = _orange_to_numpy(e2) |
227 | | - |
228 | | - if axis is not None: |
229 | | - assert axis == self.axis, \ |
230 | | - "Axis must match its value at initialization." |
231 | | - if self.axis == 0: |
232 | | - x1 = x1.T |
233 | | - if x2 is not None: |
234 | | - x2 = x2.T |
235 | | - if not x1.shape[1] == self.VI.shape[0] or \ |
236 | | - x2 is not None and not x2.shape[1] == self.VI.shape[0]: |
237 | | - raise ValueError('Incorrect number of features.') |
238 | | - |
239 | | - dist = skl_metrics.pairwise.pairwise_distances( |
240 | | - x1, x2, metric='mahalanobis', VI=self.VI) |
241 | | - if np.isnan(dist).any() and impute: |
242 | | - dist = np.nan_to_num(dist) |
243 | | - if isinstance(e1, data.Table) or isinstance(e1, data.RowInstance): |
244 | | - dist = DistMatrix(dist, e1, e2, self.axis) |
245 | | - else: |
246 | | - dist = DistMatrix(dist) |
247 | | - return dist |
248 | | - |
249 | | - |
250 | | -# Only retain this to raise errors on use. Remove in some future version. |
251 | | -class __MahalanobisDistanceError(MahalanobisDistance): |
252 | | - def _raise_error(self, *args, **kwargs): |
253 | | - raise RuntimeError( |
254 | | - "Invalid use of MahalanobisDistance.\n" |
255 | | - "Create a new MahalanobisDistance instance first, e.g.\n" |
256 | | - ">>> metric = MahalanobisDistance(data)\n" |
257 | | - ">>> dist = metric(data)" |
258 | | - ) |
259 | | - fit = _raise_error |
260 | | - __call__ = _raise_error |
261 | | -Mahalanobis = __MahalanobisDistanceError() |
| 6 | +from .base import _preprocess, remove_discrete_features, impute |
0 commit comments