Skip to content

Commit 9d6bdec

Browse files
committed
distance: Implement Cosine distance
1 parent d11cc75 commit 9d6bdec

File tree

6 files changed

+4644
-1916
lines changed

6 files changed

+4644
-1916
lines changed

Orange/distance/__init__.py

Lines changed: 64 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,10 @@ def compute_distances(self, x1, x2=None):
104104
return self.distance_by_cols(x1, self.fit_params)
105105
else:
106106
return self.distance_by_rows(
107-
x1, x2 if x2 is not None else x1, self.fit_params)
107+
x1,
108+
x2 if x2 is not None else x1,
109+
x2 is not None,
110+
self.fit_params)
108111

109112

110113
class FittedDistance(Distance):
@@ -121,6 +124,13 @@ def fit(self, data):
121124
# pylint: disable=not-callable
122125
return self.ModelType(attributes, axis=self.axis, fit_params=fit_params)
123126

127+
def fit_cols(self, x, n_vals):
128+
if any(n_vals):
129+
raise ValueError("columns with discrete values are incommensurable")
130+
131+
def fit_rows(self, x, n_vals):
132+
pass
133+
124134

125135
class EuclideanModel(FittedDistanceModel):
126136
name = "Euclidean"
@@ -137,6 +147,7 @@ def __new__(cls, *args, **kwargs):
137147
return super().__new__(cls, *args, **kwargs)
138148

139149
def fit_rows(self, x, n_vals):
150+
super().fit_rows(x, n_vals)
140151
n_cols = len(n_vals)
141152
n_bins = max(n_vals)
142153
means = np.zeros(n_cols, dtype=float)
@@ -171,9 +182,7 @@ def fit_rows(self, x, n_vals):
171182
normalize=int(self.normalize))
172183

173184
def fit_cols(self, x, n_vals):
174-
if any(n_vals):
175-
raise ValueError(
176-
"columns with discrete values are not commensurate")
185+
super().fit_cols(x, n_vals)
177186
means = np.nanmean(x, axis=0)
178187
vars = np.nanvar(x, axis=0)
179188
if np.isnan(vars).any() or not vars.all():
@@ -196,6 +205,7 @@ def __new__(cls, *args, **kwargs):
196205
return super().__new__(cls, *args, **kwargs)
197206

198207
def fit_rows(self, x, n_vals):
208+
super().fit_rows(x, n_vals)
199209
n_cols = len(n_vals)
200210
n_bins = max(n_vals)
201211

@@ -227,9 +237,7 @@ def fit_rows(self, x, n_vals):
227237
normalize=int(self.normalize))
228238

229239
def fit_cols(self, x, n_vals):
230-
if any(n_vals):
231-
raise ValueError(
232-
"columns with discrete values are not commensurate")
240+
super().fit_cols(x, n_vals)
233241
medians = np.nanmedian(x, axis=0)
234242
mads = np.nanmedian(np.abs(x - medians), axis=0)
235243
if np.isnan(mads).any() or not mads.all():
@@ -239,6 +247,49 @@ def fit_cols(self, x, n_vals):
239247
return dict(medians=medians, mads=mads, normalize=int(self.normalize))
240248

241249

250+
class CosineModel(FittedDistanceModel):
251+
supports_sparse = False
252+
distance_by_rows = _distance.cosine_rows
253+
distance_by_cols = _distance.cosine_cols
254+
255+
256+
class Cosine(FittedDistance):
257+
ModelType = CosineModel
258+
259+
def __new__(cls, *args, **kwargs):
260+
kwargs.setdefault("normalize", False)
261+
return super().__new__(cls, *args, **kwargs)
262+
263+
def fit_rows(self, x, n_vals):
264+
super().fit_rows(x, n_vals)
265+
n, n_cols = x.shape
266+
means = np.zeros(n_cols, dtype=float)
267+
vars = np.empty(n_cols, dtype=float)
268+
dist_missing2 = np.zeros(n_cols, dtype=float)
269+
270+
for col in range(n_cols):
271+
column = x[:, col]
272+
if n_vals[col]:
273+
vars[col] = -1
274+
nonnans = n - np.sum(np.isnan(column))
275+
means[col] = 1 - np.sum(column == 0) / nonnans
276+
dist_missing2[col] = means[col]
277+
elif np.isnan(column).all(): # avoid warnings in nanmean and nanvar
278+
vars[col] = -2
279+
else:
280+
means[col] = util.nanmean(column)
281+
vars[col] = util.nanvar(column)
282+
if vars[col] == 0:
283+
vars[col] = -2
284+
dist_missing2[col] = means[col] ** 2
285+
if np.isnan(dist_missing2[col]):
286+
dist_missing2[col] = 0
287+
288+
return dict(means=means, vars=vars, dist_missing2=dist_missing2)
289+
290+
fit_cols = fit_rows
291+
292+
242293
class JaccardModel(FittedDistanceModel):
243294
supports_sparse = False
244295
distance_by_cols = _distance.jaccard_cols
@@ -248,17 +299,14 @@ class JaccardModel(FittedDistanceModel):
248299
class Jaccard(FittedDistance):
249300
ModelType = JaccardModel
250301
name = "Jaccard"
251-
fit_rows = fit_cols = _distance.fit_jaccard
252-
253-
254-
class CosineModel(EuclideanModel):
255-
def compute_distances(self, x1, x2=None):
256-
return 1 - np.cos(1 - super().compute_distances(x1, x2))
257302

303+
def fit_rows(self, x, n_vals):
304+
return {
305+
"ps": np.fromiter(
306+
(_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))),
307+
dtype=np.double, count=len(n_vals))}
258308

259-
class Cosine(Euclidean):
260-
ModelType = CosineModel
261-
name = "Cosine"
309+
fit_cols = fit_rows
262310

263311

264312
class SpearmanDistance(Distance):

0 commit comments

Comments
 (0)