Skip to content

Commit fc01026

Browse files
committed
distance: Speed and memory optimization
* Use numpy.corrcoef in PearsonR * Optimize PearsonR/SpearmanR when computing pairwise distances on a single input table
1 parent 08b10bd commit fc01026

File tree

1 file changed

+35
-11
lines changed

1 file changed

+35
-11
lines changed

Orange/distance/distance.py

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -418,8 +418,6 @@ def __init__(self, absolute, axis=1, impute=False):
418418
self.absolute = absolute
419419

420420
def compute_distances(self, x1, x2):
421-
if x2 is None:
422-
x2 = x1
423421
rho = self.compute_correlation(x1, x2)
424422
if self.absolute:
425423
return (1. - np.abs(rho)) / 2.
@@ -432,11 +430,29 @@ def compute_correlation(self, x1, x2):
432430

433431
class SpearmanModel(CorrelationDistanceModel):
434432
def compute_correlation(self, x1, x2):
435-
rho = stats.spearmanr(x1, x2, axis=self.axis)[0]
436-
if isinstance(rho, np.float):
437-
return np.array([[rho]])
438-
slc = x1.shape[1 - self.axis]
439-
return rho[:slc, slc:]
433+
n1 = x1.shape[1 - self.axis]
434+
n2 = x2.shape[1 - self.axis] if x2 is not None else 0
435+
if x2 is None:
436+
if n1 == 2:
437+
# Special case to properly fill degenerate self correlations
438+
# (nan, inf on the diagonals)
439+
rho = stats.spearmanr(x1, x1, axis=self.axis)[0]
440+
assert rho.shape == (4, 4)
441+
rho = rho[:2, :2].copy()
442+
else:
443+
# scalar if n1 == 1
444+
rho = stats.spearmanr(x1, axis=self.axis)[0]
445+
return np.atleast_2d(rho)
446+
else:
447+
# this computes too much (most of it is thrown away)
448+
rho = stats.spearmanr(x1, x2, axis=self.axis)[0]
449+
if np.isscalar(rho):
450+
# scalar if n1 + n2 <= 2
451+
assert n1 + n2 <= 2
452+
return np.atleast_2d(rho)
453+
else:
454+
assert rho.shape == (n1 + n2, n1 + n2)
455+
return rho[:n1, n1:].copy()
440456

441457

442458
class CorrelationDistance(Distance):
@@ -455,10 +471,18 @@ def fit(self, _):
455471

456472
class PearsonModel(CorrelationDistanceModel):
457473
def compute_correlation(self, x1, x2):
458-
if self.axis == 0:
459-
x1 = x1.T
460-
x2 = x2.T
461-
return np.array([[stats.pearsonr(i, j)[0] for j in x2] for i in x1])
474+
if x2 is None:
475+
c = np.corrcoef(x1, rowvar=self.axis == 1)
476+
return np.atleast_2d(c)
477+
else:
478+
# this computes too much (most of it is thrown away)`
479+
c = np.corrcoef(x1, x2, rowvar=self.axis == 1)
480+
if np.isscalar(c):
481+
return np.atleast_2d(c)
482+
n1 = x1.shape[1 - self.axis]
483+
n2 = x2.shape[1 - self.axis]
484+
assert c.shape[0] == c.shape[1] == n1 + n2
485+
return c[:n1, n1:].copy()
462486

463487

464488
class PearsonR(CorrelationDistance):

0 commit comments

Comments
 (0)