Skip to content

Commit 1d5c34f

Browse files
committed
distances: Optimize PearsonR/SpearmanR ...
... for the case where computing distances from two tables.
1 parent fc01026 commit 1d5c34f

File tree

1 file changed

+99
-20
lines changed

1 file changed

+99
-20
lines changed

Orange/distance/distance.py

Lines changed: 99 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -425,14 +425,13 @@ def compute_distances(self, x1, x2):
425425
return (1. - rho) / 2.
426426

427427
def compute_correlation(self, x1, x2):
428-
pass
428+
raise NotImplementedError()
429429

430430

431431
class SpearmanModel(CorrelationDistanceModel):
432432
def compute_correlation(self, x1, x2):
433-
n1 = x1.shape[1 - self.axis]
434-
n2 = x2.shape[1 - self.axis] if x2 is not None else 0
435433
if x2 is None:
434+
n1 = x1.shape[1 - self.axis]
436435
if n1 == 2:
437436
# Special case to properly fill degenerate self correlations
438437
# (nan, inf on the diagonals)
@@ -444,15 +443,102 @@ def compute_correlation(self, x1, x2):
444443
rho = stats.spearmanr(x1, axis=self.axis)[0]
445444
return np.atleast_2d(rho)
446445
else:
447-
# this computes too much (most of it is thrown away)
448-
rho = stats.spearmanr(x1, x2, axis=self.axis)[0]
449-
if np.isscalar(rho):
450-
# scalar if n1 + n2 <= 2
451-
assert n1 + n2 <= 2
452-
return np.atleast_2d(rho)
453-
else:
454-
assert rho.shape == (n1 + n2, n1 + n2)
455-
return rho[:n1, n1:].copy()
446+
return _spearmanr2(x1, x2, axis=self.axis)
447+
448+
449+
def _spearmanr2(a, b, axis=0):
450+
"""
451+
Compute all pairwise spearman rank moment correlations between rows
452+
or columns of a and b
453+
454+
Parameters
455+
----------
456+
a : (N, M) numpy.ndarray
457+
The input cases a.
458+
b : (J, K) numpy.ndarray
459+
The input cases b. In case of axis == 0: J must equal N;
460+
otherwise if axis == 1 then K must equal M.
461+
axis : int
462+
If 0 the correlation are computed between a and b's columns.
463+
Otherwise if 1 the correlations are computed between rows.
464+
465+
Returns
466+
-------
467+
cor : (N, J) or (M, K) nd.array
468+
If axis == 0 then (N, J) matrix of correlations between a x b columns
469+
else a (N, J) matrix of correlations between a x b rows.
470+
471+
See Also
472+
--------
473+
scipy.stats.spearmanr
474+
"""
475+
a, b = np.atleast_2d(a, b)
476+
assert a.shape[axis] == b.shape[axis]
477+
ar = np.apply_along_axis(stats.rankdata, axis, a)
478+
br = np.apply_along_axis(stats.rankdata, axis, b)
479+
480+
return _corrcoef2(ar, br, axis=axis)
481+
482+
483+
def _corrcoef2(a, b, axis=0):
484+
"""
485+
Compute all pairwise Pearson product-moment correlation coefficients
486+
between rows or columns of a and b
487+
488+
Parameters
489+
----------
490+
a : (N, M) numpy.ndarray
491+
The input cases a.
492+
b : (J, K) numpy.ndarray
493+
The input cases b. In case of axis == 0: J must equal N;
494+
otherwise if axis == 1 then K must equal M.
495+
axis : int
496+
If 0 the correlation are computed between a and b's columns.
497+
Otherwise if 1 the correlations are computed between rows.
498+
499+
Returns
500+
-------
501+
cor : (N, J) or (M, K) nd.array
502+
If axis == 0 then (N, J) matrix of correlations between a x b columns
503+
else a (N, J) matrix of correlations between a x b rows.
504+
505+
See Also
506+
--------
507+
numpy.corrcoef
508+
"""
509+
a, b = np.atleast_2d(a, b)
510+
mean_a = np.mean(a, axis=axis, keepdims=True)
511+
mean_b = np.mean(b, axis=axis, keepdims=True)
512+
assert a.shape[axis] == b.shape[axis]
513+
514+
n = a.shape[1 - axis]
515+
m = b.shape[1 - axis]
516+
517+
a = a - mean_a
518+
b = b - mean_b
519+
520+
if axis == 0:
521+
C = a.T.dot(b)
522+
assert C.shape == (n, m)
523+
elif axis == 1:
524+
C = a.dot(b.T)
525+
assert C.shape == (n, m)
526+
else:
527+
raise ValueError()
528+
529+
ss_a = np.sum(a ** 2, axis=axis, keepdims=True)
530+
ss_b = np.sum(b ** 2, axis=axis, keepdims=True)
531+
532+
if axis == 0:
533+
ss_a = ss_a.T
534+
else:
535+
ss_b = ss_b.T
536+
537+
assert ss_a.shape == (n, 1)
538+
assert ss_b.shape == (1, m)
539+
C /= np.sqrt(ss_a)
540+
C /= np.sqrt(ss_b)
541+
return C
456542

457543

458544
class CorrelationDistance(Distance):
@@ -475,14 +561,7 @@ def compute_correlation(self, x1, x2):
475561
c = np.corrcoef(x1, rowvar=self.axis == 1)
476562
return np.atleast_2d(c)
477563
else:
478-
# this computes too much (most of it is thrown away)`
479-
c = np.corrcoef(x1, x2, rowvar=self.axis == 1)
480-
if np.isscalar(c):
481-
return np.atleast_2d(c)
482-
n1 = x1.shape[1 - self.axis]
483-
n2 = x2.shape[1 - self.axis]
484-
assert c.shape[0] == c.shape[1] == n1 + n2
485-
return c[:n1, n1:].copy()
564+
return _corrcoef2(x1, x2, axis=self.axis)
486565

487566

488567
class PearsonR(CorrelationDistance):

0 commit comments

Comments
 (0)