Skip to content

Commit b09b282

Browse files
authored
Merge pull request #2852 from ales-erjavec/distances-correlations-optimization
[ENH] Distances: Optimize PearsonR/SpearmanR
2 parents 3fdba8d + f837920 commit b09b282

File tree

2 files changed

+169
-12
lines changed

2 files changed

+169
-12
lines changed

Orange/distance/distance.py

Lines changed: 116 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -418,25 +418,128 @@ def __init__(self, absolute, axis=1, impute=False):
418418
self.absolute = absolute
419419

420420
def compute_distances(self, x1, x2):
421-
if x2 is None:
422-
x2 = x1
423421
rho = self.compute_correlation(x1, x2)
424422
if self.absolute:
425423
return (1. - np.abs(rho)) / 2.
426424
else:
427425
return (1. - rho) / 2.
428426

429427
def compute_correlation(self, x1, x2):
430-
pass
428+
raise NotImplementedError()
431429

432430

433431
class SpearmanModel(CorrelationDistanceModel):
434432
def compute_correlation(self, x1, x2):
435-
rho = stats.spearmanr(x1, x2, axis=self.axis)[0]
436-
if isinstance(rho, np.float):
437-
return np.array([[rho]])
438-
slc = x1.shape[1 - self.axis]
439-
return rho[:slc, slc:]
433+
if x2 is None:
434+
n1 = x1.shape[1 - self.axis]
435+
if n1 == 2:
436+
# Special case to properly fill degenerate self correlations
437+
# (nan, inf on the diagonals)
438+
rho = stats.spearmanr(x1, x1, axis=self.axis)[0]
439+
assert rho.shape == (4, 4)
440+
rho = rho[:2, :2].copy()
441+
else:
442+
# scalar if n1 == 1
443+
rho = stats.spearmanr(x1, axis=self.axis)[0]
444+
return np.atleast_2d(rho)
445+
else:
446+
return _spearmanr2(x1, x2, axis=self.axis)
447+
448+
449+
def _spearmanr2(a, b, axis=0):
450+
"""
451+
Compute all pairwise spearman rank moment correlations between rows
452+
or columns of a and b
453+
454+
Parameters
455+
----------
456+
a : (N, M) numpy.ndarray
457+
The input cases a.
458+
b : (J, K) numpy.ndarray
459+
The input cases b. In case of axis == 0: J must equal N;
460+
otherwise if axis == 1 then K must equal M.
461+
axis : int
462+
If 0 the correlation are computed between a and b's columns.
463+
Otherwise if 1 the correlations are computed between rows.
464+
465+
Returns
466+
-------
467+
cor : (N, J) or (M, K) nd.array
468+
If axis == 0 then (N, J) matrix of correlations between a x b columns
469+
else a (N, J) matrix of correlations between a x b rows.
470+
471+
See Also
472+
--------
473+
scipy.stats.spearmanr
474+
"""
475+
a, b = np.atleast_2d(a, b)
476+
assert a.shape[axis] == b.shape[axis]
477+
ar = np.apply_along_axis(stats.rankdata, axis, a)
478+
br = np.apply_along_axis(stats.rankdata, axis, b)
479+
480+
return _corrcoef2(ar, br, axis=axis)
481+
482+
483+
def _corrcoef2(a, b, axis=0):
484+
"""
485+
Compute all pairwise Pearson product-moment correlation coefficients
486+
between rows or columns of a and b
487+
488+
Parameters
489+
----------
490+
a : (N, M) numpy.ndarray
491+
The input cases a.
492+
b : (J, K) numpy.ndarray
493+
The input cases b. In case of axis == 0: J must equal N;
494+
otherwise if axis == 1 then K must equal M.
495+
axis : int
496+
If 0 the correlation are computed between a and b's columns.
497+
Otherwise if 1 the correlations are computed between rows.
498+
499+
Returns
500+
-------
501+
cor : (N, J) or (M, K) nd.array
502+
If axis == 0 then (N, J) matrix of correlations between a x b columns
503+
else a (N, J) matrix of correlations between a x b rows.
504+
505+
See Also
506+
--------
507+
numpy.corrcoef
508+
"""
509+
a, b = np.atleast_2d(a, b)
510+
if not (axis == 0 or axis == 1):
511+
raise ValueError("Invalid axis {} (only 0 or 1 accepted)".format(axis))
512+
513+
mean_a = np.mean(a, axis=axis, keepdims=True)
514+
mean_b = np.mean(b, axis=axis, keepdims=True)
515+
assert a.shape[axis] == b.shape[axis]
516+
517+
n = a.shape[1 - axis]
518+
m = b.shape[1 - axis]
519+
520+
a = a - mean_a
521+
b = b - mean_b
522+
523+
if axis == 0:
524+
C = a.T.dot(b)
525+
assert C.shape == (n, m)
526+
elif axis == 1:
527+
C = a.dot(b.T)
528+
assert C.shape == (n, m)
529+
530+
ss_a = np.sum(a ** 2, axis=axis, keepdims=True)
531+
ss_b = np.sum(b ** 2, axis=axis, keepdims=True)
532+
533+
if axis == 0:
534+
ss_a = ss_a.T
535+
else:
536+
ss_b = ss_b.T
537+
538+
assert ss_a.shape == (n, 1)
539+
assert ss_b.shape == (1, m)
540+
C /= np.sqrt(ss_a)
541+
C /= np.sqrt(ss_b)
542+
return C
440543

441544

442545
class CorrelationDistance(Distance):
@@ -455,10 +558,11 @@ def fit(self, _):
455558

456559
class PearsonModel(CorrelationDistanceModel):
457560
def compute_correlation(self, x1, x2):
458-
if self.axis == 0:
459-
x1 = x1.T
460-
x2 = x2.T
461-
return np.array([[stats.pearsonr(i, j)[0] for j in x2] for i in x1])
561+
if x2 is None:
562+
c = np.corrcoef(x1, rowvar=self.axis == 1)
563+
return np.atleast_2d(c)
564+
else:
565+
return _corrcoef2(x1, x2, axis=self.axis)
462566

463567

464568
class PearsonR(CorrelationDistance):

Orange/tests/test_distances.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,16 @@
66

77
import numpy as np
88
import scipy
9+
import scipy.spatial
10+
import scipy.stats
911
from scipy.sparse import csr_matrix
1012

1113
from Orange.data import (Table, Domain, ContinuousVariable,
1214
DiscreteVariable, StringVariable, Instance)
1315
from Orange.distance import (Euclidean, SpearmanR, SpearmanRAbsolute,
1416
PearsonR, PearsonRAbsolute, Manhattan, Cosine,
1517
Jaccard, _preprocess, MahalanobisDistance)
18+
from Orange.distance.distance import _spearmanr2, _corrcoef2
1619
from Orange.misc import DistMatrix
1720
from Orange.tests import named_file, test_filename
1821
from Orange.util import OrangeDeprecationWarning
@@ -598,6 +601,30 @@ def test_spearmanr_distance_numpy(self):
598601
[0.3833333],
599602
[0.]]))
600603

604+
def test_spearmanr2(self):
605+
# Test that _spearnmanr2 returns the same result that stats.spearmanr
606+
# would
607+
n, m = tuple(np.random.randint(2, 5, size=2))
608+
mean = np.random.uniform(-1, 1, size=m)
609+
cov = np.random.uniform(0, 1./m, size=(m, m))
610+
cov = (cov + cov.T) / 2
611+
cov.flat[::m + 1] = 1.0
612+
X1 = np.random.multivariate_normal(mean, cov, size=n)
613+
X2 = np.random.multivariate_normal(mean, cov, size=n)
614+
expected = scipy.stats.spearmanr(X1, X2, axis=1)[0][:n, n:]
615+
np.testing.assert_almost_equal(
616+
_spearmanr2(X1, X2, axis=1),
617+
expected,
618+
decimal=9
619+
)
620+
621+
expected = scipy.stats.spearmanr(X1, X2, axis=0)[0][:m, m:]
622+
np.testing.assert_almost_equal(
623+
_spearmanr2(X1, X2, axis=0),
624+
expected,
625+
decimal=9,
626+
)
627+
601628

602629
# noinspection PyTypeChecker
603630
class TestSpearmanRAbsolute(TestCase):
@@ -752,6 +779,32 @@ def test_pearsonr_distance_numpy(self):
752779
[0.32783865],
753780
[0.]]))
754781

782+
def test_corrcoef2(self):
783+
# Test that _corrcoef2 returns the same result that np.corrcoef would
784+
n, m = tuple(np.random.randint(2, 5, size=2))
785+
mean = np.random.uniform(-1, 1, size=m)
786+
cov = np.random.uniform(0, 1./m, size=(m, m))
787+
cov = (cov + cov.T) / 2
788+
cov.flat[::m + 1] = 1.0
789+
X1 = np.random.multivariate_normal(mean, cov, size=n)
790+
X2 = np.random.multivariate_normal(mean, cov, size=n)
791+
expected = np.corrcoef(X1, X2, rowvar=True)[:n, n:]
792+
np.testing.assert_almost_equal(
793+
_corrcoef2(X1, X2, axis=1),
794+
expected,
795+
decimal=9
796+
)
797+
798+
expected = np.corrcoef(X1, X2, rowvar=False)[:m, m:]
799+
np.testing.assert_almost_equal(
800+
_corrcoef2(X1, X2, axis=0),
801+
expected,
802+
decimal=9,
803+
)
804+
805+
with self.assertRaises(ValueError):
806+
_corrcoef2(X1, X2, axis=10)
807+
755808

756809
# noinspection PyTypeChecker
757810
class TestPearsonRAbsolute(TestCase):

0 commit comments

Comments
 (0)