Skip to content

Commit 38a8659

Browse files
author
Dave Lahr
committed
math/fast_cov and math/fast_corr: made recommended changes - remove axis variable (only works on columns of provided matrices), remove print_trace, added spearman correlation
1 parent 514e796 commit 38a8659

File tree

5 files changed

+114
-257
lines changed

5 files changed

+114
-257
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
__pycache__/
22
*.pyc
3+
cmapPy.egg-info/
4+
.vscode
5+
.gitignore

cmapPy/math/fast_corr.py

Lines changed: 37 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,42 +2,56 @@
22
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
33
import numpy
44
import cmapPy.math.fast_cov as fast_cov
5+
import pandas
56

67

78
logger = logging.getLogger(setup_logger.LOGGER_NAME)
89

910

10-
def fast_corr(x, y=None, axis=0, do_print_trace=False):
11-
"""calculate the pearson correlation matrix for x, or optionally, the correlaton matrix between x and y.
12-
axis parameter defines whether to treat the rows or columns as vectors that the covariance is calculated
13-
between
11+
def fast_corr(x, y=None):
12+
"""calculate the pearson correlation matrix for the columns of x (MxN), or optionally, the correlaton matrix between x and y (OxP).
13+
In the language of statistics the columns are the variables and the rows are the observations.
1414
1515
Args:
16-
x (numpy array-like) NxM in shape
17-
y (numpy array-like) OxP in shape
18-
axis (integer) indicates which axis of the arrays is the vectors. axis=0 indicates calculate
19-
the covariance between the rows, axis=1 indicates calculate the covariance between the columns
20-
do_print_trace (bool) whether or not to use logger.debug to print out intermediate mathematical steps
21-
(WARNING: will print out all intermediate matrices, potentially lots of output!)
22-
23-
returns (numpy array-like) QxR array of the covariance values
24-
for defaults (y=None, axis=0), shape is NxN
25-
if y=None and axis=1, shape is MxM
26-
if y is provied, axis=0, shape is NxO (and P==M in other words number of columns in x & y are the same)
27-
if y is provided, axis=1, shape is MxP (and N==O in other words number of rows in x & y are the same)
16+
x (numpy array-like) MxN in shape
17+
y (optional, numpy array-like) OxP in shape
18+
19+
returns (numpy array-like) array of the covariance values
20+
for defaults (y=None), shape is NxN
21+
if y is provied, shape is NxP
2822
"""
2923
if y is None:
3024
y = x
3125

32-
cov_mat = fast_cov.fast_cov(x, y, axis, do_print_trace)
33-
if do_print_trace: logger.debug("cov_mat:\n{}".format(cov_mat))
26+
cov_mat = fast_cov.fast_cov(x, y)
3427

35-
std_x = numpy.std(x.T, axis=axis, ddof=1)
36-
if do_print_trace: logger.debug("std_x:\n{}".format(std_x))
37-
std_y = numpy.std(y.T, axis=axis, ddof=1)
38-
if do_print_trace: logger.debug("std_y:\n{}".format(std_y))
28+
std_x = numpy.std(x, axis=0, ddof=1)
29+
std_y = numpy.std(y, axis=0, ddof=1)
3930

4031
std_outer = numpy.outer(std_x, std_y)
41-
if do_print_trace: logger.debug("std_outer:\n{}".format(std_outer))
4232

4333
return cov_mat / std_outer
34+
35+
36+
def fast_spearman(x, y=None):
37+
"""calculate the spearnab correlation matrix for the columns of x (MxN), or optionally, the spearmancorrelaton matrix between x and y (OxP).
38+
In the language of statistics the columns are the variables and the rows are the observations.
39+
40+
Args:
41+
x (numpy array-like) MxN in shape
42+
y (optional, numpy array-like) OxP in shape
43+
44+
returns:
45+
(numpy array-like) array of the covariance values
46+
for defaults (y=None), shape is NxN
47+
if y is provied, shape is NxP
48+
"""
49+
logger.debug("x.shape: {}".format(x.shape))
50+
if hasattr(y, "shape"):
51+
logger.debug("y.shape: {}".format(y.shape))
52+
53+
x_ranks = pandas.DataFrame(x).rank(method="average").values
54+
logger.debug("some min and max ranks of x_ranks:\n{}\n{}".format(numpy.min(x_ranks[:10], axis=0), numpy.max(x_ranks[:10], axis=0)))
55+
y_ranks = pandas.DataFrame(y).rank(method="average").values if y is not None else None
56+
57+
return fast_corr(x_ranks, y_ranks)

cmapPy/math/fast_cov.py

Lines changed: 21 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -6,59 +6,38 @@
66
logger = logging.getLogger(setup_logger.LOGGER_NAME)
77

88

9-
def fast_cov(x, y=None, axis=0, do_print_trace=False):
10-
"""calculate the covariance matrix for x, or optionally, the covariance matrix between x and y.
11-
axis parameter defines whether to treat the rows or columns as vectors that the correlation is calculated
12-
between
9+
def fast_cov(x, y=None):
10+
"""calculate the covariance matrix for the columns of x (MxN), or optionally, the covariance matrix between the
11+
columns of x and and the columns of y (MxP). (In the language of statistics, the columns are variables, the rows
12+
are observations).
1313
1414
Args:
15-
x (numpy array-like) NxM in shape
16-
y (numpy array-like) OxP in shape
17-
axis (integer) indicates which axis of the arrays is the vectors. axis=0 indicates calculate
18-
the covariance between the rows, axis=1 indicates calculate the covariance between the columns
19-
do_print_trace (bool) whether or not to use logger.debug to print out intermediate mathematical steps
20-
(WARNING: will print out all intermediate matrices, potentially lots of output!)
21-
22-
returns (numpy array-like) QxR array of the covariance values
23-
for defaults (y=None, axis=0), shape is NxN
24-
if y=None and axis=1, shape is MxM
25-
if y is provied, axis=0, shape is NxO (and P==M in other words number of columns in x & y are the same)
26-
if y is provided, axis=1, shape is MxP (and N==O in other words number of rows in x & y are the same)
15+
x (numpy array-like) MxN in shape
16+
y (numpy array-like) MxP in shape
17+
18+
returns (numpy array-like) array of the covariance values
19+
for defaults (y=None), shape is NxN
20+
if y is provided, shape is NxP
2721
"""
28-
validate_axis(axis)
29-
validate_x_y(x, y, axis)
22+
validate_x_y(x, y)
3023

3124
if y is None:
3225
y = x
3326

34-
mean_x = numpy.mean(x.T, axis=axis)
35-
if do_print_trace: logger.debug("mean_x:\n{}".format(mean_x))
36-
mean_y = numpy.mean(y.T, axis=axis)
37-
if do_print_trace: logger.debug("mean_y:\n{}".format(mean_y))
38-
39-
mean_centered_x = (x.T - mean_x).T if axis == 0 else (x - mean_x).T
40-
if do_print_trace: logger.debug("mean_centered_x:\n{}".format(mean_centered_x))
41-
mean_centered_y = y.T - mean_y if axis == 0 else y - mean_y
42-
if do_print_trace: logger.debug("mean_centered_y:\n{}".format(mean_centered_y))
43-
44-
dotprod = numpy.dot(mean_centered_x, mean_centered_y)
45-
if do_print_trace: logger.debug("dotprod:\n{}".format(dotprod))
27+
mean_x = numpy.mean(x, axis=0)
28+
mean_y = numpy.mean(y, axis=0)
29+
30+
mean_centered_x = x - mean_x
31+
mean_centered_y = y - mean_y
4632

47-
other_axis = 1 if axis == 0 else 0
48-
if do_print_trace: logger.debug("other_axis:\n{}".format(other_axis))
33+
dotprod = numpy.dot(mean_centered_x.T, mean_centered_y)
4934

50-
denom = float(x.shape[other_axis] - 1)
51-
if do_print_trace: logger.debug("denom:\n{}".format(denom))
35+
denom = x.shape[0] - 1
5236

5337
return dotprod / denom
5438

5539

56-
def validate_axis(axis):
57-
if not (0 == axis or 1 == axis):
58-
raise CmapPyMathFastCovInvalidInputAxis("invalid value for axis provided to fast_cov - axis: {}".format(axis))
59-
60-
61-
def validate_x_y(x, y, axis):
40+
def validate_x_y(x, y):
6241
error_msg = ""
6342

6443
if not hasattr(x, "shape"):
@@ -67,14 +46,8 @@ def validate_x_y(x, y, axis):
6746
if y is not None:
6847
if not hasattr(y, "shape"):
6948
error_msg += "y needs to be numpy array-like but it does not have \"shape\" attribute - type(y): {}\n".format(type(y))
70-
else:
71-
base_msg = "with axis={} (finding the covariance between {}), the number of {} in x and y must be the same but they are not - x.shape: {} y.shape: {}\n"
72-
if 0 == axis:
73-
if x.shape[1] != y.shape[1]:
74-
error_msg += base_msg.format(axis, "rows", "columns", x.shape, y.shape)
75-
else:
76-
if x.shape[0] != y.shape[0]:
77-
error_msg += base_msg.format(axis, "columns", "rows", x.shape, y.shape)
49+
elif x.shape[0] != y.shape[0]:
50+
error_msg += "the number of rows in the x and y matrices must be the same".format(x.shape, y.shape)
7851

7952
if error_msg != "":
8053
raise CmapPyMathFastCovInvalidInputXY(error_msg)

cmapPy/math/tests/test_fast_corr.py

Lines changed: 30 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
44
import cmapPy.math.fast_corr as fast_corr
55
import numpy
6+
import pandas
67

78

89
logger = logging.getLogger(setup_logger.LOGGER_NAME)
@@ -29,77 +30,25 @@ def test_fast_corr_just_x(self):
2930
logger.debug("*************happy path just x")
3031
x, _ = TestFastCorr.build_standard_x_y()
3132

32-
ex = numpy.corrcoef(x)
33+
ex = numpy.corrcoef(x, rowvar=False)
3334
logger.debug("expected ex: {}".format(ex))
3435

35-
r = fast_corr.fast_corr(x, do_print_trace=True)
36+
r = fast_corr.fast_corr(x)
3637
logger.debug("r: {}".format(r))
3738

3839
self.assertTrue(numpy.allclose(ex, r))
3940

40-
#happy path just x, transposed
41-
ex = numpy.corrcoef(x.T)
42-
logger.debug("happy path just x, transposed, expected ex: {}".format(ex))
43-
r = fast_corr.fast_corr(x.T, do_print_trace=True)
44-
logger.debug("r: {}".format(r))
45-
self.assertTrue(numpy.allclose(ex, r))
46-
47-
def test_fast_corr_just_x_different_axis(self):
48-
logger.debug("*************happy path just x, use different axis")
49-
x, _ = TestFastCorr.build_standard_x_y()
50-
51-
ex = numpy.corrcoef(x, rowvar=False)
52-
logger.debug("happy path just x, use different axis, expected ex: {}".format(ex))
53-
r = fast_corr.fast_corr(x, axis=1, do_print_trace=True)
54-
logger.debug("r: {}".format(r))
55-
self.assertTrue(numpy.allclose(ex, r))
56-
57-
#happy path just x, use different axis, transpose
58-
ex = numpy.corrcoef(x)
59-
logger.debug("happy path just x, use different axis, transpose, expected ex: {}".format(ex))
60-
r = fast_corr.fast_corr(x.T, axis=1, do_print_trace=True)
41+
#happy path just x, other direction
42+
ex = numpy.corrcoef(x, rowvar=True)
43+
logger.debug("happy path just x, other direction, expected ex: {}".format(ex))
44+
r = fast_corr.fast_corr(x.T)
6145
logger.debug("r: {}".format(r))
6246
self.assertTrue(numpy.allclose(ex, r))
6347

6448
def test_fast_corr_x_and_y(self):
6549
logger.debug("*************happy path x and y")
6650
x, y = TestFastCorr.build_standard_x_y()
6751

68-
combined = numpy.vstack([x, y])
69-
logger.debug("combined: {}".format(combined))
70-
logger.debug("combined.shape: {}".format(combined.shape))
71-
72-
off_diag_ind = combined.shape[0] / 2
73-
74-
raw_ex = numpy.corrcoef(combined)
75-
logger.debug("raw expected produced from numpy.cov on full combined - raw_ex: {}".format(raw_ex))
76-
ex = raw_ex[:off_diag_ind, off_diag_ind:]
77-
logger.debug("expected ex: {}".format(ex))
78-
79-
r = fast_corr.fast_corr(x, y, do_print_trace=True)
80-
logger.debug("r: {}".format(r))
81-
self.assertTrue(numpy.allclose(ex, r))
82-
83-
#happy path x and y, transposed
84-
combined = numpy.vstack([x.T, y.T])
85-
logger.debug("*************happy path x and y, transposed - combined: {}".format(combined))
86-
logger.debug("combined.shape: {}".format(combined.shape))
87-
88-
off_diag_ind = combined.shape[0] / 2
89-
90-
raw_ex = numpy.corrcoef(combined)
91-
logger.debug("raw expected produced from numpy.cov on full combined - raw_ex: {}".format(raw_ex))
92-
ex = raw_ex[:off_diag_ind, off_diag_ind:]
93-
logger.debug("expected ex: {}".format(ex))
94-
95-
r = fast_corr.fast_corr(x.T, y.T, do_print_trace=True)
96-
logger.debug("r: {}".format(r))
97-
self.assertTrue(numpy.allclose(ex, r))
98-
99-
def test_fast_corr_x_and_y_different_axis(self):
100-
logger.debug("*************happy path, x and y using different axis")
101-
x, y = TestFastCorr.build_standard_x_y()
102-
10352
combined = numpy.hstack([x, y])
10453
logger.debug("combined: {}".format(combined))
10554
logger.debug("combined.shape: {}".format(combined.shape))
@@ -111,16 +60,13 @@ def test_fast_corr_x_and_y_different_axis(self):
11160
ex = raw_ex[:off_diag_ind, off_diag_ind:]
11261
logger.debug("expected ex: {}".format(ex))
11362

114-
r = fast_corr.fast_corr(x, y, axis=1, do_print_trace=True)
63+
r = fast_corr.fast_corr(x, y)
11564
logger.debug("r: {}".format(r))
11665
self.assertTrue(numpy.allclose(ex, r))
11766

118-
def test_fast_corr_x_and_y_different_axis_transpose(self):
119-
logger.debug("*************happy path x and y different axis, transposed")
120-
x, y = TestFastCorr.build_standard_x_y()
121-
67+
#happy path x and y, other direction
12268
combined = numpy.hstack([x.T, y.T])
123-
logger.debug("combined: {}".format(combined))
69+
logger.debug("*************happy path x and y, other direction - combined: {}".format(combined))
12470
logger.debug("combined.shape: {}".format(combined.shape))
12571

12672
off_diag_ind = combined.shape[1] / 2
@@ -130,7 +76,7 @@ def test_fast_corr_x_and_y_different_axis_transpose(self):
13076
ex = raw_ex[:off_diag_ind, off_diag_ind:]
13177
logger.debug("expected ex: {}".format(ex))
13278

133-
r = fast_corr.fast_corr(x.T, y.T, axis=1, do_print_trace=True)
79+
r = fast_corr.fast_corr(x.T, y.T)
13480
logger.debug("r: {}".format(r))
13581
self.assertTrue(numpy.allclose(ex, r))
13682

@@ -153,56 +99,53 @@ def test_fast_corr_x_and_y_different_shapes(self):
15399
logger.debug("expected ex: {}".format(ex))
154100
logger.debug("ex.shape: {}".format(ex.shape))
155101

156-
r = fast_corr.fast_corr(x, y, axis=1, do_print_trace=True)
102+
r = fast_corr.fast_corr(x, y)
157103
logger.debug("r: {}".format(r))
158104
self.assertTrue(numpy.allclose(ex, r))
159105

160106
def test_fast_corr_functional(self):
161107
logger.debug("*************happy path functional test using randomly generated matrices")
162108

163-
axes = numpy.random.choice([0, 1], size=num_iterations_functional_tests)
164-
165-
for cur_axis in axes:
166-
logger.debug("*******cur_axis: {}".format(cur_axis))
167-
109+
for i in xrange(num_iterations_functional_tests):
168110
#the dimension containing the observations must have at least size 2
169-
x_shape = [numpy.random.randint(1, max_dimension_functional_tests),
170-
numpy.random.randint(2, max_dimension_functional_tests)]
171-
if cur_axis == 1:
172-
x_shape.reverse()
111+
x_shape = [numpy.random.randint(2, max_dimension_functional_tests),
112+
numpy.random.randint(1, max_dimension_functional_tests)]
173113
logger.debug("x_shape: {}".format(x_shape))
174114

175115
x = numpy.random.rand(x_shape[0], x_shape[1]) * numpy.random.randint(1, multiplier_max_functional_tests, size=1)
176116
logger.debug("x:\n{}".format(x))
177117

178118
y_other_shape = numpy.random.randint(1, max_dimension_functional_tests, size=1)[0]
179-
y_shape = (y_other_shape, x_shape[1]) if cur_axis == 0 else (x_shape[0], y_other_shape)
119+
y_shape = (x_shape[0], y_other_shape)
180120
logger.debug("y_shape: {}".format(y_shape))
181121
y = numpy.random.rand(y_shape[0], y_shape[1]) * numpy.random.randint(1, multiplier_max_functional_tests, size=1)
182122
logger.debug("y:\n{}".format(y))
183123

184-
if cur_axis == 0:
185-
row_var = True
186-
combined = numpy.vstack([x, y])
187-
off_diag_ind = (x.shape[0], -y.shape[0])
188-
else:
189-
row_var = False
190-
combined = numpy.hstack([x, y])
191-
off_diag_ind = (x.shape[1], -y.shape[1])
124+
combined = numpy.hstack([x, y])
192125

193-
raw_ex = numpy.corrcoef(combined, rowvar=row_var)
126+
raw_ex = numpy.corrcoef(combined, rowvar=False)
194127
logger.debug("raw_ex.shape: {}".format(raw_ex.shape))
195128

196-
ex = raw_ex[:off_diag_ind[0], off_diag_ind[1]:]
129+
ex = raw_ex[:x.shape[1], -y.shape[1]:]
197130
logger.debug("ex:\n{}".format(ex))
198131
logger.debug("ex.shape: {}".format(ex.shape))
199132

200-
r = fast_corr.fast_corr(x, y, axis=cur_axis)
133+
r = fast_corr.fast_corr(x, y)
201134
logger.debug("r:\n{}".format(r))
202135
logger.debug("r.shape: {}".format(r.shape))
203136

204137
self.assertTrue(numpy.allclose(ex, r))
205138

139+
def test_fast_spearman(self):
140+
x, y = TestFastCorr.build_standard_x_y()
141+
142+
ex = numpy.array([[1.0, 1.0, 1.0], [-1.0, -1.0, -1.0], [1.0, 1.0, 1.0]])
143+
144+
r = fast_corr.fast_spearman(x, y)
145+
logger.debug("r: {}".format(r))
146+
147+
self.assertTrue(numpy.allclose(ex, r))
148+
206149

207150
if __name__ == "__main__":
208151
setup_logger.setup(verbose=True)

0 commit comments

Comments
 (0)