Skip to content

Commit 6b3d0d2

Browse files
author
lev
committed
Merge branch 'master' into math
2 parents 83aa692 + 0406a37 commit 6b3d0d2

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+2391
-1187
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
__pycache__/
22
*.pyc
3+
cmapPy.egg-info/
4+
.vscode
5+
.gitignore

.travis.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ install:
1313
env:
1414
- TEST_DIR=cmapPy/pandasGEXpress/tests
1515
- TEST_DIR=cmapPy/set_io/tests
16+
- TEST_DIR=cmapPy/math/tests
1617

1718
# run all tests in each test dir
1819
script: cd $TEST_DIR && python -m unittest discover -p "test_*.py"

README.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1818
**Connectivity Map, Broad Institute of MIT and Harvard**
1919

20-
Documentation: `<http://cmappy.readthedocs.io/en/latest/>`_
20+
Documentation: `<https://clue.io/cmapPy/index.html>`_
2121

2222
For questions/problems, please add an issue (that includes code/files that reproduce your problem) to the repository.
2323

@@ -29,6 +29,7 @@ We welcome contributors! For your pull requests, please include the following:
2929
* Sample code/file that reproducibly causes the bug/issue
3030
* Documented code providing fix
3131
* Unit tests evaluating added/modified methods.
32+
3233

3334
Citation
3435
====================

cmapPy/clue_api_client/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-
from .clue_api_client import ClueApiClient

cmapPy/math/fast_corr.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import logging
2+
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
3+
import numpy
4+
import cmapPy.math.fast_cov as fast_cov
5+
import pandas
6+
7+
8+
logger = logging.getLogger(setup_logger.LOGGER_NAME)
9+
10+
11+
def fast_corr(x, y=None):
12+
"""calculate the pearson correlation matrix for the columns of x (MxN), or optionally, the correlaton matrix between x and y (OxP).
13+
In the language of statistics the columns are the variables and the rows are the observations.
14+
15+
Args:
16+
x (numpy array-like) MxN in shape
17+
y (optional, numpy array-like) OxP in shape
18+
19+
returns (numpy array-like) array of the covariance values
20+
for defaults (y=None), shape is NxN
21+
if y is provied, shape is NxP
22+
"""
23+
if y is None:
24+
y = x
25+
26+
cov_mat = fast_cov.fast_cov(x, y)
27+
28+
std_x = numpy.std(x, axis=0, ddof=1)
29+
std_y = numpy.std(y, axis=0, ddof=1)
30+
31+
std_outer = numpy.outer(std_x, std_y)
32+
33+
return cov_mat / std_outer
34+
35+
36+
def fast_spearman(x, y=None):
37+
"""calculate the spearnab correlation matrix for the columns of x (MxN), or optionally, the spearmancorrelaton matrix between x and y (OxP).
38+
In the language of statistics the columns are the variables and the rows are the observations.
39+
40+
Args:
41+
x (numpy array-like) MxN in shape
42+
y (optional, numpy array-like) OxP in shape
43+
44+
returns:
45+
(numpy array-like) array of the covariance values
46+
for defaults (y=None), shape is NxN
47+
if y is provied, shape is NxP
48+
"""
49+
logger.debug("x.shape: {}".format(x.shape))
50+
if hasattr(y, "shape"):
51+
logger.debug("y.shape: {}".format(y.shape))
52+
53+
x_ranks = pandas.DataFrame(x).rank(method="average").values
54+
logger.debug("some min and max ranks of x_ranks:\n{}\n{}".format(numpy.min(x_ranks[:10], axis=0), numpy.max(x_ranks[:10], axis=0)))
55+
y_ranks = pandas.DataFrame(y).rank(method="average").values if y is not None else None
56+
57+
return fast_corr(x_ranks, y_ranks)

cmapPy/math/fast_cov.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import logging
2+
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
3+
import numpy
4+
5+
6+
logger = logging.getLogger(setup_logger.LOGGER_NAME)
7+
8+
9+
def fast_cov(x, y=None):
10+
"""calculate the covariance matrix for the columns of x (MxN), or optionally, the covariance matrix between the
11+
columns of x and and the columns of y (MxP). (In the language of statistics, the columns are variables, the rows
12+
are observations).
13+
14+
Args:
15+
x (numpy array-like) MxN in shape
16+
y (numpy array-like) MxP in shape
17+
18+
returns (numpy array-like) array of the covariance values
19+
for defaults (y=None), shape is NxN
20+
if y is provided, shape is NxP
21+
"""
22+
validate_x_y(x, y)
23+
24+
if y is None:
25+
y = x
26+
27+
mean_x = numpy.mean(x, axis=0)
28+
mean_y = numpy.mean(y, axis=0)
29+
30+
mean_centered_x = x - mean_x
31+
mean_centered_y = y - mean_y
32+
33+
dotprod = numpy.dot(mean_centered_x.T, mean_centered_y)
34+
35+
denom = x.shape[0] - 1
36+
37+
return dotprod / denom
38+
39+
40+
def validate_x_y(x, y):
41+
error_msg = ""
42+
43+
if not hasattr(x, "shape"):
44+
error_msg += "x needs to be numpy array-like but it does not have \"shape\" attribute - type(x): {}\n".format(type(x))
45+
46+
if y is not None:
47+
if not hasattr(y, "shape"):
48+
error_msg += "y needs to be numpy array-like but it does not have \"shape\" attribute - type(y): {}\n".format(type(y))
49+
elif x.shape[0] != y.shape[0]:
50+
error_msg += "the number of rows in the x and y matrices must be the same".format(x.shape, y.shape)
51+
52+
if error_msg != "":
53+
raise CmapPyMathFastCovInvalidInputXY(error_msg)
54+
55+
56+
class CmapPyMathFastCovInvalidInputAxis(Exception):
57+
pass
58+
59+
class CmapPyMathFastCovInvalidInputXY(Exception):
60+
pass
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
import unittest
2+
import logging
3+
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
4+
import cmapPy.math.fast_corr as fast_corr
5+
import numpy
6+
import pandas
7+
8+
9+
logger = logging.getLogger(setup_logger.LOGGER_NAME)
10+
11+
num_iterations_functional_tests = 20
12+
max_dimension_functional_tests = 10
13+
multiplier_max_functional_tests = 100
14+
15+
16+
class TestFastCorr(unittest.TestCase):
17+
@staticmethod
18+
def build_standard_x_y():
19+
x = numpy.array([[1,7,2], [5,3,11]])
20+
logger.debug("x: {}".format(x))
21+
logger.debug("x.shape: {}".format(x.shape))
22+
23+
y = numpy.array([[13, 17, 19], [23, 31, 29]])
24+
logger.debug("y: {}".format(y))
25+
logger.debug("y.shape: {}".format(y.shape))
26+
27+
return x, y
28+
29+
def test_fast_corr_just_x(self):
30+
logger.debug("*************happy path just x")
31+
x, _ = TestFastCorr.build_standard_x_y()
32+
33+
ex = numpy.corrcoef(x, rowvar=False)
34+
logger.debug("expected ex: {}".format(ex))
35+
36+
r = fast_corr.fast_corr(x)
37+
logger.debug("r: {}".format(r))
38+
39+
self.assertTrue(numpy.allclose(ex, r))
40+
41+
#happy path just x, other direction
42+
ex = numpy.corrcoef(x, rowvar=True)
43+
logger.debug("happy path just x, other direction, expected ex: {}".format(ex))
44+
r = fast_corr.fast_corr(x.T)
45+
logger.debug("r: {}".format(r))
46+
self.assertTrue(numpy.allclose(ex, r))
47+
48+
def test_fast_corr_x_and_y(self):
49+
logger.debug("*************happy path x and y")
50+
x, y = TestFastCorr.build_standard_x_y()
51+
52+
combined = numpy.hstack([x, y])
53+
logger.debug("combined: {}".format(combined))
54+
logger.debug("combined.shape: {}".format(combined.shape))
55+
56+
off_diag_ind = combined.shape[1] / 2
57+
58+
raw_ex = numpy.corrcoef(combined, rowvar=False)
59+
logger.debug("raw expected produced from numpy.cov on full combined - raw_ex: {}".format(raw_ex))
60+
ex = raw_ex[:off_diag_ind, off_diag_ind:]
61+
logger.debug("expected ex: {}".format(ex))
62+
63+
r = fast_corr.fast_corr(x, y)
64+
logger.debug("r: {}".format(r))
65+
self.assertTrue(numpy.allclose(ex, r))
66+
67+
#happy path x and y, other direction
68+
combined = numpy.hstack([x.T, y.T])
69+
logger.debug("*************happy path x and y, other direction - combined: {}".format(combined))
70+
logger.debug("combined.shape: {}".format(combined.shape))
71+
72+
off_diag_ind = combined.shape[1] / 2
73+
74+
raw_ex = numpy.corrcoef(combined, rowvar=False)
75+
logger.debug("raw expected produced from numpy.cov on full combined - raw_ex: {}".format(raw_ex))
76+
ex = raw_ex[:off_diag_ind, off_diag_ind:]
77+
logger.debug("expected ex: {}".format(ex))
78+
79+
r = fast_corr.fast_corr(x.T, y.T)
80+
logger.debug("r: {}".format(r))
81+
self.assertTrue(numpy.allclose(ex, r))
82+
83+
def test_fast_corr_x_and_y_different_shapes(self):
84+
logger.debug("*************happy path x and y different shapes")
85+
x, _ = TestFastCorr.build_standard_x_y()
86+
y = numpy.array([[13, 17, 19, 41, 23], [23, 29, 31, 37, 43]])
87+
logger.debug("y.shape: {}".format(y.shape))
88+
logger.debug("y:\n{}".format(y))
89+
90+
combined = numpy.hstack([x, y])
91+
logger.debug("combined: {}".format(combined))
92+
logger.debug("combined.shape: {}".format(combined.shape))
93+
94+
raw_ex = numpy.corrcoef(combined, rowvar=False)
95+
logger.debug("raw expected produced from numpy.cov on full combined - raw_ex: {}".format(raw_ex))
96+
logger.debug("raw_ex.shape: {}".format(raw_ex.shape))
97+
98+
ex = raw_ex[:x.shape[1], -y.shape[1]:]
99+
logger.debug("expected ex: {}".format(ex))
100+
logger.debug("ex.shape: {}".format(ex.shape))
101+
102+
r = fast_corr.fast_corr(x, y)
103+
logger.debug("r: {}".format(r))
104+
self.assertTrue(numpy.allclose(ex, r))
105+
106+
def test_fast_corr_functional(self):
107+
logger.debug("*************happy path functional test using randomly generated matrices")
108+
109+
for i in xrange(num_iterations_functional_tests):
110+
#the dimension containing the observations must have at least size 2
111+
x_shape = [numpy.random.randint(2, max_dimension_functional_tests),
112+
numpy.random.randint(1, max_dimension_functional_tests)]
113+
logger.debug("x_shape: {}".format(x_shape))
114+
115+
x = numpy.random.rand(x_shape[0], x_shape[1]) * numpy.random.randint(1, multiplier_max_functional_tests, size=1)
116+
logger.debug("x:\n{}".format(x))
117+
118+
y_other_shape = numpy.random.randint(1, max_dimension_functional_tests, size=1)[0]
119+
y_shape = (x_shape[0], y_other_shape)
120+
logger.debug("y_shape: {}".format(y_shape))
121+
y = numpy.random.rand(y_shape[0], y_shape[1]) * numpy.random.randint(1, multiplier_max_functional_tests, size=1)
122+
logger.debug("y:\n{}".format(y))
123+
124+
combined = numpy.hstack([x, y])
125+
126+
raw_ex = numpy.corrcoef(combined, rowvar=False)
127+
logger.debug("raw_ex.shape: {}".format(raw_ex.shape))
128+
129+
ex = raw_ex[:x.shape[1], -y.shape[1]:]
130+
logger.debug("ex:\n{}".format(ex))
131+
logger.debug("ex.shape: {}".format(ex.shape))
132+
133+
r = fast_corr.fast_corr(x, y)
134+
logger.debug("r:\n{}".format(r))
135+
logger.debug("r.shape: {}".format(r.shape))
136+
137+
self.assertTrue(numpy.allclose(ex, r))
138+
139+
def test_fast_spearman(self):
140+
x, y = TestFastCorr.build_standard_x_y()
141+
142+
ex = numpy.array([[1.0, 1.0, 1.0], [-1.0, -1.0, -1.0], [1.0, 1.0, 1.0]])
143+
144+
r = fast_corr.fast_spearman(x, y)
145+
logger.debug("r: {}".format(r))
146+
147+
self.assertTrue(numpy.allclose(ex, r))
148+
149+
150+
if __name__ == "__main__":
151+
setup_logger.setup(verbose=True)
152+
153+
unittest.main()

0 commit comments

Comments
 (0)