Skip to content

Commit 83aa692

Browse files
author
lev
committed
cmapPy/math/agg_wt_avg.py: renamed from modz.py
1 parent 77e8e8d commit 83aa692

File tree

4 files changed

+169
-120
lines changed

4 files changed

+169
-120
lines changed

cmapPy/math/agg_wt_avg.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
'''
2+
agg_wt_avg.py
3+
4+
Aggregate a matrix of replicate profiles into a single signature using
5+
a weighted average based on the correlation between replicates. That is, if
6+
one replicate is less correlated with the other replicates, its values will
7+
not be weighted as highly in the aggregated signature.
8+
9+
Equivalent to the 'modz' method in mortar.
10+
'''
11+
12+
import numpy as np
13+
14+
rounding_precision = 4
15+
16+
17+
def get_upper_triangle(correlation_matrix):
18+
''' Extract upper triangle from a square matrix. Negative values are
19+
set to 0.
20+
21+
Args:
22+
correlation_matrix (pandas df): Correlations between all replicates
23+
24+
Returns:
25+
upper_tri_df (pandas df): Upper triangle extracted from
26+
correlation_matrix; rid is the row index, cid is the column index,
27+
corr is the extracted correlation value
28+
'''
29+
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))
30+
31+
# convert matrix into long form description
32+
upper_tri_df = upper_triangle.stack().reset_index(level=1)
33+
upper_tri_df.columns = ['rid', 'corr']
34+
35+
# Index at this point is cid, it now becomes a column
36+
upper_tri_df.reset_index(level=0, inplace=True)
37+
38+
# Get rid of negative values
39+
upper_tri_df['corr'] = upper_tri_df['corr'].clip(lower=0)
40+
41+
return upper_tri_df.round(rounding_precision)
42+
43+
44+
def calculate_weights(correlation_matrix, min_wt):
45+
''' Calculate a weight for each profile based on its correlation to other
46+
replicates. Negative correlations are clipped to 0, and weights are clipped
47+
to be min_wt at the least.
48+
49+
Args:
50+
correlation_matrix (pandas df): Correlations between all replicates
51+
52+
Returns:
53+
raw weights (pandas series): Mean correlation to other replicates
54+
weights (pandas series): raw_weights normalized such that they add to 1
55+
'''
56+
# fill diagonal of correlation_matrix with np.nan
57+
np.fill_diagonal(correlation_matrix.values, np.nan)
58+
59+
# remove negative values
60+
correlation_matrix = correlation_matrix.clip(lower=0)
61+
62+
# get average correlation for each profile (will ignore NaN)
63+
raw_weights = correlation_matrix.mean(axis=1)
64+
65+
# threshold weights
66+
raw_weights = raw_weights.clip(lower=min_wt)
67+
68+
# normalize raw_weights so that they add to 1
69+
weights = raw_weights / sum(raw_weights)
70+
71+
return raw_weights.round(rounding_precision), weights.round(rounding_precision)
72+
73+
74+
def agg_wt_avg(mat, min_wt = 0.01, corr_metric='spearman'):
75+
''' Aggregate a set of replicate profiles into a single signature using
76+
a weighted average.
77+
78+
Args:
79+
mat (pandas df): a matrix of replicate profiles, where the columns are
80+
samples and the rows are features; columns correspond to the
81+
replicates of a single perturbagen
82+
min_wt (float): Minimum raw weight when calculating weighted average
83+
corr_metric (string): Spearman or Pearson; the correlation method
84+
85+
Returns:
86+
out_sig (pandas series): weighted average values
87+
upper_tri_df (pandas df): the correlations between each profile that went into the signature
88+
raw weights (pandas series): weights before normalization
89+
weights (pandas series): weights after normalization
90+
'''
91+
assert mat.shape[1] > 0, "mat is empty! mat: {}".format(mat)
92+
93+
if mat.shape[1] == 1:
94+
95+
out_sig = mat
96+
upper_tri_df = None
97+
raw_weights = None
98+
weights = None
99+
100+
else:
101+
102+
assert corr_metric in ["spearman", "pearson"]
103+
104+
# Make correlation matrix column wise
105+
corr_mat = mat.corr(method=corr_metric)
106+
107+
# Save the values in the upper triangle
108+
upper_tri_df = get_upper_triangle(corr_mat)
109+
110+
# Calculate weight per replicate
111+
raw_weights, weights = calculate_weights(corr_mat, min_wt)
112+
113+
# Apply weights to values
114+
weighted_values = mat * weights
115+
out_sig = weighted_values.sum(axis=1)
116+
117+
return out_sig, upper_tri_df, raw_weights, weights

cmapPy/math/modz.py

Lines changed: 0 additions & 89 deletions
This file was deleted.
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import unittest
2+
import logging
3+
import pandas as pd
4+
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
5+
import cmapPy.math.agg_wt_avg as agg_wt_avg
6+
7+
logger = logging.getLogger(setup_logger.LOGGER_NAME)
8+
9+
test_mat = pd.DataFrame({'A':[1,2,3], 'B': [2,8,6], 'C': [6,8,9]})
10+
test_mat_corr = test_mat.corr()
11+
12+
13+
class TestAggWtAvg(unittest.TestCase):
14+
def test_calculate_weights(self):
15+
# happy path
16+
raw_weights, weights = agg_wt_avg.calculate_weights(test_mat_corr, min_wt=0.1)
17+
self.assertTrue(len(weights == 3))
18+
self.assertTrue(raw_weights.tolist() == [0.8183, 0.7202, 0.8838])
19+
self.assertTrue(weights.tolist() == [0.3378, 0.2973, 0.3649])
20+
21+
# test that min_wt works
22+
raw_weights2, weights2 = agg_wt_avg.calculate_weights(test_mat_corr, min_wt=0.85)
23+
self.assertEqual(raw_weights2[1], 0.85)
24+
25+
def test_get_upper_triangle(self):
26+
# happy path
27+
upper_tri_df = agg_wt_avg.get_upper_triangle(test_mat_corr)
28+
self.assertTrue(upper_tri_df['corr'].tolist() == [0.6547, 0.982, 0.7857])
29+
self.assertTrue(upper_tri_df['rid'].tolist() == ['B', 'C', 'C'])
30+
self.assertTrue(upper_tri_df['index'].tolist() == ['A', 'A', 'B'])
31+
32+
def test_agg_wt_avg(self):
33+
# use spearman
34+
out_sig, upper_tri_df, raw_weights, weights = agg_wt_avg.agg_wt_avg(test_mat)
35+
self.assertTrue(out_sig.tolist() == [3.125, 5.75, 6.0])
36+
self.assertAlmostEqual(upper_tri_df.loc[upper_tri_df.index[0], "corr"], 0.5)
37+
self.assertAlmostEqual(raw_weights[0], 0.75)
38+
self.assertAlmostEqual(weights[0], 0.375)
39+
40+
# test on a single signature
41+
out_sig2, _, _, _ = agg_wt_avg.agg_wt_avg(test_mat[["C"]])
42+
pd.util.testing.assert_frame_equal(out_sig2, test_mat[["C"]])
43+
44+
# should break if empty input
45+
with self.assertRaises(AssertionError) as e:
46+
agg_wt_avg.agg_wt_avg(test_mat[[]])
47+
self.assertIn("mat is empty!", str(e.exception))
48+
49+
if __name__ == "__main__":
50+
setup_logger.setup(verbose=True)
51+
unittest.main()
52+

cmapPy/math/tests/test_modz.py

Lines changed: 0 additions & 31 deletions
This file was deleted.

0 commit comments

Comments
 (0)