Skip to content

Commit c3631b7

Browse files
authored
Merge pull request #29 from cmap/math
Math
2 parents a2963cd + 74e0ec4 commit c3631b7

File tree

6 files changed

+430
-0
lines changed

6 files changed

+430
-0
lines changed

cmapPy/math/agg_wt_avg.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
'''
2+
agg_wt_avg.py
3+
4+
Aggregate a matrix of replicate profiles into a single signature using
5+
a weighted average based on the correlation between replicates. That is, if
6+
one replicate is less correlated with the other replicates, its values will
7+
not be weighted as highly in the aggregated signature.
8+
9+
Equivalent to the 'modz' method in mortar.
10+
'''
11+
12+
import numpy as np
13+
14+
rounding_precision = 4
15+
16+
17+
def get_upper_triangle(correlation_matrix):
18+
''' Extract upper triangle from a square matrix. Negative values are
19+
set to 0.
20+
21+
Args:
22+
correlation_matrix (pandas df): Correlations between all replicates
23+
24+
Returns:
25+
upper_tri_df (pandas df): Upper triangle extracted from
26+
correlation_matrix; rid is the row index, cid is the column index,
27+
corr is the extracted correlation value
28+
'''
29+
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))
30+
31+
# convert matrix into long form description
32+
upper_tri_df = upper_triangle.stack().reset_index(level=1)
33+
upper_tri_df.columns = ['rid', 'corr']
34+
35+
# Index at this point is cid, it now becomes a column
36+
upper_tri_df.reset_index(level=0, inplace=True)
37+
38+
# Get rid of negative values
39+
upper_tri_df['corr'] = upper_tri_df['corr'].clip(lower=0)
40+
41+
return upper_tri_df.round(rounding_precision)
42+
43+
44+
def calculate_weights(correlation_matrix, min_wt):
45+
''' Calculate a weight for each profile based on its correlation to other
46+
replicates. Negative correlations are clipped to 0, and weights are clipped
47+
to be min_wt at the least.
48+
49+
Args:
50+
correlation_matrix (pandas df): Correlations between all replicates
51+
min_wt (float): Minimum raw weight when calculating weighted average
52+
53+
Returns:
54+
raw weights (pandas series): Mean correlation to other replicates
55+
weights (pandas series): raw_weights normalized such that they add to 1
56+
'''
57+
# fill diagonal of correlation_matrix with np.nan
58+
np.fill_diagonal(correlation_matrix.values, np.nan)
59+
60+
# remove negative values
61+
correlation_matrix = correlation_matrix.clip(lower=0)
62+
63+
# get average correlation for each profile (will ignore NaN)
64+
raw_weights = correlation_matrix.mean(axis=1)
65+
66+
# threshold weights
67+
raw_weights = raw_weights.clip(lower=min_wt)
68+
69+
# normalize raw_weights so that they add to 1
70+
weights = raw_weights / sum(raw_weights)
71+
72+
return raw_weights.round(rounding_precision), weights.round(rounding_precision)
73+
74+
75+
def agg_wt_avg(mat, min_wt = 0.01, corr_metric='spearman'):
76+
''' Aggregate a set of replicate profiles into a single signature using
77+
a weighted average.
78+
79+
Args:
80+
mat (pandas df): a matrix of replicate profiles, where the columns are
81+
samples and the rows are features; columns correspond to the
82+
replicates of a single perturbagen
83+
min_wt (float): Minimum raw weight when calculating weighted average
84+
corr_metric (string): Spearman or Pearson; the correlation method
85+
86+
Returns:
87+
out_sig (pandas series): weighted average values
88+
upper_tri_df (pandas df): the correlations between each profile that went into the signature
89+
raw weights (pandas series): weights before normalization
90+
weights (pandas series): weights after normalization
91+
'''
92+
assert mat.shape[1] > 0, "mat is empty! mat: {}".format(mat)
93+
94+
if mat.shape[1] == 1:
95+
96+
out_sig = mat
97+
upper_tri_df = None
98+
raw_weights = None
99+
weights = None
100+
101+
else:
102+
103+
assert corr_metric in ["spearman", "pearson"]
104+
105+
# Make correlation matrix column wise
106+
corr_mat = mat.corr(method=corr_metric)
107+
108+
# Save the values in the upper triangle
109+
upper_tri_df = get_upper_triangle(corr_mat)
110+
111+
# Calculate weight per replicate
112+
raw_weights, weights = calculate_weights(corr_mat, min_wt)
113+
114+
# Apply weights to values
115+
weighted_values = mat * weights
116+
out_sig = weighted_values.sum(axis=1)
117+
118+
return out_sig, upper_tri_df, raw_weights, weights

cmapPy/math/robust_zscore.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
'''
2+
robust_zscore.py
3+
4+
Robustly z-scores a pandas df along the rows (i.e. the z-score is made relative
5+
to a row). A robust z-score means that median is used instead of mean and
6+
median absolute deviation (MAD) instead of standard deviation in the
7+
standard z-score calculation:
8+
9+
z = (x - u) / s
10+
11+
x: input value
12+
u: median
13+
s: MAD
14+
15+
Optionally, the median and MAD can be computed from a control df, instead of the
16+
input df. This functionality is useful for "vehicle-control"; that is, if
17+
the control df consists only of negative control samples, the median and MAD
18+
can be computed using just those samples but applied to the input df.
19+
'''
20+
21+
rounding_precision = 4
22+
23+
24+
def robust_zscore(mat, ctrl_mat=None, min_mad=0.1):
25+
''' Robustly z-score a pandas df along the rows.
26+
27+
Args:
28+
mat (pandas df): Matrix of data that z-scoring will be applied to
29+
ctrl_mat (pandas df): Optional matrix from which to compute medians and MADs
30+
(e.g. vehicle control)
31+
min_mad (float): Minimum MAD to threshold to; tiny MAD values will cause
32+
z-scores to blow up
33+
34+
Returns:
35+
zscore_df (pandas_df): z-scored data
36+
'''
37+
38+
# If optional df exists, calc medians and mads from it
39+
if ctrl_mat is not None:
40+
medians = ctrl_mat.median(axis=1)
41+
median_devs = abs(ctrl_mat.subtract(medians, axis=0))
42+
43+
# Else just use plate medians
44+
else:
45+
medians = mat.median(axis=1)
46+
median_devs = abs(mat.subtract(medians, axis=0))
47+
48+
sub = mat.subtract(medians, axis='index')
49+
mads = median_devs.median(axis=1)
50+
51+
# Threshold mads
52+
mads = mads.clip(lower=min_mad)
53+
54+
# Must multiply values by 1.4826 to make MAD comparable to SD
55+
# (https://en.wikipedia.org/wiki/Median_absolute_deviation)
56+
zscore_df = sub.divide(mads * 1.4826, axis='index')
57+
58+
return zscore_df.round(rounding_precision)
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import unittest
2+
import logging
3+
import pandas as pd
4+
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
5+
import cmapPy.math.agg_wt_avg as agg_wt_avg
6+
7+
logger = logging.getLogger(setup_logger.LOGGER_NAME)
8+
9+
test_mat = pd.DataFrame({'A':[1,2,3], 'B': [2,8,6], 'C': [6,8,9]})
10+
test_mat_corr = test_mat.corr()
11+
12+
13+
class TestAggWtAvg(unittest.TestCase):
14+
def test_calculate_weights(self):
15+
# happy path
16+
raw_weights, weights = agg_wt_avg.calculate_weights(test_mat_corr, min_wt=0.1)
17+
self.assertTrue(len(weights == 3))
18+
self.assertTrue(raw_weights.tolist() == [0.8183, 0.7202, 0.8838])
19+
self.assertTrue(weights.tolist() == [0.3378, 0.2973, 0.3649])
20+
21+
# test that min_wt works
22+
raw_weights2, weights2 = agg_wt_avg.calculate_weights(test_mat_corr, min_wt=0.85)
23+
self.assertEqual(raw_weights2[1], 0.85)
24+
25+
def test_get_upper_triangle(self):
26+
# happy path
27+
upper_tri_df = agg_wt_avg.get_upper_triangle(test_mat_corr)
28+
self.assertTrue(upper_tri_df['corr'].tolist() == [0.6547, 0.982, 0.7857])
29+
self.assertTrue(upper_tri_df['rid'].tolist() == ['B', 'C', 'C'])
30+
self.assertTrue(upper_tri_df['index'].tolist() == ['A', 'A', 'B'])
31+
32+
def test_agg_wt_avg(self):
33+
# use spearman
34+
out_sig, upper_tri_df, raw_weights, weights = agg_wt_avg.agg_wt_avg(test_mat)
35+
self.assertTrue(out_sig.tolist() == [3.125, 5.75, 6.0])
36+
self.assertAlmostEqual(upper_tri_df.loc[upper_tri_df.index[0], "corr"], 0.5)
37+
self.assertAlmostEqual(raw_weights[0], 0.75)
38+
self.assertAlmostEqual(weights[0], 0.375)
39+
40+
# test on a single signature
41+
out_sig2, _, _, _ = agg_wt_avg.agg_wt_avg(test_mat[["C"]])
42+
pd.util.testing.assert_frame_equal(out_sig2, test_mat[["C"]])
43+
44+
# should break if empty input
45+
with self.assertRaises(AssertionError) as e:
46+
agg_wt_avg.agg_wt_avg(test_mat[[]])
47+
self.assertIn("mat is empty!", str(e.exception))
48+
49+
if __name__ == "__main__":
50+
setup_logger.setup(verbose=True)
51+
unittest.main()
52+
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import unittest
2+
import logging
3+
import pandas as pd
4+
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
5+
import cmapPy.math.robust_zscore as robust_zscore
6+
7+
logger = logging.getLogger(setup_logger.LOGGER_NAME)
8+
9+
test_mat = pd.DataFrame({'A':[4,2,3], 'B': [2,8,6], 'C': [6,5,9], 'D': [5,2,1]})
10+
test_ctl_mat = pd.DataFrame({'E':[8,8,6], 'F': [7,6,6]})
11+
test_ctl_mat2 = pd.DataFrame({'E':[8,8,6], 'F': [8,6,6]})
12+
13+
14+
class TestRobustZscore(unittest.TestCase):
15+
def test_zscore_pc(self):
16+
pc_zscores = robust_zscore.robust_zscore(test_mat)
17+
self.assertTrue(pc_zscores.shape == (3, 4))
18+
19+
pd.util.testing.assert_frame_equal(pc_zscores, pd.DataFrame(
20+
{'A': [-0.3372, -0.6745, -0.4047],
21+
'B': [-1.6862, 2.0235, 0.4047],
22+
'C': [1.0117, 0.6745, 1.2141],
23+
'D': [0.3372, -0.6745, -0.9443]}))
24+
25+
def test_zscore_vc(self):
26+
vc_zscores = robust_zscore.robust_zscore(test_mat, ctrl_mat=test_ctl_mat)
27+
self.assertTrue(vc_zscores.shape == (3, 4))
28+
pd.util.testing.assert_frame_equal(vc_zscores, pd.DataFrame(
29+
{'A': [-4.7214, -3.3725, -20.2347],
30+
'B': [-7.4194, 0.6745, 0.0],
31+
'C': [-2.0235, -1.349, 20.2347],
32+
'D': [-3.3725, -3.3725, -33.7245]}))
33+
34+
# check that min_mad works
35+
vc_zscores2 = robust_zscore.robust_zscore(test_mat, ctrl_mat=test_ctl_mat2)
36+
self.assertEqual(vc_zscores2.iloc[0, 0], -26.9796)
37+
self.assertEqual(vc_zscores2.iloc[1, 1], 0.6745)
38+
39+
if __name__ == "__main__":
40+
setup_logger.setup(verbose=True)
41+
unittest.main()
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
'''
2+
diff_gctoo.py
3+
4+
Converts a matrix of values (e.g. gene expression, viability, etc.) into a
5+
matrix of differential values. Values can be made differential relative to all
6+
samples in the dataset ("plate-control") or relative to just negative control
7+
samples ("vehicle-control"). The method of computing the differential can be
8+
either a robust z-score ("robust_z") or simply median normalization
9+
("median_norm").
10+
11+
'''
12+
import cmapPy.math.robust_zscore as robust_zscore
13+
import cmapPy.pandasGEXpress.GCToo as GCToo
14+
15+
possible_diff_methods = ["robust_z", "median_norm"]
16+
17+
18+
def diff_gctoo(gctoo, plate_control=True, group_field='pert_type', group_val='ctl_vehicle',
19+
diff_method="robust_z", upper_diff_thresh=10, lower_diff_thresh=-10):
20+
''' Converts a matrix of values (e.g. gene expression, viability, etc.)
21+
into a matrix of differential values.
22+
23+
Args:
24+
df (pandas df): data to make diff_gctoo
25+
plate_control (bool): True means calculate diff_gctoo using plate control.
26+
False means vehicle control.
27+
group_field (string): Metadata field in which to find group_val
28+
group_val (string): Value in group_field that indicates use in vehicle control
29+
diff_method (string): Method of computing differential data; currently only
30+
support either "robust_z" or "median_norm"
31+
upper_diff_thresh (float): Maximum value for diff data
32+
lower_diff_thresh (float): Minimum value for diff data
33+
34+
Returns:
35+
out_gctoo (GCToo object): GCToo with differential data values
36+
'''
37+
assert diff_method in possible_diff_methods, (
38+
"possible_diff_methods: {}, diff_method: {}".format(
39+
possible_diff_methods, diff_method))
40+
41+
# Compute median and MAD using all samples in the dataset
42+
if plate_control:
43+
44+
# Compute differential data
45+
if diff_method == "robust_z":
46+
diff_data = robust_zscore.robust_zscore(gctoo.data_df)
47+
48+
elif diff_method == "median_norm":
49+
medians = gctoo.data_df.median(axis=1)
50+
diff_data = gctoo.data_df.subtract(medians, axis='index')
51+
52+
# Compute median and MAD from negative controls, rather than all samples
53+
else:
54+
55+
assert group_field in gctoo.col_metadata_df.columns.values, (
56+
"group_field {} not present in column metadata. " +
57+
"gctoo.col_metadata_df.columns.values: {}").format(
58+
group_field, gctoo.col_metadata_df.columns.values)
59+
60+
assert sum(gctoo.col_metadata_df[group_field] == group_val) > 0, (
61+
"group_val {} not present in the {} column.").format(
62+
group_val, group_field)
63+
64+
# Find negative control samples
65+
neg_ctl_samples = gctoo.col_metadata_df.index[gctoo.col_metadata_df[group_field] == group_val]
66+
neg_ctl_df = gctoo.data_df[neg_ctl_samples]
67+
68+
# Compute differential data
69+
if diff_method == "robust_z":
70+
diff_data = robust_zscore.robust_zscore(gctoo.data_df, neg_ctl_df)
71+
72+
elif diff_method == "median_norm":
73+
medians = gctoo.data_df.median(axis=1)
74+
diff_data = gctoo.data_df.subtract(medians, axis='index')
75+
76+
# Threshold differential data before returning
77+
diff_data = diff_data.clip(lower=lower_diff_thresh, upper=upper_diff_thresh)
78+
79+
# Construct output GCToo object
80+
out_gctoo = GCToo.GCToo(data_df=diff_data,
81+
row_metadata_df=gctoo.row_metadata_df,
82+
col_metadata_df=gctoo.col_metadata_df)
83+
84+
return out_gctoo
85+

0 commit comments

Comments
 (0)