Skip to content

Commit 77e8e8d

Browse files
author
Evan Lemire
committed
Improving documentation, paramaterising thresholds, making other changes requested in pull request
1 parent 7d3c548 commit 77e8e8d

File tree

5 files changed

+81
-43
lines changed

5 files changed

+81
-43
lines changed

cmapPy/math/modz.py

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,48 @@
11
'''
22
modz.py
33
4+
modz refers to a weighted average of zscores - but can be applied to other types of data as well
45
Given a matrix of profiles on which you want to calculate a weighted average, returns a single profile of modz values
6+
Weights are calculated based on the correlation between replicates - so if one replicate is less highly correlated it
7+
will not be weighted as highly in the level 5 signature.
58
'''
69

710
import pandas as pd
811
import numpy as np
912
import os
1013
import math
1114

15+
rounding_precision=4
16+
1217
def upper_triangle(correlation_matrix):
1318
'''
14-
:param correlation_matrix (pandas df): Correlations between all replicates
15-
:return upper_tri_series (pandas series): Upper triangle extracted from corr mat
19+
Args:
20+
correlation_matrix (pandas df): Correlations between all replicates
21+
22+
Returns:
23+
upper_tri_series (pandas series): Upper triangle extracted from corr mat
1624
'''
1725
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))
1826

1927
# convert matrix into long form description
2028
upper_tri_series = upper_triangle.stack().reset_index(level=1)
2129

22-
upper_tri_series.columns = ['rid', 'spearman_corr']
30+
upper_tri_series.columns = ['rid', 'corr']
2331

2432
# Index at this point is CID, it now becomes a column
2533
upper_tri_series.reset_index(level=0, inplace=True)
2634

27-
return upper_tri_series.round(4)
35+
return upper_tri_series.round(rounding_precision)
2836

2937

30-
def calculate_weights(correlation_matrix):
38+
def calculate_weights(correlation_matrix, min_wt = 0.01):
3139
'''
32-
:param correlation_matrix (pandas df): Correlations between all replicates
33-
:return raw weights, weights (pandas series): Weights computed by summing correlations (raw weights) and then normalized to add to 1 (weights)
40+
Args:
41+
correlation_matrix (pandas df): Correlations between all replicates
42+
43+
Returns:
44+
raw weights (pandas series): Weights computed by summing correlations
45+
weights (pandas series): Weights computed by summing correlations (raw weights) and then normalized to add to 1 (weights)
3446
'''
3547

3648
# fill diagonal of corr_mat with 0s
@@ -39,27 +51,36 @@ def calculate_weights(correlation_matrix):
3951
# remove negative values
4052
correlation_matrix[correlation_matrix < 0] = 0
4153
raw_weights = correlation_matrix.sum(axis=1) / (len(correlation_matrix.index) - 1)
42-
raw_weights[raw_weights < .01] = .01
54+
raw_weights[raw_weights < min_wt] = min_wt
4355
weights = raw_weights / sum(raw_weights.abs())
4456

45-
return raw_weights.round(4), weights.round(4)
57+
return raw_weights.round(rounding_precision), weights.round(rounding_precision)
4658

4759

48-
def main(mat):
60+
def calc_modz(mat, min_wt = 0.01, corr_metric='spearman'):
4961
'''
50-
:param mat (pandas df): One matching profile from each replicate
51-
:return (4 pandas series): modz values, correlations from upper tri series, raw weights, normalized weights
62+
Args:
63+
mat (pandas df): a signature matrix, where the columns are samples and the rows are features;
64+
columns correspond to the replicates of a single perturbagen
65+
min_wt (float): Minimum raw weight when calculating weighted average
66+
corr_metric (string): Spearmen or pearson, correlation method
67+
68+
Returns:
69+
modz values (pandas series): weighted average values
70+
upper_tri_series (pandas series): the correlations between each profile that went into the signature
71+
raw weights (pandas series): weights before normalization to add to 1
72+
weights (pandas series): weights after normalization
5273
'''
5374
# Make correlation matrix column wise
54-
corr_mat = mat.corr(method='spearman')
75+
corr_mat = mat.corr(method=corr_metric)
5576

5677
# Extract just the values in the upper triangle
5778
upper_tri_series = upper_triangle(corr_mat)
5879

5980
# Get rid of negative values
60-
upper_tri_series['spearman_corr'][upper_tri_series['spearman_corr'] < 0] = 0
81+
upper_tri_series['corr'][upper_tri_series['corr'] < 0] = 0
6182

62-
raw_weights, weights = calculate_weights(corr_mat)
83+
raw_weights, weights = calculate_weights(corr_mat, min_wt)
6384

6485
weighted_values = mat * weights
6586

cmapPy/math/robust_zscore.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,18 @@
22
robust_zscore.py
33
44
Given a pandas df, and an optional control df, will calculate zscores using plate control or vehicle control
5+
Values can be zscored relative to all samples on a plate ("plate-control")
6+
or relative to negative control samples ("vehicle-control").
57
'''
6-
7-
def calc_zscore(mat, ctrl_mat=None):
8+
rounding_precision = 4
9+
def calc_zscore(mat, ctrl_mat=None, min_mad=.1):
810
'''
9-
:param mat (pandas df): Matrix of data that zscoring will be applied to
10-
:param ctrl_mat (pandas df): Optional subset matrix from which to draw medians and MADS (vehicle control)
11-
:return zscore_data (pandas_df):
11+
Args:
12+
mat (pandas df): Matrix of data that zscoring will be applied to
13+
ctrl_mat (pandas df): Optional subset matrix from which to draw medians and MADS (vehicle control)
14+
15+
Returns:
16+
zscore_data (pandas_df): Zscored data!
1217
'''
1318

1419
# If optional df exists, calc medians and mads from it
@@ -25,7 +30,8 @@ def calc_zscore(mat, ctrl_mat=None):
2530
mads = median_devs.median(axis=1)
2631

2732
# Threshold mads
28-
mads[mads < .1] = .1
33+
mads[mads < min_mad] = min_mad
34+
# Must multiply values by 1.4826 to make MAD comparable to SD (https://en.wikipedia.org/wiki/Median_absolute_deviation)
2935
zscore_data = sub.divide(mads * 1.4826, axis='index')
3036

31-
return zscore_data.round(4)
37+
return zscore_data.round(rounding_precision)

cmapPy/math/tests/test_modz.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,12 @@ def test_calculate_weights(self):
1919

2020
def test_upper_triangle(self):
2121
upper_tri_series = modz.upper_triangle(test_mat.corr())
22-
self.assertTrue(upper_tri_series['spearman_corr'].tolist() == [0.6547, 0.982, 0.7857])
22+
self.assertTrue(upper_tri_series['corr'].tolist() == [0.6547, 0.982, 0.7857])
2323
self.assertTrue(upper_tri_series['rid'].tolist() == ['B', 'C', 'C'])
2424
self.assertTrue(upper_tri_series['index'].tolist() == ['A', 'A', 'B'])
2525

2626
def test_main(self):
27-
modz_values, x, y, z = modz.main(test_mat)
27+
modz_values, x, y, z = modz.calc_modz(test_mat)
2828
self.assertTrue(modz_values.tolist() == [3.125, 5.75, 6.0])
2929

3030

cmapPy/math/tests/test_robust_zscore.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,16 @@ class TestRobustZscore(unittest.TestCase):
1414
def test_zscore_pc(self):
1515
pc_zscores = robust_zscore.calc_zscore(test_mat)
1616
self.assertTrue(pc_zscores.shape == (3,4))
17-
self.assertTrue(pc_zscores.loc[0].tolist() == [-0.3372, -1.6862, 1.0117, 0.3372])
18-
self.assertTrue(pc_zscores.loc[1].tolist() == [-0.6745, 2.0235, 0.6745, -0.6745])
19-
self.assertTrue(pc_zscores.loc[2].tolist() == [-0.4047, 0.4047, 1.2141, -0.9443])
17+
18+
pd.util.testing.assert_frame_equal(pc_zscores, pd.DataFrame({'A': [-0.3372, -0.6745, -0.4047],
19+
'B': [-1.6862, 2.0235, 0.4047],
20+
'C': [1.0117, 0.6745, 1.2141],
21+
'D': [0.3372, -0.6745, -0.9443]}))
2022

2123
def test_zscore_vc(self):
2224
vc_zscores = robust_zscore.calc_zscore(test_mat, ctrl_mat = test_ctl_mat)
2325
self.assertTrue(vc_zscores.shape == (3, 4))
24-
self.assertTrue(vc_zscores.loc[0].tolist() == [-4.7214, -7.4194, -2.0235, -3.3725])
25-
self.assertTrue(vc_zscores.loc[1].tolist() == [-3.3725, 0.6745, -1.349, -3.3725])
26-
self.assertTrue(vc_zscores.loc[2].tolist() == [-20.2347, 0.0, 20.2347, -33.7245])
26+
pd.util.testing.assert_frame_equal(vc_zscores, pd.DataFrame({'A': [-4.7214, -3.3725, -20.2347],
27+
'B': [-7.4194, 0.6745, 0.0],
28+
'C': [-2.0235, -1.349, 20.2347],
29+
'D': [-3.3725, -3.3725, -33.7245]}))

cmapPy/pandasGEXpress/diff_gctoo.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,37 +2,45 @@
22
diff_gctoo.py
33
44
Given a GCToo object calculates differential values (expression, viability etc.)
5+
Values can be made differential relative to all samples on a plate ("plate-control")
6+
or relative to negative control samples ("vehicle-control").
57
'''
68
import sys
79
import cmapPy.math.robust_zscore as robust_zscore
810
import cmapPy.pandasGEXpress.GCToo as GCToo
911

10-
def calc_differential(gctoo, plate_control=True, group_field='pert_type', group_val='ctl_vehicle', func = robust_zscore.calc_zscore):
12+
def calc_differential(gctoo, plate_control=True, group_field='pert_type', group_val='ctl_vehicle',
13+
func = robust_zscore.calc_zscore, pos_diff_thresh=10, neg_diff_thresh=-10):
1114

1215
'''
13-
:param df (pandas df):
14-
:param plate_control (bool): True means calculate differential using plate control. False means vehicle control.
15-
:param group_field (string): Metadata field in which to find group_val
16-
:param group_val (string): Value in group_field that indicates use in vehicle control
17-
:param func (function): Function to apply to data fro calculating diff, eg. zscore, fold change
18-
:return zscore_gctoo (pandas df): Zscored data!
16+
Args:
17+
df (pandas df): data on which to perform diff
18+
plate_control (bool): True means calculate differential using plate control. False means vehicle control.
19+
group_field (string): Metadata field in which to find group_val
20+
group_val (string): Value in group_field that indicates use in vehicle control
21+
func (function): Function to apply to data for calculating diff, eg. zscore, fold change
22+
pos_diff_thresh (float): Maximum value for diff data
23+
neg_diff_thresh: Minimum value for diff data
24+
25+
Returns:
26+
diff_gctoo (pandas df): Diff data!
1927
'''
2028

2129
if plate_control == False:
2230
# If using only a subset of the plate for control (usually vehicle control) extract this df
2331
neg_dex = gctoo.col_metadata_df[gctoo.col_metadata_df[group_field] == group_val].index.tolist()
2432
neg_df = gctoo.data_df[neg_dex]
25-
zscore_data = func(gctoo.data_df, neg_df)
33+
diff_data = func(gctoo.data_df, neg_df)
2634

2735
elif plate_control == True:
28-
zscore_data = func(gctoo.data_df)
36+
diff_data = func(gctoo.data_df)
2937

3038
row_metadata_df = gctoo.row_metadata_df
3139

3240
# Threshold zscore data before returning
33-
zscore_data[zscore_data < -10] = -10
34-
zscore_data[zscore_data > 10] = 10
41+
diff_data[diff_data < neg_diff_thresh] = neg_diff_thresh
42+
diff_data[diff_data > pos_diff_thresh] = pos_diff_thresh
3543

36-
zscore_gctoo = GCToo.GCToo(data_df=zscore_data, row_metadata_df=row_metadata_df, col_metadata_df=gctoo.col_metadata_df)
44+
diff_gctoo = GCToo.GCToo(data_df=diff_data, row_metadata_df=row_metadata_df, col_metadata_df=gctoo.col_metadata_df)
3745

38-
return zscore_gctoo
46+
return diff_gctoo

0 commit comments

Comments
 (0)