Skip to content

Commit 74e0ec4

Browse files
author
lev
committed
pandasGEXpress/diff_gctoo.py: added median_norm functionality, documentation, and small tweaks
1 parent 1007034 commit 74e0ec4

File tree

2 files changed

+129
-59
lines changed

2 files changed

+129
-59
lines changed

cmapPy/pandasGEXpress/diff_gctoo.py

Lines changed: 65 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,85 @@
11
'''
22
diff_gctoo.py
33
4-
Given a GCToo object calculates differential values (expression, viability etc.)
5-
Values can be made differential relative to all samples on a plate ("plate-control")
6-
or relative to negative control samples ("vehicle-control").
4+
Converts a matrix of values (e.g. gene expression, viability, etc.) into a
5+
matrix of differential values. Values can be made differential relative to all
6+
samples in the dataset ("plate-control") or relative to just negative control
7+
samples ("vehicle-control"). The method of computing the differential can be
8+
either a robust z-score ("robust_z") or simply median normalization
9+
("median_norm").
10+
711
'''
8-
import sys
912
import cmapPy.math.robust_zscore as robust_zscore
1013
import cmapPy.pandasGEXpress.GCToo as GCToo
1114

12-
def calc_differential(gctoo, plate_control=True, group_field='pert_type', group_val='ctl_vehicle',
13-
func = robust_zscore.calc_zscore, pos_diff_thresh=10, neg_diff_thresh=-10):
15+
possible_diff_methods = ["robust_z", "median_norm"]
16+
17+
18+
def diff_gctoo(gctoo, plate_control=True, group_field='pert_type', group_val='ctl_vehicle',
19+
diff_method="robust_z", upper_diff_thresh=10, lower_diff_thresh=-10):
20+
''' Converts a matrix of values (e.g. gene expression, viability, etc.)
21+
into a matrix of differential values.
1422
15-
'''
1623
Args:
17-
df (pandas df): data on which to perform diff
18-
plate_control (bool): True means calculate differential using plate control. False means vehicle control.
24+
df (pandas df): data to make diff_gctoo
25+
plate_control (bool): True means calculate diff_gctoo using plate control.
26+
False means vehicle control.
1927
group_field (string): Metadata field in which to find group_val
2028
group_val (string): Value in group_field that indicates use in vehicle control
21-
func (function): Function to apply to data for calculating diff, eg. zscore, fold change
22-
pos_diff_thresh (float): Maximum value for diff data
23-
neg_diff_thresh: Minimum value for diff data
29+
diff_method (string): Method of computing differential data; currently only
30+
support either "robust_z" or "median_norm"
31+
upper_diff_thresh (float): Maximum value for diff data
32+
lower_diff_thresh (float): Minimum value for diff data
2433
2534
Returns:
26-
diff_gctoo (pandas df): Diff data!
35+
out_gctoo (GCToo object): GCToo with differential data values
2736
'''
37+
assert diff_method in possible_diff_methods, (
38+
"possible_diff_methods: {}, diff_method: {}".format(
39+
possible_diff_methods, diff_method))
40+
41+
# Compute median and MAD using all samples in the dataset
42+
if plate_control:
43+
44+
# Compute differential data
45+
if diff_method == "robust_z":
46+
diff_data = robust_zscore.robust_zscore(gctoo.data_df)
47+
48+
elif diff_method == "median_norm":
49+
medians = gctoo.data_df.median(axis=1)
50+
diff_data = gctoo.data_df.subtract(medians, axis='index')
51+
52+
# Compute median and MAD from negative controls, rather than all samples
53+
else:
54+
55+
assert group_field in gctoo.col_metadata_df.columns.values, (
56+
"group_field {} not present in column metadata. " +
57+
"gctoo.col_metadata_df.columns.values: {}").format(
58+
group_field, gctoo.col_metadata_df.columns.values)
59+
60+
assert sum(gctoo.col_metadata_df[group_field] == group_val) > 0, (
61+
"group_val {} not present in the {} column.").format(
62+
group_val, group_field)
63+
64+
# Find negative control samples
65+
neg_ctl_samples = gctoo.col_metadata_df.index[gctoo.col_metadata_df[group_field] == group_val]
66+
neg_ctl_df = gctoo.data_df[neg_ctl_samples]
2867

29-
if plate_control == False:
30-
# If using only a subset of the plate for control (usually vehicle control) extract this df
31-
neg_dex = gctoo.col_metadata_df[gctoo.col_metadata_df[group_field] == group_val].index.tolist()
32-
neg_df = gctoo.data_df[neg_dex]
33-
diff_data = func(gctoo.data_df, neg_df)
68+
# Compute differential data
69+
if diff_method == "robust_z":
70+
diff_data = robust_zscore.robust_zscore(gctoo.data_df, neg_ctl_df)
3471

35-
elif plate_control == True:
36-
diff_data = func(gctoo.data_df)
72+
elif diff_method == "median_norm":
73+
medians = gctoo.data_df.median(axis=1)
74+
diff_data = gctoo.data_df.subtract(medians, axis='index')
3775

38-
row_metadata_df = gctoo.row_metadata_df
76+
# Threshold differential data before returning
77+
diff_data = diff_data.clip(lower=lower_diff_thresh, upper=upper_diff_thresh)
3978

40-
# Threshold zscore data before returning
41-
diff_data[diff_data < neg_diff_thresh] = neg_diff_thresh
42-
diff_data[diff_data > pos_diff_thresh] = pos_diff_thresh
79+
# Construct output GCToo object
80+
out_gctoo = GCToo.GCToo(data_df=diff_data,
81+
row_metadata_df=gctoo.row_metadata_df,
82+
col_metadata_df=gctoo.col_metadata_df)
4383

44-
diff_gctoo = GCToo.GCToo(data_df=diff_data, row_metadata_df=row_metadata_df, col_metadata_df=gctoo.col_metadata_df)
84+
return out_gctoo
4585

46-
return diff_gctoo
Lines changed: 64 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,76 @@
11
import unittest
2-
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
32
import logging
43
import pandas as pd
5-
import sys
4+
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
65
import cmapPy.pandasGEXpress.GCToo as GCToo
7-
import cmapPy.pandasGEXpress.diff_gctoo as differential
6+
import cmapPy.pandasGEXpress.diff_gctoo as diff_gctoo
87

98
logger = logging.getLogger(setup_logger.LOGGER_NAME)
109

11-
test_mat = pd.DataFrame({'A':[4,2,3], 'B': [2,8,6], 'C': [6,5,9], 'D': [5,2,1], 'E':[8,8,6], 'F': [7,6,6]})
12-
test_col_meta = pd.DataFrame({'pert_type': ['trt_cp', 'trt_cp', 'trt_cp', 'trt_cp','ctl_vehicle','ctl_vehicle'],
13-
'pert_iname': ['bort', 'bort', 'DMSO', 'DMSO', 'bort', 'bort']},
14-
index=['A', 'B', 'C', 'D', 'E', 'F'])
15-
test_gctoo = GCToo.GCToo(data_df=test_mat, col_metadata_df=test_col_meta, row_metadata_df=pd.DataFrame(index=range(0,3)))
10+
test_mat = pd.DataFrame({'A':[4,2,3], 'B': [2,8,6], 'C': [6,5,9],
11+
'D': [5,2,1], 'E':[8,8,6], 'F': [7,6,6]})
12+
test_col_meta = pd.DataFrame(
13+
{'pert_type': ['trt_cp', 'trt_cp', 'trt_cp',
14+
'trt_cp', 'ctl_vehicle', 'ctl_vehicle'],
15+
'pert_iname': ['bort', 'bort', 'DMSO', 'DMSO', 'bort', 'bort']},
16+
index=['A', 'B', 'C', 'D', 'E', 'F'])
17+
test_gctoo = GCToo.GCToo(data_df=test_mat,
18+
col_metadata_df=test_col_meta)
19+
1620

1721
class TestDifferential(unittest.TestCase):
18-
def test_differential(self):
19-
pc_zscores = differential.calc_differential(test_gctoo, True)
20-
vc_zscores1 = differential.calc_differential(test_gctoo, False)
21-
vc_zscores2 = differential.calc_differential(test_gctoo, False, 'pert_iname', 'DMSO')
22+
def test_diff_gctoo_pc(self):
23+
pc_zscores = diff_gctoo.diff_gctoo(test_gctoo, plate_control=True, lower_diff_thresh=-2)
2224
self.assertTrue(pc_zscores.data_df.shape == (3, 6))
25+
26+
pd.util.testing.assert_frame_equal(pc_zscores.data_df, pd.DataFrame(
27+
{'A': [-0.6745, -0.9443, -1.349],
28+
'C': [0.2248, -0.1349, 1.349],
29+
'B': [-1.5738, 0.6745, 0.0], 'E': [1.1242, 0.6745, 0.0],
30+
'D': [-0.2248, -0.9443, -2], # last val should be -2 bc of thresholding
31+
'F': [0.6745, 0.1349, 0.0]}))
32+
33+
# test diff_method assertion
34+
with self.assertRaises(AssertionError) as e:
35+
diff_gctoo.diff_gctoo(test_gctoo, plate_control=True, diff_method="robust_zs")
36+
self.assertIn("diff_method: robust_zs", str(e.exception))
37+
38+
# test median norm
39+
pc_median_normed_df = diff_gctoo.diff_gctoo(test_gctoo, diff_method="median_norm")
40+
self.assertEqual(pc_median_normed_df.data_df.iloc[0, 0], -1.5)
41+
self.assertEqual(pc_median_normed_df.data_df.loc[2, "B"], 0)
42+
43+
def test_diff_gctoo_vc(self):
44+
vc_zscores1 = diff_gctoo.diff_gctoo(test_gctoo, plate_control=False)
45+
vc_zscores2 = diff_gctoo.diff_gctoo(test_gctoo, plate_control=False,
46+
group_field='pert_iname',
47+
group_val='DMSO')
2348
self.assertTrue(vc_zscores1.data_df.shape == (3, 6))
2449
self.assertTrue(vc_zscores2.data_df.shape == (3, 6))
25-
pd.util.testing.assert_frame_equal(pc_zscores.data_df, pd.DataFrame({'A': [-0.6745, -0.9443, -1.349],
26-
'C': [0.2248, -0.1349, 1.349],
27-
'B': [-1.5738, 0.6745, 0.0], 'E': [1.1242, 0.6745, 0.0],
28-
'D': [-0.2248, -0.9443, -2.2483],
29-
'F': [0.6745, 0.1349, 0.0]}, index=[0,1,2]))
30-
31-
pd.util.testing.assert_frame_equal(vc_zscores1.data_df, pd.DataFrame({'A': [-4.7214, -3.3725, -10.0],
32-
'C': [-2.0235, -1.349, 10.0],
33-
'B': [-7.4194, 0.6745, 0.0],
34-
'E': [0.6745, 0.6745, 0.0],
35-
'D': [-3.3725, -3.3725, -10.0],
36-
'F': [-0.6745, -0.6745, 0.0]},
37-
index=[0,1,2]))
38-
39-
pd.util.testing.assert_frame_equal(vc_zscores2.data_df, pd.DataFrame({'A': [-2.0235, -0.6745, -0.3372],
40-
'C': [0.6745, 0.6745, 0.6745],
41-
'B': [-4.7214, 2.0235, 0.1686],
42-
'E': [3.3725, 2.0235, 0.1686],
43-
'D': [-0.6745, -0.6745, -0.6745],
44-
'F': [2.0235, 1.1242, 0.1686]},
45-
index=[0,1,2]))
50+
51+
pd.util.testing.assert_frame_equal(vc_zscores1.data_df, pd.DataFrame(
52+
{'A': [-4.7214, -3.3725, -10.0], # check for thresholding
53+
'C': [-2.0235, -1.349, 10.0],
54+
'B': [-7.4194, 0.6745, 0.0],
55+
'E': [0.6745, 0.6745, 0.0],
56+
'D': [-3.3725, -3.3725, -10.0],
57+
'F': [-0.6745, -0.6745, 0.0]}))
58+
59+
pd.util.testing.assert_frame_equal(vc_zscores2.data_df, pd.DataFrame(
60+
{'A': [-2.0235, -0.6745, -0.3372],
61+
'C': [0.6745, 0.6745, 0.6745],
62+
'B': [-4.7214, 2.0235, 0.1686],
63+
'E': [3.3725, 2.0235, 0.1686],
64+
'D': [-0.6745, -0.6745, -0.6745],
65+
'F': [2.0235, 1.1242, 0.1686]}))
66+
67+
# test group_val assertion
68+
with self.assertRaises(AssertionError) as e:
69+
diff_gctoo.diff_gctoo(test_gctoo, plate_control=False, group_val="dmso")
70+
self.assertIn("dmso not present", str(e.exception))
71+
72+
73+
if __name__ == "__main__":
74+
setup_logger.setup(verbose=True)
75+
unittest.main()
76+

0 commit comments

Comments
 (0)