Skip to content

Commit 1007034

Browse files
author
lev
committed
math/robust_zscore.py: small documentation and code changes
1 parent b76aa8f commit 1007034

File tree

2 files changed

+58
-25
lines changed

2 files changed

+58
-25
lines changed

cmapPy/math/robust_zscore.py

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,38 @@
11
'''
22
robust_zscore.py
33
4-
Given a pandas df, and an optional control df, will calculate zscores using plate control or vehicle control
5-
Values can be zscored relative to all samples on a plate ("plate-control")
6-
or relative to negative control samples ("vehicle-control").
4+
Robustly z-scores a pandas df along the rows (i.e. the z-score is made relative
5+
to a row). A robust z-score means that median is used instead of mean and
6+
median absolute deviation (MAD) instead of standard deviation in the
7+
standard z-score calculation:
8+
9+
z = (x - u) / s
10+
11+
x: input value
12+
u: median
13+
s: MAD
14+
15+
Optionally, the median and MAD can be computed from a control df, instead of the
16+
input df. This functionality is useful for "vehicle-control"; that is, if
17+
the control df consists only of negative control samples, the median and MAD
18+
can be computed using just those samples but applied to the input df.
719
'''
20+
821
rounding_precision = 4
9-
def calc_zscore(mat, ctrl_mat=None, min_mad=.1):
10-
'''
22+
23+
24+
def robust_zscore(mat, ctrl_mat=None, min_mad=0.1):
25+
''' Robustly z-score a pandas df along the rows.
26+
1127
Args:
12-
mat (pandas df): Matrix of data that zscoring will be applied to
13-
ctrl_mat (pandas df): Optional subset matrix from which to draw medians and MADS (vehicle control)
28+
mat (pandas df): Matrix of data that z-scoring will be applied to
29+
ctrl_mat (pandas df): Optional matrix from which to compute medians and MADs
30+
(e.g. vehicle control)
31+
min_mad (float): Minimum MAD to threshold to; tiny MAD values will cause
32+
z-scores to blow up
1433
1534
Returns:
16-
zscore_data (pandas_df): Zscored data!
35+
zscore_df (pandas_df): z-scored data
1736
'''
1837

1938
# If optional df exists, calc medians and mads from it
@@ -30,8 +49,10 @@ def calc_zscore(mat, ctrl_mat=None, min_mad=.1):
3049
mads = median_devs.median(axis=1)
3150

3251
# Threshold mads
33-
mads[mads < min_mad] = min_mad
34-
# Must multiply values by 1.4826 to make MAD comparable to SD (https://en.wikipedia.org/wiki/Median_absolute_deviation)
35-
zscore_data = sub.divide(mads * 1.4826, axis='index')
52+
mads = mads.clip(lower=min_mad)
53+
54+
# Must multiply values by 1.4826 to make MAD comparable to SD
55+
# (https://en.wikipedia.org/wiki/Median_absolute_deviation)
56+
zscore_df = sub.divide(mads * 1.4826, axis='index')
3657

37-
return zscore_data.round(rounding_precision)
58+
return zscore_df.round(rounding_precision)
Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,41 @@
11
import unittest
2-
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
32
import logging
43
import pandas as pd
5-
import sys
4+
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
65
import cmapPy.math.robust_zscore as robust_zscore
76

87
logger = logging.getLogger(setup_logger.LOGGER_NAME)
98

109
test_mat = pd.DataFrame({'A':[4,2,3], 'B': [2,8,6], 'C': [6,5,9], 'D': [5,2,1]})
1110
test_ctl_mat = pd.DataFrame({'E':[8,8,6], 'F': [7,6,6]})
11+
test_ctl_mat2 = pd.DataFrame({'E':[8,8,6], 'F': [8,6,6]})
12+
1213

1314
class TestRobustZscore(unittest.TestCase):
1415
def test_zscore_pc(self):
15-
pc_zscores = robust_zscore.calc_zscore(test_mat)
16-
self.assertTrue(pc_zscores.shape == (3,4))
16+
pc_zscores = robust_zscore.robust_zscore(test_mat)
17+
self.assertTrue(pc_zscores.shape == (3, 4))
1718

18-
pd.util.testing.assert_frame_equal(pc_zscores, pd.DataFrame({'A': [-0.3372, -0.6745, -0.4047],
19-
'B': [-1.6862, 2.0235, 0.4047],
20-
'C': [1.0117, 0.6745, 1.2141],
21-
'D': [0.3372, -0.6745, -0.9443]}))
19+
pd.util.testing.assert_frame_equal(pc_zscores, pd.DataFrame(
20+
{'A': [-0.3372, -0.6745, -0.4047],
21+
'B': [-1.6862, 2.0235, 0.4047],
22+
'C': [1.0117, 0.6745, 1.2141],
23+
'D': [0.3372, -0.6745, -0.9443]}))
2224

2325
def test_zscore_vc(self):
24-
vc_zscores = robust_zscore.calc_zscore(test_mat, ctrl_mat = test_ctl_mat)
26+
vc_zscores = robust_zscore.robust_zscore(test_mat, ctrl_mat=test_ctl_mat)
2527
self.assertTrue(vc_zscores.shape == (3, 4))
26-
pd.util.testing.assert_frame_equal(vc_zscores, pd.DataFrame({'A': [-4.7214, -3.3725, -20.2347],
27-
'B': [-7.4194, 0.6745, 0.0],
28-
'C': [-2.0235, -1.349, 20.2347],
29-
'D': [-3.3725, -3.3725, -33.7245]}))
28+
pd.util.testing.assert_frame_equal(vc_zscores, pd.DataFrame(
29+
{'A': [-4.7214, -3.3725, -20.2347],
30+
'B': [-7.4194, 0.6745, 0.0],
31+
'C': [-2.0235, -1.349, 20.2347],
32+
'D': [-3.3725, -3.3725, -33.7245]}))
33+
34+
# check that min_mad works
35+
vc_zscores2 = robust_zscore.robust_zscore(test_mat, ctrl_mat=test_ctl_mat2)
36+
self.assertEqual(vc_zscores2.iloc[0, 0], -26.9796)
37+
self.assertEqual(vc_zscores2.iloc[1, 1], 0.6745)
38+
39+
if __name__ == "__main__":
40+
setup_logger.setup(verbose=True)
41+
unittest.main()

0 commit comments

Comments
 (0)