math/robust_zscore.py: small documentation and code changes

lev · lev · commit 10070344f663 · 2018-04-13T17:57:24.000-04:00
diff --git a/cmapPy/math/robust_zscore.py b/cmapPy/math/robust_zscore.py
@@ -1,19 +1,38 @@
 '''
 robust_zscore.py
 
-Given a pandas df, and an optional control df, will calculate zscores using plate control or vehicle control
-Values can be zscored relative to all samples on a plate ("plate-control")
-or relative to negative control samples ("vehicle-control").
+Robustly z-scores a pandas df along the rows (i.e. the z-score is made relative
+to a row). A robust z-score means that median is used instead of mean and
+median absolute deviation (MAD) instead of standard deviation in the
+standard z-score calculation:
+
+z = (x - u) / s
+
+x: input value
+u: median
+s: MAD
+
+Optionally, the median and MAD can be computed from a control df, instead of the
+input df. This functionality is useful for "vehicle-control"; that is, if
+the control df consists only of negative control samples, the median and MAD
+can be computed using just those samples but applied to the input df.
 '''
+
 rounding_precision = 4
-def calc_zscore(mat, ctrl_mat=None, min_mad=.1):
-    '''
+
+
+def robust_zscore(mat, ctrl_mat=None, min_mad=0.1):
+    ''' Robustly z-score a pandas df along the rows.
+
     Args:
-    mat (pandas df): Matrix of data that zscoring will be applied to
-    ctrl_mat (pandas df): Optional subset matrix from which to draw medians and MADS (vehicle control)
+    mat (pandas df): Matrix of data that z-scoring will be applied to
+    ctrl_mat (pandas df): Optional matrix from which to compute medians and MADs
+        (e.g. vehicle control)
+    min_mad (float): Minimum MAD to threshold to; tiny MAD values will cause
+        z-scores to blow up
 
     Returns:
-    zscore_data (pandas_df): Zscored data!
+    zscore_df (pandas_df): z-scored data
     '''
 
     # If optional df exists, calc medians and mads from it
@@ -30,8 +49,10 @@ def calc_zscore(mat, ctrl_mat=None, min_mad=.1):
     mads = median_devs.median(axis=1)
 
     # Threshold mads
-    mads[mads < min_mad] = min_mad
-    # Must multiply values by 1.4826 to make MAD comparable to SD (https://en.wikipedia.org/wiki/Median_absolute_deviation)
-    zscore_data = sub.divide(mads * 1.4826, axis='index')
+    mads = mads.clip(lower=min_mad)
+
+    # Must multiply values by 1.4826 to make MAD comparable to SD
+    # (https://en.wikipedia.org/wiki/Median_absolute_deviation)
+    zscore_df = sub.divide(mads * 1.4826, axis='index')
 
-    return zscore_data.round(rounding_precision)
+    return zscore_df.round(rounding_precision)
diff --git a/cmapPy/math/tests/test_robust_zscore.py b/cmapPy/math/tests/test_robust_zscore.py
@@ -1,29 +1,41 @@
 import unittest
-import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 import logging
 import pandas as pd
-import sys
+import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 import cmapPy.math.robust_zscore as robust_zscore
 
 logger = logging.getLogger(setup_logger.LOGGER_NAME)
 
 test_mat = pd.DataFrame({'A':[4,2,3], 'B': [2,8,6], 'C': [6,5,9], 'D': [5,2,1]})
 test_ctl_mat = pd.DataFrame({'E':[8,8,6], 'F': [7,6,6]})
+test_ctl_mat2 = pd.DataFrame({'E':[8,8,6], 'F': [8,6,6]})
+
 
 class TestRobustZscore(unittest.TestCase):
     def test_zscore_pc(self):
-        pc_zscores = robust_zscore.calc_zscore(test_mat)
-        self.assertTrue(pc_zscores.shape == (3,4))
+        pc_zscores = robust_zscore.robust_zscore(test_mat)
+        self.assertTrue(pc_zscores.shape == (3, 4))
 
-        pd.util.testing.assert_frame_equal(pc_zscores, pd.DataFrame({'A': [-0.3372, -0.6745, -0.4047],
-                                                                     'B': [-1.6862, 2.0235, 0.4047],
-                                                                     'C': [1.0117, 0.6745, 1.2141],
-                                                                     'D': [0.3372, -0.6745, -0.9443]}))
+        pd.util.testing.assert_frame_equal(pc_zscores, pd.DataFrame(
+            {'A': [-0.3372, -0.6745, -0.4047],
+             'B': [-1.6862, 2.0235, 0.4047],
+             'C': [1.0117, 0.6745, 1.2141],
+             'D': [0.3372, -0.6745, -0.9443]}))
 
     def test_zscore_vc(self):
-        vc_zscores = robust_zscore.calc_zscore(test_mat, ctrl_mat = test_ctl_mat)
+        vc_zscores = robust_zscore.robust_zscore(test_mat, ctrl_mat=test_ctl_mat)
         self.assertTrue(vc_zscores.shape == (3, 4))
-        pd.util.testing.assert_frame_equal(vc_zscores, pd.DataFrame({'A': [-4.7214, -3.3725, -20.2347],
-                                                                     'B': [-7.4194, 0.6745, 0.0],
-                                                                     'C': [-2.0235, -1.349, 20.2347],
-                                                                     'D': [-3.3725, -3.3725, -33.7245]}))
+        pd.util.testing.assert_frame_equal(vc_zscores, pd.DataFrame(
+            {'A': [-4.7214, -3.3725, -20.2347],
+             'B': [-7.4194, 0.6745, 0.0],
+             'C': [-2.0235, -1.349, 20.2347],
+             'D': [-3.3725, -3.3725, -33.7245]}))
+
+        # check that min_mad works
+        vc_zscores2 = robust_zscore.robust_zscore(test_mat, ctrl_mat=test_ctl_mat2)
+        self.assertEqual(vc_zscores2.iloc[0, 0], -26.9796)
+        self.assertEqual(vc_zscores2.iloc[1, 1], 0.6745)
+
+if __name__ == "__main__":
+    setup_logger.setup(verbose=True)
+    unittest.main()