Improving documentation, paramaterising thresholds, making other changes requested in pull request

Evan Lemire · Evan Lemire · commit 77e8e8dd20df · 2018-04-12T10:44:15.000-04:00
diff --git a/cmapPy/math/modz.py b/cmapPy/math/modz.py
@@ -1,36 +1,48 @@
 '''
 modz.py
 
+modz refers to a weighted average of zscores - but can be applied to other types of data as well
 Given a matrix of profiles on which you want to calculate a weighted average, returns a single profile of modz values
+Weights are calculated based on the correlation between replicates - so if one replicate is less highly correlated it
+will not be weighted as highly in the level 5 signature.
 '''
 
 import pandas as pd
 import numpy as np
 import os
 import math
 
+rounding_precision=4
+
 def upper_triangle(correlation_matrix):
     '''
-    :param correlation_matrix (pandas df): Correlations between all replicates
-    :return upper_tri_series (pandas series): Upper triangle extracted from corr mat
+    Args:
+    correlation_matrix (pandas df): Correlations between all replicates
+
+    Returns:
+    upper_tri_series (pandas series): Upper triangle extracted from corr mat
     '''
     upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))
 
     # convert matrix into long form description
     upper_tri_series = upper_triangle.stack().reset_index(level=1)
 
-    upper_tri_series.columns = ['rid', 'spearman_corr']
+    upper_tri_series.columns = ['rid', 'corr']
 
     # Index at this point is CID, it now becomes a column
     upper_tri_series.reset_index(level=0, inplace=True)
 
-    return upper_tri_series.round(4)
+    return upper_tri_series.round(rounding_precision)
 
 
-def calculate_weights(correlation_matrix):
+def calculate_weights(correlation_matrix, min_wt = 0.01):
     '''
-    :param correlation_matrix (pandas df): Correlations between all replicates
-    :return raw weights, weights (pandas series): Weights computed by summing correlations (raw weights) and then normalized to add to 1 (weights)
+    Args:
+    correlation_matrix (pandas df): Correlations between all replicates
+
+    Returns:
+    raw weights (pandas series):  Weights computed by summing correlations
+    weights (pandas series): Weights computed by summing correlations (raw weights) and then normalized to add to 1 (weights)
     '''
 
     # fill diagonal of corr_mat with 0s
@@ -39,27 +51,36 @@ def calculate_weights(correlation_matrix):
     # remove negative values
     correlation_matrix[correlation_matrix < 0] = 0
     raw_weights = correlation_matrix.sum(axis=1) / (len(correlation_matrix.index) - 1)
-    raw_weights[raw_weights < .01] = .01
+    raw_weights[raw_weights < min_wt] = min_wt
     weights = raw_weights / sum(raw_weights.abs())
 
-    return raw_weights.round(4), weights.round(4)
+    return raw_weights.round(rounding_precision), weights.round(rounding_precision)
 
 
-def main(mat):
+def calc_modz(mat, min_wt = 0.01, corr_metric='spearman'):
     '''
-    :param mat (pandas df): One matching profile from each replicate
-    :return (4 pandas series): modz values, correlations from upper tri series, raw weights, normalized weights
+    Args:
+    mat (pandas df): a signature matrix, where the columns are samples and the rows are features;
+    columns correspond to the replicates of a single perturbagen
+    min_wt (float): Minimum raw weight when calculating weighted average
+    corr_metric (string): Spearmen or pearson, correlation method
+
+    Returns:
+    modz values (pandas series): weighted average values
+    upper_tri_series (pandas series): the correlations between each profile that went into the signature
+    raw weights (pandas series): weights before normalization to add to 1
+    weights (pandas series): weights after normalization
     '''
     # Make correlation matrix column wise
-    corr_mat = mat.corr(method='spearman')
+    corr_mat = mat.corr(method=corr_metric)
 
     # Extract just the values in the upper triangle
     upper_tri_series = upper_triangle(corr_mat)
 
     # Get rid of negative values
-    upper_tri_series['spearman_corr'][upper_tri_series['spearman_corr'] < 0] = 0
+    upper_tri_series['corr'][upper_tri_series['corr'] < 0] = 0
 
-    raw_weights, weights = calculate_weights(corr_mat)
+    raw_weights, weights = calculate_weights(corr_mat, min_wt)
 
     weighted_values = mat * weights
 
diff --git a/cmapPy/math/robust_zscore.py b/cmapPy/math/robust_zscore.py
@@ -2,13 +2,18 @@
 robust_zscore.py
 
 Given a pandas df, and an optional control df, will calculate zscores using plate control or vehicle control
+Values can be zscored relative to all samples on a plate ("plate-control")
+or relative to negative control samples ("vehicle-control").
 '''
-
-def calc_zscore(mat, ctrl_mat=None):
+rounding_precision = 4
+def calc_zscore(mat, ctrl_mat=None, min_mad=.1):
     '''
-    :param mat (pandas df): Matrix of data that zscoring will be applied to
-    :param ctrl_mat (pandas df): Optional subset matrix from which to draw medians and MADS (vehicle control)
-    :return zscore_data (pandas_df):
+    Args:
+    mat (pandas df): Matrix of data that zscoring will be applied to
+    ctrl_mat (pandas df): Optional subset matrix from which to draw medians and MADS (vehicle control)
+
+    Returns:
+    zscore_data (pandas_df): Zscored data!
     '''
 
     # If optional df exists, calc medians and mads from it
@@ -25,7 +30,8 @@ def calc_zscore(mat, ctrl_mat=None):
     mads = median_devs.median(axis=1)
 
     # Threshold mads
-    mads[mads < .1] = .1
+    mads[mads < min_mad] = min_mad
+    # Must multiply values by 1.4826 to make MAD comparable to SD (https://en.wikipedia.org/wiki/Median_absolute_deviation)
     zscore_data = sub.divide(mads * 1.4826, axis='index')
 
-    return zscore_data.round(4)
+    return zscore_data.round(rounding_precision)
diff --git a/cmapPy/math/tests/test_modz.py b/cmapPy/math/tests/test_modz.py
@@ -19,12 +19,12 @@ def test_calculate_weights(self):
 
     def test_upper_triangle(self):
         upper_tri_series = modz.upper_triangle(test_mat.corr())
-        self.assertTrue(upper_tri_series['spearman_corr'].tolist() == [0.6547, 0.982, 0.7857])
+        self.assertTrue(upper_tri_series['corr'].tolist() == [0.6547, 0.982, 0.7857])
         self.assertTrue(upper_tri_series['rid'].tolist() == ['B', 'C', 'C'])
         self.assertTrue(upper_tri_series['index'].tolist() == ['A', 'A', 'B'])
 
     def test_main(self):
-        modz_values, x, y, z = modz.main(test_mat)
+        modz_values, x, y, z = modz.calc_modz(test_mat)
         self.assertTrue(modz_values.tolist() == [3.125, 5.75, 6.0])
 
 
diff --git a/cmapPy/math/tests/test_robust_zscore.py b/cmapPy/math/tests/test_robust_zscore.py
@@ -14,13 +14,16 @@ class TestRobustZscore(unittest.TestCase):
     def test_zscore_pc(self):
         pc_zscores = robust_zscore.calc_zscore(test_mat)
         self.assertTrue(pc_zscores.shape == (3,4))
-        self.assertTrue(pc_zscores.loc[0].tolist() == [-0.3372, -1.6862, 1.0117, 0.3372])
-        self.assertTrue(pc_zscores.loc[1].tolist() == [-0.6745, 2.0235, 0.6745, -0.6745])
-        self.assertTrue(pc_zscores.loc[2].tolist() == [-0.4047, 0.4047, 1.2141, -0.9443])
+
+        pd.util.testing.assert_frame_equal(pc_zscores, pd.DataFrame({'A': [-0.3372, -0.6745, -0.4047],
+                                                                     'B': [-1.6862, 2.0235, 0.4047],
+                                                                     'C': [1.0117, 0.6745, 1.2141],
+                                                                     'D': [0.3372, -0.6745, -0.9443]}))
 
     def test_zscore_vc(self):
         vc_zscores = robust_zscore.calc_zscore(test_mat, ctrl_mat = test_ctl_mat)
         self.assertTrue(vc_zscores.shape == (3, 4))
-        self.assertTrue(vc_zscores.loc[0].tolist() == [-4.7214, -7.4194, -2.0235, -3.3725])
-        self.assertTrue(vc_zscores.loc[1].tolist() == [-3.3725, 0.6745, -1.349, -3.3725])
-        self.assertTrue(vc_zscores.loc[2].tolist() == [-20.2347, 0.0, 20.2347, -33.7245])
+        pd.util.testing.assert_frame_equal(vc_zscores, pd.DataFrame({'A': [-4.7214, -3.3725, -20.2347],
+                                                                     'B': [-7.4194, 0.6745, 0.0],
+                                                                     'C': [-2.0235, -1.349, 20.2347],
+                                                                     'D': [-3.3725, -3.3725, -33.7245]}))
diff --git a/cmapPy/pandasGEXpress/diff_gctoo.py b/cmapPy/pandasGEXpress/diff_gctoo.py
@@ -2,37 +2,45 @@
 diff_gctoo.py
 
 Given a GCToo object calculates differential values (expression, viability etc.)
+Values can be made differential relative to all samples on a plate ("plate-control")
+or relative to negative control samples ("vehicle-control").
 '''
 import sys
 import cmapPy.math.robust_zscore as robust_zscore
 import cmapPy.pandasGEXpress.GCToo as GCToo
 
-def calc_differential(gctoo, plate_control=True, group_field='pert_type', group_val='ctl_vehicle', func = robust_zscore.calc_zscore):
+def calc_differential(gctoo, plate_control=True, group_field='pert_type', group_val='ctl_vehicle',
+                      func = robust_zscore.calc_zscore, pos_diff_thresh=10, neg_diff_thresh=-10):
 
     '''
-    :param df (pandas df):
-    :param plate_control (bool): True means calculate differential using plate control. False means vehicle control.
-    :param group_field (string): Metadata field in which to find group_val
-    :param group_val (string): Value in group_field that indicates use in vehicle control
-    :param func (function): Function to apply to data fro calculating diff, eg. zscore, fold change
-    :return zscore_gctoo (pandas df): Zscored data!
+    Args:
+    df (pandas df): data on which to perform diff
+    plate_control (bool): True means calculate differential using plate control. False means vehicle control.
+    group_field (string): Metadata field in which to find group_val
+    group_val (string): Value in group_field that indicates use in vehicle control
+    func (function): Function to apply to data for calculating diff, eg. zscore, fold change
+    pos_diff_thresh (float): Maximum value for diff data
+    neg_diff_thresh: Minimum value for diff data
+
+    Returns:
+    diff_gctoo (pandas df): Diff data!
     '''
 
     if plate_control == False:
         # If using only a subset of the plate for control (usually vehicle control) extract this df
         neg_dex = gctoo.col_metadata_df[gctoo.col_metadata_df[group_field] == group_val].index.tolist()
         neg_df = gctoo.data_df[neg_dex]
-        zscore_data = func(gctoo.data_df, neg_df)
+        diff_data = func(gctoo.data_df, neg_df)
 
     elif plate_control == True:
-        zscore_data = func(gctoo.data_df)
+        diff_data = func(gctoo.data_df)
 
     row_metadata_df = gctoo.row_metadata_df
 
     # Threshold zscore data before returning
-    zscore_data[zscore_data < -10] = -10
-    zscore_data[zscore_data > 10] = 10
+    diff_data[diff_data < neg_diff_thresh] = neg_diff_thresh
+    diff_data[diff_data > pos_diff_thresh] = pos_diff_thresh
 
-    zscore_gctoo = GCToo.GCToo(data_df=zscore_data, row_metadata_df=row_metadata_df, col_metadata_df=gctoo.col_metadata_df)
+    diff_gctoo = GCToo.GCToo(data_df=diff_data, row_metadata_df=row_metadata_df, col_metadata_df=gctoo.col_metadata_df)
 
-    return zscore_gctoo
+    return diff_gctoo