Merge pull request #29 from cmap/math

oena · web-flow · commit c3631b726b2e · 2018-06-22T15:23:16.000-04:00
Math
diff --git a/cmapPy/math/agg_wt_avg.py b/cmapPy/math/agg_wt_avg.py
@@ -0,0 +1,118 @@
+'''
+agg_wt_avg.py
+
+Aggregate a matrix of replicate profiles into a single signature using
+a weighted average based on the correlation between replicates. That is, if
+one replicate is less correlated with the other replicates, its values will
+not be weighted as highly in the aggregated signature.
+
+Equivalent to the 'modz' method in mortar.
+'''
+
+import numpy as np
+
+rounding_precision = 4
+
+
+def get_upper_triangle(correlation_matrix):
+    ''' Extract upper triangle from a square matrix. Negative values are
+    set to 0.
+
+    Args:
+    correlation_matrix (pandas df): Correlations between all replicates
+
+    Returns:
+    upper_tri_df (pandas df): Upper triangle extracted from
+        correlation_matrix; rid is the row index, cid is the column index,
+        corr is the extracted correlation value
+    '''
+    upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))
+
+    # convert matrix into long form description
+    upper_tri_df = upper_triangle.stack().reset_index(level=1)
+    upper_tri_df.columns = ['rid', 'corr']
+
+    # Index at this point is cid, it now becomes a column
+    upper_tri_df.reset_index(level=0, inplace=True)
+
+    # Get rid of negative values
+    upper_tri_df['corr'] = upper_tri_df['corr'].clip(lower=0)
+
+    return upper_tri_df.round(rounding_precision)
+
+
+def calculate_weights(correlation_matrix, min_wt):
+    ''' Calculate a weight for each profile based on its correlation to other
+    replicates. Negative correlations are clipped to 0, and weights are clipped
+    to be min_wt at the least.
+
+    Args:
+    correlation_matrix (pandas df): Correlations between all replicates
+    min_wt (float): Minimum raw weight when calculating weighted average
+
+    Returns:
+    raw weights (pandas series):  Mean correlation to other replicates
+    weights (pandas series): raw_weights normalized such that they add to 1
+    '''
+    # fill diagonal of correlation_matrix with np.nan
+    np.fill_diagonal(correlation_matrix.values, np.nan)
+
+    # remove negative values
+    correlation_matrix = correlation_matrix.clip(lower=0)
+
+    # get average correlation for each profile (will ignore NaN)
+    raw_weights = correlation_matrix.mean(axis=1)
+
+    # threshold weights
+    raw_weights = raw_weights.clip(lower=min_wt)
+
+    # normalize raw_weights so that they add to 1
+    weights = raw_weights / sum(raw_weights)
+
+    return raw_weights.round(rounding_precision), weights.round(rounding_precision)
+
+
+def agg_wt_avg(mat, min_wt = 0.01, corr_metric='spearman'):
+    ''' Aggregate a set of replicate profiles into a single signature using
+    a weighted average.
+
+    Args:
+    mat (pandas df): a matrix of replicate profiles, where the columns are
+        samples and the rows are features; columns correspond to the
+        replicates of a single perturbagen
+    min_wt (float): Minimum raw weight when calculating weighted average
+    corr_metric (string): Spearman or Pearson; the correlation method
+
+    Returns:
+    out_sig (pandas series): weighted average values
+    upper_tri_df (pandas df): the correlations between each profile that went into the signature
+    raw weights (pandas series): weights before normalization
+    weights (pandas series): weights after normalization
+    '''
+    assert mat.shape[1] > 0, "mat is empty! mat: {}".format(mat)
+
+    if mat.shape[1] == 1:
+
+        out_sig = mat
+        upper_tri_df = None
+        raw_weights = None
+        weights = None
+
+    else:
+
+        assert corr_metric in ["spearman", "pearson"]
+
+        # Make correlation matrix column wise
+        corr_mat = mat.corr(method=corr_metric)
+
+        # Save the values in the upper triangle
+        upper_tri_df = get_upper_triangle(corr_mat)
+
+        # Calculate weight per replicate
+        raw_weights, weights = calculate_weights(corr_mat, min_wt)
+
+        # Apply weights to values
+        weighted_values = mat * weights
+        out_sig = weighted_values.sum(axis=1)
+
+    return out_sig, upper_tri_df, raw_weights, weights
diff --git a/cmapPy/math/robust_zscore.py b/cmapPy/math/robust_zscore.py
@@ -0,0 +1,58 @@
+'''
+robust_zscore.py
+
+Robustly z-scores a pandas df along the rows (i.e. the z-score is made relative
+to a row). A robust z-score means that median is used instead of mean and
+median absolute deviation (MAD) instead of standard deviation in the
+standard z-score calculation:
+
+z = (x - u) / s
+
+x: input value
+u: median
+s: MAD
+
+Optionally, the median and MAD can be computed from a control df, instead of the
+input df. This functionality is useful for "vehicle-control"; that is, if
+the control df consists only of negative control samples, the median and MAD
+can be computed using just those samples but applied to the input df.
+'''
+
+rounding_precision = 4
+
+
+def robust_zscore(mat, ctrl_mat=None, min_mad=0.1):
+    ''' Robustly z-score a pandas df along the rows.
+
+    Args:
+    mat (pandas df): Matrix of data that z-scoring will be applied to
+    ctrl_mat (pandas df): Optional matrix from which to compute medians and MADs
+        (e.g. vehicle control)
+    min_mad (float): Minimum MAD to threshold to; tiny MAD values will cause
+        z-scores to blow up
+
+    Returns:
+    zscore_df (pandas_df): z-scored data
+    '''
+
+    # If optional df exists, calc medians and mads from it
+    if ctrl_mat is not None:
+        medians = ctrl_mat.median(axis=1)
+        median_devs = abs(ctrl_mat.subtract(medians, axis=0))
+
+    # Else just use plate medians
+    else:
+        medians = mat.median(axis=1)
+        median_devs = abs(mat.subtract(medians, axis=0))
+
+    sub = mat.subtract(medians, axis='index')
+    mads = median_devs.median(axis=1)
+
+    # Threshold mads
+    mads = mads.clip(lower=min_mad)
+
+    # Must multiply values by 1.4826 to make MAD comparable to SD
+    # (https://en.wikipedia.org/wiki/Median_absolute_deviation)
+    zscore_df = sub.divide(mads * 1.4826, axis='index')
+
+    return zscore_df.round(rounding_precision)
diff --git a/cmapPy/math/tests/test_agg_wt_avg.py b/cmapPy/math/tests/test_agg_wt_avg.py
@@ -0,0 +1,52 @@
+import unittest
+import logging
+import pandas as pd
+import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
+import cmapPy.math.agg_wt_avg as agg_wt_avg
+
+logger = logging.getLogger(setup_logger.LOGGER_NAME)
+
+test_mat = pd.DataFrame({'A':[1,2,3], 'B': [2,8,6], 'C': [6,8,9]})
+test_mat_corr = test_mat.corr()
+
+
+class TestAggWtAvg(unittest.TestCase):
+    def test_calculate_weights(self):
+        # happy path
+        raw_weights, weights = agg_wt_avg.calculate_weights(test_mat_corr, min_wt=0.1)
+        self.assertTrue(len(weights == 3))
+        self.assertTrue(raw_weights.tolist() == [0.8183, 0.7202, 0.8838])
+        self.assertTrue(weights.tolist() == [0.3378, 0.2973, 0.3649])
+
+        # test that min_wt works
+        raw_weights2, weights2 = agg_wt_avg.calculate_weights(test_mat_corr, min_wt=0.85)
+        self.assertEqual(raw_weights2[1], 0.85)
+
+    def test_get_upper_triangle(self):
+        # happy path
+        upper_tri_df = agg_wt_avg.get_upper_triangle(test_mat_corr)
+        self.assertTrue(upper_tri_df['corr'].tolist() == [0.6547, 0.982, 0.7857])
+        self.assertTrue(upper_tri_df['rid'].tolist() == ['B', 'C', 'C'])
+        self.assertTrue(upper_tri_df['index'].tolist() == ['A', 'A', 'B'])
+
+    def test_agg_wt_avg(self):
+        # use spearman
+        out_sig, upper_tri_df, raw_weights, weights = agg_wt_avg.agg_wt_avg(test_mat)
+        self.assertTrue(out_sig.tolist() == [3.125, 5.75, 6.0])
+        self.assertAlmostEqual(upper_tri_df.loc[upper_tri_df.index[0], "corr"], 0.5)
+        self.assertAlmostEqual(raw_weights[0], 0.75)
+        self.assertAlmostEqual(weights[0], 0.375)
+
+        # test on a single signature
+        out_sig2, _, _, _ = agg_wt_avg.agg_wt_avg(test_mat[["C"]])
+        pd.util.testing.assert_frame_equal(out_sig2, test_mat[["C"]])
+
+        # should break if empty input
+        with self.assertRaises(AssertionError) as e:
+            agg_wt_avg.agg_wt_avg(test_mat[[]])
+        self.assertIn("mat is empty!", str(e.exception))
+
+if __name__ == "__main__":
+    setup_logger.setup(verbose=True)
+    unittest.main()
+
diff --git a/cmapPy/math/tests/test_robust_zscore.py b/cmapPy/math/tests/test_robust_zscore.py
@@ -0,0 +1,41 @@
+import unittest
+import logging
+import pandas as pd
+import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
+import cmapPy.math.robust_zscore as robust_zscore
+
+logger = logging.getLogger(setup_logger.LOGGER_NAME)
+
+test_mat = pd.DataFrame({'A':[4,2,3], 'B': [2,8,6], 'C': [6,5,9], 'D': [5,2,1]})
+test_ctl_mat = pd.DataFrame({'E':[8,8,6], 'F': [7,6,6]})
+test_ctl_mat2 = pd.DataFrame({'E':[8,8,6], 'F': [8,6,6]})
+
+
+class TestRobustZscore(unittest.TestCase):
+    def test_zscore_pc(self):
+        pc_zscores = robust_zscore.robust_zscore(test_mat)
+        self.assertTrue(pc_zscores.shape == (3, 4))
+
+        pd.util.testing.assert_frame_equal(pc_zscores, pd.DataFrame(
+            {'A': [-0.3372, -0.6745, -0.4047],
+             'B': [-1.6862, 2.0235, 0.4047],
+             'C': [1.0117, 0.6745, 1.2141],
+             'D': [0.3372, -0.6745, -0.9443]}))
+
+    def test_zscore_vc(self):
+        vc_zscores = robust_zscore.robust_zscore(test_mat, ctrl_mat=test_ctl_mat)
+        self.assertTrue(vc_zscores.shape == (3, 4))
+        pd.util.testing.assert_frame_equal(vc_zscores, pd.DataFrame(
+            {'A': [-4.7214, -3.3725, -20.2347],
+             'B': [-7.4194, 0.6745, 0.0],
+             'C': [-2.0235, -1.349, 20.2347],
+             'D': [-3.3725, -3.3725, -33.7245]}))
+
+        # check that min_mad works
+        vc_zscores2 = robust_zscore.robust_zscore(test_mat, ctrl_mat=test_ctl_mat2)
+        self.assertEqual(vc_zscores2.iloc[0, 0], -26.9796)
+        self.assertEqual(vc_zscores2.iloc[1, 1], 0.6745)
+
+if __name__ == "__main__":
+    setup_logger.setup(verbose=True)
+    unittest.main()
diff --git a/cmapPy/pandasGEXpress/diff_gctoo.py b/cmapPy/pandasGEXpress/diff_gctoo.py
@@ -0,0 +1,85 @@
+'''
+diff_gctoo.py
+
+Converts a matrix of values (e.g. gene expression, viability, etc.) into a
+matrix of differential values. Values can be made differential relative to all
+samples in the dataset ("plate-control") or relative to just negative control
+samples ("vehicle-control"). The method of computing the differential can be
+either a robust z-score ("robust_z") or simply median normalization
+("median_norm").
+
+'''
+import cmapPy.math.robust_zscore as robust_zscore
+import cmapPy.pandasGEXpress.GCToo as GCToo
+
+possible_diff_methods = ["robust_z", "median_norm"]
+
+
+def diff_gctoo(gctoo, plate_control=True, group_field='pert_type', group_val='ctl_vehicle',
+               diff_method="robust_z", upper_diff_thresh=10, lower_diff_thresh=-10):
+    ''' Converts a matrix of values (e.g. gene expression, viability, etc.)
+    into a matrix of differential values.
+
+    Args:
+    df (pandas df): data to make diff_gctoo
+    plate_control (bool): True means calculate diff_gctoo using plate control.
+        False means vehicle control.
+    group_field (string): Metadata field in which to find group_val
+    group_val (string): Value in group_field that indicates use in vehicle control
+    diff_method (string): Method of computing differential data; currently only
+        support either "robust_z" or "median_norm"
+    upper_diff_thresh (float): Maximum value for diff data
+    lower_diff_thresh (float): Minimum value for diff data
+
+    Returns:
+    out_gctoo (GCToo object): GCToo with differential data values
+    '''
+    assert diff_method in possible_diff_methods, (
+        "possible_diff_methods: {}, diff_method: {}".format(
+            possible_diff_methods, diff_method))
+
+    # Compute median and MAD using all samples in the dataset
+    if plate_control:
+
+        # Compute differential data
+        if diff_method == "robust_z":
+            diff_data = robust_zscore.robust_zscore(gctoo.data_df)
+
+        elif diff_method == "median_norm":
+            medians = gctoo.data_df.median(axis=1)
+            diff_data = gctoo.data_df.subtract(medians, axis='index')
+
+    # Compute median and MAD from negative controls, rather than all samples
+    else:
+
+        assert group_field in gctoo.col_metadata_df.columns.values, (
+            "group_field {} not present in column metadata. " +
+            "gctoo.col_metadata_df.columns.values: {}").format(
+            group_field, gctoo.col_metadata_df.columns.values)
+
+        assert sum(gctoo.col_metadata_df[group_field] == group_val) > 0, (
+            "group_val {} not present in the {} column.").format(
+            group_val, group_field)
+
+        # Find negative control samples
+        neg_ctl_samples = gctoo.col_metadata_df.index[gctoo.col_metadata_df[group_field] == group_val]
+        neg_ctl_df = gctoo.data_df[neg_ctl_samples]
+
+        # Compute differential data
+        if diff_method == "robust_z":
+            diff_data = robust_zscore.robust_zscore(gctoo.data_df, neg_ctl_df)
+
+        elif diff_method == "median_norm":
+            medians = gctoo.data_df.median(axis=1)
+            diff_data = gctoo.data_df.subtract(medians, axis='index')
+
+    # Threshold differential data before returning
+    diff_data = diff_data.clip(lower=lower_diff_thresh, upper=upper_diff_thresh)
+
+    # Construct output GCToo object
+    out_gctoo = GCToo.GCToo(data_df=diff_data,
+                            row_metadata_df=gctoo.row_metadata_df,
+                            col_metadata_df=gctoo.col_metadata_df)
+
+    return out_gctoo
+
diff --git a/cmapPy/pandasGEXpress/tests/test_diff_gctoo.py b/cmapPy/pandasGEXpress/tests/test_diff_gctoo.py