11'''
22modz.py
33
4+ modz refers to a weighted average of zscores - but can be applied to other types of data as well
45Given a matrix of profiles on which you want to calculate a weighted average, returns a single profile of modz values
6+ Weights are calculated based on the correlation between replicates - so if one replicate is less highly correlated it
7+ will not be weighted as highly in the level 5 signature.
58'''
69
710import pandas as pd
811import numpy as np
912import os
1013import math
1114
15+ rounding_precision = 4
16+
1217def upper_triangle (correlation_matrix ):
1318 '''
14- :param correlation_matrix (pandas df): Correlations between all replicates
15- :return upper_tri_series (pandas series): Upper triangle extracted from corr mat
19+ Args:
20+ correlation_matrix (pandas df): Correlations between all replicates
21+
22+ Returns:
23+ upper_tri_series (pandas series): Upper triangle extracted from corr mat
1624 '''
1725 upper_triangle = correlation_matrix .where (np .triu (np .ones (correlation_matrix .shape ), k = 1 ).astype (np .bool ))
1826
1927 # convert matrix into long form description
2028 upper_tri_series = upper_triangle .stack ().reset_index (level = 1 )
2129
22- upper_tri_series .columns = ['rid' , 'spearman_corr ' ]
30+ upper_tri_series .columns = ['rid' , 'corr ' ]
2331
2432 # Index at this point is CID, it now becomes a column
2533 upper_tri_series .reset_index (level = 0 , inplace = True )
2634
27- return upper_tri_series .round (4 )
35+ return upper_tri_series .round (rounding_precision )
2836
2937
30- def calculate_weights (correlation_matrix ):
38+ def calculate_weights (correlation_matrix , min_wt = 0.01 ):
3139 '''
32- :param correlation_matrix (pandas df): Correlations between all replicates
33- :return raw weights, weights (pandas series): Weights computed by summing correlations (raw weights) and then normalized to add to 1 (weights)
40+ Args:
41+ correlation_matrix (pandas df): Correlations between all replicates
42+
43+ Returns:
44+ raw weights (pandas series): Weights computed by summing correlations
45+ weights (pandas series): Weights computed by summing correlations (raw weights) and then normalized to add to 1 (weights)
3446 '''
3547
3648 # fill diagonal of corr_mat with 0s
@@ -39,27 +51,36 @@ def calculate_weights(correlation_matrix):
3951 # remove negative values
4052 correlation_matrix [correlation_matrix < 0 ] = 0
4153 raw_weights = correlation_matrix .sum (axis = 1 ) / (len (correlation_matrix .index ) - 1 )
42- raw_weights [raw_weights < .01 ] = .01
54+ raw_weights [raw_weights < min_wt ] = min_wt
4355 weights = raw_weights / sum (raw_weights .abs ())
4456
45- return raw_weights .round (4 ), weights .round (4 )
57+ return raw_weights .round (rounding_precision ), weights .round (rounding_precision )
4658
4759
48- def main (mat ):
60+ def calc_modz (mat , min_wt = 0.01 , corr_metric = 'spearman' ):
4961 '''
50- :param mat (pandas df): One matching profile from each replicate
51- :return (4 pandas series): modz values, correlations from upper tri series, raw weights, normalized weights
62+ Args:
63+ mat (pandas df): a signature matrix, where the columns are samples and the rows are features;
64+ columns correspond to the replicates of a single perturbagen
65+ min_wt (float): Minimum raw weight when calculating weighted average
66+ corr_metric (string): Spearmen or pearson, correlation method
67+
68+ Returns:
69+ modz values (pandas series): weighted average values
70+ upper_tri_series (pandas series): the correlations between each profile that went into the signature
71+ raw weights (pandas series): weights before normalization to add to 1
72+ weights (pandas series): weights after normalization
5273 '''
5374 # Make correlation matrix column wise
54- corr_mat = mat .corr (method = 'spearman' )
75+ corr_mat = mat .corr (method = corr_metric )
5576
5677 # Extract just the values in the upper triangle
5778 upper_tri_series = upper_triangle (corr_mat )
5879
5980 # Get rid of negative values
60- upper_tri_series ['spearman_corr ' ][upper_tri_series ['spearman_corr ' ] < 0 ] = 0
81+ upper_tri_series ['corr ' ][upper_tri_series ['corr ' ] < 0 ] = 0
6182
62- raw_weights , weights = calculate_weights (corr_mat )
83+ raw_weights , weights = calculate_weights (corr_mat , min_wt )
6384
6485 weighted_values = mat * weights
6586
0 commit comments