4
4
import pandas as pd
5
5
from dfply import *
6
6
import numpy as np
7
- import glob
7
+ from scipy import stats
8
8
import os
9
9
10
10
tag = 'E'
@@ -43,6 +43,8 @@ def createkey(row):
43
43
for line in reader :
44
44
all_samples .append (line [0 ])
45
45
46
+ num_of_samples = len (all_samples )
47
+
46
48
### read in all of the regtools cse output for this cohort ###
47
49
# create list to hold each sample's df
48
50
dfs = []
@@ -123,9 +125,10 @@ def createkey(row):
123
125
samples_w_variant_df = samples_w_variant_df [['sample_y' , 'variant_info_y' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
124
126
'info' , 'genes' , 'name_y' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
125
127
'norm_score_y' , 'score_y' , 'junction' , 'total_score_variant' ]]
126
- samples_w_variant_df .columns = ['samples ' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
128
+ samples_w_variant_df .columns = ['junction_samples ' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
127
129
'info' , 'genes' , 'names' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
128
130
'norm_scores_variant' , 'scores' , 'junction' , 'total_score_variant' ]
131
+ samples_w_variant_df ['variant_samples' ] = samples_w_variant_df ['junction_samples' ]
129
132
samples_w_variant_df = samples_w_variant_df [~ samples_w_variant_df .astype (
130
133
str ).duplicated ()]
131
134
else :
@@ -144,12 +147,20 @@ def createkey(row):
144
147
tmp_df = samples_w_variant_df .groupby ('info' )[['norm_score' , 'score' , 'sd_norm_score_variant' ,
145
148
'mean_norm_score_variant' , 'sample' , 'name' ]].aggregate (lambda x : x .tolist ()).reset_index ()
146
149
samples_w_variant_df = pd .merge (samples_w_variant_df , tmp_df , on = 'info' )
147
- samples_w_variant_df = samples_w_variant_df [['sample_y' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
150
+ samples_w_variant_df = samples_w_variant_df [['sample_x' , ' sample_y' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
148
151
'info' , 'genes' , 'name_y' , 'mean_norm_score_variant_y' , 'sd_norm_score_variant_y' ,
149
152
'norm_score_y' , 'score_y' , 'junction' , 'total_score_variant' ]]
150
- samples_w_variant_df .columns = ['samples ' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
153
+ samples_w_variant_df .columns = ['sample_x' , 'sample_y ' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
151
154
'info' , 'genes' , 'names' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
152
155
'norm_scores_variant' , 'scores' , 'junction' , 'total_score_variant' ]
156
+ tmp_df = samples_w_variant_df .groupby ('variant_info' )[['sample_x' ]].aggregate (lambda x : set (x .tolist ())).reset_index ()
157
+ samples_w_variant_df = pd .merge (samples_w_variant_df , tmp_df , on = 'variant_info' )
158
+ samples_w_variant_df = samples_w_variant_df [['sample_y' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
159
+ 'info' , 'genes' , 'names' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
160
+ 'norm_scores_variant' , 'scores' , 'junction' , 'total_score_variant' , 'sample_x_y' ]]
161
+ samples_w_variant_df .columns = ['junction_samples' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
162
+ 'info' , 'genes' , 'names' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
163
+ 'norm_scores_variant' , 'scores' , 'junction' , 'total_score_variant' , 'variant_samples' ]
153
164
samples_w_variant_df = samples_w_variant_df [~ samples_w_variant_df .astype (
154
165
str ).duplicated ()]
155
166
@@ -171,7 +182,7 @@ def createkey(row):
171
182
samples_wout_variant_df = samples_wout_variant_df .loc [samples_wout_variant_df ['variant_info' ].isin (
172
183
all_splicing_variants ['key' ])]
173
184
tmp_df = samples_wout_variant_df .groupby (
174
- 'info' )[['norm_score' ]].aggregate (lambda x : x .tolist ()).reset_index ()
185
+ 'info' )[['norm_score' , 'sample' ]].aggregate (lambda x : x .tolist ()).reset_index ()
175
186
samples_wout_variant_df = pd .merge (samples_wout_variant_df , tmp_df , on = 'info' )
176
187
samples_wout_variant_df ['samples_wout_variant_count' ] = samples_wout_variant_df ['norm_score_y' ].astype (
177
188
str ).str .count (',' ) + 1
@@ -187,37 +198,90 @@ def createkey(row):
187
198
summarize (total_score_non = X .score .sum ()) >>
188
199
outer_join (samples_wout_variant_df , by = 'info' )
189
200
)
190
- samples_wout_variant_df = samples_wout_variant_df [['sample ' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
191
- 'info' , 'genes' , 'name' , ' norm_score_x_y' , 'junction' , 'total_score_non' , 'samples_wout_variant_count' ]]
201
+ samples_wout_variant_df = samples_wout_variant_df [['sample_y ' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
202
+ 'info' , 'genes' , 'norm_score_x_y' , 'junction' , 'total_score_non' , 'samples_wout_variant_count' ]]
192
203
else :
193
204
samples_wout_variant_df = (samples_wout_variant_df >>
194
205
group_by ('info' ) >>
195
206
summarize (total_score_non = X .score .sum ()) >>
196
207
outer_join (samples_wout_variant_df , by = 'info' )
197
208
)
198
- samples_wout_variant_df = samples_wout_variant_df [['sample ' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
199
- 'info' , 'genes' , 'name' , ' norm_score_y' , 'junction' , 'total_score_non' , 'samples_wout_variant_count' ]]
209
+ samples_wout_variant_df = samples_wout_variant_df [['sample_y ' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
210
+ 'info' , 'genes' , 'norm_score_y' , 'junction' , 'total_score_non' , 'samples_wout_variant_count' ]]
200
211
samples_wout_variant_df .columns = ['sample' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
201
- 'info' , 'genes' , 'name' , ' norm_scores_non' , 'junction' , 'total_score_non' , 'samples_wout_variant_count' ]
212
+ 'info' , 'genes' , 'norm_scores_non' , 'junction' , 'total_score_non' , 'samples_wout_variant_count' ]
202
213
203
214
print ('Merging dataframes' )
204
- # samples_w_variant_df['samples'] = samples_w_variant_df['samples'].astype(str)
205
- # samples_w_variant_df['variant_info'] = samples_w_variant_df['variant_info'].astype(str)
206
- samples_w_variant_df ['info' ] = samples_w_variant_df ['info' ].astype (str )
207
- # samples_w_variant_df['names'] = samples_w_variant_df['names'].astype(str)
208
- # samples_w_variant_df['norm_scores_variant'] = samples_w_variant_df['norm_scores_variant'].astype(str)
209
- # samples_w_variant_df['scores'] = samples_w_variant_df['scores'].astype(str)
210
- # samples_wout_variant_df['norm_scores_non'] = samples_wout_variant_df['norm_scores_non'].astype(str)
211
- samples_wout_variant_df ['info' ] = samples_wout_variant_df ['info' ].astype (str )
212
- master_df = pd .merge (samples_w_variant_df , samples_wout_variant_df , how = 'outer' ,on = 'info' )
215
+ master_df = pd .merge (samples_w_variant_df , samples_wout_variant_df , how = 'left' ,on = 'info' )
216
+ master_df = master_df [- master_df .astype (
217
+ str ).duplicated ()]
213
218
del (samples_wout_variant_df )
214
219
del (samples_w_variant_df )
215
220
216
- master_df ['samples_w_variant_count' ] = master_df ['norm_score_y ' ].astype (
221
+ master_df ['samples_w_variant_count' ] = master_df ['variant_samples ' ].astype (
217
222
str ).str .count (',' ) + 1
218
223
219
- samples_wout_variant_df = (samples_wout_variant_df >>
220
- group_by ('info' ) >>
221
- summarize (mean_norm_score_non = X .norm_score .mean (), sd_norm_score_non = X .norm_score .std (), total_score_non = X .score .sum ()) >>
222
- outer_join (samples_wout_variant_df , by = 'info' )
223
- )
224
+ tmp_df = master_df [['info' , 'norm_scores_non' , 'samples_wout_variant_count' , 'samples_w_variant_count' ]]
225
+ tmp_df = tmp_df .fillna (0 )
226
+
227
+ def add_zeros (row ):
228
+ norm_scores = row [1 ]
229
+ if norm_scores == 0 :
230
+ norm_scores = ['0' ]
231
+ samples_wout_variant = row [2 ]
232
+ samples_w_variant = row [3 ]
233
+ num_of_zeros_toadd = num_of_samples - samples_wout_variant - samples_w_variant
234
+ zeros = np .repeat (0 , num_of_zeros_toadd ).tolist ()
235
+ norm_scores = norm_scores + zeros
236
+ new_norm_score_value = (',' ).join (map (str , norm_scores ))
237
+ return new_norm_score_value
238
+
239
+ tmp_df ['new_norm_scores' ] = tmp_df .apply (lambda row : add_zeros (row ), axis = 1 )
240
+ master_df = pd .merge (master_df , tmp_df , how = 'left' ,on = 'info' )
241
+ del (tmp_df )
242
+
243
+ def get_mean (row ):
244
+ values = row [- 1 ].split (',' )
245
+ values = [float (i ) for i in values ]
246
+ mean = np .mean (values )
247
+ return mean
248
+
249
+ master_df ['mean_norm_score_non' ] = master_df .apply (lambda row : get_mean (row ), axis = 1 )
250
+
251
+ def get_sd (row ):
252
+ values = row [- 2 ].split (',' )
253
+ values = [float (i ) for i in values ]
254
+ std = np .std (values )
255
+ return std
256
+
257
+ master_df ['sd_norm_score_non' ] = master_df .apply (lambda row : get_sd (row ), axis = 1 )
258
+
259
+ def get_min (row ):
260
+ values = row [12 ]
261
+ values = [float (i ) for i in values ]
262
+ minimum = min (values )
263
+ return (minimum )
264
+
265
+ master_df ['min_norm_score_variant' ] = master_df .apply (lambda row : get_min (row ), axis = 1 )
266
+
267
+ def get_pvalue_mean (row ):
268
+ values = row [32 ].split (',' )
269
+ values = [float (i ) for i in values ]
270
+ mean_value = row [10 ]
271
+ pvalue = stats .percentileofscore (values , mean_value )
272
+ pvalue = 1 - (pvalue / 100.0 )
273
+ return pvalue
274
+
275
+ master_df ['p_value_mean' ] = master_df .apply (lambda row : get_pvalue_mean (row ), axis = 1 )
276
+
277
+ def get_pvalue_min (row ):
278
+ values = row [32 ].split (',' )
279
+ values = [float (i ) for i in values ]
280
+ mean_value = row [35 ]
281
+ pvalue = stats .percentileofscore (values , mean_value )
282
+ pvalue = 1 - (pvalue / 100.0 )
283
+ return pvalue
284
+
285
+ master_df ['p_value_min' ] = master_df .apply (lambda row : get_pvalue_mean (row ), axis = 1 )
286
+
287
+ # master_df = master_df[['samples', 'variant_info_x', ']]
0 commit comments