1
+ import csv
2
+ from doctest import master
3
+ from itertools import groupby
4
+ import pandas as pd
5
+ from dfply import *
6
+ import numpy as np
7
+ from scipy import stats
8
+ import os
9
+ import argparse
10
+
11
+ input_parser = argparse .ArgumentParser (
12
+ description = "Run RegTools stats script" ,
13
+ )
14
+ input_parser .add_argument (
15
+ '-t' ,
16
+ '--tag' ,
17
+ help = "Variant tag parameter used to run RegTools." ,
18
+ )
19
+ input_parser .add_argument (
20
+ '-i' ,
21
+ '--variants_file' ,
22
+ help = "File containing variants to be considered as splicing relevant."
23
+ )
24
+ input_parser .add_argument (
25
+ '-d' ,
26
+ '--dir_names' ,
27
+ help = "File containing directory names corresponding to each sample that is to be processed."
28
+ )
29
+ input_parser .add_argument (
30
+ '-v' ,
31
+ '--variant-grouping' ,
32
+ help = "" ,
33
+ choices = ['strict' , 'exclude' , 'include' ]
34
+ )
35
+
36
+ args = input_parser .parse_args ()
37
+
38
+ tag = args .tag
39
+ splicing_variants_inputfile = args .variants_file
40
+ samples_inputfile = args .dir_names
41
+ variant_grouping_mode = args .variant_grouping
42
+ os .chdir ('/Users/kcotto/Desktop/CHOL/' )
43
+
44
+ # read in all splicing variants
45
+ all_splicing_variants = pd .read_csv (
46
+ splicing_variants_inputfile , delimiter = '\t ' , header = 0 )
47
+
48
+ # create key to match regtools variant_info column and key2 that is the same as key but with sample name added
49
+
50
+
51
+ def createkey (row ):
52
+ key = row [0 ] + ':' + str (row [1 ]) + '-' + str (row [2 ])
53
+ return key
54
+
55
+ all_splicing_variants ['key' ] = all_splicing_variants .apply (
56
+ lambda row : createkey (row ), axis = 1 )
57
+
58
+
59
+ def createkey (row ):
60
+ key = row [0 ] + ':' + str (row [1 ]) + '-' + str (row [2 ]) + '_' + row [3 ]
61
+ return key
62
+
63
+
64
+ all_splicing_variants ['key2' ] = all_splicing_variants .apply (
65
+ lambda row : createkey (row ), axis = 1 )
66
+
67
+ # read in the sample names
68
+ all_samples = []
69
+ with open (samples_inputfile , 'r' ) as samples :
70
+ reader = csv .reader (samples , delimiter = '\t ' )
71
+ for line in reader :
72
+ all_samples .append (line [0 ])
73
+
74
+ num_of_samples = len (all_samples )
75
+
76
+ ### read in all of the regtools cse output for this cohort ###
77
+ # create list to hold each sample's df
78
+ dfs = []
79
+
80
+ # read each sample's output file into a df and subset columns, split variants into multirows,
81
+ # and require that variant is in all_splicing_variants
82
+ for sample in all_samples :
83
+ path = f'samples/{ sample } /output/cse_identify_filtered_compare_{ tag } .tsv'
84
+ df = f'df_{ sample } '
85
+ print (f'Reading in { sample } ' )
86
+ df = pd .read_csv (path , delimiter = '\t ' , header = 0 )
87
+ df ['sample' ] = sample
88
+ df = df [['sample' , 'variant_info' , 'chrom' , 'start' ,
89
+ 'end' , 'strand' , 'anchor' , 'score' , 'name' , 'genes' ]]
90
+ df = df .dropna (subset = ['variant_info' ])
91
+ df = df .set_index (['sample' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' , 'score' ,
92
+ 'name' , 'genes' ]).apply (lambda x : x .str .split (',' ).explode ()).reset_index ()
93
+ df = df .loc [df ['variant_info' ].isin (all_splicing_variants ['key' ])]
94
+ dfs .append (df )
95
+
96
+ # concat all individual dfs into one df
97
+ print ("Concatenating each sample's df together" )
98
+ master_df = pd .concat (dfs , axis = 0 , ignore_index = True )
99
+ del dfs
100
+
101
+ # create various keys
102
+
103
+
104
+ def createkey (row ):
105
+ key = row [1 ] + '_' + str (row [2 ]) + '_' + \
106
+ str (row [3 ]) + '_' + row [5 ] + '_' + row [9 ]
107
+ return key
108
+
109
+
110
+ master_df ['info' ] = master_df .apply (lambda row : createkey (row ), axis = 1 )
111
+
112
+
113
+ def createkey (row ):
114
+ key = row [9 ] + '_' + row [0 ]
115
+ return key
116
+
117
+
118
+ master_df ['key' ] = master_df .apply (lambda row : createkey (row ), axis = 1 )
119
+
120
+
121
+ def createkey (row ):
122
+ key = row [1 ] + '_' + str (row [2 ]) + '_' + str (row [3 ])
123
+ return key
124
+
125
+
126
+ master_df ['junction' ] = master_df .apply (lambda row : createkey (row ), axis = 1 )
127
+
128
+ # subset data to work on samples with splicing variant of interest
129
+ samples_w_variant_df = master_df .loc [master_df ['key' ].isin (
130
+ all_splicing_variants ['key2' ])]
131
+ # print(samples_w_variant_df.info(verbose=True))
132
+
133
+ # start performing the calculations for this subset of data
134
+ print ('Calculating normalized scores for samples with variants of interest' )
135
+ # variant_grouping_mode = 'strict'
136
+ if variant_grouping_mode == 'group' :
137
+ samples_w_variant_df = (samples_w_variant_df >>
138
+ group_by (X .key ) >>
139
+ summarize (score_tmp = X .score .sum ()) >>
140
+ outer_join (samples_w_variant_df , by = 'key' )
141
+ )
142
+ samples_w_variant_df ['norm_score' ] = samples_w_variant_df ['score' ] / \
143
+ samples_w_variant_df ['score_tmp' ]
144
+ samples_w_variant_df = (samples_w_variant_df >>
145
+ group_by ('junction' ) >>
146
+ summarize (mean_norm_score_variant = X .norm_score .mean (), sd_norm_score_variant = X .norm_score .std (), total_score_variant = X .score .sum ()) >>
147
+ outer_join (samples_w_variant_df , by = 'junction' )
148
+ )
149
+ tmp_df = samples_w_variant_df .groupby ('junction' )[
150
+ ['norm_score' , 'score' , 'variant_info' , 'sample' , 'name' ]].aggregate (lambda x : x .tolist ()).reset_index ()
151
+ samples_w_variant_df = pd .merge (
152
+ samples_w_variant_df , tmp_df , on = 'junction' )
153
+ samples_w_variant_df = samples_w_variant_df [['sample_y' , 'variant_info_y' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
154
+ 'info' , 'genes' , 'name_y' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
155
+ 'norm_score_y' , 'score_y' , 'junction' , 'total_score_variant' ]]
156
+ samples_w_variant_df .columns = ['junction_samples' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
157
+ 'info' , 'genes' , 'names' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
158
+ 'norm_scores_variant' , 'scores' , 'junction' , 'total_score_variant' ]
159
+ samples_w_variant_df ['variant_samples' ] = samples_w_variant_df ['junction_samples' ]
160
+ samples_w_variant_df = samples_w_variant_df [~ samples_w_variant_df .astype (
161
+ str ).duplicated ()]
162
+ else :
163
+ samples_w_variant_df = (samples_w_variant_df >>
164
+ group_by (X .key ) >>
165
+ summarize (score_tmp = X .score .sum ()) >>
166
+ outer_join (samples_w_variant_df , by = 'key' )
167
+ )
168
+ samples_w_variant_df ['norm_score' ] = samples_w_variant_df ['score' ] / \
169
+ samples_w_variant_df ['score_tmp' ]
170
+ samples_w_variant_df = (samples_w_variant_df >>
171
+ group_by ('info' ) >>
172
+ summarize (mean_norm_score_variant = X .norm_score .mean (), sd_norm_score_variant = X .norm_score .std (), total_score_variant = X .score .sum ()) >>
173
+ outer_join (samples_w_variant_df , by = 'info' )
174
+ )
175
+ tmp_df = samples_w_variant_df .groupby ('info' )[['norm_score' , 'score' , 'sd_norm_score_variant' ,
176
+ 'mean_norm_score_variant' , 'sample' , 'name' ]].aggregate (lambda x : x .tolist ()).reset_index ()
177
+ samples_w_variant_df = pd .merge (samples_w_variant_df , tmp_df , on = 'info' )
178
+ samples_w_variant_df = samples_w_variant_df [['sample_x' , 'sample_y' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
179
+ 'info' , 'genes' , 'name_y' , 'mean_norm_score_variant_y' , 'sd_norm_score_variant_y' ,
180
+ 'norm_score_y' , 'score_y' , 'junction' , 'total_score_variant' ]]
181
+ samples_w_variant_df .columns = ['sample_x' , 'sample_y' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
182
+ 'info' , 'genes' , 'names' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
183
+ 'norm_scores_variant' , 'scores' , 'junction' , 'total_score_variant' ]
184
+ tmp_df = samples_w_variant_df .groupby ('variant_info' )[['sample_x' ]].aggregate (lambda x : list (set (x .tolist ()))).reset_index ()
185
+ samples_w_variant_df = pd .merge (samples_w_variant_df , tmp_df , on = 'variant_info' )
186
+ samples_w_variant_df = samples_w_variant_df [['sample_y' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
187
+ 'info' , 'genes' , 'names' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
188
+ 'norm_scores_variant' , 'scores' , 'junction' , 'total_score_variant' , 'sample_x_y' ]]
189
+ samples_w_variant_df .columns = ['junction_samples' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
190
+ 'info' , 'genes' , 'names' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
191
+ 'norm_scores_variant' , 'scores' , 'junction' , 'total_score_variant' , 'variant_samples' ]
192
+ samples_w_variant_df = samples_w_variant_df [~ samples_w_variant_df .astype (
193
+ str ).duplicated ()]
194
+
195
+ # work on samples that don't have the variant of interest
196
+ print ('Calculating normalized scores for samples without variants of interest' )
197
+ samples_wout_variant_df = master_df [~ master_df ['key' ].isin (
198
+ all_splicing_variants ['key2' ])]
199
+ del (master_df )
200
+
201
+ # mode = 'strict' #others include 'exclude' and 'group'
202
+ # if mode == 'strict':
203
+ samples_wout_variant_df = (samples_wout_variant_df >>
204
+ group_by (X .key ) >>
205
+ summarize (score_tmp = X .score .sum ()) >>
206
+ outer_join (samples_wout_variant_df , by = 'key' )
207
+ )
208
+ samples_wout_variant_df ['norm_score' ] = samples_wout_variant_df ['score' ] / \
209
+ samples_wout_variant_df ['score_tmp' ]
210
+ samples_wout_variant_df = samples_wout_variant_df .loc [samples_wout_variant_df ['variant_info' ].isin (
211
+ all_splicing_variants ['key' ])]
212
+ tmp_df = samples_wout_variant_df .groupby (
213
+ 'info' )[['norm_score' , 'sample' ]].aggregate (lambda x : x .tolist ()).reset_index ()
214
+ samples_wout_variant_df = pd .merge (samples_wout_variant_df , tmp_df , on = 'info' )
215
+ samples_wout_variant_df ['samples_wout_variant_count' ] = samples_wout_variant_df ['norm_score_y' ].astype (
216
+ str ).str .count (',' ) + 1
217
+ if variant_grouping_mode == 'group' or variant_grouping_mode == 'exclude' :
218
+ samples_wout_variant_df = samples_wout_variant_df [~ samples_wout_variant_df ['junction' ].isin (
219
+ samples_w_variant_df ['junction' ])]
220
+ tmp_df = samples_wout_variant_df .groupby (
221
+ 'info' )[['norm_score_x' ]].aggregate (lambda x : x .tolist ()).reset_index ()
222
+ samples_wout_variant_df = pd .merge (
223
+ samples_wout_variant_df , tmp_df , on = 'info' )
224
+ samples_wout_variant_df = (samples_wout_variant_df >>
225
+ group_by ('info' ) >>
226
+ summarize (total_score_non = X .score .sum ()) >>
227
+ outer_join (samples_wout_variant_df , by = 'info' )
228
+ )
229
+ samples_wout_variant_df = samples_wout_variant_df [['sample_y' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
230
+ 'info' , 'genes' , 'norm_score_x_y' , 'junction' , 'total_score_non' , 'samples_wout_variant_count' ]]
231
+ else :
232
+ samples_wout_variant_df = (samples_wout_variant_df >>
233
+ group_by ('info' ) >>
234
+ summarize (total_score_non = X .score .sum ()) >>
235
+ outer_join (samples_wout_variant_df , by = 'info' )
236
+ )
237
+ samples_wout_variant_df = samples_wout_variant_df [['sample_y' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
238
+ 'info' , 'genes' , 'norm_score_y' , 'junction' , 'total_score_non' , 'samples_wout_variant_count' ]]
239
+ samples_wout_variant_df .columns = ['sample' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
240
+ 'info' , 'genes' , 'norm_scores_non' , 'junction' , 'total_score_non' , 'samples_wout_variant_count' ]
241
+
242
+ print ('Merging dataframes' )
243
+ master_df = pd .merge (samples_w_variant_df , samples_wout_variant_df , how = 'left' ,on = 'info' )
244
+ master_df = master_df [- master_df .astype (
245
+ str ).duplicated ()]
246
+ del (samples_wout_variant_df )
247
+ del (samples_w_variant_df )
248
+
249
+ master_df ['samples_w_variant_count' ] = master_df ['variant_samples' ].astype (
250
+ str ).str .count (',' ) + 1
251
+
252
+ tmp_df = master_df [['info' , 'norm_scores_non' , 'samples_wout_variant_count' , 'samples_w_variant_count' ]]
253
+ tmp_df = tmp_df .fillna (0 )
254
+
255
+ def add_zeros (row ):
256
+ norm_scores = row [1 ]
257
+ if norm_scores == 0 :
258
+ norm_scores = [0 ]
259
+ samples_wout_variant = row [2 ]
260
+ samples_w_variant = row [3 ]
261
+ num_of_zeros_toadd = num_of_samples - samples_wout_variant - samples_w_variant
262
+ zeros = np .repeat (0 , num_of_zeros_toadd ).tolist ()
263
+ norm_scores = norm_scores + zeros
264
+ norm_scores .sort (reverse = True )
265
+ new_norm_score_value = (',' ).join (map (str , norm_scores ))
266
+ return new_norm_score_value
267
+
268
+ tmp_df ['new_norm_scores' ] = tmp_df .apply (lambda row : add_zeros (row ), axis = 1 )
269
+ master_df = pd .merge (master_df , tmp_df , how = 'left' ,on = 'info' )
270
+ del (tmp_df )
271
+
272
+ def get_mean (row ):
273
+ values = row [33 ].split (',' )
274
+ values = [float (i ) for i in values ]
275
+ mean = np .mean (values )
276
+ return mean
277
+
278
+ master_df ['mean_norm_score_non' ] = master_df .apply (lambda row : get_mean (row ), axis = 1 )
279
+
280
+ def get_sd (row ):
281
+ values = row [33 ].split (',' )
282
+ values = [float (i ) for i in values ]
283
+ std = np .std (values )
284
+ return std
285
+
286
+ master_df ['sd_norm_score_non' ] = master_df .apply (lambda row : get_sd (row ), axis = 1 )
287
+
288
+ def get_min (row ):
289
+ values = row [12 ]
290
+ values = [float (i ) for i in values ]
291
+ minimum = min (values )
292
+ return (minimum )
293
+
294
+ master_df ['min_norm_score_variant' ] = master_df .apply (lambda row : get_min (row ), axis = 1 )
295
+
296
+ def get_pvalue_mean (row ):
297
+ values = row [33 ].split (',' )
298
+ values = [float (i ) for i in values ]
299
+ mean_value = row [10 ]
300
+ pvalue = stats .percentileofscore (values , mean_value )
301
+ pvalue = 1 - (pvalue / 100.0 )
302
+ pvalue = re .sub ('[\[\]]' , '' , np .array_str (pvalue ))
303
+ return pvalue
304
+
305
+ master_df ['p_value_mean' ] = master_df .apply (lambda row : get_pvalue_mean (row ), axis = 1 )
306
+
307
+ def get_pvalue_min (row ):
308
+ values = row [33 ].split (',' )
309
+ values = [float (i ) for i in values ]
310
+ min_value = row [36 ]
311
+ pvalue = stats .percentileofscore (values , min_value )
312
+ pvalue = 1 - (pvalue / 100.0 )
313
+ pvalue = re .sub ('[\[\]]' , '' , np .array_str (pvalue ))
314
+ return pvalue
315
+
316
+ master_df ['p_value_min' ] = master_df .apply (lambda row : get_pvalue_mean (row ), axis = 1 )
317
+
318
+ master_df = master_df [['variant_samples' , 'variant_info_x' , 'genes_x' , 'junction_samples' ,
319
+ 'chrom_x' , 'start_x' , 'end_x' , 'strand_x' , 'anchor_x' , 'info' ,
320
+ 'names' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
321
+ 'norm_scores_variant' , 'total_score_variant' , 'mean_norm_score_non' ,
322
+ 'sd_norm_score_non' , 'new_norm_scores' , 'total_score_non' , 'p_value_mean' ,'p_value_min' ]]
323
+ master_df .columns = ['variant_samples' , 'variant_info' , 'genes' , 'junction_samples' ,
324
+ 'chrom' , 'start' , 'end' , 'strand' , 'anchor' , 'variant_junction_info' ,
325
+ 'names' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
326
+ 'norm_scores_variant' , 'total_score_variant' , 'mean_norm_score_non' ,
327
+ 'sd_norm_score_non' , 'norm_scores_non' , 'total_score_non' , 'p_value_mean' ,'p_value_min' ]
328
+
329
+ master_df = master_df .applymap (lambda x : x [0 ] if isinstance (x , list ) else x )
330
+ master_df = master_df .fillna (0 )
331
+
332
+ master_df .to_csv (f'junction_pvalues_{ tag } _out.tsv' , sep = '\t ' , index = False )
333
+ print (master_df .info ())
334
+ # master_df = master_df[['samples', 'variant_info_x', ']]
335
+ #why are variant_samples >1 missing?
0 commit comments