1
1
import csv
2
- from doctest import master
3
2
from itertools import groupby
4
3
import pandas as pd
5
4
from dfply import *
8
7
import os
9
8
import argparse
10
9
11
- input_parser = argparse .ArgumentParser (
12
- description = "Run RegTools stats script" ,
13
- )
14
- input_parser .add_argument (
15
- '-t' ,
16
- '--tag' ,
17
- help = "Variant tag parameter used to run RegTools." ,
18
- )
19
- input_parser .add_argument (
20
- '-i' ,
21
- '--variants_file' ,
22
- help = "File containing variants to be considered as splicing relevant."
23
- )
24
- input_parser .add_argument (
25
- '-d' ,
26
- '--dir_names' ,
27
- help = "File containing directory names corresponding to each sample that is to be processed."
28
- )
29
- input_parser .add_argument (
30
- '-v' ,
31
- '--variant-grouping' ,
32
- help = "" ,
33
- choices = ['strict' , 'exclude' , 'include' ]
34
- )
35
-
36
- args = input_parser .parse_args ()
37
-
38
- tag = args .tag
39
- splicing_variants_inputfile = args .variants_file
40
- samples_inputfile = args .dir_names
41
- variant_grouping_mode = args .variant_grouping
42
- os .chdir ('/Users/kcotto/Desktop/CHOL/' )
10
+ # input_parser = argparse.ArgumentParser(
11
+ # description="Run RegTools stats script",
12
+ # )
13
+ # input_parser.add_argument(
14
+ # '-t',
15
+ # '--tag',
16
+ # help="Variant tag parameter used to run RegTools.",
17
+ # )
18
+ # input_parser.add_argument(
19
+ # '-i',
20
+ # '--variants_file',
21
+ # help="File containing variants to be considered as splicing relevant."
22
+ # )
23
+ # input_parser.add_argument(
24
+ # '-d',
25
+ # '--dir_names',
26
+ # help="File containing directory names corresponding to each sample that is to be processed."
27
+ # )
28
+ # input_parser.add_argument(
29
+ # '-v',
30
+ # '--variant-grouping',
31
+ # help="",
32
+ # choices=['strict', 'exclude', 'include']
33
+ # )
34
+
35
+ # args = input_parser.parse_args()
36
+
37
+ # tag = args.tag
38
+ # splicing_variants_inputfile = args.variants_file
39
+ # samples_inputfile = args.dir_names
40
+ # variant_grouping_mode = args.variant_grouping
41
+
42
+ tag = 'default'
43
+ splicing_variants_inputfile = '/Users/kcotto/Desktop/MET_samples/MET_splicing_variants.bed'
44
+ samples_inputfile = '/Users/kcotto/Desktop/MET_samples/samples.txt'
45
+ variant_grouping_mode = 'strict'
46
+ os .chdir ('/Users/kcotto/Desktop/MET_samples/' )
43
47
44
48
# read in all splicing variants
45
49
all_splicing_variants = pd .read_csv (
46
- splicing_variants_inputfile , delimiter = '\t ' , header = 0 )
50
+ splicing_variants_inputfile , delimiter = '\t ' , header = None )
47
51
48
52
# create key to match regtools variant_info column and key2 that is the same as key but with sample name added
49
53
@@ -80,7 +84,7 @@ def createkey(row):
80
84
# read each sample's output file into a df and subset columns, split variants into multirows,
81
85
# and require that variant is in all_splicing_variants
82
86
for sample in all_samples :
83
- path = f'samples/ { sample } /output/cse_identify_filtered_compare_{ tag } .tsv'
87
+ path = f'{ sample } /output/cse_identify_filtered_compare_{ tag } .tsv'
84
88
df = f'df_{ sample } '
85
89
print (f'Reading in { sample } ' )
86
90
df = pd .read_csv (path , delimiter = '\t ' , header = 0 )
@@ -130,10 +134,24 @@ def createkey(row):
130
134
all_splicing_variants ['key2' ])]
131
135
# print(samples_w_variant_df.info(verbose=True))
132
136
137
+ def add_zeros_variant (row ):
138
+ norm_scores = row [1 ]
139
+ if norm_scores == 0 :
140
+ norm_scores = [0 ]
141
+ samples_wout_variant = row [2 ]
142
+ samples_w_variant = row [3 ]
143
+ num_of_zeros_toadd = num_of_samples - samples_wout_variant - samples_w_variant
144
+ zeros = np .repeat (0 , num_of_zeros_toadd ).tolist ()
145
+ norm_scores = norm_scores + zeros
146
+ norm_scores .sort (reverse = True )
147
+ new_norm_score_value = (',' ).join (map (str , norm_scores ))
148
+ return new_norm_score_value
149
+
150
+ # tmp_df['new_norm_scores'] = tmp_df.apply(lambda row: add_zeros_nonvariant(row), axis=1)
151
+
133
152
# start performing the calculations for this subset of data
134
153
print ('Calculating normalized scores for samples with variants of interest' )
135
- # variant_grouping_mode = 'strict'
136
- if variant_grouping_mode == 'group' :
154
+ if variant_grouping_mode == 'include' :
137
155
samples_w_variant_df = (samples_w_variant_df >>
138
156
group_by (X .key ) >>
139
157
summarize (score_tmp = X .score .sum ()) >>
@@ -198,7 +216,7 @@ def createkey(row):
198
216
all_splicing_variants ['key2' ])]
199
217
del (master_df )
200
218
201
- # mode = 'strict' #others include 'exclude ' and 'group '
219
+ # mode = 'strict' #others include 'include ' and 'exclude '
202
220
# if mode == 'strict':
203
221
samples_wout_variant_df = (samples_wout_variant_df >>
204
222
group_by (X .key ) >>
@@ -214,7 +232,7 @@ def createkey(row):
214
232
samples_wout_variant_df = pd .merge (samples_wout_variant_df , tmp_df , on = 'info' )
215
233
samples_wout_variant_df ['samples_wout_variant_count' ] = samples_wout_variant_df ['norm_score_y' ].astype (
216
234
str ).str .count (',' ) + 1
217
- if variant_grouping_mode == 'group ' or variant_grouping_mode == 'exclude' :
235
+ if variant_grouping_mode == 'include ' or variant_grouping_mode == 'exclude' :
218
236
samples_wout_variant_df = samples_wout_variant_df [~ samples_wout_variant_df ['junction' ].isin (
219
237
samples_w_variant_df ['junction' ])]
220
238
tmp_df = samples_wout_variant_df .groupby (
@@ -226,6 +244,7 @@ def createkey(row):
226
244
summarize (total_score_non = X .score .sum ()) >>
227
245
outer_join (samples_wout_variant_df , by = 'info' )
228
246
)
247
+ print (samples_wout_variant_df .info ())
229
248
samples_wout_variant_df = samples_wout_variant_df [['sample_y' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
230
249
'info' , 'genes' , 'norm_score_x_y' , 'junction' , 'total_score_non' , 'samples_wout_variant_count' ]]
231
250
else :
@@ -252,7 +271,7 @@ def createkey(row):
252
271
tmp_df = master_df [['info' , 'norm_scores_non' , 'samples_wout_variant_count' , 'samples_w_variant_count' ]]
253
272
tmp_df = tmp_df .fillna (0 )
254
273
255
- def add_zeros (row ):
274
+ def add_zeros_nonvariant (row ):
256
275
norm_scores = row [1 ]
257
276
if norm_scores == 0 :
258
277
norm_scores = [0 ]
@@ -265,7 +284,7 @@ def add_zeros(row):
265
284
new_norm_score_value = (',' ).join (map (str , norm_scores ))
266
285
return new_norm_score_value
267
286
268
- tmp_df ['new_norm_scores' ] = tmp_df .apply (lambda row : add_zeros (row ), axis = 1 )
287
+ tmp_df ['new_norm_scores' ] = tmp_df .apply (lambda row : add_zeros_nonvariant (row ), axis = 1 )
269
288
master_df = pd .merge (master_df , tmp_df , how = 'left' ,on = 'info' )
270
289
del (tmp_df )
271
290
@@ -285,6 +304,8 @@ def get_sd(row):
285
304
286
305
master_df ['sd_norm_score_non' ] = master_df .apply (lambda row : get_sd (row ), axis = 1 )
287
306
307
+ print ('getting p-values for associations' )
308
+
288
309
def get_min (row ):
289
310
values = row [12 ]
290
311
values = [float (i ) for i in values ]
@@ -329,7 +350,5 @@ def get_pvalue_min(row):
329
350
master_df = master_df .applymap (lambda x : x [0 ] if isinstance (x , list ) else x )
330
351
master_df = master_df .fillna (0 )
331
352
332
- master_df .to_csv (f'junction_pvalues_{ tag } _out.tsv' , sep = '\t ' , index = False )
333
- print (master_df .info ())
334
- # master_df = master_df[['samples', 'variant_info_x', ']]
335
- #why are variant_samples >1 missing?
353
+ master_df .to_csv (f'junction_pvalues_{ tag } _{ variant_grouping_mode } .tsv' , sep = '\t ' , index = False )
354
+ print (master_df .info ())
0 commit comments