6
6
import numpy as np
7
7
from scipy import stats
8
8
import os
9
-
10
- tag = 'E'
9
+ import argparse
10
+
11
+ input_parser = argparse .ArgumentParser (
12
+ description = "Run RegTools stats script" ,
13
+ )
14
+ input_parser .add_argument (
15
+ '-t' ,
16
+ '--tag' ,
17
+ help = "Variant tag parameter used to run RegTools." ,
18
+ )
19
+ input_parser .add_argument (
20
+ '-i' ,
21
+ '--variants_file' ,
22
+ help = "File containing variants to be considered as splicing relevant."
23
+ )
24
+ input_parser .add_argument (
25
+ '-d' ,
26
+ '--dir_names' ,
27
+ help = "File containing directory names corresponding to each sample that is to be processed."
28
+ )
29
+ input_parser .add_argument (
30
+ '-v' ,
31
+ '--variant-grouping' ,
32
+ help = "" ,
33
+ choices = ['strict' , 'exclude' , 'include' ]
34
+ )
35
+
36
+ args = input_parser .parse_args ()
37
+
38
+ tag = args .tag
39
+ splicing_variants_inputfile = args .variants_file
40
+ samples_inputfile = args .dir_names
41
+ variant_grouping_mode = args .variant_grouping
11
42
os .chdir ('/Users/kcotto/Desktop/CHOL/' )
12
- splicing_variants_inputfile = '/Users/kcotto/Desktop/CHOL/all_splicing_variants_E.bed'
13
- samples_inputfile = '/Users/kcotto/Desktop/CHOL/dir_names.tsv'
14
43
15
44
# read in all splicing variants
16
45
all_splicing_variants = pd .read_csv (
@@ -23,7 +52,6 @@ def createkey(row):
23
52
key = row [0 ] + ':' + str (row [1 ]) + '-' + str (row [2 ])
24
53
return key
25
54
26
-
27
55
all_splicing_variants ['key' ] = all_splicing_variants .apply (
28
56
lambda row : createkey (row ), axis = 1 )
29
57
@@ -104,8 +132,8 @@ def createkey(row):
104
132
105
133
# start performing the calculations for this subset of data
106
134
print ('Calculating normalized scores for samples with variants of interest' )
107
- mode = 'strict'
108
- if mode == 'group' :
135
+ # variant_grouping_mode = 'strict'
136
+ if variant_grouping_mode == 'group' :
109
137
samples_w_variant_df = (samples_w_variant_df >>
110
138
group_by (X .key ) >>
111
139
summarize (score_tmp = X .score .sum ()) >>
@@ -186,7 +214,7 @@ def createkey(row):
186
214
samples_wout_variant_df = pd .merge (samples_wout_variant_df , tmp_df , on = 'info' )
187
215
samples_wout_variant_df ['samples_wout_variant_count' ] = samples_wout_variant_df ['norm_score_y' ].astype (
188
216
str ).str .count (',' ) + 1
189
- if mode == 'group' or mode == 'exclude' :
217
+ if variant_grouping_mode == 'group' or variant_grouping_mode == 'exclude' :
190
218
samples_wout_variant_df = samples_wout_variant_df [~ samples_wout_variant_df ['junction' ].isin (
191
219
samples_w_variant_df ['junction' ])]
192
220
tmp_df = samples_wout_variant_df .groupby (
@@ -301,7 +329,7 @@ def get_pvalue_min(row):
301
329
master_df = master_df .applymap (lambda x : x [0 ] if isinstance (x , list ) else x )
302
330
master_df = master_df .fillna (0 )
303
331
304
- master_df .to_csv ('test_results .tsv' , sep = '\t ' , index = False )
332
+ master_df .to_csv (f'junction_pvalues_ { tag } _out .tsv' , sep = '\t ' , index = False )
305
333
print (master_df .info ())
306
334
# master_df = master_df[['samples', 'variant_info_x', ']]
307
335
#why are variant_samples >1 missing?
0 commit comments