13
13
samples_inputfile = '/Users/kcotto/Desktop/CHOL/dir_names.tsv'
14
14
15
15
# read in all splicing variants
16
- all_splicing_variants = pd .read_csv (splicing_variants_inputfile , delimiter = '\t ' , header = 0 )
16
+ all_splicing_variants = pd .read_csv (
17
+ splicing_variants_inputfile , delimiter = '\t ' , header = 0 )
17
18
18
19
# create key to match regtools variant_info column and key2 that is the same as key but with sample name added
20
+
21
+
19
22
def createkey (row ):
20
23
key = row [0 ] + ':' + str (row [1 ]) + '-' + str (row [2 ])
21
24
return key
22
- all_splicing_variants ['key' ] = all_splicing_variants .apply (lambda row : createkey (row ), axis = 1 )
25
+
26
+
27
+ all_splicing_variants ['key' ] = all_splicing_variants .apply (
28
+ lambda row : createkey (row ), axis = 1 )
29
+
23
30
24
31
def createkey (row ):
25
32
key = row [0 ] + ':' + str (row [1 ]) + '-' + str (row [2 ]) + '_' + row [3 ]
26
33
return key
27
- all_splicing_variants ['key2' ] = all_splicing_variants .apply (lambda row : createkey (row ), axis = 1 )
34
+
35
+
36
+ all_splicing_variants ['key2' ] = all_splicing_variants .apply (
37
+ lambda row : createkey (row ), axis = 1 )
28
38
29
39
# read in the sample names
30
40
all_samples = []
@@ -45,9 +55,11 @@ def createkey(row):
45
55
print (f'Reading in { sample } ' )
46
56
df = pd .read_csv (path , delimiter = '\t ' , header = 0 )
47
57
df ['sample' ] = sample
48
- df = df [['sample' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' , 'score' , 'name' , 'genes' ]]
58
+ df = df [['sample' , 'variant_info' , 'chrom' , 'start' ,
59
+ 'end' , 'strand' , 'anchor' , 'score' , 'name' , 'genes' ]]
49
60
df = df .dropna (subset = ['variant_info' ])
50
- df = df .set_index (['sample' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' , 'score' , 'name' , 'genes' ]).apply (lambda x : x .str .split (',' ).explode ()).reset_index ()
61
+ df = df .set_index (['sample' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' , 'score' ,
62
+ 'name' , 'genes' ]).apply (lambda x : x .str .split (',' ).explode ()).reset_index ()
51
63
df = df .loc [df ['variant_info' ].isin (all_splicing_variants ['key' ])]
52
64
dfs .append (df )
53
65
@@ -57,84 +69,155 @@ def createkey(row):
57
69
del dfs
58
70
59
71
# create various keys
72
+
73
+
60
74
def createkey (row ):
61
- key = row [1 ] + '_' + str (row [2 ]) + '_' + str (row [3 ]) + '_' + row [5 ] + '_' + row [9 ]
75
+ key = row [1 ] + '_' + str (row [2 ]) + '_' + \
76
+ str (row [3 ]) + '_' + row [5 ] + '_' + row [9 ]
62
77
return key
78
+
79
+
63
80
master_df ['info' ] = master_df .apply (lambda row : createkey (row ), axis = 1 )
64
81
82
+
65
83
def createkey (row ):
66
84
key = row [9 ] + '_' + row [0 ]
67
85
return key
86
+
87
+
68
88
master_df ['key' ] = master_df .apply (lambda row : createkey (row ), axis = 1 )
69
89
90
+
70
91
def createkey (row ):
71
92
key = row [1 ] + '_' + str (row [2 ]) + '_' + str (row [3 ])
72
93
return key
94
+
95
+
73
96
master_df ['junction' ] = master_df .apply (lambda row : createkey (row ), axis = 1 )
74
97
75
98
# subset data to work on samples with splicing variant of interest
76
- samples_w_variant_df = master_df .loc [master_df ['key' ].isin (all_splicing_variants ['key2' ])]
99
+ samples_w_variant_df = master_df .loc [master_df ['key' ].isin (
100
+ all_splicing_variants ['key2' ])]
77
101
# print(samples_w_variant_df.info(verbose=True))
78
102
79
103
# start performing the calculations for this subset of data
80
- print ('Calculating for samples with variants of interest' )
81
- mode = 'blah'
82
- print (samples_w_variant_df .head (10 ))
104
+ print ('Calculating normalized scores for samples with variants of interest' )
105
+ mode = 'strict'
83
106
if mode == 'group' :
84
107
samples_w_variant_df = (samples_w_variant_df >>
85
- group_by (X .key ) >>
86
- summarize (score_tmp = X .score .sum ()) >>
87
- outer_join (samples_w_variant_df , by = 'key' )
88
- )
89
- samples_w_variant_df ['norm_score' ] = samples_w_variant_df ['score' ]/ samples_w_variant_df ['score_tmp' ]
108
+ group_by (X .key ) >>
109
+ summarize (score_tmp = X .score .sum ()) >>
110
+ outer_join (samples_w_variant_df , by = 'key' )
111
+ )
112
+ samples_w_variant_df ['norm_score' ] = samples_w_variant_df ['score' ] / \
113
+ samples_w_variant_df ['score_tmp' ]
90
114
samples_w_variant_df = (samples_w_variant_df >>
91
- group_by ('junction' ) >>
92
- summarize (mean_norm_score_variant = X .norm_score .mean (), sd_norm_score_variant = X .norm_score .std (), total_score_variant = X .score .sum ()) >>
93
- outer_join (samples_w_variant_df , by = 'junction' )
94
- )
95
- tmp_df = samples_w_variant_df .groupby ('junction' )[['norm_score' , 'score' , 'variant_info' , 'sample' , 'name' , 'info' ]].aggregate (lambda x : x .tolist ()).reset_index ()
96
- samples_w_variant_df = pd .merge (samples_w_variant_df , tmp_df , on = 'junction' )
115
+ group_by ('junction' ) >>
116
+ summarize (mean_norm_score_variant = X .norm_score .mean (), sd_norm_score_variant = X .norm_score .std (), total_score_variant = X .score .sum ()) >>
117
+ outer_join (samples_w_variant_df , by = 'junction' )
118
+ )
119
+ tmp_df = samples_w_variant_df .groupby ('junction' )[
120
+ ['norm_score' , 'score' , 'variant_info' , 'sample' , 'name' ]].aggregate (lambda x : x .tolist ()).reset_index ()
121
+ samples_w_variant_df = pd .merge (
122
+ samples_w_variant_df , tmp_df , on = 'junction' )
97
123
samples_w_variant_df = samples_w_variant_df [['sample_y' , 'variant_info_y' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
98
- 'info_y ' , 'genes' , 'name_y' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
99
- 'norm_score_y' , 'score_y' , 'junction' , 'total_score_variant' ]]
124
+ 'info ' , 'genes' , 'name_y' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
125
+ 'norm_score_y' , 'score_y' , 'junction' , 'total_score_variant' ]]
100
126
samples_w_variant_df .columns = ['samples' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
101
- 'info' , 'genes' , 'names' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
102
- 'norm_scores_variant' , 'scores' , 'junction' , 'total_score_variant' ]
103
- samples_w_variant_df = samples_w_variant_df .astype (str ).drop_duplicates ()
127
+ 'info' , 'genes' , 'names' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
128
+ 'norm_scores_variant' , 'scores' , 'junction' , 'total_score_variant' ]
129
+ samples_w_variant_df = samples_w_variant_df [~ samples_w_variant_df .astype (
130
+ str ).duplicated ()]
104
131
else :
105
132
samples_w_variant_df = (samples_w_variant_df >>
106
133
group_by (X .key ) >>
107
- summarize (score_tmp = X .score .sum ()) >>
134
+ summarize (score_tmp = X .score .sum ()) >>
108
135
outer_join (samples_w_variant_df , by = 'key' )
109
136
)
110
- samples_w_variant_df ['norm_score' ] = samples_w_variant_df ['score' ]/ samples_w_variant_df ['score_tmp' ]
137
+ samples_w_variant_df ['norm_score' ] = samples_w_variant_df ['score' ] / \
138
+ samples_w_variant_df ['score_tmp' ]
111
139
samples_w_variant_df = (samples_w_variant_df >>
112
- group_by ('info' ) >>
113
- summarize (mean_norm_score_variant = X .norm_score .mean (), sd_norm_score_variant = X .norm_score .std (), total_score_variant = X .score .sum ()) >>
114
- outer_join (samples_w_variant_df , by = 'info' )
115
- )
116
- tmp_df = samples_w_variant_df .groupby ('info' )[['norm_score' , 'score' , 'sd_norm_score_variant' , 'mean_norm_score_variant' , 'sample' , 'name' ]].aggregate (lambda x : x .tolist ()).reset_index ()
140
+ group_by ('info' ) >>
141
+ summarize (mean_norm_score_variant = X .norm_score .mean (), sd_norm_score_variant = X .norm_score .std (), total_score_variant = X .score .sum ()) >>
142
+ outer_join (samples_w_variant_df , by = 'info' )
143
+ )
144
+ tmp_df = samples_w_variant_df .groupby ('info' )[['norm_score' , 'score' , 'sd_norm_score_variant' ,
145
+ 'mean_norm_score_variant' , 'sample' , 'name' ]].aggregate (lambda x : x .tolist ()).reset_index ()
117
146
samples_w_variant_df = pd .merge (samples_w_variant_df , tmp_df , on = 'info' )
118
147
samples_w_variant_df = samples_w_variant_df [['sample_y' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
119
- 'info' , 'genes' , 'name_y' , 'mean_norm_score_variant_y' , 'sd_norm_score_variant_y' ,
120
- 'norm_score_y' , 'score_y' , 'junction' , 'total_score_variant' ]]
148
+ 'info' , 'genes' , 'name_y' , 'mean_norm_score_variant_y' , 'sd_norm_score_variant_y' ,
149
+ 'norm_score_y' , 'score_y' , 'junction' , 'total_score_variant' ]]
121
150
samples_w_variant_df .columns = ['samples' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
122
- 'info' , 'genes' , 'names' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
123
- 'norm_scores_variant' , 'scores' , 'junction' , 'total_score_variant' ]
151
+ 'info' , 'genes' , 'names' , 'mean_norm_score_variant' , 'sd_norm_score_variant' ,
152
+ 'norm_scores_variant' , 'scores' , 'junction' , 'total_score_variant' ]
153
+ samples_w_variant_df = samples_w_variant_df [~ samples_w_variant_df .astype (
154
+ str ).duplicated ()]
124
155
125
156
# work on samples that don't have the variant of interest
157
+ print ('Calculating normalized scores for samples without variants of interest' )
158
+ samples_wout_variant_df = master_df [~ master_df ['key' ].isin (
159
+ all_splicing_variants ['key2' ])]
160
+ del (master_df )
126
161
127
- samples_wout_variant_df = master_df [- master_df ['key' ].isin (all_splicing_variants ['key2' ])]
128
- samples_wout_variant_df = (samples_wout_variant_df >>
129
- group_by (X .key ) >>
130
- summarize (score_tmp = X .score .sum ()) >>
131
- outer_join (samples_wout_variant_df , by = 'key' )
132
- )
133
- samples_wout_variant_df ['norm_score' ] = samples_wout_variant_df ['score' ]/ samples_wout_variant_df ['score_tmp' ]
134
-
135
- mode = 'strict' #others include 'exclude' and 'group'
136
-
162
+ # mode = 'strict' #others include 'exclude' and 'group'
137
163
# if mode == 'strict':
138
-
139
-
164
+ samples_wout_variant_df = (samples_wout_variant_df >>
165
+ group_by (X .key ) >>
166
+ summarize (score_tmp = X .score .sum ()) >>
167
+ outer_join (samples_wout_variant_df , by = 'key' )
168
+ )
169
+ samples_wout_variant_df ['norm_score' ] = samples_wout_variant_df ['score' ] / \
170
+ samples_wout_variant_df ['score_tmp' ]
171
+ samples_wout_variant_df = samples_wout_variant_df .loc [samples_wout_variant_df ['variant_info' ].isin (
172
+ all_splicing_variants ['key' ])]
173
+ tmp_df = samples_wout_variant_df .groupby (
174
+ 'info' )[['norm_score' ]].aggregate (lambda x : x .tolist ()).reset_index ()
175
+ samples_wout_variant_df = pd .merge (samples_wout_variant_df , tmp_df , on = 'info' )
176
+ samples_wout_variant_df ['samples_wout_variant_count' ] = samples_wout_variant_df ['norm_score_y' ].astype (
177
+ str ).str .count (',' ) + 1
178
+ if mode == 'group' or mode == 'exclude' :
179
+ samples_wout_variant_df = samples_wout_variant_df [~ samples_wout_variant_df ['junction' ].isin (
180
+ samples_w_variant_df ['junction' ])]
181
+ tmp_df = samples_wout_variant_df .groupby (
182
+ 'info' )[['norm_score_x' ]].aggregate (lambda x : x .tolist ()).reset_index ()
183
+ samples_wout_variant_df = pd .merge (
184
+ samples_wout_variant_df , tmp_df , on = 'info' )
185
+ samples_wout_variant_df = (samples_wout_variant_df >>
186
+ group_by ('info' ) >>
187
+ summarize (total_score_non = X .score .sum ()) >>
188
+ outer_join (samples_wout_variant_df , by = 'info' )
189
+ )
190
+ samples_wout_variant_df = samples_wout_variant_df [['sample' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
191
+ 'info' , 'genes' , 'name' , 'norm_score_x_y' , 'junction' , 'total_score_non' , 'samples_wout_variant_count' ]]
192
+ else :
193
+ samples_wout_variant_df = (samples_wout_variant_df >>
194
+ group_by ('info' ) >>
195
+ summarize (total_score_non = X .score .sum ()) >>
196
+ outer_join (samples_wout_variant_df , by = 'info' )
197
+ )
198
+ samples_wout_variant_df = samples_wout_variant_df [['sample' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
199
+ 'info' , 'genes' , 'name' , 'norm_score_y' , 'junction' , 'total_score_non' , 'samples_wout_variant_count' ]]
200
+ samples_wout_variant_df .columns = ['sample' , 'variant_info' , 'chrom' , 'start' , 'end' , 'strand' , 'anchor' ,
201
+ 'info' , 'genes' , 'name' , 'norm_scores_non' , 'junction' , 'total_score_non' , 'samples_wout_variant_count' ]
202
+
203
+ print ('Merging dataframes' )
204
+ # samples_w_variant_df['samples'] = samples_w_variant_df['samples'].astype(str)
205
+ # samples_w_variant_df['variant_info'] = samples_w_variant_df['variant_info'].astype(str)
206
+ samples_w_variant_df ['info' ] = samples_w_variant_df ['info' ].astype (str )
207
+ # samples_w_variant_df['names'] = samples_w_variant_df['names'].astype(str)
208
+ # samples_w_variant_df['norm_scores_variant'] = samples_w_variant_df['norm_scores_variant'].astype(str)
209
+ # samples_w_variant_df['scores'] = samples_w_variant_df['scores'].astype(str)
210
+ # samples_wout_variant_df['norm_scores_non'] = samples_wout_variant_df['norm_scores_non'].astype(str)
211
+ samples_wout_variant_df ['info' ] = samples_wout_variant_df ['info' ].astype (str )
212
+ master_df = pd .merge (samples_w_variant_df , samples_wout_variant_df , how = 'outer' ,on = 'info' )
213
+ del (samples_wout_variant_df )
214
+ del (samples_w_variant_df )
215
+
216
+ master_df ['samples_w_variant_count' ] = master_df ['norm_score_y' ].astype (
217
+ str ).str .count (',' ) + 1
140
218
219
+ samples_wout_variant_df = (samples_wout_variant_df >>
220
+ group_by ('info' ) >>
221
+ summarize (mean_norm_score_non = X .norm_score .mean (), sd_norm_score_non = X .norm_score .std (), total_score_non = X .score .sum ()) >>
222
+ outer_join (samples_wout_variant_df , by = 'info' )
223
+ )
0 commit comments