Skip to content

Commit f51efb7

Browse files
committed
almost working version
1 parent 1e1f7cd commit f51efb7

File tree

1 file changed

+89
-25
lines changed

1 file changed

+89
-25
lines changed

scripts/compare_junctions_hist.py

Lines changed: 89 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import pandas as pd
55
from dfply import *
66
import numpy as np
7-
import glob
7+
from scipy import stats
88
import os
99

1010
tag = 'E'
@@ -43,6 +43,8 @@ def createkey(row):
4343
for line in reader:
4444
all_samples.append(line[0])
4545

46+
num_of_samples = len(all_samples)
47+
4648
### read in all of the regtools cse output for this cohort ###
4749
# create list to hold each sample's df
4850
dfs = []
@@ -123,9 +125,10 @@ def createkey(row):
123125
samples_w_variant_df = samples_w_variant_df[['sample_y', 'variant_info_y', 'chrom', 'start', 'end', 'strand', 'anchor',
124126
'info', 'genes', 'name_y', 'mean_norm_score_variant', 'sd_norm_score_variant',
125127
'norm_score_y', 'score_y', 'junction', 'total_score_variant']]
126-
samples_w_variant_df.columns = ['samples', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
128+
samples_w_variant_df.columns = ['junction_samples', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
127129
'info', 'genes', 'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
128130
'norm_scores_variant', 'scores', 'junction', 'total_score_variant']
131+
samples_w_variant_df['variant_samples'] = samples_w_variant_df['junction_samples']
129132
samples_w_variant_df = samples_w_variant_df[~samples_w_variant_df.astype(
130133
str).duplicated()]
131134
else:
@@ -144,12 +147,20 @@ def createkey(row):
144147
tmp_df = samples_w_variant_df.groupby('info')[['norm_score', 'score', 'sd_norm_score_variant',
145148
'mean_norm_score_variant', 'sample', 'name']].aggregate(lambda x: x.tolist()).reset_index()
146149
samples_w_variant_df = pd.merge(samples_w_variant_df, tmp_df, on='info')
147-
samples_w_variant_df = samples_w_variant_df[['sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
150+
samples_w_variant_df = samples_w_variant_df[['sample_x', 'sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
148151
'info', 'genes', 'name_y', 'mean_norm_score_variant_y', 'sd_norm_score_variant_y',
149152
'norm_score_y', 'score_y', 'junction', 'total_score_variant']]
150-
samples_w_variant_df.columns = ['samples', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
153+
samples_w_variant_df.columns = ['sample_x', 'sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
151154
'info', 'genes', 'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
152155
'norm_scores_variant', 'scores', 'junction', 'total_score_variant']
156+
tmp_df = samples_w_variant_df.groupby('variant_info')[['sample_x']].aggregate(lambda x: set(x.tolist())).reset_index()
157+
samples_w_variant_df = pd.merge(samples_w_variant_df, tmp_df, on='variant_info')
158+
samples_w_variant_df = samples_w_variant_df[['sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
159+
'info', 'genes', 'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
160+
'norm_scores_variant', 'scores', 'junction', 'total_score_variant', 'sample_x_y']]
161+
samples_w_variant_df.columns = ['junction_samples', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
162+
'info', 'genes', 'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
163+
'norm_scores_variant', 'scores', 'junction', 'total_score_variant', 'variant_samples']
153164
samples_w_variant_df = samples_w_variant_df[~samples_w_variant_df.astype(
154165
str).duplicated()]
155166

@@ -171,7 +182,7 @@ def createkey(row):
171182
samples_wout_variant_df = samples_wout_variant_df.loc[samples_wout_variant_df['variant_info'].isin(
172183
all_splicing_variants['key'])]
173184
tmp_df = samples_wout_variant_df.groupby(
174-
'info')[['norm_score']].aggregate(lambda x: x.tolist()).reset_index()
185+
'info')[['norm_score', 'sample']].aggregate(lambda x: x.tolist()).reset_index()
175186
samples_wout_variant_df = pd.merge(samples_wout_variant_df, tmp_df, on='info')
176187
samples_wout_variant_df['samples_wout_variant_count'] = samples_wout_variant_df['norm_score_y'].astype(
177188
str).str.count(',') + 1
@@ -187,37 +198,90 @@ def createkey(row):
187198
summarize(total_score_non=X.score.sum()) >>
188199
outer_join(samples_wout_variant_df, by='info')
189200
)
190-
samples_wout_variant_df = samples_wout_variant_df[['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
191-
'info', 'genes', 'name', 'norm_score_x_y', 'junction', 'total_score_non', 'samples_wout_variant_count']]
201+
samples_wout_variant_df = samples_wout_variant_df[['sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
202+
'info', 'genes', 'norm_score_x_y', 'junction', 'total_score_non', 'samples_wout_variant_count']]
192203
else:
193204
samples_wout_variant_df = (samples_wout_variant_df >>
194205
group_by('info') >>
195206
summarize(total_score_non=X.score.sum()) >>
196207
outer_join(samples_wout_variant_df, by='info')
197208
)
198-
samples_wout_variant_df = samples_wout_variant_df[['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
199-
'info', 'genes', 'name', 'norm_score_y', 'junction', 'total_score_non', 'samples_wout_variant_count']]
209+
samples_wout_variant_df = samples_wout_variant_df[['sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
210+
'info', 'genes', 'norm_score_y', 'junction', 'total_score_non', 'samples_wout_variant_count']]
200211
samples_wout_variant_df.columns = ['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
201-
'info', 'genes', 'name', 'norm_scores_non', 'junction', 'total_score_non', 'samples_wout_variant_count']
212+
'info', 'genes', 'norm_scores_non', 'junction', 'total_score_non', 'samples_wout_variant_count']
202213

203214
print('Merging dataframes')
204-
# samples_w_variant_df['samples'] = samples_w_variant_df['samples'].astype(str)
205-
# samples_w_variant_df['variant_info'] = samples_w_variant_df['variant_info'].astype(str)
206-
samples_w_variant_df['info'] = samples_w_variant_df['info'].astype(str)
207-
# samples_w_variant_df['names'] = samples_w_variant_df['names'].astype(str)
208-
# samples_w_variant_df['norm_scores_variant'] = samples_w_variant_df['norm_scores_variant'].astype(str)
209-
# samples_w_variant_df['scores'] = samples_w_variant_df['scores'].astype(str)
210-
# samples_wout_variant_df['norm_scores_non'] = samples_wout_variant_df['norm_scores_non'].astype(str)
211-
samples_wout_variant_df['info'] = samples_wout_variant_df['info'].astype(str)
212-
master_df = pd.merge(samples_w_variant_df, samples_wout_variant_df, how='outer' ,on='info')
215+
master_df = pd.merge(samples_w_variant_df, samples_wout_variant_df, how='left' ,on='info')
216+
master_df = master_df[-master_df.astype(
217+
str).duplicated()]
213218
del(samples_wout_variant_df)
214219
del(samples_w_variant_df)
215220

216-
master_df['samples_w_variant_count'] = master_df['norm_score_y'].astype(
221+
master_df['samples_w_variant_count'] = master_df['variant_samples'].astype(
217222
str).str.count(',') + 1
218223

219-
samples_wout_variant_df = (samples_wout_variant_df >>
220-
group_by('info') >>
221-
summarize(mean_norm_score_non=X.norm_score.mean(), sd_norm_score_non=X.norm_score.std(), total_score_non=X.score.sum()) >>
222-
outer_join(samples_wout_variant_df, by='info')
223-
)
224+
tmp_df = master_df[['info', 'norm_scores_non', 'samples_wout_variant_count', 'samples_w_variant_count']]
225+
tmp_df = tmp_df.fillna(0)
226+
227+
def add_zeros(row):
228+
norm_scores = row[1]
229+
if norm_scores == 0:
230+
norm_scores = ['0']
231+
samples_wout_variant = row[2]
232+
samples_w_variant = row[3]
233+
num_of_zeros_toadd = num_of_samples - samples_wout_variant - samples_w_variant
234+
zeros = np.repeat(0, num_of_zeros_toadd).tolist()
235+
norm_scores = norm_scores + zeros
236+
new_norm_score_value = (',').join(map(str, norm_scores))
237+
return new_norm_score_value
238+
239+
tmp_df['new_norm_scores'] = tmp_df.apply(lambda row: add_zeros(row), axis=1)
240+
master_df = pd.merge(master_df, tmp_df, how='left' ,on='info')
241+
del(tmp_df)
242+
243+
def get_mean(row):
244+
values = row[-1].split(',')
245+
values = [float(i) for i in values]
246+
mean = np.mean(values)
247+
return mean
248+
249+
master_df['mean_norm_score_non'] = master_df.apply(lambda row: get_mean(row), axis=1)
250+
251+
def get_sd(row):
252+
values = row[-2].split(',')
253+
values = [float(i) for i in values]
254+
std = np.std(values)
255+
return std
256+
257+
master_df['sd_norm_score_non'] = master_df.apply(lambda row: get_sd(row), axis=1)
258+
259+
def get_min(row):
260+
values = row[12]
261+
values = [float(i) for i in values]
262+
minimum = min(values)
263+
return(minimum)
264+
265+
master_df['min_norm_score_variant'] = master_df.apply(lambda row: get_min(row), axis=1)
266+
267+
def get_pvalue_mean(row):
268+
values = row[32].split(',')
269+
values = [float(i) for i in values]
270+
mean_value = row[10]
271+
pvalue = stats.percentileofscore(values, mean_value)
272+
pvalue = 1 - (pvalue/100.0)
273+
return pvalue
274+
275+
master_df['p_value_mean'] = master_df.apply(lambda row: get_pvalue_mean(row), axis=1)
276+
277+
def get_pvalue_min(row):
278+
values = row[32].split(',')
279+
values = [float(i) for i in values]
280+
mean_value = row[35]
281+
pvalue = stats.percentileofscore(values, mean_value)
282+
pvalue = 1 - (pvalue/100.0)
283+
return pvalue
284+
285+
master_df['p_value_min'] = master_df.apply(lambda row: get_pvalue_mean(row), axis=1)
286+
287+
# master_df = master_df[['samples', 'variant_info_x', ']]

0 commit comments

Comments
 (0)