Skip to content

Commit 1e1f7cd

Browse files
committed
WIP still
1 parent 0a230c4 commit 1e1f7cd

File tree

1 file changed

+132
-49
lines changed

1 file changed

+132
-49
lines changed

scripts/compare_junctions_hist.py

Lines changed: 132 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,28 @@
1313
samples_inputfile = '/Users/kcotto/Desktop/CHOL/dir_names.tsv'
1414

1515
# read in all splicing variants
16-
all_splicing_variants = pd.read_csv(splicing_variants_inputfile, delimiter='\t', header=0)
16+
all_splicing_variants = pd.read_csv(
17+
splicing_variants_inputfile, delimiter='\t', header=0)
1718

1819
# create key to match regtools variant_info column and key2 that is the same as key but with sample name added
20+
21+
1922
def createkey(row):
2023
key = row[0] + ':' + str(row[1]) + '-' + str(row[2])
2124
return key
22-
all_splicing_variants['key'] = all_splicing_variants.apply(lambda row: createkey(row), axis=1)
25+
26+
27+
all_splicing_variants['key'] = all_splicing_variants.apply(
28+
lambda row: createkey(row), axis=1)
29+
2330

2431
def createkey(row):
2532
key = row[0] + ':' + str(row[1]) + '-' + str(row[2]) + '_' + row[3]
2633
return key
27-
all_splicing_variants['key2'] = all_splicing_variants.apply(lambda row: createkey(row), axis=1)
34+
35+
36+
all_splicing_variants['key2'] = all_splicing_variants.apply(
37+
lambda row: createkey(row), axis=1)
2838

2939
# read in the sample names
3040
all_samples = []
@@ -45,9 +55,11 @@ def createkey(row):
4555
print(f'Reading in {sample}')
4656
df = pd.read_csv(path, delimiter='\t', header=0)
4757
df['sample'] = sample
48-
df = df[['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor', 'score', 'name', 'genes']]
58+
df = df[['sample', 'variant_info', 'chrom', 'start',
59+
'end', 'strand', 'anchor', 'score', 'name', 'genes']]
4960
df = df.dropna(subset=['variant_info'])
50-
df = df.set_index(['sample', 'chrom', 'start', 'end', 'strand', 'anchor', 'score', 'name', 'genes']).apply(lambda x: x.str.split(',').explode()).reset_index()
61+
df = df.set_index(['sample', 'chrom', 'start', 'end', 'strand', 'anchor', 'score',
62+
'name', 'genes']).apply(lambda x: x.str.split(',').explode()).reset_index()
5163
df = df.loc[df['variant_info'].isin(all_splicing_variants['key'])]
5264
dfs.append(df)
5365

@@ -57,84 +69,155 @@ def createkey(row):
5769
del dfs
5870

5971
# create various keys
72+
73+
6074
def createkey(row):
61-
key = row[1] + '_' + str(row[2]) + '_' + str(row[3]) + '_' + row[5] + '_' + row[9]
75+
key = row[1] + '_' + str(row[2]) + '_' + \
76+
str(row[3]) + '_' + row[5] + '_' + row[9]
6277
return key
78+
79+
6380
master_df['info'] = master_df.apply(lambda row: createkey(row), axis=1)
6481

82+
6583
def createkey(row):
6684
key = row[9] + '_' + row[0]
6785
return key
86+
87+
6888
master_df['key'] = master_df.apply(lambda row: createkey(row), axis=1)
6989

90+
7091
def createkey(row):
7192
key = row[1] + '_' + str(row[2]) + '_' + str(row[3])
7293
return key
94+
95+
7396
master_df['junction'] = master_df.apply(lambda row: createkey(row), axis=1)
7497

7598
# subset data to work on samples with splicing variant of interest
76-
samples_w_variant_df = master_df.loc[master_df['key'].isin(all_splicing_variants['key2'])]
99+
samples_w_variant_df = master_df.loc[master_df['key'].isin(
100+
all_splicing_variants['key2'])]
77101
# print(samples_w_variant_df.info(verbose=True))
78102

79103
# start performing the calculations for this subset of data
80-
print('Calculating for samples with variants of interest')
81-
mode = 'blah'
82-
print(samples_w_variant_df.head(10))
104+
print('Calculating normalized scores for samples with variants of interest')
105+
mode = 'strict'
83106
if mode == 'group':
84107
samples_w_variant_df = (samples_w_variant_df >>
85-
group_by(X.key) >>
86-
summarize(score_tmp = X.score.sum()) >>
87-
outer_join(samples_w_variant_df, by='key')
88-
)
89-
samples_w_variant_df['norm_score'] = samples_w_variant_df['score']/samples_w_variant_df['score_tmp']
108+
group_by(X.key) >>
109+
summarize(score_tmp=X.score.sum()) >>
110+
outer_join(samples_w_variant_df, by='key')
111+
)
112+
samples_w_variant_df['norm_score'] = samples_w_variant_df['score'] / \
113+
samples_w_variant_df['score_tmp']
90114
samples_w_variant_df = (samples_w_variant_df >>
91-
group_by('junction') >>
92-
summarize(mean_norm_score_variant=X.norm_score.mean(), sd_norm_score_variant=X.norm_score.std(), total_score_variant=X.score.sum()) >>
93-
outer_join(samples_w_variant_df, by='junction')
94-
)
95-
tmp_df = samples_w_variant_df.groupby('junction')[['norm_score', 'score', 'variant_info', 'sample', 'name', 'info']].aggregate(lambda x: x.tolist()).reset_index()
96-
samples_w_variant_df = pd.merge(samples_w_variant_df, tmp_df, on='junction')
115+
group_by('junction') >>
116+
summarize(mean_norm_score_variant=X.norm_score.mean(), sd_norm_score_variant=X.norm_score.std(), total_score_variant=X.score.sum()) >>
117+
outer_join(samples_w_variant_df, by='junction')
118+
)
119+
tmp_df = samples_w_variant_df.groupby('junction')[
120+
['norm_score', 'score', 'variant_info', 'sample', 'name']].aggregate(lambda x: x.tolist()).reset_index()
121+
samples_w_variant_df = pd.merge(
122+
samples_w_variant_df, tmp_df, on='junction')
97123
samples_w_variant_df = samples_w_variant_df[['sample_y', 'variant_info_y', 'chrom', 'start', 'end', 'strand', 'anchor',
98-
'info_y', 'genes', 'name_y', 'mean_norm_score_variant', 'sd_norm_score_variant',
99-
'norm_score_y', 'score_y', 'junction', 'total_score_variant']]
124+
'info', 'genes', 'name_y', 'mean_norm_score_variant', 'sd_norm_score_variant',
125+
'norm_score_y', 'score_y', 'junction', 'total_score_variant']]
100126
samples_w_variant_df.columns = ['samples', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
101-
'info', 'genes', 'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
102-
'norm_scores_variant', 'scores', 'junction', 'total_score_variant']
103-
samples_w_variant_df = samples_w_variant_df.astype(str).drop_duplicates()
127+
'info', 'genes', 'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
128+
'norm_scores_variant', 'scores', 'junction', 'total_score_variant']
129+
samples_w_variant_df = samples_w_variant_df[~samples_w_variant_df.astype(
130+
str).duplicated()]
104131
else:
105132
samples_w_variant_df = (samples_w_variant_df >>
106133
group_by(X.key) >>
107-
summarize(score_tmp = X.score.sum()) >>
134+
summarize(score_tmp=X.score.sum()) >>
108135
outer_join(samples_w_variant_df, by='key')
109136
)
110-
samples_w_variant_df['norm_score'] = samples_w_variant_df['score']/samples_w_variant_df['score_tmp']
137+
samples_w_variant_df['norm_score'] = samples_w_variant_df['score'] / \
138+
samples_w_variant_df['score_tmp']
111139
samples_w_variant_df = (samples_w_variant_df >>
112-
group_by('info') >>
113-
summarize(mean_norm_score_variant=X.norm_score.mean(), sd_norm_score_variant=X.norm_score.std(), total_score_variant=X.score.sum()) >>
114-
outer_join(samples_w_variant_df, by='info')
115-
)
116-
tmp_df = samples_w_variant_df.groupby('info')[['norm_score', 'score', 'sd_norm_score_variant', 'mean_norm_score_variant', 'sample', 'name']].aggregate(lambda x: x.tolist()).reset_index()
140+
group_by('info') >>
141+
summarize(mean_norm_score_variant=X.norm_score.mean(), sd_norm_score_variant=X.norm_score.std(), total_score_variant=X.score.sum()) >>
142+
outer_join(samples_w_variant_df, by='info')
143+
)
144+
tmp_df = samples_w_variant_df.groupby('info')[['norm_score', 'score', 'sd_norm_score_variant',
145+
'mean_norm_score_variant', 'sample', 'name']].aggregate(lambda x: x.tolist()).reset_index()
117146
samples_w_variant_df = pd.merge(samples_w_variant_df, tmp_df, on='info')
118147
samples_w_variant_df = samples_w_variant_df[['sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
119-
'info', 'genes', 'name_y', 'mean_norm_score_variant_y', 'sd_norm_score_variant_y',
120-
'norm_score_y', 'score_y', 'junction', 'total_score_variant']]
148+
'info', 'genes', 'name_y', 'mean_norm_score_variant_y', 'sd_norm_score_variant_y',
149+
'norm_score_y', 'score_y', 'junction', 'total_score_variant']]
121150
samples_w_variant_df.columns = ['samples', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
122-
'info', 'genes', 'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
123-
'norm_scores_variant', 'scores', 'junction', 'total_score_variant']
151+
'info', 'genes', 'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
152+
'norm_scores_variant', 'scores', 'junction', 'total_score_variant']
153+
samples_w_variant_df = samples_w_variant_df[~samples_w_variant_df.astype(
154+
str).duplicated()]
124155

125156
# work on samples that don't have the variant of interest
157+
print('Calculating normalized scores for samples without variants of interest')
158+
samples_wout_variant_df = master_df[~master_df['key'].isin(
159+
all_splicing_variants['key2'])]
160+
del (master_df)
126161

127-
samples_wout_variant_df = master_df[-master_df['key'].isin(all_splicing_variants['key2'])]
128-
samples_wout_variant_df = (samples_wout_variant_df >>
129-
group_by(X.key) >>
130-
summarize(score_tmp = X.score.sum()) >>
131-
outer_join(samples_wout_variant_df, by='key')
132-
)
133-
samples_wout_variant_df['norm_score'] = samples_wout_variant_df['score']/samples_wout_variant_df['score_tmp']
134-
135-
mode = 'strict' #others include 'exclude' and 'group'
136-
162+
# mode = 'strict' #others include 'exclude' and 'group'
137163
# if mode == 'strict':
138-
139-
164+
samples_wout_variant_df = (samples_wout_variant_df >>
165+
group_by(X.key) >>
166+
summarize(score_tmp=X.score.sum()) >>
167+
outer_join(samples_wout_variant_df, by='key')
168+
)
169+
samples_wout_variant_df['norm_score'] = samples_wout_variant_df['score'] / \
170+
samples_wout_variant_df['score_tmp']
171+
samples_wout_variant_df = samples_wout_variant_df.loc[samples_wout_variant_df['variant_info'].isin(
172+
all_splicing_variants['key'])]
173+
tmp_df = samples_wout_variant_df.groupby(
174+
'info')[['norm_score']].aggregate(lambda x: x.tolist()).reset_index()
175+
samples_wout_variant_df = pd.merge(samples_wout_variant_df, tmp_df, on='info')
176+
samples_wout_variant_df['samples_wout_variant_count'] = samples_wout_variant_df['norm_score_y'].astype(
177+
str).str.count(',') + 1
178+
if mode == 'group' or mode == 'exclude':
179+
samples_wout_variant_df = samples_wout_variant_df[~samples_wout_variant_df['junction'].isin(
180+
samples_w_variant_df['junction'])]
181+
tmp_df = samples_wout_variant_df.groupby(
182+
'info')[['norm_score_x']].aggregate(lambda x: x.tolist()).reset_index()
183+
samples_wout_variant_df = pd.merge(
184+
samples_wout_variant_df, tmp_df, on='info')
185+
samples_wout_variant_df = (samples_wout_variant_df >>
186+
group_by('info') >>
187+
summarize(total_score_non=X.score.sum()) >>
188+
outer_join(samples_wout_variant_df, by='info')
189+
)
190+
samples_wout_variant_df = samples_wout_variant_df[['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
191+
'info', 'genes', 'name', 'norm_score_x_y', 'junction', 'total_score_non', 'samples_wout_variant_count']]
192+
else:
193+
samples_wout_variant_df = (samples_wout_variant_df >>
194+
group_by('info') >>
195+
summarize(total_score_non=X.score.sum()) >>
196+
outer_join(samples_wout_variant_df, by='info')
197+
)
198+
samples_wout_variant_df = samples_wout_variant_df[['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
199+
'info', 'genes', 'name', 'norm_score_y', 'junction', 'total_score_non', 'samples_wout_variant_count']]
200+
samples_wout_variant_df.columns = ['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
201+
'info', 'genes', 'name', 'norm_scores_non', 'junction', 'total_score_non', 'samples_wout_variant_count']
202+
203+
print('Merging dataframes')
204+
# samples_w_variant_df['samples'] = samples_w_variant_df['samples'].astype(str)
205+
# samples_w_variant_df['variant_info'] = samples_w_variant_df['variant_info'].astype(str)
206+
samples_w_variant_df['info'] = samples_w_variant_df['info'].astype(str)
207+
# samples_w_variant_df['names'] = samples_w_variant_df['names'].astype(str)
208+
# samples_w_variant_df['norm_scores_variant'] = samples_w_variant_df['norm_scores_variant'].astype(str)
209+
# samples_w_variant_df['scores'] = samples_w_variant_df['scores'].astype(str)
210+
# samples_wout_variant_df['norm_scores_non'] = samples_wout_variant_df['norm_scores_non'].astype(str)
211+
samples_wout_variant_df['info'] = samples_wout_variant_df['info'].astype(str)
212+
master_df = pd.merge(samples_w_variant_df, samples_wout_variant_df, how='outer' ,on='info')
213+
del(samples_wout_variant_df)
214+
del(samples_w_variant_df)
215+
216+
master_df['samples_w_variant_count'] = master_df['norm_score_y'].astype(
217+
str).str.count(',') + 1
140218

219+
samples_wout_variant_df = (samples_wout_variant_df >>
220+
group_by('info') >>
221+
summarize(mean_norm_score_non=X.norm_score.mean(), sd_norm_score_non=X.norm_score.std(), total_score_non=X.score.sum()) >>
222+
outer_join(samples_wout_variant_df, by='info')
223+
)

0 commit comments

Comments
 (0)