Skip to content

Commit ebb700a

Browse files
authored
Merge pull request #170 from griffithlab/testing_met
New statistics script to allow for different variant grouping modes
2 parents 90156a8 + 7e2bc1e commit ebb700a

File tree

2 files changed

+359
-4
lines changed

2 files changed

+359
-4
lines changed

scripts/compare_junctions_hist.py

Lines changed: 335 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,335 @@
1+
import csv
2+
from doctest import master
3+
from itertools import groupby
4+
import pandas as pd
5+
from dfply import *
6+
import numpy as np
7+
from scipy import stats
8+
import os
9+
import argparse
10+
11+
input_parser = argparse.ArgumentParser(
12+
description="Run RegTools stats script",
13+
)
14+
input_parser.add_argument(
15+
'-t',
16+
'--tag',
17+
help="Variant tag parameter used to run RegTools.",
18+
)
19+
input_parser.add_argument(
20+
'-i',
21+
'--variants_file',
22+
help="File containing variants to be considered as splicing relevant."
23+
)
24+
input_parser.add_argument(
25+
'-d',
26+
'--dir_names',
27+
help="File containing directory names corresponding to each sample that is to be processed."
28+
)
29+
input_parser.add_argument(
30+
'-v',
31+
'--variant-grouping',
32+
help="",
33+
choices=['strict', 'exclude', 'include']
34+
)
35+
36+
args = input_parser.parse_args()
37+
38+
tag = args.tag
39+
splicing_variants_inputfile = args.variants_file
40+
samples_inputfile = args.dir_names
41+
variant_grouping_mode = args.variant_grouping
42+
os.chdir('/Users/kcotto/Desktop/CHOL/')
43+
44+
# read in all splicing variants
45+
all_splicing_variants = pd.read_csv(
46+
splicing_variants_inputfile, delimiter='\t', header=0)
47+
48+
# create key to match regtools variant_info column and key2 that is the same as key but with sample name added
49+
50+
51+
def createkey(row):
52+
key = row[0] + ':' + str(row[1]) + '-' + str(row[2])
53+
return key
54+
55+
all_splicing_variants['key'] = all_splicing_variants.apply(
56+
lambda row: createkey(row), axis=1)
57+
58+
59+
def createkey(row):
60+
key = row[0] + ':' + str(row[1]) + '-' + str(row[2]) + '_' + row[3]
61+
return key
62+
63+
64+
all_splicing_variants['key2'] = all_splicing_variants.apply(
65+
lambda row: createkey(row), axis=1)
66+
67+
# read in the sample names
68+
all_samples = []
69+
with open(samples_inputfile, 'r') as samples:
70+
reader = csv.reader(samples, delimiter='\t')
71+
for line in reader:
72+
all_samples.append(line[0])
73+
74+
num_of_samples = len(all_samples)
75+
76+
### read in all of the regtools cse output for this cohort ###
77+
# create list to hold each sample's df
78+
dfs = []
79+
80+
# read each sample's output file into a df and subset columns, split variants into multirows,
81+
# and require that variant is in all_splicing_variants
82+
for sample in all_samples:
83+
path = f'samples/{sample}/output/cse_identify_filtered_compare_{tag}.tsv'
84+
df = f'df_{sample}'
85+
print(f'Reading in {sample}')
86+
df = pd.read_csv(path, delimiter='\t', header=0)
87+
df['sample'] = sample
88+
df = df[['sample', 'variant_info', 'chrom', 'start',
89+
'end', 'strand', 'anchor', 'score', 'name', 'genes']]
90+
df = df.dropna(subset=['variant_info'])
91+
df = df.set_index(['sample', 'chrom', 'start', 'end', 'strand', 'anchor', 'score',
92+
'name', 'genes']).apply(lambda x: x.str.split(',').explode()).reset_index()
93+
df = df.loc[df['variant_info'].isin(all_splicing_variants['key'])]
94+
dfs.append(df)
95+
96+
# concat all individual dfs into one df
97+
print("Concatenating each sample's df together")
98+
master_df = pd.concat(dfs, axis=0, ignore_index=True)
99+
del dfs
100+
101+
# create various keys
102+
103+
104+
def createkey(row):
105+
key = row[1] + '_' + str(row[2]) + '_' + \
106+
str(row[3]) + '_' + row[5] + '_' + row[9]
107+
return key
108+
109+
110+
master_df['info'] = master_df.apply(lambda row: createkey(row), axis=1)
111+
112+
113+
def createkey(row):
114+
key = row[9] + '_' + row[0]
115+
return key
116+
117+
118+
master_df['key'] = master_df.apply(lambda row: createkey(row), axis=1)
119+
120+
121+
def createkey(row):
122+
key = row[1] + '_' + str(row[2]) + '_' + str(row[3])
123+
return key
124+
125+
126+
master_df['junction'] = master_df.apply(lambda row: createkey(row), axis=1)
127+
128+
# subset data to work on samples with splicing variant of interest
129+
samples_w_variant_df = master_df.loc[master_df['key'].isin(
130+
all_splicing_variants['key2'])]
131+
# print(samples_w_variant_df.info(verbose=True))
132+
133+
# start performing the calculations for this subset of data
134+
print('Calculating normalized scores for samples with variants of interest')
135+
# variant_grouping_mode = 'strict'
136+
if variant_grouping_mode == 'group':
137+
samples_w_variant_df = (samples_w_variant_df >>
138+
group_by(X.key) >>
139+
summarize(score_tmp=X.score.sum()) >>
140+
outer_join(samples_w_variant_df, by='key')
141+
)
142+
samples_w_variant_df['norm_score'] = samples_w_variant_df['score'] / \
143+
samples_w_variant_df['score_tmp']
144+
samples_w_variant_df = (samples_w_variant_df >>
145+
group_by('junction') >>
146+
summarize(mean_norm_score_variant=X.norm_score.mean(), sd_norm_score_variant=X.norm_score.std(), total_score_variant=X.score.sum()) >>
147+
outer_join(samples_w_variant_df, by='junction')
148+
)
149+
tmp_df = samples_w_variant_df.groupby('junction')[
150+
['norm_score', 'score', 'variant_info', 'sample', 'name']].aggregate(lambda x: x.tolist()).reset_index()
151+
samples_w_variant_df = pd.merge(
152+
samples_w_variant_df, tmp_df, on='junction')
153+
samples_w_variant_df = samples_w_variant_df[['sample_y', 'variant_info_y', 'chrom', 'start', 'end', 'strand', 'anchor',
154+
'info', 'genes', 'name_y', 'mean_norm_score_variant', 'sd_norm_score_variant',
155+
'norm_score_y', 'score_y', 'junction', 'total_score_variant']]
156+
samples_w_variant_df.columns = ['junction_samples', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
157+
'info', 'genes', 'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
158+
'norm_scores_variant', 'scores', 'junction', 'total_score_variant']
159+
samples_w_variant_df['variant_samples'] = samples_w_variant_df['junction_samples']
160+
samples_w_variant_df = samples_w_variant_df[~samples_w_variant_df.astype(
161+
str).duplicated()]
162+
else:
163+
samples_w_variant_df = (samples_w_variant_df >>
164+
group_by(X.key) >>
165+
summarize(score_tmp=X.score.sum()) >>
166+
outer_join(samples_w_variant_df, by='key')
167+
)
168+
samples_w_variant_df['norm_score'] = samples_w_variant_df['score'] / \
169+
samples_w_variant_df['score_tmp']
170+
samples_w_variant_df = (samples_w_variant_df >>
171+
group_by('info') >>
172+
summarize(mean_norm_score_variant=X.norm_score.mean(), sd_norm_score_variant=X.norm_score.std(), total_score_variant=X.score.sum()) >>
173+
outer_join(samples_w_variant_df, by='info')
174+
)
175+
tmp_df = samples_w_variant_df.groupby('info')[['norm_score', 'score', 'sd_norm_score_variant',
176+
'mean_norm_score_variant', 'sample', 'name']].aggregate(lambda x: x.tolist()).reset_index()
177+
samples_w_variant_df = pd.merge(samples_w_variant_df, tmp_df, on='info')
178+
samples_w_variant_df = samples_w_variant_df[['sample_x', 'sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
179+
'info', 'genes', 'name_y', 'mean_norm_score_variant_y', 'sd_norm_score_variant_y',
180+
'norm_score_y', 'score_y', 'junction', 'total_score_variant']]
181+
samples_w_variant_df.columns = ['sample_x', 'sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
182+
'info', 'genes', 'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
183+
'norm_scores_variant', 'scores', 'junction', 'total_score_variant']
184+
tmp_df = samples_w_variant_df.groupby('variant_info')[['sample_x']].aggregate(lambda x: list(set(x.tolist()))).reset_index()
185+
samples_w_variant_df = pd.merge(samples_w_variant_df, tmp_df, on='variant_info')
186+
samples_w_variant_df = samples_w_variant_df[['sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
187+
'info', 'genes', 'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
188+
'norm_scores_variant', 'scores', 'junction', 'total_score_variant', 'sample_x_y']]
189+
samples_w_variant_df.columns = ['junction_samples', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
190+
'info', 'genes', 'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
191+
'norm_scores_variant', 'scores', 'junction', 'total_score_variant', 'variant_samples']
192+
samples_w_variant_df = samples_w_variant_df[~samples_w_variant_df.astype(
193+
str).duplicated()]
194+
195+
# work on samples that don't have the variant of interest
196+
print('Calculating normalized scores for samples without variants of interest')
197+
samples_wout_variant_df = master_df[~master_df['key'].isin(
198+
all_splicing_variants['key2'])]
199+
del (master_df)
200+
201+
# mode = 'strict' #others include 'exclude' and 'group'
202+
# if mode == 'strict':
203+
samples_wout_variant_df = (samples_wout_variant_df >>
204+
group_by(X.key) >>
205+
summarize(score_tmp=X.score.sum()) >>
206+
outer_join(samples_wout_variant_df, by='key')
207+
)
208+
samples_wout_variant_df['norm_score'] = samples_wout_variant_df['score'] / \
209+
samples_wout_variant_df['score_tmp']
210+
samples_wout_variant_df = samples_wout_variant_df.loc[samples_wout_variant_df['variant_info'].isin(
211+
all_splicing_variants['key'])]
212+
tmp_df = samples_wout_variant_df.groupby(
213+
'info')[['norm_score', 'sample']].aggregate(lambda x: x.tolist()).reset_index()
214+
samples_wout_variant_df = pd.merge(samples_wout_variant_df, tmp_df, on='info')
215+
samples_wout_variant_df['samples_wout_variant_count'] = samples_wout_variant_df['norm_score_y'].astype(
216+
str).str.count(',') + 1
217+
if variant_grouping_mode == 'group' or variant_grouping_mode == 'exclude':
218+
samples_wout_variant_df = samples_wout_variant_df[~samples_wout_variant_df['junction'].isin(
219+
samples_w_variant_df['junction'])]
220+
tmp_df = samples_wout_variant_df.groupby(
221+
'info')[['norm_score_x']].aggregate(lambda x: x.tolist()).reset_index()
222+
samples_wout_variant_df = pd.merge(
223+
samples_wout_variant_df, tmp_df, on='info')
224+
samples_wout_variant_df = (samples_wout_variant_df >>
225+
group_by('info') >>
226+
summarize(total_score_non=X.score.sum()) >>
227+
outer_join(samples_wout_variant_df, by='info')
228+
)
229+
samples_wout_variant_df = samples_wout_variant_df[['sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
230+
'info', 'genes', 'norm_score_x_y', 'junction', 'total_score_non', 'samples_wout_variant_count']]
231+
else:
232+
samples_wout_variant_df = (samples_wout_variant_df >>
233+
group_by('info') >>
234+
summarize(total_score_non=X.score.sum()) >>
235+
outer_join(samples_wout_variant_df, by='info')
236+
)
237+
samples_wout_variant_df = samples_wout_variant_df[['sample_y', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
238+
'info', 'genes', 'norm_score_y', 'junction', 'total_score_non', 'samples_wout_variant_count']]
239+
samples_wout_variant_df.columns = ['sample', 'variant_info', 'chrom', 'start', 'end', 'strand', 'anchor',
240+
'info', 'genes', 'norm_scores_non', 'junction', 'total_score_non', 'samples_wout_variant_count']
241+
242+
print('Merging dataframes')
243+
master_df = pd.merge(samples_w_variant_df, samples_wout_variant_df, how='left' ,on='info')
244+
master_df = master_df[-master_df.astype(
245+
str).duplicated()]
246+
del(samples_wout_variant_df)
247+
del(samples_w_variant_df)
248+
249+
master_df['samples_w_variant_count'] = master_df['variant_samples'].astype(
250+
str).str.count(',') + 1
251+
252+
tmp_df = master_df[['info', 'norm_scores_non', 'samples_wout_variant_count', 'samples_w_variant_count']]
253+
tmp_df = tmp_df.fillna(0)
254+
255+
def add_zeros(row):
256+
norm_scores = row[1]
257+
if norm_scores == 0:
258+
norm_scores = [0]
259+
samples_wout_variant = row[2]
260+
samples_w_variant = row[3]
261+
num_of_zeros_toadd = num_of_samples - samples_wout_variant - samples_w_variant
262+
zeros = np.repeat(0, num_of_zeros_toadd).tolist()
263+
norm_scores = norm_scores + zeros
264+
norm_scores.sort(reverse=True)
265+
new_norm_score_value = (',').join(map(str, norm_scores))
266+
return new_norm_score_value
267+
268+
tmp_df['new_norm_scores'] = tmp_df.apply(lambda row: add_zeros(row), axis=1)
269+
master_df = pd.merge(master_df, tmp_df, how='left' ,on='info')
270+
del(tmp_df)
271+
272+
def get_mean(row):
273+
values = row[33].split(',')
274+
values = [float(i) for i in values]
275+
mean = np.mean(values)
276+
return mean
277+
278+
master_df['mean_norm_score_non'] = master_df.apply(lambda row: get_mean(row), axis=1)
279+
280+
def get_sd(row):
281+
values = row[33].split(',')
282+
values = [float(i) for i in values]
283+
std = np.std(values)
284+
return std
285+
286+
master_df['sd_norm_score_non'] = master_df.apply(lambda row: get_sd(row), axis=1)
287+
288+
def get_min(row):
289+
values = row[12]
290+
values = [float(i) for i in values]
291+
minimum = min(values)
292+
return(minimum)
293+
294+
master_df['min_norm_score_variant'] = master_df.apply(lambda row: get_min(row), axis=1)
295+
296+
def get_pvalue_mean(row):
297+
values = row[33].split(',')
298+
values = [float(i) for i in values]
299+
mean_value = row[10]
300+
pvalue = stats.percentileofscore(values, mean_value)
301+
pvalue = 1 - (pvalue/100.0)
302+
pvalue = re.sub('[\[\]]', '', np.array_str(pvalue))
303+
return pvalue
304+
305+
master_df['p_value_mean'] = master_df.apply(lambda row: get_pvalue_mean(row), axis=1)
306+
307+
def get_pvalue_min(row):
308+
values = row[33].split(',')
309+
values = [float(i) for i in values]
310+
min_value = row[36]
311+
pvalue = stats.percentileofscore(values, min_value)
312+
pvalue = 1 - (pvalue/100.0)
313+
pvalue = re.sub('[\[\]]', '', np.array_str(pvalue))
314+
return pvalue
315+
316+
master_df['p_value_min'] = master_df.apply(lambda row: get_pvalue_mean(row), axis=1)
317+
318+
master_df = master_df[['variant_samples', 'variant_info_x', 'genes_x', 'junction_samples',
319+
'chrom_x', 'start_x', 'end_x', 'strand_x', 'anchor_x', 'info',
320+
'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
321+
'norm_scores_variant', 'total_score_variant', 'mean_norm_score_non',
322+
'sd_norm_score_non', 'new_norm_scores', 'total_score_non', 'p_value_mean','p_value_min']]
323+
master_df.columns = ['variant_samples', 'variant_info', 'genes', 'junction_samples',
324+
'chrom', 'start', 'end', 'strand', 'anchor', 'variant_junction_info',
325+
'names', 'mean_norm_score_variant', 'sd_norm_score_variant',
326+
'norm_scores_variant', 'total_score_variant', 'mean_norm_score_non',
327+
'sd_norm_score_non', 'norm_scores_non', 'total_score_non', 'p_value_mean','p_value_min']
328+
329+
master_df = master_df.applymap(lambda x: x[0] if isinstance(x, list) else x)
330+
master_df = master_df.fillna(0)
331+
332+
master_df.to_csv(f'junction_pvalues_{tag}_out.tsv', sep='\t', index=False)
333+
print(master_df.info())
334+
# master_df = master_df[['samples', 'variant_info_x', ']]
335+
#why are variant_samples >1 missing?

scripts/stats_wrapper.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,42 @@
88
description="Run RegTools stats script",
99
)
1010
input_parser.add_argument(
11-
'tag',
11+
'-t',
12+
'--tag',
1213
help="Variant tag parameter used to run RegTools.",
1314
)
15+
input_parser.add_argument(
16+
'-i',
17+
'--variants_file',
18+
help="File containing variants to be considered as splicing relevant."
19+
)
20+
input_parser.add_argument(
21+
'-d',
22+
'--dir_names',
23+
help="File containing directory names corresponding to each sample that is to be processed."
24+
)
25+
input_parser.add_argument(
26+
'-v',
27+
'--variant-grouping',
28+
help="",
29+
choices=['strict', 'exclude', 'include']
30+
)
1431

1532
args = input_parser.parse_args()
1633

1734
tag = args.tag
35+
input_variant_file = args.variants_file
36+
input_sample_names = args.dir_names
37+
variant_grouping_mode = args.variant_grouping
1838
cwd = os.getcwd()
1939

2040
target_lines_per_file = 25000
2141
lines_per_file = 0
22-
input_file = f'all_splicing_variants_{tag}.bed'
42+
input_file = input_variant_file
2343
lines = open(input_file).readlines()
2444
count = len(lines)
2545
if count <= lines_per_file:
26-
subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} {input_file}')
46+
subprocess.run(f'python3 /scripts/compare_junctions_hist.py -t {tag} -i {input_file} -d {input_sample_names} -v {variant_grouping_mode}', shell=True, check=True)
2747
else:
2848
header = lines[0]
2949
lines.pop(0)
@@ -52,7 +72,7 @@
5272
files.sort()
5373
number_of_in_files = len(files)
5474
for file in files:
55-
subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True)
75+
subprocess.run(f'python3 /scripts/compare_junctions_hist.py -t {tag} -i {input_file} -d {input_sample_names} -v {variant_grouping_mode}', shell=True, check=True)
5676
output_files = glob.glob("*_out.tsv")
5777
output_files.sort()# glob lacks reliable ordering, so impose your own if output order matters
5878
number_of_out_files = len(output_files)

0 commit comments

Comments
 (0)