Skip to content

Commit 7e2bc1e

Browse files
committed
add new stats script and wrapper script
1 parent 519ff7d commit 7e2bc1e

File tree

2 files changed

+61
-13
lines changed

2 files changed

+61
-13
lines changed

scripts/compare_junctions_hist.py

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,40 @@
66
import numpy as np
77
from scipy import stats
88
import os
9-
10-
tag = 'E'
9+
import argparse
10+
11+
input_parser = argparse.ArgumentParser(
12+
description="Run RegTools stats script",
13+
)
14+
input_parser.add_argument(
15+
'-t',
16+
'--tag',
17+
help="Variant tag parameter used to run RegTools.",
18+
)
19+
input_parser.add_argument(
20+
'-i',
21+
'--variants_file',
22+
help="File containing variants to be considered as splicing relevant."
23+
)
24+
input_parser.add_argument(
25+
'-d',
26+
'--dir_names',
27+
help="File containing directory names corresponding to each sample that is to be processed."
28+
)
29+
input_parser.add_argument(
30+
'-v',
31+
'--variant-grouping',
32+
help="",
33+
choices=['strict', 'exclude', 'include']
34+
)
35+
36+
args = input_parser.parse_args()
37+
38+
tag = args.tag
39+
splicing_variants_inputfile = args.variants_file
40+
samples_inputfile = args.dir_names
41+
variant_grouping_mode = args.variant_grouping
1142
os.chdir('/Users/kcotto/Desktop/CHOL/')
12-
splicing_variants_inputfile = '/Users/kcotto/Desktop/CHOL/all_splicing_variants_E.bed'
13-
samples_inputfile = '/Users/kcotto/Desktop/CHOL/dir_names.tsv'
1443

1544
# read in all splicing variants
1645
all_splicing_variants = pd.read_csv(
@@ -23,7 +52,6 @@ def createkey(row):
2352
key = row[0] + ':' + str(row[1]) + '-' + str(row[2])
2453
return key
2554

26-
2755
all_splicing_variants['key'] = all_splicing_variants.apply(
2856
lambda row: createkey(row), axis=1)
2957

@@ -104,8 +132,8 @@ def createkey(row):
104132

105133
# start performing the calculations for this subset of data
106134
print('Calculating normalized scores for samples with variants of interest')
107-
mode = 'strict'
108-
if mode == 'group':
135+
# variant_grouping_mode = 'strict'
136+
if variant_grouping_mode == 'group':
109137
samples_w_variant_df = (samples_w_variant_df >>
110138
group_by(X.key) >>
111139
summarize(score_tmp=X.score.sum()) >>
@@ -186,7 +214,7 @@ def createkey(row):
186214
samples_wout_variant_df = pd.merge(samples_wout_variant_df, tmp_df, on='info')
187215
samples_wout_variant_df['samples_wout_variant_count'] = samples_wout_variant_df['norm_score_y'].astype(
188216
str).str.count(',') + 1
189-
if mode == 'group' or mode == 'exclude':
217+
if variant_grouping_mode == 'group' or variant_grouping_mode == 'exclude':
190218
samples_wout_variant_df = samples_wout_variant_df[~samples_wout_variant_df['junction'].isin(
191219
samples_w_variant_df['junction'])]
192220
tmp_df = samples_wout_variant_df.groupby(
@@ -301,7 +329,7 @@ def get_pvalue_min(row):
301329
master_df = master_df.applymap(lambda x: x[0] if isinstance(x, list) else x)
302330
master_df = master_df.fillna(0)
303331

304-
master_df.to_csv('test_results.tsv', sep='\t', index=False)
332+
master_df.to_csv(f'junction_pvalues_{tag}_out.tsv', sep='\t', index=False)
305333
print(master_df.info())
306334
# master_df = master_df[['samples', 'variant_info_x', ']]
307335
#why are variant_samples >1 missing?

scripts/stats_wrapper.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,42 @@
88
description="Run RegTools stats script",
99
)
1010
input_parser.add_argument(
11-
'tag',
11+
'-t',
12+
'--tag',
1213
help="Variant tag parameter used to run RegTools.",
1314
)
15+
input_parser.add_argument(
16+
'-i',
17+
'--variants_file',
18+
help="File containing variants to be considered as splicing relevant."
19+
)
20+
input_parser.add_argument(
21+
'-d',
22+
'--dir_names',
23+
help="File containing directory names corresponding to each sample that is to be processed."
24+
)
25+
input_parser.add_argument(
26+
'-v',
27+
'--variant-grouping',
28+
help="",
29+
choices=['strict', 'exclude', 'include']
30+
)
1431

1532
args = input_parser.parse_args()
1633

1734
tag = args.tag
35+
input_variant_file = args.variants_file
36+
input_sample_names = args.dir_names
37+
variant_grouping_mode = args.variant_grouping
1838
cwd = os.getcwd()
1939

2040
target_lines_per_file = 25000
2141
lines_per_file = 0
22-
input_file = f'all_splicing_variants_{tag}.bed'
42+
input_file = input_variant_file
2343
lines = open(input_file).readlines()
2444
count = len(lines)
2545
if count <= lines_per_file:
26-
subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} {input_file}')
46+
subprocess.run(f'python3 /scripts/compare_junctions_hist.py -t {tag} -i {input_file} -d {input_sample_names} -v {variant_grouping_mode}', shell=True, check=True)
2747
else:
2848
header = lines[0]
2949
lines.pop(0)
@@ -52,7 +72,7 @@
5272
files.sort()
5373
number_of_in_files = len(files)
5474
for file in files:
55-
subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True)
75+
subprocess.run(f'python3 /scripts/compare_junctions_hist.py -t {tag} -i {input_file} -d {input_sample_names} -v {variant_grouping_mode}', shell=True, check=True)
5676
output_files = glob.glob("*_out.tsv")
5777
output_files.sort()# glob lacks reliable ordering, so impose your own if output order matters
5878
number_of_out_files = len(output_files)

0 commit comments

Comments
 (0)