Skip to content

Commit 5226219

Browse files
authored
Merge pull request #138 from griffithlab/docs_update
Updated wrapper script to fix bug and added script to help run new script
2 parents 3d007f5 + 8360f6b commit 5226219

File tree

3 files changed

+74
-24
lines changed

3 files changed

+74
-24
lines changed

scripts/filter_and_BH.R

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,20 @@ if (debug){
2020
read_file=paste("compare_junctions/hist/", "junction_pvalues", tag, ".tsv", sep="")
2121
regtools_data = unique(data.table::fread(file=read_file, sep = '\t', header = TRUE, stringsAsFactors = FALSE))
2222
regtools_data_filtered = regtools_data[(regtools_data$total_score_variant > 5 &
23-
regtools_data$pvalue >= 0 &
24-
(regtools_data$anchor == "D" |
25-
regtools_data$anchor == "A" |
26-
regtools_data$anchor == "NDA"))]
23+
regtools_data$p_value_mean >= 0 &
24+
(regtools_data$anchor == "DA"))]
2725

28-
p = regtools_data_filtered$pvalue
26+
p = regtools_data_filtered$p_value_mean
2927
adjusted_p = p.adjust(p, method = "BH")
3028
regtools_data_filtered$adjusted_p = adjusted_p
3129
regtools_data_filtered_sorted = regtools_data_filtered[order(adjusted_p)]
3230

33-
write_file = paste("compare_junctions/hist/", "junction_pvalues_filtered_BH", tag, ".tsv", sep="")
31+
write_file = paste("compare_junctions/hist/", "junction_pvalues_filtered_BH_DA_junctions", tag, ".tsv", sep="")
3432
write.table(regtools_data_filtered_sorted, file=write_file, quote=FALSE, sep='\t', row.names = FALSE)
3533

3634
threshold = 0.05
3735
is_significant = regtools_data_filtered_sorted$adjusted_p < threshold
3836
regtools_data_significant_filtered_sorted = regtools_data_filtered_sorted[is_significant]
3937

40-
write_file = paste("compare_junctions/hist/", "junction_pvalues_significant_",threshold,"_filtered_BH", tag, ".tsv", sep="")
38+
write_file = paste("compare_junctions/hist/", "junction_pvalues_significant_",threshold,"_filtered_BH_DA_junctions", tag, ".tsv", sep="")
4139
write.table(regtools_data_significant_filtered_sorted, file=write_file, quote=FALSE, sep='\t', row.names = FALSE)

scripts/run_stats_modified.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import subprocess
2+
import os
3+
import glob
4+
import shutil
5+
import sys
6+
7+
def run(cmd: str) -> None:
8+
subprocess.run(cmd, shell=True, check=True, stdout=sys.stdout)
9+
10+
cohorts = ['SKCM', 'GBM', 'READ', 'ESCA', 'PAAD', 'SARC',
11+
'OV', 'KIRP', 'CESC', 'KIRC', 'LIHC', 'STAD', 'BLCA',
12+
'COAD', 'LUSC', 'HNSC', 'LGG', 'LUAD', 'UCEC', 'BRCA']
13+
14+
for cohort in cohorts:
15+
os.makedirs(f'{cohort}/samples', exist_ok=True)
16+
os.makedirs(f'{cohort}/compare_junctions/hist', exist_ok=True)
17+
bed_files = glob.glob(f'{cohort}*modified.bed')
18+
for bed_file in bed_files:
19+
shutil.copy(bed_file, f'{cohort}/{bed_file}')
20+
os.chdir(f'{cohort}/samples')
21+
run(f'aws s3 cp s3://regtools-results-unstranded/{cohort}/ . --recursive')
22+
tar_files = glob.glob('*.tar.gz')
23+
for tar_file in tar_files:
24+
run(f'tar xzf {tar_file}')
25+
os.remove(tar_file)
26+
run('rm -rf all*; rm -rf compare_junctions*')
27+
os.chdir('..')
28+
run('ls samples/ > dir_names.tsv')
29+
bed_files = glob.glob(f'*modified.bed')
30+
for bed_file in bed_files:
31+
tag = bed_file.split('_')[1]
32+
os.rename(bed_file, f'all_splicing_variants_{tag}.bed')
33+
run(f'python3 /home/ec2-user/workspace/regtools/scripts/stats_wrapper.py {tag}')
34+
run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/filter_and_BH.R {tag}')
35+
run(f'aws s3 cp compare_junctions/ s3://regtools-results-unstranded/{cohort}/compare_junctions3/ --recursive')
36+
os.chdir('..')
37+
shutil.rmtree(cohort)

scripts/stats_wrapper.py

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -17,26 +17,42 @@
1717
tag = args.tag
1818
cwd = os.getcwd()
1919

20-
lines_per_file = 25000
21-
smallfile = None
22-
with open(f'all_splicing_variants_{tag}.bed', 'r') as bigfile:
23-
header = bigfile.readline()
24-
for lineno, line in enumerate(bigfile):
25-
if lineno % lines_per_file == 0:
26-
if smallfile:
27-
smallfile.close()
28-
small_filename = 'small_file_{}.txt'.format(lineno + lines_per_file)
29-
smallfile = open(small_filename, "w")
30-
smallfile.write(header)
20+
target_lines_per_file = 25000
21+
lines_per_file = 0
22+
input_file = f'all_splicing_variants_{tag}.bed'
23+
lines = open(input_file).readlines()
24+
count = len(lines)
25+
if count <= lines_per_file:
26+
subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} {input_file}')
27+
else:
28+
header = lines[0]
29+
lines.pop(0)
30+
lines.sort()
31+
filenum = 1
32+
small_filename = f'small_file_{filenum}.txt'
33+
smallfile = open(small_filename, "w")
34+
smallfile.write(header)
35+
lines_per_file += target_lines_per_file
36+
for lineno, line in enumerate(lines):
3137
smallfile.write(line)
32-
if smallfile:
33-
smallfile.close()
34-
#get chunks
38+
if lineno >= lines_per_file:
39+
fields1 = line.split('\t')
40+
variant1 = f'{fields1[0]}_{fields1[1]}_{fields1[2]}'
41+
fields2 = lines[lineno+1].split('\t')
42+
variant2 = f'{fields2[0]}_{fields2[1]}_{fields2[2]}'
43+
if variant1 != variant2:
44+
smallfile.close()
45+
filenum += 1
46+
small_filename = f'small_file_{filenum}.txt'
47+
smallfile = open(small_filename, "w")
48+
smallfile.write(header)
49+
lines_per_file += target_lines_per_file
50+
# get chunks
3551
files = glob.glob('small_file_*')
3652
files.sort()
3753
number_of_in_files = len(files)
3854
for file in files:
39-
subprocess.run(f'Rscript --vanilla compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True)
55+
subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True)
4056
output_files = glob.glob("*_out.tsv")
4157
output_files.sort()# glob lacks reliable ordering, so impose your own if output order matters
4258
number_of_out_files = len(output_files)
@@ -53,5 +69,4 @@
5369
print("Number of output files doesn't match the number of input files that should have been processed")
5470
files = glob.glob('small_file_*')
5571
for file in files:
56-
os.remove(file)
57-
72+
os.remove(file)

0 commit comments

Comments
 (0)