Skip to content

Commit 50b3bfa

Browse files
committed
updated scripts
1 parent 9595e2b commit 50b3bfa

File tree

2 files changed

+68
-16
lines changed

2 files changed

+68
-16
lines changed

scripts/run_stats_modified.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import subprocess
2+
import os
3+
import glob
4+
import shutil
5+
import sys
6+
7+
def run(cmd: str) -> None:
8+
subprocess.run(cmd, shell=True, check=True, stdout=sys.stdout)
9+
10+
cohorts = ['SKCM', 'GBM', 'READ', 'ESCA', 'PAAD', 'SARC',
11+
'OV', 'KIRP', 'CESC', 'KIRC', 'LIHC', 'STAD', 'BLCA',
12+
'COAD', 'LUSC', 'HNSC', 'LGG', 'LUAD', 'UCEC', 'BRCA']]
13+
14+
for cohort in cohorts:
15+
os.makedirs(f'{cohort}/samples', exist_ok=True)
16+
os.makedirs(f'{cohort}/compare_junctions/hist', exist_ok=True)
17+
bed_files = glob.glob(f'{cohort}*modified.bed')
18+
for bed_file in bed_files:
19+
shutil.copy(bed_file, f'{cohort}/{bed_file}')
20+
os.chdir(f'{cohort}/samples')
21+
run(f'aws s3 cp s3://regtools-results-unstranded/{cohort}/ . --recursive')
22+
tar_files = glob.glob('*.tar.gz')
23+
for tar_file in tar_files:
24+
run(f'tar xzf {tar_file}')
25+
os.remove(tar_file)
26+
run('rm -rf all*; rm -rf compare_junctions*')
27+
os.chdir('..')
28+
run('ls samples/ > dir_names.tsv')
29+
bed_files = glob.glob(f'*modified.bed')
30+
for bed_file in bed_files:
31+
tag = bed_file.split('_')[1]
32+
os.rename(bed_file, f'all_splicing_variants_{tag}.bed')
33+
run(f'python3 /home/ec2-user/workspace/regtools/scripts/stats_wrapper.py {tag}')
34+
run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/filter_and_BH.R {tag}')
35+
run(f'aws s3 cp compare_junctions/ s3://regtools-results-unstranded/{cohort}/compare_junctions3/ --recursive)')
36+
os.chdir('..')
37+
shutil.rmtree(cohort)

scripts/stats_wrapper.py

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,21 +17,37 @@
1717
tag = args.tag
1818
cwd = os.getcwd()
1919

20-
lines_per_file = 25000
21-
smallfile = None
22-
with open(f'all_splicing_variants_{tag}.bed', 'r') as bigfile:
23-
header = bigfile.readline()
24-
for lineno, line in enumerate(bigfile):
25-
if lineno % lines_per_file == 0:
26-
if smallfile:
27-
smallfile.close()
28-
small_filename = 'small_file_{}.txt'.format(lineno + lines_per_file)
29-
smallfile = open(small_filename, "w")
30-
smallfile.write(header)
20+
target_lines_per_file = 25000
21+
lines_per_file = 0
22+
input_file = f'all_splicing_variants_{tag}.bed'
23+
lines = open(input_file).readlines()
24+
count = len(lines)
25+
if count <= lines_per_file:
26+
subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/data/compare_junctions_hist_v2.R {tag} {input_file}')
27+
else:
28+
header = lines[0]
29+
lines.pop(0)
30+
lines.sort()
31+
filenum = 1
32+
small_filename = f'small_file_{filenum}.txt'
33+
smallfile = open(small_filename, "w")
34+
smallfile.write(header)
35+
lines_per_file += target_lines_per_file
36+
for lineno, line in enumerate(lines):
3137
smallfile.write(line)
32-
if smallfile:
33-
smallfile.close()
34-
#get chunks
38+
if lineno >= lines_per_file:
39+
fields1 = line.split('\t')
40+
variant1 = f'{fields1[0]}_{fields1[1]}_{fields1[2]}'
41+
fields2 = lines[lineno+1].split('\t')
42+
variant2 = f'{fields2[0]}_{fields2[1]}_{fields2[2]}'
43+
if variant1 != variant2:
44+
smallfile.close()
45+
filenum += 1
46+
small_filename = f'small_file_{filenum}.txt'
47+
smallfile = open(small_filename, "w")
48+
smallfile.write(header)
49+
lines_per_file += target_lines_per_file
50+
# get chunks
3551
files = glob.glob('small_file_*')
3652
files.sort()
3753
number_of_in_files = len(files)
@@ -53,5 +69,4 @@
5369
print("Number of output files doesn't match the number of input files that should have been processed")
5470
files = glob.glob('small_file_*')
5571
for file in files:
56-
os.remove(file)
57-
72+
os.remove(file)

0 commit comments

Comments
 (0)