updated scripts

kcotto · kcotto · commit 50b3bfab91a9 · 2020-05-06T12:58:26.000-05:00
diff --git a/scripts/run_stats_modified.py b/scripts/run_stats_modified.py
@@ -0,0 +1,37 @@
+import subprocess
+import os
+import glob
+import shutil
+import sys
+
+def run(cmd: str) -> None:
+    subprocess.run(cmd, shell=True, check=True, stdout=sys.stdout)
+
+cohorts = ['SKCM', 'GBM', 'READ', 'ESCA', 'PAAD', 'SARC',
+          'OV', 'KIRP', 'CESC', 'KIRC', 'LIHC', 'STAD', 'BLCA',
+          'COAD', 'LUSC', 'HNSC', 'LGG', 'LUAD', 'UCEC', 'BRCA']]
+
+for cohort in cohorts:
+    os.makedirs(f'{cohort}/samples', exist_ok=True)
+    os.makedirs(f'{cohort}/compare_junctions/hist', exist_ok=True)
+    bed_files = glob.glob(f'{cohort}*modified.bed')
+    for bed_file in bed_files:
+        shutil.copy(bed_file, f'{cohort}/{bed_file}')
+    os.chdir(f'{cohort}/samples')
+    run(f'aws s3 cp s3://regtools-results-unstranded/{cohort}/ . --recursive')
+    tar_files = glob.glob('*.tar.gz')
+    for tar_file in tar_files:
+        run(f'tar xzf {tar_file}')
+        os.remove(tar_file)
+        run('rm -rf all*; rm -rf compare_junctions*')
+    os.chdir('..')
+    run('ls samples/ > dir_names.tsv')
+    bed_files = glob.glob(f'*modified.bed')
+    for bed_file in bed_files:
+        tag = bed_file.split('_')[1]
+        os.rename(bed_file, f'all_splicing_variants_{tag}.bed')
+        run(f'python3 /home/ec2-user/workspace/regtools/scripts/stats_wrapper.py {tag}')
+        run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/filter_and_BH.R {tag}')
+	run(f'aws s3 cp compare_junctions/ s3://regtools-results-unstranded/{cohort}/compare_junctions3/ --recursive)')
+	os.chdir('..')
+    shutil.rmtree(cohort)
diff --git a/scripts/stats_wrapper.py b/scripts/stats_wrapper.py
@@ -17,21 +17,37 @@
 tag = args.tag
 cwd = os.getcwd()
 
-lines_per_file = 25000
-smallfile = None
-with open(f'all_splicing_variants_{tag}.bed', 'r') as bigfile:
-    header = bigfile.readline()
-    for lineno, line in enumerate(bigfile):
-        if lineno % lines_per_file == 0:
-            if smallfile:
-                smallfile.close()
-            small_filename = 'small_file_{}.txt'.format(lineno + lines_per_file)
-            smallfile = open(small_filename, "w")
-            smallfile.write(header)
+target_lines_per_file = 25000
+lines_per_file = 0
+input_file = f'all_splicing_variants_{tag}.bed'
+lines = open(input_file).readlines()
+count = len(lines)
+if count <= lines_per_file:
+    subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/data/compare_junctions_hist_v2.R {tag} {input_file}')
+else:
+    header = lines[0]
+    lines.pop(0)
+    lines.sort()
+    filenum = 1
+    small_filename = f'small_file_{filenum}.txt'
+    smallfile = open(small_filename, "w")
+    smallfile.write(header)
+    lines_per_file += target_lines_per_file
+    for lineno, line in enumerate(lines):
         smallfile.write(line)
-    if smallfile:
-        smallfile.close()
-#get chunks
+        if lineno >= lines_per_file:
+            fields1 = line.split('\t')
+            variant1 = f'{fields1[0]}_{fields1[1]}_{fields1[2]}'
+            fields2 = lines[lineno+1].split('\t')
+            variant2 = f'{fields2[0]}_{fields2[1]}_{fields2[2]}'
+            if variant1 != variant2:
+                smallfile.close()
+                filenum += 1
+                small_filename = f'small_file_{filenum}.txt'
+                smallfile = open(small_filename, "w")
+                smallfile.write(header)
+                lines_per_file += target_lines_per_file
+# get chunks
 files = glob.glob('small_file_*')
 files.sort()
 number_of_in_files = len(files)
@@ -53,5 +69,4 @@
     print("Number of output files doesn't match the number of input files that should have been processed")
 files = glob.glob('small_file_*')
 for file in files:
-     os.remove(file)
-
+    os.remove(file)