Merge pull request #138 from griffithlab/docs_update

kcotto · web-flow · commit 52262197e9c1 · 2022-06-29T11:49:07.000-05:00
Updated wrapper script to fix bug and added script to help run new script
diff --git a/scripts/filter_and_BH.R b/scripts/filter_and_BH.R
@@ -20,22 +20,20 @@ if (debug){
 read_file=paste("compare_junctions/hist/", "junction_pvalues", tag, ".tsv", sep="")
 regtools_data = unique(data.table::fread(file=read_file, sep = '\t', header = TRUE, stringsAsFactors = FALSE))
 regtools_data_filtered = regtools_data[(regtools_data$total_score_variant > 5 & 
-											regtools_data$pvalue >= 0 & 
-											(regtools_data$anchor == "D" | 
-												regtools_data$anchor == "A" | 
-												regtools_data$anchor == "NDA"))]
+											regtools_data$p_value_mean >= 0 & 
+											(regtools_data$anchor == "DA"))]
 
-p = regtools_data_filtered$pvalue
+p = regtools_data_filtered$p_value_mean
 adjusted_p = p.adjust(p, method = "BH")
 regtools_data_filtered$adjusted_p = adjusted_p
 regtools_data_filtered_sorted = regtools_data_filtered[order(adjusted_p)]
 
-write_file = paste("compare_junctions/hist/", "junction_pvalues_filtered_BH", tag, ".tsv", sep="")
+write_file = paste("compare_junctions/hist/", "junction_pvalues_filtered_BH_DA_junctions", tag, ".tsv", sep="")
 write.table(regtools_data_filtered_sorted, file=write_file, quote=FALSE, sep='\t', row.names = FALSE)
 
 threshold = 0.05
 is_significant = regtools_data_filtered_sorted$adjusted_p < threshold
 regtools_data_significant_filtered_sorted = regtools_data_filtered_sorted[is_significant] 
 
-write_file = paste("compare_junctions/hist/", "junction_pvalues_significant_",threshold,"_filtered_BH", tag, ".tsv", sep="")
+write_file = paste("compare_junctions/hist/", "junction_pvalues_significant_",threshold,"_filtered_BH_DA_junctions", tag, ".tsv", sep="")
 write.table(regtools_data_significant_filtered_sorted, file=write_file, quote=FALSE, sep='\t', row.names = FALSE)
diff --git a/scripts/run_stats_modified.py b/scripts/run_stats_modified.py
@@ -0,0 +1,37 @@
+import subprocess
+import os
+import glob
+import shutil
+import sys
+
+def run(cmd: str) -> None:
+    subprocess.run(cmd, shell=True, check=True, stdout=sys.stdout)
+
+cohorts = ['SKCM', 'GBM', 'READ', 'ESCA', 'PAAD', 'SARC',
+          'OV', 'KIRP', 'CESC', 'KIRC', 'LIHC', 'STAD', 'BLCA',
+          'COAD', 'LUSC', 'HNSC', 'LGG', 'LUAD', 'UCEC', 'BRCA']
+
+for cohort in cohorts:
+    os.makedirs(f'{cohort}/samples', exist_ok=True)
+    os.makedirs(f'{cohort}/compare_junctions/hist', exist_ok=True)
+    bed_files = glob.glob(f'{cohort}*modified.bed')
+    for bed_file in bed_files:
+        shutil.copy(bed_file, f'{cohort}/{bed_file}')
+    os.chdir(f'{cohort}/samples')
+    run(f'aws s3 cp s3://regtools-results-unstranded/{cohort}/ . --recursive')
+    tar_files = glob.glob('*.tar.gz')
+    for tar_file in tar_files:
+        run(f'tar xzf {tar_file}')
+        os.remove(tar_file)
+        run('rm -rf all*; rm -rf compare_junctions*')
+    os.chdir('..')
+    run('ls samples/ > dir_names.tsv')
+    bed_files = glob.glob(f'*modified.bed')
+    for bed_file in bed_files:
+        tag = bed_file.split('_')[1]
+        os.rename(bed_file, f'all_splicing_variants_{tag}.bed')
+        run(f'python3 /home/ec2-user/workspace/regtools/scripts/stats_wrapper.py {tag}')
+        run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/filter_and_BH.R {tag}')
+    run(f'aws s3 cp compare_junctions/ s3://regtools-results-unstranded/{cohort}/compare_junctions3/ --recursive')
+    os.chdir('..')
+    shutil.rmtree(cohort)
diff --git a/scripts/stats_wrapper.py b/scripts/stats_wrapper.py
@@ -17,26 +17,42 @@
 tag = args.tag
 cwd = os.getcwd()
 
-lines_per_file = 25000
-smallfile = None
-with open(f'all_splicing_variants_{tag}.bed', 'r') as bigfile:
-    header = bigfile.readline()
-    for lineno, line in enumerate(bigfile):
-        if lineno % lines_per_file == 0:
-            if smallfile:
-                smallfile.close()
-            small_filename = 'small_file_{}.txt'.format(lineno + lines_per_file)
-            smallfile = open(small_filename, "w")
-            smallfile.write(header)
+target_lines_per_file = 25000
+lines_per_file = 0
+input_file = f'all_splicing_variants_{tag}.bed'
+lines = open(input_file).readlines()
+count = len(lines)
+if count <= lines_per_file:
+    subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} {input_file}')
+else:
+    header = lines[0]
+    lines.pop(0)
+    lines.sort()
+    filenum = 1
+    small_filename = f'small_file_{filenum}.txt'
+    smallfile = open(small_filename, "w")
+    smallfile.write(header)
+    lines_per_file += target_lines_per_file
+    for lineno, line in enumerate(lines):
         smallfile.write(line)
-    if smallfile:
-        smallfile.close()
-#get chunks
+        if lineno >= lines_per_file:
+            fields1 = line.split('\t')
+            variant1 = f'{fields1[0]}_{fields1[1]}_{fields1[2]}'
+            fields2 = lines[lineno+1].split('\t')
+            variant2 = f'{fields2[0]}_{fields2[1]}_{fields2[2]}'
+            if variant1 != variant2:
+                smallfile.close()
+                filenum += 1
+                small_filename = f'small_file_{filenum}.txt'
+                smallfile = open(small_filename, "w")
+                smallfile.write(header)
+                lines_per_file += target_lines_per_file
+# get chunks
 files = glob.glob('small_file_*')
 files.sort()
 number_of_in_files = len(files)
 for file in files:
-    subprocess.run(f'Rscript --vanilla compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True)
+    subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True)
 output_files = glob.glob("*_out.tsv")
 output_files.sort()# glob lacks reliable ordering, so impose your own if output order matters
 number_of_out_files = len(output_files)
@@ -53,5 +69,4 @@
     print("Number of output files doesn't match the number of input files that should have been processed")
 files = glob.glob('small_file_*')
 for file in files:
-     os.remove(file)
-
+    os.remove(file)