Merge branch 'master' into edit_docs

kcotto · web-flow · commit af0b36cee073 · 2022-06-29T11:50:39.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ site/
 *.pdf
 *Rdata
 *Rhistory
+.DS_Store
diff --git a/docs/commands/cis-splice-effects-identify.md b/docs/commands/cis-splice-effects-identify.md
@@ -1,9 +1,9 @@
 [csei]: ../images/csei_examples.png
 
-# Overview of `cis-splice-effects identify` command
 
 The `cis-splice-effects identify` command is used to identify splicing misregulation events. This command takes in a list of variants in the VCF format and RNAseq alignments produced with a splice-aware aligner in the BAM format. The tool then proceeds to identify non-canonical splicing junctions near the variant sites.
 
+
 ## Usage
 
 `regtools cis-splice-effects identify [options] variants.vcf alignments.bam ref.fa annotations.gtf`
@@ -13,7 +13,7 @@ The `cis-splice-effects identify` command is used to identify splicing misregula
 | Input                  | Description |
 | ------                 | ----------- |
 | variants.vcf | Variant call in VCF format from which to look for cis-splice-effects.|
-| alignments.bam | Aligned RNAseq BAM produced with a splice aware aligner, that has been indexed for example with `samtools index`. We have tested this command with alignments from TopHat.|
+| alignments.bam | Aligned RNAseq BAM/CRAM produced with a splice aware aligner, that has been indexed for example with `samtools index`. We have tested this command with alignments from TopHat.|
 | ref.fa          | The reference FASTA file. The donor and acceptor sequences used in the "splice-site" column of the annotated junctions are extracted from the FASTA file. |
 | annotations.gtf | The GTF file specifies the transcriptome that is used to annotate the junctions and variants. For examples, the Ensembl GTFs for release78 are [here](ftp://ftp.ensembl.org/pub/release-78/gtf/).|
 
diff --git a/docs/commands/junctions-extract.md b/docs/commands/junctions-extract.md
@@ -2,6 +2,7 @@
 
 The `junctions extract` command can be used to extract exon-exon junctions from an RNAseq BAM file. The output is a BED file in the BED12 format. We have tested this command with alignments from TopHat and by comparing the exon-exon junctions with the `junctions.bed` file produced from TopHat.
 
+
 ## Usage
 
 `regtools junctions extract [options] indexed_alignments.bam`
@@ -10,7 +11,7 @@ The `junctions extract` command can be used to extract exon-exon junctions from
 
 | Input                  | Description |
 | ------                 | ----------- |
-| indexed_alignments.bam | Aligned RNAseq BAM which has been indexed for example with `samtools index`. We have tested this command with alignments from TopHat.|
+| indexed_alignments.bam | Aligned RNAseq BAM/CRAM which has been indexed for example with `samtools index`. We have tested this command with alignments from TopHat.|
 
 ## Options
 
diff --git a/docs/index.md b/docs/index.md
@@ -4,7 +4,7 @@ RegTools is a set of tools that integrate DNA-seq and RNA-seq data to help inter
 
 ## Features
 
-- Extract exon-exon junctions from a RNAseq BAM file.
+- Extract exon-exon junctions from a RNAseq BAM/CRAM file.
 - Annotate exon-exon junctions with information from a known transcriptome.
 - Annotate variants with splice-region(the definition of this region is configurable) annotations.
 
diff --git a/scripts/filter_and_BH.R b/scripts/filter_and_BH.R
@@ -20,22 +20,20 @@ if (debug){
 read_file=paste("compare_junctions/hist/", "junction_pvalues", tag, ".tsv", sep="")
 regtools_data = unique(data.table::fread(file=read_file, sep = '\t', header = TRUE, stringsAsFactors = FALSE))
 regtools_data_filtered = regtools_data[(regtools_data$total_score_variant > 5 & 
-											regtools_data$pvalue >= 0 & 
-											(regtools_data$anchor == "D" | 
-												regtools_data$anchor == "A" | 
-												regtools_data$anchor == "NDA"))]
+											regtools_data$p_value_mean >= 0 & 
+											(regtools_data$anchor == "DA"))]
 
-p = regtools_data_filtered$pvalue
+p = regtools_data_filtered$p_value_mean
 adjusted_p = p.adjust(p, method = "BH")
 regtools_data_filtered$adjusted_p = adjusted_p
 regtools_data_filtered_sorted = regtools_data_filtered[order(adjusted_p)]
 
-write_file = paste("compare_junctions/hist/", "junction_pvalues_filtered_BH", tag, ".tsv", sep="")
+write_file = paste("compare_junctions/hist/", "junction_pvalues_filtered_BH_DA_junctions", tag, ".tsv", sep="")
 write.table(regtools_data_filtered_sorted, file=write_file, quote=FALSE, sep='\t', row.names = FALSE)
 
 threshold = 0.05
 is_significant = regtools_data_filtered_sorted$adjusted_p < threshold
 regtools_data_significant_filtered_sorted = regtools_data_filtered_sorted[is_significant] 
 
-write_file = paste("compare_junctions/hist/", "junction_pvalues_significant_",threshold,"_filtered_BH", tag, ".tsv", sep="")
+write_file = paste("compare_junctions/hist/", "junction_pvalues_significant_",threshold,"_filtered_BH_DA_junctions", tag, ".tsv", sep="")
 write.table(regtools_data_significant_filtered_sorted, file=write_file, quote=FALSE, sep='\t', row.names = FALSE)
diff --git a/scripts/run_stats_modified.py b/scripts/run_stats_modified.py
@@ -0,0 +1,37 @@
+import subprocess
+import os
+import glob
+import shutil
+import sys
+
+def run(cmd: str) -> None:
+    subprocess.run(cmd, shell=True, check=True, stdout=sys.stdout)
+
+cohorts = ['SKCM', 'GBM', 'READ', 'ESCA', 'PAAD', 'SARC',
+          'OV', 'KIRP', 'CESC', 'KIRC', 'LIHC', 'STAD', 'BLCA',
+          'COAD', 'LUSC', 'HNSC', 'LGG', 'LUAD', 'UCEC', 'BRCA']
+
+for cohort in cohorts:
+    os.makedirs(f'{cohort}/samples', exist_ok=True)
+    os.makedirs(f'{cohort}/compare_junctions/hist', exist_ok=True)
+    bed_files = glob.glob(f'{cohort}*modified.bed')
+    for bed_file in bed_files:
+        shutil.copy(bed_file, f'{cohort}/{bed_file}')
+    os.chdir(f'{cohort}/samples')
+    run(f'aws s3 cp s3://regtools-results-unstranded/{cohort}/ . --recursive')
+    tar_files = glob.glob('*.tar.gz')
+    for tar_file in tar_files:
+        run(f'tar xzf {tar_file}')
+        os.remove(tar_file)
+        run('rm -rf all*; rm -rf compare_junctions*')
+    os.chdir('..')
+    run('ls samples/ > dir_names.tsv')
+    bed_files = glob.glob(f'*modified.bed')
+    for bed_file in bed_files:
+        tag = bed_file.split('_')[1]
+        os.rename(bed_file, f'all_splicing_variants_{tag}.bed')
+        run(f'python3 /home/ec2-user/workspace/regtools/scripts/stats_wrapper.py {tag}')
+        run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/filter_and_BH.R {tag}')
+    run(f'aws s3 cp compare_junctions/ s3://regtools-results-unstranded/{cohort}/compare_junctions3/ --recursive')
+    os.chdir('..')
+    shutil.rmtree(cohort)
diff --git a/scripts/stats_wrapper.py b/scripts/stats_wrapper.py
@@ -17,26 +17,42 @@
 tag = args.tag
 cwd = os.getcwd()
 
-lines_per_file = 25000
-smallfile = None
-with open(f'all_splicing_variants_{tag}.bed', 'r') as bigfile:
-    header = bigfile.readline()
-    for lineno, line in enumerate(bigfile):
-        if lineno % lines_per_file == 0:
-            if smallfile:
-                smallfile.close()
-            small_filename = 'small_file_{}.txt'.format(lineno + lines_per_file)
-            smallfile = open(small_filename, "w")
-            smallfile.write(header)
+target_lines_per_file = 25000
+lines_per_file = 0
+input_file = f'all_splicing_variants_{tag}.bed'
+lines = open(input_file).readlines()
+count = len(lines)
+if count <= lines_per_file:
+    subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} {input_file}')
+else:
+    header = lines[0]
+    lines.pop(0)
+    lines.sort()
+    filenum = 1
+    small_filename = f'small_file_{filenum}.txt'
+    smallfile = open(small_filename, "w")
+    smallfile.write(header)
+    lines_per_file += target_lines_per_file
+    for lineno, line in enumerate(lines):
         smallfile.write(line)
-    if smallfile:
-        smallfile.close()
-#get chunks
+        if lineno >= lines_per_file:
+            fields1 = line.split('\t')
+            variant1 = f'{fields1[0]}_{fields1[1]}_{fields1[2]}'
+            fields2 = lines[lineno+1].split('\t')
+            variant2 = f'{fields2[0]}_{fields2[1]}_{fields2[2]}'
+            if variant1 != variant2:
+                smallfile.close()
+                filenum += 1
+                small_filename = f'small_file_{filenum}.txt'
+                smallfile = open(small_filename, "w")
+                smallfile.write(header)
+                lines_per_file += target_lines_per_file
+# get chunks
 files = glob.glob('small_file_*')
 files.sort()
 number_of_in_files = len(files)
 for file in files:
-    subprocess.run(f'Rscript --vanilla compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True)
+    subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True)
 output_files = glob.glob("*_out.tsv")
 output_files.sort()# glob lacks reliable ordering, so impose your own if output order matters
 number_of_out_files = len(output_files)
@@ -53,5 +69,4 @@
     print("Number of output files doesn't match the number of input files that should have been processed")
 files = glob.glob('small_file_*')
 for file in files:
-     os.remove(file)
-
+    os.remove(file)
diff --git a/scripts/variants.sh b/scripts/variants.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# usage: 
+# 	bash variants.sh <input.tsv> <output.bed>
+awk 'NR>1 {print $17}' $1  > $2
+sed -ie 's/[,]/\n/g' $2
+sed -ie 's/[:\-]/\t/g' $2
+sort $2 | uniq > ${2}.tmp 
+cat ${2}.tmp > $2
+rm -f ${2}.tmp
+rm ${2}e
diff --git a/src/.DS_Store b/src/.DS_Store
diff --git a/tests/.DS_Store b/tests/.DS_Store