Skip to content

Commit 17bb43d

Browse files
authored
Merge pull request #128 from griffithlab/improved_compare_junctions
update scripts
2 parents 896ee94 + 2ca1e75 commit 17bb43d

File tree

3 files changed

+97
-7
lines changed

3 files changed

+97
-7
lines changed

docs/workflow.md

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
# RegTools example workflow
2+
3+
This is an example workflow for running RegTools on a cohort of samples. This analysis requires that there be a vcf and RNA bam/cram file for each samples. The outline described below was used to run our own analysis on TCGA data.
4+
5+
By the end of the analysis, the directory structure should look like so:
6+
7+
- Project/ (SCLC/)
8+
- all_splicing_variants*.bed
9+
- paths.tsv
10+
- make_vcfs.sh
11+
- dir_names.tsv
12+
- variants_all_sorted.vcf.gz
13+
- variants_all_sorted.vcf.gz.tbi
14+
- samples/
15+
- Sample_1/
16+
- tumor_rna_alignments.bam
17+
- tumor_rna_alignments.bam.bai
18+
- variants.per_gene.vep.vcf.gz
19+
- variants.per_gene.vep.vcf.gz.tbi
20+
- variants.ensembl
21+
- logs/
22+
- output/
23+
- cse_identify_filtered_*
24+
- cse_identify_filtered_compare_*
25+
- variants*.bed
26+
- compare_junctions/
27+
- hist/
28+
- junction_pvalues_*.tsv
29+
30+
## Set tag and parameter shell arguments
31+
32+
tag=<tag> param=<run option> (e.g. tag=default param=""; tag=E param="-E"; tag=i20e5 param="-i 20 -e 5")
33+
34+
# run regtools cse identify with desired options for selecting variant and window size
35+
for i in samples/*/; do bsub -oo $i/logs/regtools_actual_$tag.lsf regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_$tag.tsv -j ${i}/output/cse_identify_filtered_$tag.bed -v ${i}/output/cse_identify_filtered_$tag.vcf ${i}/variants.per_gene.vep.vcf.gz ${i}/tumor_rna_alignments.bam /gscmnt/gc2602/griffithlab/regtools/yafeng/GRCh37.fa /gscmnt/gc2602/griffithlab/regtools/GRCh37.87.exons.sorted.gtf; done
36+
37+
# make variant.bed
38+
for i in samples/*/; do bsub -oo $i/logs/make_variant_bed_$tag.lsf bash /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/variants.sh ${i}/output/cse_identify_filtered_$tag.tsv ${i}/output/variants_$tag.bed; done
39+
40+
# make bed (really just tsv with columns: chrom start end samples) with all variants that were deemed significant to splicing across all samples
41+
echo -e 'chrom\tstart\tend\tsamples' > all_splicing_variants_$tag.bed
42+
for i in samples/*/; do j=${i##samples/}; uniq ${i}output/variants_$tag.bed | awk -v var=${j%%/} '{print $0 "\t" var}' >> all_splicing_variants_$tag.bed; done
43+
44+
# make vcf of all variants across all samples (from variants.vcf)
45+
vcf-concat samples/*/variants.per_gene.vep.vcf.gz | vcf-sort > all_variants_sorted.vcf
46+
bgzip all_variants_sorted.vcf
47+
tabix all_variants_sorted.vcf.gz
48+
49+
# run cis-splice effects identify on all samples with all variants (with $tag options as example)
50+
for i in samples/*/; do bsub -oo $i/logs/regtools_compare_$tag.lsf regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_compare_$tag.tsv -j ${i}/output/cse_identify_filtered_compare_$tag.bed -v ${i}/output/cse_identify_filtered_compare_$tag.vcf variants_all_sorted.vcf.gz ${i}/tumor_rna_alignments.bam /gscmnt/gc2602/griffithlab/regtools/yafeng/GRCh37.fa /gscmnt/gc2602/griffithlab/regtools/GRCh37.87.exons.sorted.gtf; done
51+
52+
<===== JUNCTION COMPARISON ANALYSIS =====>
53+
54+
# make directories
55+
mkdir compare_junctions/
56+
mkdir compare_junctions/outlier
57+
mkdir compare_junctions/ratio
58+
59+
# outlier analysis
60+
bsub -M 650000000 -R 'select[mem>65000] span[hosts=1] rusage[mem=65000]' /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/compare_junctions_outlier.R $tag
61+
62+
# ratio analysis
63+
bsub -M 650000000 -R 'select[mem>65000] span[hosts=1] rusage[mem=65000]' /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/compare_junctions_ratio.R $tag
64+
65+
# vep comparison (outlier)
66+
bsub /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/vep_compare.R outlier $tag
67+
68+
# vep comparison (ratio)
69+
bsub /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/vep_compare.R ratio $tag
70+
71+
NOTE: in the vep comparison, since regtools doesn't really have a concept of "variants" per se but rather "variant positions" (doesn't care about the actual alleles), we are really counting (positions of variants found splicing-significant by vep AND found significant by regtools) / (positions of variants found significant by found significant by regtools)
72+
73+
To count:
74+
75+
echo -e "anchor\tvep_significant_vars\ttotal_significant_vars\tpercent_vep_significant" > comparison_counts.tsv
76+
for i in default i20e5 i50e5 E I; do
77+
echo -n $i >> comparison_counts.tsv;
78+
echo -en "\t" >> comparison_counts.tsv;
79+
vep=$(cut -f 1,2,7 vep_comparison_$i.tsv | sort | uniq | grep "TRUE" | wc -l)
80+
echo -n $vep >> comparison_counts.tsv;
81+
echo -en "\t" >> comparison_counts.tsv;
82+
total=$(($(cut -f 1,2,7 vep_comparison_$i.tsv | sort | uniq | wc -l)-1))
83+
echo -n $total >> comparison_counts.tsv;
84+
echo -en "\t" >> comparison_counts.tsv;
85+
echo $(bc -l <<< "$vep/$total") >> comparison_counts.tsv;
86+
done

scripts/compare_junctions_hist_v2.R

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# load libraries
66
library(data.table)
77
library(plyr)
8+
library(tidyverse)
89

910
debug = F
1011

@@ -20,12 +21,11 @@ if (debug){
2021
stop("Please specify an option tag (e.g. \"default\", \"i20e5\")")
2122
}
2223
}
23-
24+
2425
# tag = 'I'
2526
# input_file = '~/Desktop/CHOL/all_splicing_variants_I.bed'
2627

2728
# All splicing relevant variants (union of rows from variants.bed files; add column with comma-separated list of sample names)
28-
input_file = args[2]
2929
all_splicing_variants = unique(data.table::fread(input_file), sep = '\t', header = T, stringsAsFactors = FALSE)
3030
colnames(all_splicing_variants) <- c("chrom", "start", "end", "samples")
3131

@@ -158,7 +158,6 @@ a <- function(x, y, z){
158158
return(x)
159159
}
160160
x <- mapply(a, regtools_data$norm_scores_non, length(all_samples), regtools_data$samples)
161-
x = split(x, rep(1:ncol(x), each = nrow(x)))
162161
regtools_data$norm_scores_non = x
163162
print("test7")
164163

@@ -214,6 +213,6 @@ all_splicing_variants <- as.data.table(all_splicing_variants)
214213
regtools_data = regtools_data %>% distinct()
215214

216215

217-
write.table(regtools_data, file=paste(input_file, "_out.tsv", sep=""), quote=FALSE, sep='\t', row.names = F)
216+
write.table(regtools_data, file=paste(input_file, "_out_test.tsv", sep=""), quote=FALSE, sep='\t', row.names = F)
218217

219218
})

scripts/stats_wrapper.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@
1717
tag = args.tag
1818
cwd = os.getcwd()
1919

20-
lines_per_file = 50000
20+
lines_per_file = 1000
2121
smallfile = None
22+
num_small_file = 0
2223
with open(f'all_splicing_variants_{tag}.bed', 'r') as bigfile:
24+
num_small_file +=1
2325
for lineno, line in enumerate(bigfile):
2426
if lineno % lines_per_file == 0:
2527
if smallfile:
@@ -28,11 +30,13 @@
2830
smallfile = open(small_filename, "w")
2931
smallfile.write(line)
3032
if smallfile:
33+
num_small_file += 1
3134
smallfile.close()
3235
#get chunks
3336
files = glob.glob('small_file_*')
37+
files.sort()
3438
for file in files:
35-
subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} ~{cwd}/{file}', shell=True, check=True)
39+
subprocess.run(f'Rscript --vanilla ~/Git/regtools/scripts/compare_junctions_hist_v2.R {tag} {file}', shell=True, check=False)
3640
output_files = glob.glob("*_out.tsv")
3741
output_files.sort() # glob lacks reliable ordering, so impose your own if output order matters
3842
with open(f'junction_pvalues_{tag}.tsv', 'wb') as outfile:
@@ -43,5 +47,6 @@
4347
# Block copy rest of file from input to output without parsing
4448
shutil.copyfileobj(infile, outfile)
4549
print(fname + " has been imported.")
46-
os.remove('small_file*')
50+
for file in files:
51+
os.remove(file)
4752

0 commit comments

Comments
 (0)