Skip to content

Commit 2ca1e75

Browse files
committed
update to workflow readthedocs
1 parent 8d186d0 commit 2ca1e75

File tree

1 file changed

+86
-0
lines changed

1 file changed

+86
-0
lines changed

docs/workflow.md

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
# RegTools example workflow
2+
3+
This is an example workflow for running RegTools on a cohort of samples. This analysis requires that there be a vcf and RNA bam/cram file for each samples. The outline described below was used to run our own analysis on TCGA data.
4+
5+
By the end of the analysis, the directory structure should look like so:
6+
7+
- Project/ (SCLC/)
8+
- all_splicing_variants*.bed
9+
- paths.tsv
10+
- make_vcfs.sh
11+
- dir_names.tsv
12+
- variants_all_sorted.vcf.gz
13+
- variants_all_sorted.vcf.gz.tbi
14+
- samples/
15+
- Sample_1/
16+
- tumor_rna_alignments.bam
17+
- tumor_rna_alignments.bam.bai
18+
- variants.per_gene.vep.vcf.gz
19+
- variants.per_gene.vep.vcf.gz.tbi
20+
- variants.ensembl
21+
- logs/
22+
- output/
23+
- cse_identify_filtered_*
24+
- cse_identify_filtered_compare_*
25+
- variants*.bed
26+
- compare_junctions/
27+
- hist/
28+
- junction_pvalues_*.tsv
29+
30+
## Set tag and parameter shell arguments
31+
32+
tag=<tag> param=<run option> (e.g. tag=default param=""; tag=E param="-E"; tag=i20e5 param="-i 20 -e 5")
33+
34+
# run regtools cse identify with desired options for selecting variant and window size
35+
for i in samples/*/; do bsub -oo $i/logs/regtools_actual_$tag.lsf regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_$tag.tsv -j ${i}/output/cse_identify_filtered_$tag.bed -v ${i}/output/cse_identify_filtered_$tag.vcf ${i}/variants.per_gene.vep.vcf.gz ${i}/tumor_rna_alignments.bam /gscmnt/gc2602/griffithlab/regtools/yafeng/GRCh37.fa /gscmnt/gc2602/griffithlab/regtools/GRCh37.87.exons.sorted.gtf; done
36+
37+
# make variant.bed
38+
for i in samples/*/; do bsub -oo $i/logs/make_variant_bed_$tag.lsf bash /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/variants.sh ${i}/output/cse_identify_filtered_$tag.tsv ${i}/output/variants_$tag.bed; done
39+
40+
# make bed (really just tsv with columns: chrom start end samples) with all variants that were deemed significant to splicing across all samples
41+
echo -e 'chrom\tstart\tend\tsamples' > all_splicing_variants_$tag.bed
42+
for i in samples/*/; do j=${i##samples/}; uniq ${i}output/variants_$tag.bed | awk -v var=${j%%/} '{print $0 "\t" var}' >> all_splicing_variants_$tag.bed; done
43+
44+
# make vcf of all variants across all samples (from variants.vcf)
45+
vcf-concat samples/*/variants.per_gene.vep.vcf.gz | vcf-sort > all_variants_sorted.vcf
46+
bgzip all_variants_sorted.vcf
47+
tabix all_variants_sorted.vcf.gz
48+
49+
# run cis-splice effects identify on all samples with all variants (with $tag options as example)
50+
for i in samples/*/; do bsub -oo $i/logs/regtools_compare_$tag.lsf regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_compare_$tag.tsv -j ${i}/output/cse_identify_filtered_compare_$tag.bed -v ${i}/output/cse_identify_filtered_compare_$tag.vcf variants_all_sorted.vcf.gz ${i}/tumor_rna_alignments.bam /gscmnt/gc2602/griffithlab/regtools/yafeng/GRCh37.fa /gscmnt/gc2602/griffithlab/regtools/GRCh37.87.exons.sorted.gtf; done
51+
52+
<===== JUNCTION COMPARISON ANALYSIS =====>
53+
54+
# make directories
55+
mkdir compare_junctions/
56+
mkdir compare_junctions/outlier
57+
mkdir compare_junctions/ratio
58+
59+
# outlier analysis
60+
bsub -M 650000000 -R 'select[mem>65000] span[hosts=1] rusage[mem=65000]' /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/compare_junctions_outlier.R $tag
61+
62+
# ratio analysis
63+
bsub -M 650000000 -R 'select[mem>65000] span[hosts=1] rusage[mem=65000]' /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/compare_junctions_ratio.R $tag
64+
65+
# vep comparison (outlier)
66+
bsub /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/vep_compare.R outlier $tag
67+
68+
# vep comparison (ratio)
69+
bsub /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/vep_compare.R ratio $tag
70+
71+
NOTE: in the vep comparison, since regtools doesn't really have a concept of "variants" per se but rather "variant positions" (doesn't care about the actual alleles), we are really counting (positions of variants found splicing-significant by vep AND found significant by regtools) / (positions of variants found significant by found significant by regtools)
72+
73+
To count:
74+
75+
echo -e "anchor\tvep_significant_vars\ttotal_significant_vars\tpercent_vep_significant" > comparison_counts.tsv
76+
for i in default i20e5 i50e5 E I; do
77+
echo -n $i >> comparison_counts.tsv;
78+
echo -en "\t" >> comparison_counts.tsv;
79+
vep=$(cut -f 1,2,7 vep_comparison_$i.tsv | sort | uniq | grep "TRUE" | wc -l)
80+
echo -n $vep >> comparison_counts.tsv;
81+
echo -en "\t" >> comparison_counts.tsv;
82+
total=$(($(cut -f 1,2,7 vep_comparison_$i.tsv | sort | uniq | wc -l)-1))
83+
echo -n $total >> comparison_counts.tsv;
84+
echo -en "\t" >> comparison_counts.tsv;
85+
echo $(bc -l <<< "$vep/$total") >> comparison_counts.tsv;
86+
done

0 commit comments

Comments
 (0)