|
| 1 | +# RegTools example workflow |
| 2 | + |
| 3 | +This is an example workflow for running RegTools on a cohort of samples. This analysis requires that there be a vcf and RNA bam/cram file for each samples. The outline described below was used to run our own analysis on TCGA data. |
| 4 | + |
| 5 | +By the end of the analysis, the directory structure should look like so: |
| 6 | + |
| 7 | +- Project/ (SCLC/) |
| 8 | + - all_splicing_variants*.bed |
| 9 | + - paths.tsv |
| 10 | + - make_vcfs.sh |
| 11 | + - dir_names.tsv |
| 12 | + - variants_all_sorted.vcf.gz |
| 13 | + - variants_all_sorted.vcf.gz.tbi |
| 14 | + - samples/ |
| 15 | + - Sample_1/ |
| 16 | + - tumor_rna_alignments.bam |
| 17 | + - tumor_rna_alignments.bam.bai |
| 18 | + - variants.per_gene.vep.vcf.gz |
| 19 | + - variants.per_gene.vep.vcf.gz.tbi |
| 20 | + - variants.ensembl |
| 21 | + - logs/ |
| 22 | + - output/ |
| 23 | + - cse_identify_filtered_* |
| 24 | + - cse_identify_filtered_compare_* |
| 25 | + - variants*.bed |
| 26 | + - compare_junctions/ |
| 27 | + - hist/ |
| 28 | + - junction_pvalues_*.tsv |
| 29 | + |
| 30 | +## Set tag and parameter shell arguments |
| 31 | + |
| 32 | +tag=<tag> param=<run option> (e.g. tag=default param=""; tag=E param="-E"; tag=i20e5 param="-i 20 -e 5") |
| 33 | + |
| 34 | +# run regtools cse identify with desired options for selecting variant and window size |
| 35 | +for i in samples/*/; do bsub -oo $i/logs/regtools_actual_$tag.lsf regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_$tag.tsv -j ${i}/output/cse_identify_filtered_$tag.bed -v ${i}/output/cse_identify_filtered_$tag.vcf ${i}/variants.per_gene.vep.vcf.gz ${i}/tumor_rna_alignments.bam /gscmnt/gc2602/griffithlab/regtools/yafeng/GRCh37.fa /gscmnt/gc2602/griffithlab/regtools/GRCh37.87.exons.sorted.gtf; done |
| 36 | + |
| 37 | +# make variant.bed |
| 38 | +for i in samples/*/; do bsub -oo $i/logs/make_variant_bed_$tag.lsf bash /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/variants.sh ${i}/output/cse_identify_filtered_$tag.tsv ${i}/output/variants_$tag.bed; done |
| 39 | + |
| 40 | +# make bed (really just tsv with columns: chrom start end samples) with all variants that were deemed significant to splicing across all samples |
| 41 | +echo -e 'chrom\tstart\tend\tsamples' > all_splicing_variants_$tag.bed |
| 42 | +for i in samples/*/; do j=${i##samples/}; uniq ${i}output/variants_$tag.bed | awk -v var=${j%%/} '{print $0 "\t" var}' >> all_splicing_variants_$tag.bed; done |
| 43 | + |
| 44 | +# make vcf of all variants across all samples (from variants.vcf) |
| 45 | +vcf-concat samples/*/variants.per_gene.vep.vcf.gz | vcf-sort > all_variants_sorted.vcf |
| 46 | +bgzip all_variants_sorted.vcf |
| 47 | +tabix all_variants_sorted.vcf.gz |
| 48 | + |
| 49 | +# run cis-splice effects identify on all samples with all variants (with $tag options as example) |
| 50 | +for i in samples/*/; do bsub -oo $i/logs/regtools_compare_$tag.lsf regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_compare_$tag.tsv -j ${i}/output/cse_identify_filtered_compare_$tag.bed -v ${i}/output/cse_identify_filtered_compare_$tag.vcf variants_all_sorted.vcf.gz ${i}/tumor_rna_alignments.bam /gscmnt/gc2602/griffithlab/regtools/yafeng/GRCh37.fa /gscmnt/gc2602/griffithlab/regtools/GRCh37.87.exons.sorted.gtf; done |
| 51 | + |
| 52 | +<===== JUNCTION COMPARISON ANALYSIS =====> |
| 53 | + |
| 54 | +# make directories |
| 55 | +mkdir compare_junctions/ |
| 56 | +mkdir compare_junctions/outlier |
| 57 | +mkdir compare_junctions/ratio |
| 58 | + |
| 59 | +# outlier analysis |
| 60 | +bsub -M 650000000 -R 'select[mem>65000] span[hosts=1] rusage[mem=65000]' /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/compare_junctions_outlier.R $tag |
| 61 | + |
| 62 | +# ratio analysis |
| 63 | +bsub -M 650000000 -R 'select[mem>65000] span[hosts=1] rusage[mem=65000]' /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/compare_junctions_ratio.R $tag |
| 64 | + |
| 65 | +# vep comparison (outlier) |
| 66 | +bsub /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/vep_compare.R outlier $tag |
| 67 | + |
| 68 | +# vep comparison (ratio) |
| 69 | +bsub /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/vep_compare.R ratio $tag |
| 70 | + |
| 71 | +NOTE: in the vep comparison, since regtools doesn't really have a concept of "variants" per se but rather "variant positions" (doesn't care about the actual alleles), we are really counting (positions of variants found splicing-significant by vep AND found significant by regtools) / (positions of variants found significant by found significant by regtools) |
| 72 | + |
| 73 | +To count: |
| 74 | + |
| 75 | +echo -e "anchor\tvep_significant_vars\ttotal_significant_vars\tpercent_vep_significant" > comparison_counts.tsv |
| 76 | +for i in default i20e5 i50e5 E I; do |
| 77 | + echo -n $i >> comparison_counts.tsv; |
| 78 | + echo -en "\t" >> comparison_counts.tsv; |
| 79 | + vep=$(cut -f 1,2,7 vep_comparison_$i.tsv | sort | uniq | grep "TRUE" | wc -l) |
| 80 | + echo -n $vep >> comparison_counts.tsv; |
| 81 | + echo -en "\t" >> comparison_counts.tsv; |
| 82 | + total=$(($(cut -f 1,2,7 vep_comparison_$i.tsv | sort | uniq | wc -l)-1)) |
| 83 | + echo -n $total >> comparison_counts.tsv; |
| 84 | + echo -en "\t" >> comparison_counts.tsv; |
| 85 | + echo $(bc -l <<< "$vep/$total") >> comparison_counts.tsv; |
| 86 | +done |
0 commit comments