Skip to content

Commit 7b80c9c

Browse files
authored
Merge pull request #132 from griffithlab/improved_compare_junctions
Fixes docker files Improves stats script Updates readthedocs
2 parents eab866a + 7438f66 commit 7b80c9c

File tree

4 files changed

+136
-83
lines changed

4 files changed

+136
-83
lines changed

Dockerfile

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,3 @@
1-
################################################################################
2-
##################### Add Container Labels #####################################
3-
LABEL "Regtools_License"="MIT"
4-
LABEL "Description"="Software package which integrate DNA-seq and RNA-seq data\
5-
to help interpret mutations in a regulatory and splicing\
6-
context."
7-
81
################################################################################
92
##################### Set Inital Image to work from ############################
103

@@ -34,6 +27,13 @@ RUN apt-get update -y && apt-get install -y \
3427
cmake \
3528
python3
3629

30+
################################################################################
31+
##################### Add Container Labels #####################################
32+
LABEL "Regtools_License"="MIT"
33+
LABEL "Description"="Software package which integrate DNA-seq and RNA-seq data\
34+
to help interpret mutations in a regulatory and splicing\
35+
context."
36+
3737
################################################################################
3838
####################### Install R ##############################################
3939

@@ -55,18 +55,21 @@ RUN R --vanilla -e 'install.packages(c("data.table", "plyr", "tidyverse"), repos
5555
##################### Install Regtools #########################################
5656

5757
# clone git repository
58-
RUN git clone https://github.com/griffithlab/regtools.git
58+
RUN cd / && git clone https://github.com/griffithlab/regtools.git
5959

6060
# make a build directory for regtools
6161
WORKDIR /regtools/
62-
RUN mkdir build
62+
6363

6464
# compile from source
65-
RUN cd /regtools/build && cmake ..
66-
RUN cd /regtools/build && make
65+
RUN mkdir build && cd build && cmake .. && make
6766

6867
################################################################################
6968
###################### set environment path #################################
7069

70+
# make a build directory for regtools
71+
WORKDIR /regtools/scripts/
72+
7173
# add regtools executable to path
72-
ENV PATH="/regtools/build:${PATH}"
74+
ENV PATH="/regtools/build:/usr/local/bin/R-${r_version}:${PATH}"
75+

docs/workflow.md

Lines changed: 61 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22

33
This is an example workflow for running RegTools on a cohort of samples. This analysis requires that there be a vcf and RNA bam/cram file for each samples. The outline described below was used to run our own analysis on TCGA data.
44

5-
By the end of the analysis, the directory structure should look like so:
5+
By the end of the analysis, the directory structure should look like the example below. The `*` in the example below refers to the tag/parameter used to run `regtools cis-splice-effects identify` with.
66

7-
- Project/ (SCLC/)
7+
```bash
8+
- Project/
89
- all_splicing_variants*.bed
910
- paths.tsv
1011
- make_vcfs.sh
@@ -23,64 +24,82 @@ By the end of the analysis, the directory structure should look like so:
2324
- cse_identify_filtered_*
2425
- cse_identify_filtered_compare_*
2526
- variants*.bed
27+
- Sample_2/
28+
- tumor_rna_alignments.bam
29+
- tumor_rna_alignments.bam.bai
30+
- variants.per_gene.vep.vcf.gz
31+
- variants.per_gene.vep.vcf.gz.tbi
32+
- variants.ensembl
33+
- logs/
34+
- output/
35+
- cse_identify_filtered_*
36+
- cse_identify_filtered_compare_*
37+
- variants*.bed
2638
- compare_junctions/
2739
- hist/
2840
- junction_pvalues_*.tsv
41+
```
42+
43+
### Set tag and parameter shell arguments
44+
45+
```bash
46+
tag=<tag>
47+
param=<run option>
48+
# (e.g. tag=default param=""; tag=E param="-E"; tag=i20e5 param="-i 20 -e 5")
49+
```
2950

30-
## Set tag and parameter shell arguments
51+
### Run `regtools cis-splice-effects identify` with desired options for selecting variant and window size
3152

32-
tag=<tag> param=<run option> (e.g. tag=default param=""; tag=E param="-E"; tag=i20e5 param="-i 20 -e 5")
53+
```bash
54+
for i in samples/*/; do regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_$tag.tsv -j ${i}/output/cse_identify_filtered_$tag.bed -v ${i}/output/cse_identify_filtered_$tag.vcf ${i}/variants.per_gene.vep.vcf.gz ${i}/tumor_rna_alignments.bam /reference.fa reference.gtf; done
55+
```
3356

34-
# run regtools cse identify with desired options for selecting variant and window size
35-
for i in samples/*/; do bsub -oo $i/logs/regtools_actual_$tag.lsf regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_$tag.tsv -j ${i}/output/cse_identify_filtered_$tag.bed -v ${i}/output/cse_identify_filtered_$tag.vcf ${i}/variants.per_gene.vep.vcf.gz ${i}/tumor_rna_alignments.bam /gscmnt/gc2602/griffithlab/regtools/yafeng/GRCh37.fa /gscmnt/gc2602/griffithlab/regtools/GRCh37.87.exons.sorted.gtf; done
57+
### Make `variant.bed` for each sample
3658

37-
# make variant.bed
38-
for i in samples/*/; do bsub -oo $i/logs/make_variant_bed_$tag.lsf bash /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/variants.sh ${i}/output/cse_identify_filtered_$tag.tsv ${i}/output/variants_$tag.bed; done
59+
```bash
60+
for i in samples/*/; do bash variants.sh ${i}/output/cse_identify_filtered_$tag.tsv ${i}/output/variants_$tag.bed; done
61+
```
3962

40-
# make bed (really just tsv with columns: chrom start end samples) with all variants that were deemed significant to splicing across all samples
63+
### Combine each sample's `variant.bed` file per tag to get all variants that were deemed significant to splicing across all samples for a given tag
64+
65+
```bash
4166
echo -e 'chrom\tstart\tend\tsamples' > all_splicing_variants_$tag.bed
4267
for i in samples/*/; do j=${i##samples/}; uniq ${i}output/variants_$tag.bed | awk -v var=${j%%/} '{print $0 "\t" var}' >> all_splicing_variants_$tag.bed; done
68+
```
69+
70+
### Make vcf of all variants across all samples (from each sample's variants.vcf). Then, compress it and index it
71+
72+
```bash
73+
vcf-concat samples/*/variants.vcf.gz | vcf-sort > all_variants_sorted.vcf
4374

44-
# make vcf of all variants across all samples (from variants.vcf)
45-
vcf-concat samples/*/variants.per_gene.vep.vcf.gz | vcf-sort > all_variants_sorted.vcf
46-
bgzip all_variants_sorted.vcf
47-
tabix all_variants_sorted.vcf.gz
75+
###### optional ######
76+
bgzip all_variants_sorted.vcf
4877

49-
# run cis-splice effects identify on all samples with all variants (with $tag options as example)
50-
for i in samples/*/; do bsub -oo $i/logs/regtools_compare_$tag.lsf regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_compare_$tag.tsv -j ${i}/output/cse_identify_filtered_compare_$tag.bed -v ${i}/output/cse_identify_filtered_compare_$tag.vcf variants_all_sorted.vcf.gz ${i}/tumor_rna_alignments.bam /gscmnt/gc2602/griffithlab/regtools/yafeng/GRCh37.fa /gscmnt/gc2602/griffithlab/regtools/GRCh37.87.exons.sorted.gtf; done
78+
tabix all_variants_sorted.vcf.gz
79+
```
5180

52-
<===== JUNCTION COMPARISON ANALYSIS =====>
81+
### Run `regtools cis-splice effects identify` on all samples with all variants (with `$tag` options as example)
5382

54-
# make directories
55-
mkdir compare_junctions/
56-
mkdir compare_junctions/outlier
57-
mkdir compare_junctions/ratio
83+
```bash
84+
for i in samples/*/; do bsub -oo $i/logs/regtools_compare_$tag.lsf regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_compare_$tag.tsv -j ${i}/output/cse_identify_filtered_compare_$tag.bed -v ${i}/output/cse_identify_filtered_compare_$tag.vcf all_variants_sorted.vcf.gz ${i}/tumor_rna_alignments.bam reference.fa reference.gtf; done
85+
```
5886

59-
# outlier analysis
60-
bsub -M 650000000 -R 'select[mem>65000] span[hosts=1] rusage[mem=65000]' /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/compare_junctions_outlier.R $tag
87+
## Beginning of compare junctions analysis
6188

62-
# ratio analysis
63-
bsub -M 650000000 -R 'select[mem>65000] span[hosts=1] rusage[mem=65000]' /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/compare_junctions_ratio.R $tag
89+
### Make directory to store comparison results
6490

65-
# vep comparison (outlier)
66-
bsub /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/vep_compare.R outlier $tag
91+
```bash
92+
mkdir -p compare_junctions/hist
93+
```
6794

68-
# vep comparison (ratio)
69-
bsub /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/vep_compare.R ratio $tag
95+
### Run `compare_junctions_hist.R` on sample data
7096

71-
NOTE: in the vep comparison, since regtools doesn't really have a concept of "variants" per se but rather "variant positions" (doesn't care about the actual alleles), we are really counting (positions of variants found splicing-significant by vep AND found significant by regtools) / (positions of variants found significant by found significant by regtools)
97+
```bash
98+
Rscript --vanilla compare_junctions_hist.R <tag>
99+
```
72100

73-
To count:
101+
### Run `filter_and_BH.R` to adjust p values and filter results
74102

75-
echo -e "anchor\tvep_significant_vars\ttotal_significant_vars\tpercent_vep_significant" > comparison_counts.tsv
76-
for i in default i20e5 i50e5 E I; do
77-
echo -n $i >> comparison_counts.tsv;
78-
echo -en "\t" >> comparison_counts.tsv;
79-
vep=$(cut -f 1,2,7 vep_comparison_$i.tsv | sort | uniq | grep "TRUE" | wc -l)
80-
echo -n $vep >> comparison_counts.tsv;
81-
echo -en "\t" >> comparison_counts.tsv;
82-
total=$(($(cut -f 1,2,7 vep_comparison_$i.tsv | sort | uniq | wc -l)-1))
83-
echo -n $total >> comparison_counts.tsv;
84-
echo -en "\t" >> comparison_counts.tsv;
85-
echo $(bc -l <<< "$vep/$total") >> comparison_counts.tsv;
86-
done
103+
```bash
104+
Rscript --vanilla filter_and_BH.R <tag>
105+
```

scripts/compare_junctions_hist_v2.R

Lines changed: 59 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,21 @@ library(tidyverse)
99

1010
debug = F
1111

12-
system.time({
13-
if (debug){
14-
tag = paste("_", "default", sep="")
15-
} else {
16-
# get options tag
17-
args = commandArgs(trailingOnly = TRUE)
18-
tag = args[1]
19-
input_file = args[2]
20-
if ( substr(tag, 2, 3) == "--"){
21-
stop("Please specify an option tag (e.g. \"default\", \"i20e5\")")
22-
}
23-
}
12+
# system.time({
13+
# if (debug){
14+
# tag = paste("_", "default", sep="")
15+
# } else {
16+
# # get options tag
17+
# args = commandArgs(trailingOnly = TRUE)
18+
# tag = args[1]
19+
# input_file = args[2]
20+
# if ( substr(tag, 2, 3) == "--"){
21+
# stop("Please specify an option tag (e.g. \"default\", \"i20e5\")")
22+
# }
23+
# }
2424

25-
# tag = 'I'
26-
# input_file = '~/Desktop/CHOL/all_splicing_variants_I.bed'
25+
tag = 'E'
26+
input_file = 'all_splicing_variants_E.bed'
2727

2828
# All splicing relevant variants (union of rows from variants.bed files; add column with comma-separated list of sample names)
2929
all_splicing_variants = unique(data.table::fread(input_file), sep = '\t', header = T, stringsAsFactors = FALSE)
@@ -150,21 +150,14 @@ regtools_data = subset(regtools_data, select=columns_to_keep)
150150

151151

152152
# zeroes need to be added in for some samples
153-
a <- function(x, y, z){
153+
a <- function(x, y){
154154
toAdd <- y - length(x) - 1
155155
# browser()
156156
toAdd <- rep(0.0000000, toAdd)
157157
x <- c(x, toAdd)
158158
return(x)
159159
}
160-
x <- mapply(a, regtools_data$norm_scores_non, length(all_samples), regtools_data$samples)
161-
162-
163-
# if (typeof(x) == 'list') {
164-
# x <- matrix(pad(unlist(x), ncols),nrow = rows, byrow = TRUE, ncol = cols)
165-
# x <- t(x)
166-
# }
167-
# browser()
160+
x <- mapply(a, regtools_data$norm_scores_non, length(all_samples))
168161

169162
get_num_zeros_to_rm <- function(z){
170163
num_zeroes_to_rm = str_count(z, ',')
@@ -188,6 +181,47 @@ if (max(num_zeroes_to_rm > 0)) {
188181
x <- mapply(rm_zeroes, regtools_data$norm_scores_non, regtools_data$zeroes_to_rm)
189182
regtools_data$norm_scores_non = x
190183
}
184+
185+
get_mean <- function(x){
186+
x <- mean(as.numeric(x))
187+
return(x)
188+
}
189+
190+
x <- mapply(get_mean, regtools_data$norm_scores_non)
191+
regtools_data$mean_norm_score_non <- x
192+
193+
get_sd <- function(x){
194+
x <- sd(as.numeric(x))
195+
return(x)
196+
}
197+
198+
x <- mapply(get_sd, regtools_data$norm_scores_non)
199+
regtools_data$sd_norm_score_non <- x
200+
201+
a <- function(x, y){
202+
# if(y == "TCGA-ZH-A8Y2-01A,TCGA-ZH-A8Y5-01A"){
203+
# browser()
204+
# }
205+
toAdd <- (str_count(y, ',') + 1) - (str_count(x, ',') + 1)
206+
# browser()
207+
if (toAdd > 0) {
208+
toAdd <- rep(0.0000000, toAdd)
209+
x <- c(x, toAdd)
210+
} else {
211+
x <- unlist(strsplit(x, ","))
212+
}
213+
x <- list(x)
214+
return(x)
215+
}
216+
x <- mapply(a, regtools_data$norm_scores_variant, regtools_data$samples)
217+
regtools_data$norm_scores_variant = x
218+
219+
x <- mapply(get_mean, regtools_data$norm_scores_variant)
220+
regtools_data$mean_norm_score_variant <- x
221+
222+
x <- mapply(get_sd, regtools_data$norm_scores_variant)
223+
regtools_data$sd_norm_score_variant <- x
224+
191225
print("test7")
192226

193227
################ calculate p-values ############################################
@@ -207,9 +241,6 @@ a <- function(x){
207241
breaks = seq(0.5, max(non_variant_norm_scores_ranked)+1.5, by=1), plot=F)
208242
mids = histinfo$mids
209243
cd = cumsum(histinfo$density)
210-
# if(x$info == "chr1_729955_735423_D_chr1:809966-809967"){
211-
# browser()
212-
# }
213244
underestimate = max(which(mids <= variant_norm_score_ranked))
214245
pvalue = 1-cd[underestimate]
215246
return(pvalue)
@@ -243,5 +274,5 @@ regtools_data = regtools_data %>% distinct()
243274

244275

245276
write.table(regtools_data, file=paste(input_file, "_out.tsv", sep=""), quote=FALSE, sep='\t', row.names = F)
246-
247-
})
277+
#
278+
# })

scripts/stats_wrapper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
files.sort()
3737
number_of_in_files = len(files)
3838
for file in files:
39-
subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True)
39+
subprocess.run(f'Rscript --vanilla compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True)
4040
output_files = glob.glob("*_out.tsv")
4141
output_files.sort()# glob lacks reliable ordering, so impose your own if output order matters
4242
number_of_out_files = len(output_files)

0 commit comments

Comments
 (0)