Skip to content

Commit 3efe961

Browse files
authored
Merge pull request #118 from griffithlab/add_scripts
add R stats script
2 parents d511bfc + 1251082 commit 3efe961

File tree

1 file changed

+201
-0
lines changed

1 file changed

+201
-0
lines changed

scripts/compare_junctions_hist.R

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
# compare_junctions_hist.R
2+
# Rscript --vanilla compare_junctions_hist.R <tag>
3+
4+
# load libraries
5+
library(data.table)
6+
library(graphics)
7+
library(plyr)
8+
library(foreach)
9+
library(doParallel)
10+
registerDoParallel(cores=32)
11+
12+
system.time({
13+
# get options tag
14+
#argc = length(commandArgs())
15+
tag = paste("_", "default", sep="")
16+
17+
if ( substr(tag, 2, 3) == "--"){
18+
stop("Please specify an option tag (e.g. \"default\", \"i20e5\")")
19+
}
20+
21+
## All splicing relevant variants (union of rows from variants.bed files; add column with comma-separated list of sample names)
22+
all_splicing_variants = unique(data.table::fread(paste("all_splicing_variants", tag, ".bed",sep=""), sep = '\t', header = T, stringsAsFactors = FALSE))
23+
colnames(all_splicing_variants) <- c("chrom", "start", "end", "samples")
24+
25+
all_splicing_variants = as.data.table(aggregate(samples ~ chrom + start + end, paste, data=all_splicing_variants))
26+
all_splicing_variants$key <- paste0(all_splicing_variants$chrom, ":", all_splicing_variants$start, "-", all_splicing_variants$end)
27+
28+
## Get all of the samples
29+
all_samples = strsplit(scan("dir_names.tsv", what="", sep="\n"), "[[:space:]]+")
30+
31+
################################################################################
32+
##### Helper functions #########################################################
33+
################################################################################
34+
35+
get_sample_data <- function(sample){
36+
37+
file_name = paste("samples/", sample, "/output/cse_identify_filtered_compare", tag,".tsv", sep = "")
38+
cse_identify_data = data.table::fread(file_name, sep = '\t', header = T, stringsAsFactors = FALSE, strip.white = TRUE)
39+
cse_identify_data$sample <- sample
40+
s <- strsplit(cse_identify_data$variant_info, split = ",")
41+
cse_identify_data <- data.table::data.table(chrom=rep(cse_identify_data$chrom, sapply(s, length)),
42+
start=rep(cse_identify_data$start, sapply(s, length)),
43+
end=rep(cse_identify_data$end, sapply(s, length)),
44+
name=rep(cse_identify_data$name, sapply(s, length)),
45+
score=rep(cse_identify_data$score, sapply(s, length)),
46+
strand=rep(cse_identify_data$strand, sapply(s, length)),
47+
splice_site=rep(cse_identify_data$splice_site, sapply(s, length)),
48+
acceptors_skipped=rep(cse_identify_data$acceptors_skipped, sapply(s, length)),
49+
exons_skipped=rep(cse_identify_data$exons_skipped, sapply(s, length)),
50+
donors_skipped=rep(cse_identify_data$donors_skipped, sapply(s, length)),
51+
anchor=rep(cse_identify_data$anchor, sapply(s, length)),
52+
known_donor=rep(cse_identify_data$known_donor, sapply(s, length)),
53+
known_acceptor=rep(cse_identify_data$known_acceptor, sapply(s, length)),
54+
known_junction=rep(cse_identify_data$known_junction, sapply(s, length)),
55+
genes=rep(cse_identify_data$genes, sapply(s, length)),
56+
transcripts=rep(cse_identify_data$transcripts, sapply(s, length)),
57+
sample=rep(cse_identify_data$sample, sapply(s, length)),
58+
variant_info=unlist(s))
59+
return(cse_identify_data)
60+
}
61+
62+
################################################################################
63+
##### Combine cse_identify_filtered_compare.tsv files for each sample ##########
64+
################################################################################
65+
66+
# this is regtools output per sample
67+
df <- rbindlist(lapply(all_samples, get_sample_data))
68+
df$info <- paste(df$chrom, df$start,
69+
df$end, df$anchor,
70+
df$variant_info, sep = "_")
71+
72+
################################################################################
73+
##### Function to create reviewable regtools output files ######################
74+
################################################################################
75+
76+
a <- function(x, all_cse_identify_data){
77+
78+
## Get data from samples with variant
79+
variant_samples <- unlist(x$samples)
80+
81+
##############################################################################
82+
##### Get junction and variant information for variant samples ###############
83+
##############################################################################
84+
85+
## Get the data from sample with the variant and the significant junctions
86+
variant_junctions_data <- all_cse_identify_data[all_cse_identify_data$variant_info==x$key & all_cse_identify_data$sample %in% variant_samples,]
87+
88+
## Make dataset with the sample and the junction/variant information
89+
sample_data <- variant_junctions_data[,c("sample", "info")]
90+
91+
# Normalize junction scores in this variant region for each sample
92+
summed_variant_scores = variant_junctions_data[, list(score=sum(score)), by=list(sample)]
93+
colnames(summed_variant_scores) <- c("sample", "score.tmp")
94+
variant_junctions_data <- merge(variant_junctions_data, summed_variant_scores, by="sample")
95+
variant_junctions_data$norm_score <- variant_junctions_data$score / variant_junctions_data$score.tmp
96+
97+
# Aggregate data across junction-samples
98+
if (nrow(variant_junctions_data)[[1]] > 0){
99+
variant_junctions_aggr = variant_junctions_data[, list(average_norm_score=mean(norm_score),total_score=sum(score)),
100+
by=list(chrom,start,end,name,strand,anchor,variant_info,info)]
101+
102+
} else {
103+
return(data.table())
104+
}
105+
106+
## Identify mutated samples
107+
a <- function(x, tempData){
108+
return(paste(as.character(tempData[info == x, "sample"]$sample), collapse="|"))
109+
}
110+
variant_junctions_aggr$sample <- sapply(variant_junctions_aggr$info, a, tempData = sample_data)
111+
112+
##############################################################################
113+
##### Get junction and variant information for non-variant samples###############
114+
##############################################################################
115+
116+
## Samples without variant
117+
non_samples = setdiff(all_samples, variant_samples)
118+
non_variant_junctions_data <- all_cse_identify_data[all_cse_identify_data$variant_info==x$key & all_cse_identify_data$sample %in% non_samples]
119+
120+
## Get variant info
121+
non_variant_junctions_data$info <- paste(non_variant_junctions_data$chrom, non_variant_junctions_data$start,
122+
non_variant_junctions_data$end, non_variant_junctions_data$anchor,
123+
non_variant_junctions_data$variant_info, sep = "_")
124+
125+
## Make dataset with the sample and the junction/variant information
126+
non_variant_sample_data <- non_variant_junctions_data[,c("sample", "info")]
127+
128+
# Normalize junction scores in this variant region for each sample
129+
summed_non_variant_scores = non_variant_junctions_data[, list(score=sum(score)), by=list(sample)]
130+
colnames(summed_non_variant_scores) <- c("sample", "score.tmp")
131+
non_variant_junctions_data <- merge(non_variant_junctions_data, summed_non_variant_scores, by="sample")
132+
non_variant_junctions_data$norm_score <- non_variant_junctions_data$score / non_variant_junctions_data$score.tmp
133+
134+
135+
136+
137+
# Generate histogram of normalized junction scores
138+
collapsed_norm_scores = non_variant_junctions_data[, list(norm_scores=list(norm_score)), by=list(chrom,start,end,strand)]
139+
140+
# split out what p-values can be calculated on vs not
141+
collapsed_norm_scores$key <- paste(collapsed_norm_scores$chrom, collapsed_norm_scores$start, collapsed_norm_scores$end, collapsed_norm_scores$strand, sep='.')
142+
variant_junctions_aggr$key <- paste(variant_junctions_aggr$chrom, variant_junctions_aggr$start, variant_junctions_aggr$end, variant_junctions_aggr$strand, sep='.')
143+
variant_junctions_aggr_run_pvalues <- variant_junctions_aggr[variant_junctions_aggr$key %in% collapsed_norm_scores$key,]
144+
variant_junctions_aggr_norun_pvalues <- variant_junctions_aggr[!variant_junctions_aggr$key %in% collapsed_norm_scores$key,]
145+
variant_junctions_aggr_norun_pvalues$pvalue <- 0
146+
147+
calculate_pvalues <- function(junction_data){ #### pull out
148+
chrom = trimws(junction_data[["chrom"]])
149+
start = trimws(junction_data[["start"]])
150+
end = trimws(junction_data[["end"]])
151+
strand = trimws(junction_data[["strand"]])
152+
variant_norm_score = junction_data[["average_norm_score"]]
153+
### pass this variable
154+
i = which(collapsed_norm_scores$chrom == chrom & collapsed_norm_scores$start == start & collapsed_norm_scores$end == end & collapsed_norm_scores$strand == strand)
155+
if (length(i)){
156+
non_variant_norm_scores = collapsed_norm_scores[i]$norm_scores[[1]]
157+
missing = length(non_samples) - length(non_variant_norm_scores)
158+
missing_zeros = integer(missing)
159+
non_variant_norm_scores = c(non_variant_norm_scores, missing_zeros)
160+
161+
variant_norm_score = as.numeric(variant_norm_score)
162+
all_norm_scores = c(non_variant_norm_scores, variant_norm_score)
163+
countable = rank(all_norm_scores)
164+
non_variant_norm_scores_ranked = head(countable, -1)
165+
variant_norm_score_ranked = tail(countable, 1)
166+
167+
histinfo = hist(non_variant_norm_scores_ranked,
168+
breaks = seq(0.5, max(non_variant_norm_scores_ranked)+1.5, by=1), plot=F)
169+
mids = histinfo$mids
170+
cd = cumsum(histinfo$density)
171+
underestimate = max(which(mids <= variant_norm_score_ranked))
172+
pvalue = 1-cd[underestimate]
173+
174+
return(pvalue)
175+
} else {
176+
return(0)
177+
}
178+
}
179+
if(nrow(variant_junctions_aggr_run_pvalues) > 0){
180+
# apply above function to each aggregated variant junction
181+
pvalues = apply(variant_junctions_aggr_run_pvalues, 1, FUN = calculate_pvalues)
182+
variant_junctions_aggr_run_pvalues$pvalue = pvalues
183+
184+
variant_junctions_aggr <- rbind(variant_junctions_aggr_norun_pvalues, variant_junctions_aggr_run_pvalues)
185+
} else {
186+
variant_junctions_aggr <- variant_junctions_aggr_norun_pvalues
187+
}
188+
189+
return(variant_junctions_aggr)
190+
191+
}
192+
193+
regtools_data <- adply(all_splicing_variants, 1, a, all_cse_identify_data=df, .parallel=TRUE)
194+
195+
#regtools_data <- rbindlist(regtools_data, fill=TRUE)
196+
197+
regtools_data <- regtools_data[order(anchor, sample, pvalue)]
198+
regtools_data$samples <- paste(regtools_data$samples)
199+
})
200+
write.table(regtools_data, file=paste("compare_junctions/hist/", "junction_pvalues", tag, ".tsv", sep=""), quote=FALSE, sep='\t', row.names = F)
201+

0 commit comments

Comments
 (0)