Skip to content

Commit d678387

Browse files
committed
compare_junctions updates: samples vs sample column, add non-variant stats
1 parent db76b98 commit d678387

File tree

2 files changed

+60
-28
lines changed

2 files changed

+60
-28
lines changed

.DS_Store

2 KB
Binary file not shown.

scripts/compare_junctions_hist.R

Lines changed: 60 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -9,29 +9,36 @@ library(foreach)
99
library(doParallel)
1010
registerDoParallel(cores=32)
1111

12-
system.time({
13-
# get options tag
14-
#argc = length(commandArgs())
15-
tag = paste("_", "default", sep="")
12+
debug = F
1613

17-
if ( substr(tag, 2, 3) == "--"){
18-
stop("Please specify an option tag (e.g. \"default\", \"i20e5\")")
14+
system.time({
15+
if (debug){
16+
tag = paste("_", "default", sep="")
17+
} else {
18+
# get options tag
19+
argc = length(commandArgs())
20+
tag = paste("_", commandArgs(trailingOnly = F)[argc], sep="")
21+
22+
if ( substr(tag, 2, 3) == "--"){
23+
stop("Please specify an option tag (e.g. \"default\", \"i20e5\")")
24+
}
1925
}
2026

2127
## All splicing relevant variants (union of rows from variants.bed files; add column with comma-separated list of sample names)
2228
all_splicing_variants = unique(data.table::fread(paste("all_splicing_variants", tag, ".bed",sep=""), sep = '\t', header = T, stringsAsFactors = FALSE))
2329
colnames(all_splicing_variants) <- c("chrom", "start", "end", "samples")
2430

2531
all_splicing_variants = as.data.table(aggregate(samples ~ chrom + start + end, paste, data=all_splicing_variants))
26-
all_splicing_variants$key <- paste0(all_splicing_variants$chrom, ":", all_splicing_variants$start, "-", all_splicing_variants$end)
32+
all_splicing_variants$key <- paste0(all_splicing_variants$chrom, ":", all_splicing_variants$start, "-", all_splicing_variants$end) #this key is just a 1bp-long chrom:start-stop designed to match the regtools output variant_info column
2733

28-
## Get all of the samples
34+
## Get all of the samples
2935
all_samples = strsplit(scan("dir_names.tsv", what="", sep="\n"), "[[:space:]]+")
3036

3137
################################################################################
3238
##### Helper functions #########################################################
3339
################################################################################
3440

41+
# basically combines all the cse_identify_filtered_compare files for this cohort
3542
get_sample_data <- function(sample){
3643

3744
file_name = paste("samples/", sample, "/output/cse_identify_filtered_compare", tag,".tsv", sep = "")
@@ -63,8 +70,8 @@ get_sample_data <- function(sample){
6370
##### Combine cse_identify_filtered_compare.tsv files for each sample ##########
6471
################################################################################
6572

66-
# this is regtools output per sample
67-
df <- rbindlist(lapply(all_samples, get_sample_data))
73+
# this is regtools compare output across samples (so it contains variant-junction lines even from samples without variant)
74+
df <- rbindlist(lapply(all_samples, get_sample_data)) # eventually becomes all_cse_identify_data in function a
6875
df$info <- paste(df$chrom, df$start,
6976
df$end, df$anchor,
7077
df$variant_info, sep = "_")
@@ -74,15 +81,14 @@ df$info <- paste(df$chrom, df$start,
7481
################################################################################
7582

7683
a <- function(x, all_cse_identify_data){
77-
78-
## Get data from samples with variant
79-
variant_samples <- unlist(x$samples)
84+
## x comes from all_splicing_variants, so this just gives the samples with the variant
85+
variant_samples <- unlist(x$samples)
8086

8187
##############################################################################
8288
##### Get junction and variant information for variant samples ###############
8389
##############################################################################
8490

85-
## Get the data from sample with the variant and the significant junctions
91+
## Get junctions from concatenated regtools output associated with each variant, only if the data came from a sample actually containing the variant
8692
variant_junctions_data <- all_cse_identify_data[all_cse_identify_data$variant_info==x$key & all_cse_identify_data$sample %in% variant_samples,]
8793

8894
## Make dataset with the sample and the junction/variant information
@@ -95,22 +101,22 @@ a <- function(x, all_cse_identify_data){
95101
variant_junctions_data$norm_score <- variant_junctions_data$score / variant_junctions_data$score.tmp
96102

97103
# Aggregate data across junction-samples
98-
if (nrow(variant_junctions_data)[[1]] > 0){
99-
variant_junctions_aggr = variant_junctions_data[, list(average_norm_score=mean(norm_score),total_score=sum(score)),
100-
by=list(chrom,start,end,name,strand,anchor,variant_info,info)]
104+
if (nrow(variant_junctions_data)[[1]] > 0){ # names column refers to names that regtools gave each junction in the variant output (only will be multiple if variant and junction was found in multiple samples)
105+
variant_junctions_aggr = variant_junctions_data[, list(names=list(name),mean_norm_score_variant=mean(norm_score),sd_norm_score_variant=sd(norm_score),norm_scores_variant=list(norm_score),total_score_variant=sum(score)),
106+
by=list(chrom,start,end,strand,anchor,variant_info,info)]
101107

102108
} else {
103109
return(data.table())
104110
}
105111

106112
## Identify mutated samples
107113
a <- function(x, tempData){
108-
return(paste(as.character(tempData[info == x, "sample"]$sample), collapse="|"))
114+
return(paste(as.character(tempData[info == x, "sample"]$sample), collapse=",")) # for super-junction aggregated across samples with the variant, get list of samples with variant (and junction)
109115
}
110116
variant_junctions_aggr$sample <- sapply(variant_junctions_aggr$info, a, tempData = sample_data)
111117

112118
##############################################################################
113-
##### Get junction and variant information for non-variant samples###############
119+
##### Get junction and variant information for non-variant samples ###########
114120
##############################################################################
115121

116122
## Samples without variant
@@ -130,13 +136,22 @@ a <- function(x, all_cse_identify_data){
130136
colnames(summed_non_variant_scores) <- c("sample", "score.tmp")
131137
non_variant_junctions_data <- merge(non_variant_junctions_data, summed_non_variant_scores, by="sample")
132138
non_variant_junctions_data$norm_score <- non_variant_junctions_data$score / non_variant_junctions_data$score.tmp
133-
134-
135139

140+
# Aggregate data across junction-samples
141+
if (nrow(variant_junctions_data)[[1]] > 0){
142+
non_variant_junctions_aggr = non_variant_junctions_data[, list(mean_norm_score_non=mean(norm_score),sd_norm_score_non=sd(norm_score),norm_scores_non=sd(norm_score),total_score_non=sum(score)),
143+
by=list(chrom,start,end,strand,anchor,variant_info,info)]
144+
} else {
145+
return(data.table())
146+
}
136147

137148
# Generate histogram of normalized junction scores
138149
collapsed_norm_scores = non_variant_junctions_data[, list(norm_scores=list(norm_score)), by=list(chrom,start,end,strand)]
139150

151+
# Add non-variant sample aggregate stats to output
152+
variant_junctions_aggr = merge(x=variant_junctions_aggr, y=non_variant_junctions_aggr,by=
153+
c("chrom","start","end","strand","anchor","variant_info","info"),all.x=T)
154+
140155
# split out what p-values can be calculated on vs not
141156
collapsed_norm_scores$key <- paste(collapsed_norm_scores$chrom, collapsed_norm_scores$start, collapsed_norm_scores$end, collapsed_norm_scores$strand, sep='.')
142157
variant_junctions_aggr$key <- paste(variant_junctions_aggr$chrom, variant_junctions_aggr$start, variant_junctions_aggr$end, variant_junctions_aggr$strand, sep='.')
@@ -149,7 +164,7 @@ a <- function(x, all_cse_identify_data){
149164
start = trimws(junction_data[["start"]])
150165
end = trimws(junction_data[["end"]])
151166
strand = trimws(junction_data[["strand"]])
152-
variant_norm_score = junction_data[["average_norm_score"]]
167+
variant_norm_score = junction_data[["mean_norm_score_variant"]]
153168
### pass this variable
154169
i = which(collapsed_norm_scores$chrom == chrom & collapsed_norm_scores$start == start & collapsed_norm_scores$end == end & collapsed_norm_scores$strand == strand)
155170
if (length(i)){
@@ -186,16 +201,33 @@ a <- function(x, all_cse_identify_data){
186201
variant_junctions_aggr <- variant_junctions_aggr_norun_pvalues
187202
}
188203

189-
return(variant_junctions_aggr)
190-
204+
return(variant_junctions_aggr)
191205
}
192206

193-
regtools_data <- adply(all_splicing_variants, 1, a, all_cse_identify_data=df, .parallel=TRUE)
207+
# for each iteration (i.e. line in all_splicing_variants),
208+
# return a dataframe of regtools info and stats, basically what you end up seeing in regtools_data
209+
# so the samples column comes from all_splicing_variants, since adply uses rbind.fill, which fills in missing columns
210+
regtools_data <- adply(all_splicing_variants, 1, a, all_cse_identify_data=df, .parallel=!debug)
194211

195-
#regtools_data <- rbindlist(regtools_data, fill=TRUE)
212+
paste_commas <- function(v){
213+
return(paste(v,collapse = ","))
214+
}
196215

197216
regtools_data <- regtools_data[order(anchor, sample, pvalue)]
198-
regtools_data$samples <- paste(regtools_data$samples)
217+
regtools_data$norm_scores_variant <- unlist(lapply(regtools_data$norm_scores_variant,paste_commas))
218+
regtools_data$norm_scores_non <- unlist(lapply(regtools_data$norm_scores_non,paste_commas))
219+
regtools_data$samples <- unlist(lapply(regtools_data$samples,paste_commas))
220+
regtools_data$names <- unlist(lapply(regtools_data$names,paste_commas))
221+
regtools_data$sd_norm_score_variant[is.na(regtools_data$sd_norm_score_variant)] = 0
222+
regtools_data$mean_norm_score_non[is.na(regtools_data$mean_norm_score_non)] = 0
223+
regtools_data$sd_norm_score_non[is.na(regtools_data$sd_norm_score_non)] = 0
224+
regtools_data$total_score_variant[is.na(regtools_data$total_score_variant)] = 0
225+
regtools_data$total_score_non[is.na(regtools_data$total_score_non)] = 0
226+
227+
regtools_data = subset(regtools_data, select = !(names(regtools_data) %in% c("key")))
228+
colnames(regtools_data)[colnames(regtools_data)=="sample"] <- "variant_junction_samples"
229+
colnames(regtools_data)[colnames(regtools_data)=="samples"] <- "variant_samples"
230+
colnames(regtools_data)[colnames(regtools_data)=="info"] <- "variant_junction_info"
199231
})
200-
write.table(regtools_data, file=paste("compare_junctions/hist/", "junction_pvalues", tag, ".tsv", sep=""), quote=FALSE, sep='\t', row.names = F)
201232

233+
write.table(regtools_data, file=paste("compare_junctions/hist/", "junction_pvalues", tag, ".tsv", sep=""), quote=FALSE, sep='\t', row.names = F)

0 commit comments

Comments
 (0)