|
| 1 | +# compare_junctions_hist.R |
| 2 | +# Rscript --vanilla compare_junctions_hist.R <tag> |
| 3 | + |
| 4 | +# load libraries |
| 5 | +library(data.table) |
| 6 | +library(graphics) |
| 7 | +library(plyr) |
| 8 | +library(foreach) |
| 9 | +library(doParallel) |
| 10 | +registerDoParallel(cores=32) |
| 11 | + |
| 12 | +system.time({ |
| 13 | +# get options tag |
| 14 | +#argc = length(commandArgs()) |
| 15 | +tag = paste("_", "default", sep="") |
| 16 | + |
| 17 | +if ( substr(tag, 2, 3) == "--"){ |
| 18 | + stop("Please specify an option tag (e.g. \"default\", \"i20e5\")") |
| 19 | +} |
| 20 | + |
| 21 | +## All splicing relevant variants (union of rows from variants.bed files; add column with comma-separated list of sample names) |
| 22 | +all_splicing_variants = unique(data.table::fread(paste("all_splicing_variants", tag, ".bed",sep=""), sep = '\t', header = T, stringsAsFactors = FALSE)) |
| 23 | +colnames(all_splicing_variants) <- c("chrom", "start", "end", "samples") |
| 24 | + |
| 25 | +all_splicing_variants = as.data.table(aggregate(samples ~ chrom + start + end, paste, data=all_splicing_variants)) |
| 26 | +all_splicing_variants$key <- paste0(all_splicing_variants$chrom, ":", all_splicing_variants$start, "-", all_splicing_variants$end) |
| 27 | + |
| 28 | +## Get all of the samples |
| 29 | +all_samples = strsplit(scan("dir_names.tsv", what="", sep="\n"), "[[:space:]]+") |
| 30 | + |
| 31 | +################################################################################ |
| 32 | +##### Helper functions ######################################################### |
| 33 | +################################################################################ |
| 34 | + |
| 35 | +get_sample_data <- function(sample){ |
| 36 | + |
| 37 | + file_name = paste("samples/", sample, "/output/cse_identify_filtered_compare", tag,".tsv", sep = "") |
| 38 | + cse_identify_data = data.table::fread(file_name, sep = '\t', header = T, stringsAsFactors = FALSE, strip.white = TRUE) |
| 39 | + cse_identify_data$sample <- sample |
| 40 | + s <- strsplit(cse_identify_data$variant_info, split = ",") |
| 41 | + cse_identify_data <- data.table::data.table(chrom=rep(cse_identify_data$chrom, sapply(s, length)), |
| 42 | + start=rep(cse_identify_data$start, sapply(s, length)), |
| 43 | + end=rep(cse_identify_data$end, sapply(s, length)), |
| 44 | + name=rep(cse_identify_data$name, sapply(s, length)), |
| 45 | + score=rep(cse_identify_data$score, sapply(s, length)), |
| 46 | + strand=rep(cse_identify_data$strand, sapply(s, length)), |
| 47 | + splice_site=rep(cse_identify_data$splice_site, sapply(s, length)), |
| 48 | + acceptors_skipped=rep(cse_identify_data$acceptors_skipped, sapply(s, length)), |
| 49 | + exons_skipped=rep(cse_identify_data$exons_skipped, sapply(s, length)), |
| 50 | + donors_skipped=rep(cse_identify_data$donors_skipped, sapply(s, length)), |
| 51 | + anchor=rep(cse_identify_data$anchor, sapply(s, length)), |
| 52 | + known_donor=rep(cse_identify_data$known_donor, sapply(s, length)), |
| 53 | + known_acceptor=rep(cse_identify_data$known_acceptor, sapply(s, length)), |
| 54 | + known_junction=rep(cse_identify_data$known_junction, sapply(s, length)), |
| 55 | + genes=rep(cse_identify_data$genes, sapply(s, length)), |
| 56 | + transcripts=rep(cse_identify_data$transcripts, sapply(s, length)), |
| 57 | + sample=rep(cse_identify_data$sample, sapply(s, length)), |
| 58 | + variant_info=unlist(s)) |
| 59 | + return(cse_identify_data) |
| 60 | +} |
| 61 | + |
| 62 | +################################################################################ |
| 63 | +##### Combine cse_identify_filtered_compare.tsv files for each sample ########## |
| 64 | +################################################################################ |
| 65 | + |
| 66 | +# this is regtools output per sample |
| 67 | +df <- rbindlist(lapply(all_samples, get_sample_data)) |
| 68 | +df$info <- paste(df$chrom, df$start, |
| 69 | + df$end, df$anchor, |
| 70 | + df$variant_info, sep = "_") |
| 71 | + |
| 72 | +################################################################################ |
| 73 | +##### Function to create reviewable regtools output files ###################### |
| 74 | +################################################################################ |
| 75 | + |
| 76 | +a <- function(x, all_cse_identify_data){ |
| 77 | + |
| 78 | + ## Get data from samples with variant |
| 79 | + variant_samples <- unlist(x$samples) |
| 80 | + |
| 81 | + ############################################################################## |
| 82 | + ##### Get junction and variant information for variant samples ############### |
| 83 | + ############################################################################## |
| 84 | + |
| 85 | + ## Get the data from sample with the variant and the significant junctions |
| 86 | + variant_junctions_data <- all_cse_identify_data[all_cse_identify_data$variant_info==x$key & all_cse_identify_data$sample %in% variant_samples,] |
| 87 | + |
| 88 | + ## Make dataset with the sample and the junction/variant information |
| 89 | + sample_data <- variant_junctions_data[,c("sample", "info")] |
| 90 | + |
| 91 | + # Normalize junction scores in this variant region for each sample |
| 92 | + summed_variant_scores = variant_junctions_data[, list(score=sum(score)), by=list(sample)] |
| 93 | + colnames(summed_variant_scores) <- c("sample", "score.tmp") |
| 94 | + variant_junctions_data <- merge(variant_junctions_data, summed_variant_scores, by="sample") |
| 95 | + variant_junctions_data$norm_score <- variant_junctions_data$score / variant_junctions_data$score.tmp |
| 96 | + |
| 97 | + # Aggregate data across junction-samples |
| 98 | + if (nrow(variant_junctions_data)[[1]] > 0){ |
| 99 | + variant_junctions_aggr = variant_junctions_data[, list(average_norm_score=mean(norm_score),total_score=sum(score)), |
| 100 | + by=list(chrom,start,end,name,strand,anchor,variant_info,info)] |
| 101 | + |
| 102 | + } else { |
| 103 | + return(data.table()) |
| 104 | + } |
| 105 | + |
| 106 | + ## Identify mutated samples |
| 107 | + a <- function(x, tempData){ |
| 108 | + return(paste(as.character(tempData[info == x, "sample"]$sample), collapse="|")) |
| 109 | + } |
| 110 | + variant_junctions_aggr$sample <- sapply(variant_junctions_aggr$info, a, tempData = sample_data) |
| 111 | + |
| 112 | + ############################################################################## |
| 113 | + ##### Get junction and variant information for non-variant samples############### |
| 114 | + ############################################################################## |
| 115 | + |
| 116 | + ## Samples without variant |
| 117 | + non_samples = setdiff(all_samples, variant_samples) |
| 118 | + non_variant_junctions_data <- all_cse_identify_data[all_cse_identify_data$variant_info==x$key & all_cse_identify_data$sample %in% non_samples] |
| 119 | + |
| 120 | + ## Get variant info |
| 121 | + non_variant_junctions_data$info <- paste(non_variant_junctions_data$chrom, non_variant_junctions_data$start, |
| 122 | + non_variant_junctions_data$end, non_variant_junctions_data$anchor, |
| 123 | + non_variant_junctions_data$variant_info, sep = "_") |
| 124 | + |
| 125 | + ## Make dataset with the sample and the junction/variant information |
| 126 | + non_variant_sample_data <- non_variant_junctions_data[,c("sample", "info")] |
| 127 | + |
| 128 | + # Normalize junction scores in this variant region for each sample |
| 129 | + summed_non_variant_scores = non_variant_junctions_data[, list(score=sum(score)), by=list(sample)] |
| 130 | + colnames(summed_non_variant_scores) <- c("sample", "score.tmp") |
| 131 | + non_variant_junctions_data <- merge(non_variant_junctions_data, summed_non_variant_scores, by="sample") |
| 132 | + non_variant_junctions_data$norm_score <- non_variant_junctions_data$score / non_variant_junctions_data$score.tmp |
| 133 | + |
| 134 | + |
| 135 | + |
| 136 | + |
| 137 | + # Generate histogram of normalized junction scores |
| 138 | + collapsed_norm_scores = non_variant_junctions_data[, list(norm_scores=list(norm_score)), by=list(chrom,start,end,strand)] |
| 139 | + |
| 140 | + # split out what p-values can be calculated on vs not |
| 141 | + collapsed_norm_scores$key <- paste(collapsed_norm_scores$chrom, collapsed_norm_scores$start, collapsed_norm_scores$end, collapsed_norm_scores$strand, sep='.') |
| 142 | + variant_junctions_aggr$key <- paste(variant_junctions_aggr$chrom, variant_junctions_aggr$start, variant_junctions_aggr$end, variant_junctions_aggr$strand, sep='.') |
| 143 | + variant_junctions_aggr_run_pvalues <- variant_junctions_aggr[variant_junctions_aggr$key %in% collapsed_norm_scores$key,] |
| 144 | + variant_junctions_aggr_norun_pvalues <- variant_junctions_aggr[!variant_junctions_aggr$key %in% collapsed_norm_scores$key,] |
| 145 | + variant_junctions_aggr_norun_pvalues$pvalue <- 0 |
| 146 | + |
| 147 | + calculate_pvalues <- function(junction_data){ #### pull out |
| 148 | + chrom = trimws(junction_data[["chrom"]]) |
| 149 | + start = trimws(junction_data[["start"]]) |
| 150 | + end = trimws(junction_data[["end"]]) |
| 151 | + strand = trimws(junction_data[["strand"]]) |
| 152 | + variant_norm_score = junction_data[["average_norm_score"]] |
| 153 | + ### pass this variable |
| 154 | + i = which(collapsed_norm_scores$chrom == chrom & collapsed_norm_scores$start == start & collapsed_norm_scores$end == end & collapsed_norm_scores$strand == strand) |
| 155 | + if (length(i)){ |
| 156 | + non_variant_norm_scores = collapsed_norm_scores[i]$norm_scores[[1]] |
| 157 | + missing = length(non_samples) - length(non_variant_norm_scores) |
| 158 | + missing_zeros = integer(missing) |
| 159 | + non_variant_norm_scores = c(non_variant_norm_scores, missing_zeros) |
| 160 | + |
| 161 | + variant_norm_score = as.numeric(variant_norm_score) |
| 162 | + all_norm_scores = c(non_variant_norm_scores, variant_norm_score) |
| 163 | + countable = rank(all_norm_scores) |
| 164 | + non_variant_norm_scores_ranked = head(countable, -1) |
| 165 | + variant_norm_score_ranked = tail(countable, 1) |
| 166 | + |
| 167 | + histinfo = hist(non_variant_norm_scores_ranked, |
| 168 | + breaks = seq(0.5, max(non_variant_norm_scores_ranked)+1.5, by=1), plot=F) |
| 169 | + mids = histinfo$mids |
| 170 | + cd = cumsum(histinfo$density) |
| 171 | + underestimate = max(which(mids <= variant_norm_score_ranked)) |
| 172 | + pvalue = 1-cd[underestimate] |
| 173 | + |
| 174 | + return(pvalue) |
| 175 | + } else { |
| 176 | + return(0) |
| 177 | + } |
| 178 | + } |
| 179 | + if(nrow(variant_junctions_aggr_run_pvalues) > 0){ |
| 180 | + # apply above function to each aggregated variant junction |
| 181 | + pvalues = apply(variant_junctions_aggr_run_pvalues, 1, FUN = calculate_pvalues) |
| 182 | + variant_junctions_aggr_run_pvalues$pvalue = pvalues |
| 183 | + |
| 184 | + variant_junctions_aggr <- rbind(variant_junctions_aggr_norun_pvalues, variant_junctions_aggr_run_pvalues) |
| 185 | + } else { |
| 186 | + variant_junctions_aggr <- variant_junctions_aggr_norun_pvalues |
| 187 | + } |
| 188 | + |
| 189 | + return(variant_junctions_aggr) |
| 190 | + |
| 191 | +} |
| 192 | + |
| 193 | +regtools_data <- adply(all_splicing_variants, 1, a, all_cse_identify_data=df, .parallel=TRUE) |
| 194 | + |
| 195 | +#regtools_data <- rbindlist(regtools_data, fill=TRUE) |
| 196 | + |
| 197 | +regtools_data <- regtools_data[order(anchor, sample, pvalue)] |
| 198 | +regtools_data$samples <- paste(regtools_data$samples) |
| 199 | +}) |
| 200 | +write.table(regtools_data, file=paste("compare_junctions/hist/", "junction_pvalues", tag, ".tsv", sep=""), quote=FALSE, sep='\t', row.names = F) |
| 201 | + |
0 commit comments