|
| 1 | +#' @title find_dmrs_wgbs |
| 2 | +#' |
| 3 | +#' @description Add here. |
| 4 | +#' |
| 5 | +#' @param verbose Add here. |
| 6 | +#' @param gr_target Add here. |
| 7 | +#' @param include_cpgs Add here. |
| 8 | +#' @param include_dmrs Add here. |
| 9 | +#' @param num_cpgs Add here. |
| 10 | +#' @param num_regions Add here. |
| 11 | +#' @param bumphunter_beta_cutoff Add here. |
| 12 | +#' @param dmr_up_cutoff Add here. |
| 13 | +#' @param dmr_down_cutoff Add here. |
| 14 | +#' @param dmr_pval_cutoff Add here. |
| 15 | +#' @param cpg_pval_cutoff Add here. |
| 16 | +#' @param cpg_up_dm_cutoff Add here. |
| 17 | +#' @param cpg_down_dm_cutoff Add here. |
| 18 | +#' @param pairwise_comparison Add here. |
| 19 | +#' @param mset_train_flow_sort Add here. |
| 20 | +#' |
| 21 | +#' @export |
| 22 | +#' |
| 23 | +#' @return Add here. |
| 24 | +#' |
| 25 | +#' @details |
| 26 | +#' Add here. |
| 27 | +#' |
| 28 | +#' @aliases find_dmrs_wgbs |
| 29 | +#' |
| 30 | +#' @docType methods |
| 31 | +#' @name find_dmrs_wgbs |
| 32 | +#' |
| 33 | +#' @import minfi |
| 34 | +#' @import FlowSorted.Blood.450k |
| 35 | +#' @importFrom plyranges arrange |
| 36 | +#' @importFrom S4Vectors queryHits |
| 37 | +#' @importFrom utils head |
| 38 | +#' @importFrom bumphunter clusterMaker |
| 39 | +#' @importFrom stats model.matrix |
| 40 | +#' @importFrom genefilter rowttests |
| 41 | +#' |
| 42 | +#' @examples |
| 43 | +#' found_regions <- find_dmrs_wgbs() |
| 44 | +#' |
| 45 | +#' @rdname find_dmrs_wgbs |
| 46 | +#' @export |
| 47 | +find_dmrs_wgbs <- function(verbose=TRUE, gr_target=NULL, |
| 48 | + num_regions=50, bumphunter_beta_cutoff = 0.2, |
| 49 | + dmr_up_cutoff = 0.5, dmr_down_cutoff = 0.4, |
| 50 | + dmr_pval_cutoff = 1e-11, |
| 51 | + dmrseq_search = FALSE, workers = 1) { |
| 52 | + |
| 53 | + |
| 54 | + library(rhdf5) |
| 55 | + library(HDF5Array) |
| 56 | + library(bsseq) |
| 57 | + |
| 58 | + dataPath <- "/users/shicks1/data/DNAm/blueprint_ihec" |
| 59 | + hdf5_bs_se_path <- file.path(dataPath, "files_bsseq_hdf5_se") |
| 60 | + bs <- loadHDF5SummarizedExperiment(hdf5_bs_se_path) |
| 61 | + |
| 62 | + IDs = c("Gran", "CD4T", "CD8T", "Bcell","Mono", "NK") |
| 63 | + |
| 64 | + # extract beta values, phenotypic information and GRanges objects |
| 65 | + cell <- factor(bs$cell_type, levels = IDs) |
| 66 | + cell_levels <- levels(cell) |
| 67 | + |
| 68 | + # define design matrix to search for DMRs |
| 69 | + xmat = model.matrix(~ cell - 1) |
| 70 | + colnames(xmat) = cell_levels |
| 71 | + pData(bs) <- cbind(pData(bs), xmat) |
| 72 | + |
| 73 | + all_poss = diag(length(cell_levels)) |
| 74 | + all_poss <- (all_poss == TRUE) |
| 75 | + colnames(all_poss) <- cell_levels |
| 76 | + |
| 77 | + # find celltype specific regions using only overlap CpGs in target object |
| 78 | + if(!is.null(gr_target)){ |
| 79 | + # which of the WGBS CpGs overlap with the target CpGs |
| 80 | + zz <- findOverlaps(granges(bs), gr_target) |
| 81 | + bs <- bs[queryHits(zz), ] |
| 82 | + if(verbose){ |
| 83 | + mes <- "[estimatecc] gr_target is not null. Using %s overlapping CpGs." |
| 84 | + message(sprintf(mes, nrow(bs))) |
| 85 | + } |
| 86 | + } |
| 87 | + |
| 88 | + regs <- vector("list", length(IDs)) |
| 89 | + regions_all <- GRanges() |
| 90 | + zmat <- c() # regions_all, will contain all celltype-specific DMRs |
| 91 | + |
| 92 | + for(ind in seq_len(nrow(all_poss))){ |
| 93 | + |
| 94 | + if(!dmrseq_search){ |
| 95 | + if(verbose){ |
| 96 | + mes <- "[estimatecc] Loading %s cell type-specific regions." |
| 97 | + message(sprintf(mes, cell_levels[ind])) |
| 98 | + } |
| 99 | + regs[[ind]] <- readRDS(file = file.path("data", |
| 100 | + paste0("blueprint_blood_regions_dmrseq_", |
| 101 | + tolower(IDs[ind]),".RDS"))) |
| 102 | + } else { |
| 103 | + library(BiocParallel) |
| 104 | + register(MulticoreParam(workers)) |
| 105 | + |
| 106 | + if(verbose){ |
| 107 | + mes <- "[estimatecc] Searching for %s cell type-specific regions." |
| 108 | + message(sprintf(mes, cell_levels[ind])) |
| 109 | + } |
| 110 | + regs[[ind]] <- dmrseq(bs=bs, testCovariate=IDs[ind], minNumRegion = 3, |
| 111 | + cutoff = bumphunter_beta_cutoff, verbose = TRUE) |
| 112 | + } |
| 113 | + |
| 114 | + keep_dmrs = .pick_target_positions(target_granges = granges(bs), |
| 115 | + target_object = getCoverage(bs, type = "M"), |
| 116 | + target_cvg = getCoverage(bs, type = "Cov"), |
| 117 | + dmp_regions = regs[[ind]])$dmp_granges |
| 118 | + |
| 119 | + # y_regions are the beta values collapsed (CpGs averaged) by regions from bumphunter |
| 120 | + y_regions <- as.matrix(mcols(keep_dmrs)) |
| 121 | + |
| 122 | + tmp <- genefilter::rowttests(x = y_regions,fac = factor(pData(bs)[, IDs[ind]])) |
| 123 | + regs[[ind]]$ttest_pval <- tmp$p.value |
| 124 | + regs[[ind]]$ttest_dm <- tmp$dm |
| 125 | + regs[[ind]]$dmr_up_max_diff <- apply(abs(sweep(y_regions, 2, (pData(bs)[, IDs[ind]]), FUN = "-")), 1, max) |
| 126 | + regs[[ind]]$dmr_down_max_diff <- apply(abs(sweep(y_regions, 2, (1 - pData(bs)[, IDs[ind]]), FUN = "-")), 1, max) |
| 127 | + |
| 128 | + # Apply quality control |
| 129 | + keep_ind_regions <- (regs[[ind]]$qval < 0.05) & # (regs[[ind]]$area > 1) # & |
| 130 | + (regs[[ind]]$ttest_pval < dmr_pval_cutoff) # ideally less than 1e-11 |
| 131 | + regs[[ind]] <- regs[[ind]][keep_ind_regions, ] |
| 132 | + gr_regions_up <- regs[[ind]][(regs[[ind]]$stat > 0) & |
| 133 | + (regs[[ind]]$dmr_up_max_diff < dmr_up_cutoff), ] |
| 134 | + gr_regions_down <- regs[[ind]][(regs[[ind]]$stat < 0) & |
| 135 | + (regs[[ind]]$dmr_down_max_diff < dmr_down_cutoff), ] |
| 136 | + |
| 137 | + if(length(gr_regions_up) > 0){ |
| 138 | + gr_regions_up <- gr_regions_up[, names(mcols(gr_regions_up)) %in% |
| 139 | + c("L", "area", "stat", "ttest_dm", |
| 140 | + "ttest_pval", "dmr_up_max_diff")] |
| 141 | + names(mcols(gr_regions_up))[ |
| 142 | + names(mcols(gr_regions_up)) == "dmr_up_max_diff"] <- "dmr_max_diff" |
| 143 | + gr_regions_up <- gr_regions_up[order(-gr_regions_up$stat), ] |
| 144 | + gr_regions_up <- gr_regions_up[seq_len(min(length(gr_regions_up), num_regions)), ] |
| 145 | + } else { |
| 146 | + gr_regions_up <- GRanges() |
| 147 | + } |
| 148 | + |
| 149 | + if(length(gr_regions_down) > 0){ |
| 150 | + gr_regions_down <- gr_regions_down[, names(mcols(gr_regions_down)) %in% |
| 151 | + c("L", "area", "stat", "ttest_dm", |
| 152 | + "ttest_pval", "dmr_down_max_diff")] |
| 153 | + names(mcols(gr_regions_down))[ |
| 154 | + names(mcols(gr_regions_down)) == "dmr_down_max_diff"] <- "dmr_max_diff" |
| 155 | + gr_regions_down <- gr_regions_down[order(gr_regions_down$stat), ] |
| 156 | + gr_regions_down <- gr_regions_down[seq_len(min(length(gr_regions_down), num_regions)), ] |
| 157 | + } else { |
| 158 | + gr_regions_down <- GRanges() |
| 159 | + } |
| 160 | + |
| 161 | + mcols(gr_regions_up)$status <- rep("Up", length(gr_regions_up)) |
| 162 | + mcols(gr_regions_down)$status <- rep("Down", length(gr_regions_down)) |
| 163 | + bump_mat_all <- c(gr_regions_up, gr_regions_down) |
| 164 | + mcols(bump_mat_all)$cellType <- rep(cell_levels[ind], length(bump_mat_all)) |
| 165 | + |
| 166 | + if(verbose){ |
| 167 | + mes <- "[estimatecc] Found %s %s cell type-specific regions." |
| 168 | + message(sprintf(mes, length(bump_mat_all), cell_levels[ind])) |
| 169 | + } |
| 170 | + if(length(bump_mat_all) > 0){ |
| 171 | + regions_all <- c(regions_all, bump_mat_all) |
| 172 | + } |
| 173 | + if(length(gr_regions_up) > 0){ |
| 174 | + zmat <- rbind(zmat, t(replicate(min(length(gr_regions_up), num_regions), |
| 175 | + as.numeric(all_poss[ind,])))) |
| 176 | + } |
| 177 | + if(length(gr_regions_down) > 0){ |
| 178 | + zmat <- rbind(zmat, t(replicate(min(length(gr_regions_down), num_regions), |
| 179 | + as.numeric(!all_poss[ind,])))) |
| 180 | + } |
| 181 | + } |
| 182 | + |
| 183 | + colnames(zmat) <- cell_levels |
| 184 | + |
| 185 | + keep_dmrs = .pick_target_positions(target_granges = granges(bs), |
| 186 | + target_object = getCoverage(bs, type = "M"), |
| 187 | + target_cvg = getCoverage(bs, type = "Cov"), |
| 188 | + dmp_regions = regions_all)$dmp_granges |
| 189 | + y_regions <- as.matrix(mcols(keep_dmrs)) |
| 190 | + |
| 191 | + profiles <- sapply(.splitit(cell),function(ind){ rowMeans(y_regions[,ind]) } ) |
| 192 | + |
| 193 | + removeMe <- duplicated(regions_all) |
| 194 | + list(regions_all = regions_all[!removeMe,], |
| 195 | + zmat = zmat[!removeMe,], |
| 196 | + y_regions = y_regions[!removeMe,], |
| 197 | + profiles = profiles[!removeMe,], |
| 198 | + cell = cell, cell_mat = all_poss, |
| 199 | + cell_levels = cell_levels, pd = pData(bs)) |
| 200 | +} |
0 commit comments