Skip to content

Commit a0a8f02

Browse files
adding DMRs from WGBS BLUEPRINT data
1 parent 414f9a7 commit a0a8f02

7 files changed

+200
-0
lines changed

R/find_dmrs_wgbs.R

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
#' @title find_dmrs_wgbs
2+
#'
3+
#' @description Add here.
4+
#'
5+
#' @param verbose Add here.
6+
#' @param gr_target Add here.
7+
#' @param include_cpgs Add here.
8+
#' @param include_dmrs Add here.
9+
#' @param num_cpgs Add here.
10+
#' @param num_regions Add here.
11+
#' @param bumphunter_beta_cutoff Add here.
12+
#' @param dmr_up_cutoff Add here.
13+
#' @param dmr_down_cutoff Add here.
14+
#' @param dmr_pval_cutoff Add here.
15+
#' @param cpg_pval_cutoff Add here.
16+
#' @param cpg_up_dm_cutoff Add here.
17+
#' @param cpg_down_dm_cutoff Add here.
18+
#' @param pairwise_comparison Add here.
19+
#' @param mset_train_flow_sort Add here.
20+
#'
21+
#' @export
22+
#'
23+
#' @return Add here.
24+
#'
25+
#' @details
26+
#' Add here.
27+
#'
28+
#' @aliases find_dmrs_wgbs
29+
#'
30+
#' @docType methods
31+
#' @name find_dmrs_wgbs
32+
#'
33+
#' @import minfi
34+
#' @import FlowSorted.Blood.450k
35+
#' @importFrom plyranges arrange
36+
#' @importFrom S4Vectors queryHits
37+
#' @importFrom utils head
38+
#' @importFrom bumphunter clusterMaker
39+
#' @importFrom stats model.matrix
40+
#' @importFrom genefilter rowttests
41+
#'
42+
#' @examples
43+
#' found_regions <- find_dmrs_wgbs()
44+
#'
45+
#' @rdname find_dmrs_wgbs
46+
#' @export
47+
find_dmrs_wgbs <- function(verbose=TRUE, gr_target=NULL,
48+
num_regions=50, bumphunter_beta_cutoff = 0.2,
49+
dmr_up_cutoff = 0.5, dmr_down_cutoff = 0.4,
50+
dmr_pval_cutoff = 1e-11,
51+
dmrseq_search = FALSE, workers = 1) {
52+
53+
54+
library(rhdf5)
55+
library(HDF5Array)
56+
library(bsseq)
57+
58+
dataPath <- "/users/shicks1/data/DNAm/blueprint_ihec"
59+
hdf5_bs_se_path <- file.path(dataPath, "files_bsseq_hdf5_se")
60+
bs <- loadHDF5SummarizedExperiment(hdf5_bs_se_path)
61+
62+
IDs = c("Gran", "CD4T", "CD8T", "Bcell","Mono", "NK")
63+
64+
# extract beta values, phenotypic information and GRanges objects
65+
cell <- factor(bs$cell_type, levels = IDs)
66+
cell_levels <- levels(cell)
67+
68+
# define design matrix to search for DMRs
69+
xmat = model.matrix(~ cell - 1)
70+
colnames(xmat) = cell_levels
71+
pData(bs) <- cbind(pData(bs), xmat)
72+
73+
all_poss = diag(length(cell_levels))
74+
all_poss <- (all_poss == TRUE)
75+
colnames(all_poss) <- cell_levels
76+
77+
# find celltype specific regions using only overlap CpGs in target object
78+
if(!is.null(gr_target)){
79+
# which of the WGBS CpGs overlap with the target CpGs
80+
zz <- findOverlaps(granges(bs), gr_target)
81+
bs <- bs[queryHits(zz), ]
82+
if(verbose){
83+
mes <- "[estimatecc] gr_target is not null. Using %s overlapping CpGs."
84+
message(sprintf(mes, nrow(bs)))
85+
}
86+
}
87+
88+
regs <- vector("list", length(IDs))
89+
regions_all <- GRanges()
90+
zmat <- c() # regions_all, will contain all celltype-specific DMRs
91+
92+
for(ind in seq_len(nrow(all_poss))){
93+
94+
if(!dmrseq_search){
95+
if(verbose){
96+
mes <- "[estimatecc] Loading %s cell type-specific regions."
97+
message(sprintf(mes, cell_levels[ind]))
98+
}
99+
regs[[ind]] <- readRDS(file = file.path("data",
100+
paste0("blueprint_blood_regions_dmrseq_",
101+
tolower(IDs[ind]),".RDS")))
102+
} else {
103+
library(BiocParallel)
104+
register(MulticoreParam(workers))
105+
106+
if(verbose){
107+
mes <- "[estimatecc] Searching for %s cell type-specific regions."
108+
message(sprintf(mes, cell_levels[ind]))
109+
}
110+
regs[[ind]] <- dmrseq(bs=bs, testCovariate=IDs[ind], minNumRegion = 3,
111+
cutoff = bumphunter_beta_cutoff, verbose = TRUE)
112+
}
113+
114+
keep_dmrs = .pick_target_positions(target_granges = granges(bs),
115+
target_object = getCoverage(bs, type = "M"),
116+
target_cvg = getCoverage(bs, type = "Cov"),
117+
dmp_regions = regs[[ind]])$dmp_granges
118+
119+
# y_regions are the beta values collapsed (CpGs averaged) by regions from bumphunter
120+
y_regions <- as.matrix(mcols(keep_dmrs))
121+
122+
tmp <- genefilter::rowttests(x = y_regions,fac = factor(pData(bs)[, IDs[ind]]))
123+
regs[[ind]]$ttest_pval <- tmp$p.value
124+
regs[[ind]]$ttest_dm <- tmp$dm
125+
regs[[ind]]$dmr_up_max_diff <- apply(abs(sweep(y_regions, 2, (pData(bs)[, IDs[ind]]), FUN = "-")), 1, max)
126+
regs[[ind]]$dmr_down_max_diff <- apply(abs(sweep(y_regions, 2, (1 - pData(bs)[, IDs[ind]]), FUN = "-")), 1, max)
127+
128+
# Apply quality control
129+
keep_ind_regions <- (regs[[ind]]$qval < 0.05) & # (regs[[ind]]$area > 1) # &
130+
(regs[[ind]]$ttest_pval < dmr_pval_cutoff) # ideally less than 1e-11
131+
regs[[ind]] <- regs[[ind]][keep_ind_regions, ]
132+
gr_regions_up <- regs[[ind]][(regs[[ind]]$stat > 0) &
133+
(regs[[ind]]$dmr_up_max_diff < dmr_up_cutoff), ]
134+
gr_regions_down <- regs[[ind]][(regs[[ind]]$stat < 0) &
135+
(regs[[ind]]$dmr_down_max_diff < dmr_down_cutoff), ]
136+
137+
if(length(gr_regions_up) > 0){
138+
gr_regions_up <- gr_regions_up[, names(mcols(gr_regions_up)) %in%
139+
c("L", "area", "stat", "ttest_dm",
140+
"ttest_pval", "dmr_up_max_diff")]
141+
names(mcols(gr_regions_up))[
142+
names(mcols(gr_regions_up)) == "dmr_up_max_diff"] <- "dmr_max_diff"
143+
gr_regions_up <- gr_regions_up[order(-gr_regions_up$stat), ]
144+
gr_regions_up <- gr_regions_up[seq_len(min(length(gr_regions_up), num_regions)), ]
145+
} else {
146+
gr_regions_up <- GRanges()
147+
}
148+
149+
if(length(gr_regions_down) > 0){
150+
gr_regions_down <- gr_regions_down[, names(mcols(gr_regions_down)) %in%
151+
c("L", "area", "stat", "ttest_dm",
152+
"ttest_pval", "dmr_down_max_diff")]
153+
names(mcols(gr_regions_down))[
154+
names(mcols(gr_regions_down)) == "dmr_down_max_diff"] <- "dmr_max_diff"
155+
gr_regions_down <- gr_regions_down[order(gr_regions_down$stat), ]
156+
gr_regions_down <- gr_regions_down[seq_len(min(length(gr_regions_down), num_regions)), ]
157+
} else {
158+
gr_regions_down <- GRanges()
159+
}
160+
161+
mcols(gr_regions_up)$status <- rep("Up", length(gr_regions_up))
162+
mcols(gr_regions_down)$status <- rep("Down", length(gr_regions_down))
163+
bump_mat_all <- c(gr_regions_up, gr_regions_down)
164+
mcols(bump_mat_all)$cellType <- rep(cell_levels[ind], length(bump_mat_all))
165+
166+
if(verbose){
167+
mes <- "[estimatecc] Found %s %s cell type-specific regions."
168+
message(sprintf(mes, length(bump_mat_all), cell_levels[ind]))
169+
}
170+
if(length(bump_mat_all) > 0){
171+
regions_all <- c(regions_all, bump_mat_all)
172+
}
173+
if(length(gr_regions_up) > 0){
174+
zmat <- rbind(zmat, t(replicate(min(length(gr_regions_up), num_regions),
175+
as.numeric(all_poss[ind,]))))
176+
}
177+
if(length(gr_regions_down) > 0){
178+
zmat <- rbind(zmat, t(replicate(min(length(gr_regions_down), num_regions),
179+
as.numeric(!all_poss[ind,]))))
180+
}
181+
}
182+
183+
colnames(zmat) <- cell_levels
184+
185+
keep_dmrs = .pick_target_positions(target_granges = granges(bs),
186+
target_object = getCoverage(bs, type = "M"),
187+
target_cvg = getCoverage(bs, type = "Cov"),
188+
dmp_regions = regions_all)$dmp_granges
189+
y_regions <- as.matrix(mcols(keep_dmrs))
190+
191+
profiles <- sapply(.splitit(cell),function(ind){ rowMeans(y_regions[,ind]) } )
192+
193+
removeMe <- duplicated(regions_all)
194+
list(regions_all = regions_all[!removeMe,],
195+
zmat = zmat[!removeMe,],
196+
y_regions = y_regions[!removeMe,],
197+
profiles = profiles[!removeMe,],
198+
cell = cell, cell_mat = all_poss,
199+
cell_levels = cell_levels, pd = pData(bs))
200+
}
35.4 KB
Binary file not shown.
18.9 KB
Binary file not shown.
21.2 KB
Binary file not shown.
23.9 KB
Binary file not shown.
15.9 KB
Binary file not shown.
46.5 KB
Binary file not shown.

0 commit comments

Comments
 (0)