Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ export(get_hierarchical_clusters)
export(get_robust_colocalization)
export(get_robust_ucos)
export(get_ucos_summary)
importFrom(Rfast,correls)
importFrom(Rfast,med)
importFrom(Rfast,standardise)
importFrom(Rfast,upper_tri)
importFrom(grDevices,adjustcolor)
importFrom(graphics,abline)
importFrom(graphics,axis)
Expand Down
124 changes: 89 additions & 35 deletions R/colocboost.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,20 @@
#' \code{variant} is required if sumstat for different outcomes do not have the same number of variables.
#' \code{var_y} is the variance of phenotype (default is 1 meaning that the Y is in the \dQuote{standardized} scale).
#' @param LD A list of correlation matrix indicating the LD matrix for each genotype. It also could be a single matrix if all sumstats were
#' obtained from the same genotypes.
#' obtained from the same genotypes. Provide either \code{LD} or \code{X_ref}, not both.
#' If neither is provided, LD-free mode is used.
#' @param X_ref A reference panel genotype matrix (N_ref x P) or a list of matrices, as an alternative to providing a precomputed
#' \code{LD} matrix. Column names must include variant names matching those in \code{sumstat}.
#' When the number of reference panel samples is less than the number of variants (N_ref < P),
#' this avoids storing the full P x P LD matrix and reduces memory usage.
#' When N_ref >= P, LD is precomputed from \code{X_ref} internally.
#' Provide either \code{LD} or \code{X_ref}, not both. If neither is provided, LD-free mode is used.
#' @param dict_YX A L by 2 matrix of dictionary for \code{X} and \code{Y} if there exist subsets of outcomes corresponding to the same X matrix.
#' The first column should be 1:L for L outcomes. The second column should be the index of \code{X} corresponding to the outcome.
#' The innovation: do not provide the same matrix in \code{X} to reduce the computational burden.
#' @param dict_sumstatLD A L by 2 matrix of dictionary for \code{sumstat} and \code{LD} if there exist subsets of outcomes corresponding to the same sumstat.
#' The first column should be 1:L for L sumstat The second column should be the index of \code{LD} corresponding to the sumstat.
#' @param dict_sumstatLD A L by 2 matrix of dictionary for \code{sumstat} and \code{LD} (or \code{X_ref}) if there exist subsets of outcomes
#' corresponding to the same sumstat.
#' The first column should be 1:L for L sumstat The second column should be the index of \code{LD} (or \code{X_ref}) corresponding to the sumstat.
#' The innovation: do not provide the same matrix in \code{LD} to reduce the computational burden.
#' @param outcome_names The names of outcomes, which has the same order for Y.
#' @param focal_outcome_idx The index of the focal outcome if perform GWAS-xQTL ColocBoost
Expand Down Expand Up @@ -129,9 +137,10 @@
#'
#' @family colocboost
#' @importFrom stats na.omit
#' @importFrom Rfast correls standardise upper_tri med
#' @export
colocboost <- function(X = NULL, Y = NULL, # individual data
sumstat = NULL, LD = NULL, # summary statistics: either Z, bhat, sebhat, N, var_Y,
sumstat = NULL, LD = NULL, X_ref = NULL, # summary statistics: either Z, bhat, sebhat, N, var_Y,
###### - index dict for X match multiple Y / LD match multiple sumstat
dict_YX = NULL, # Y index for 1st column, X index for 2nd column
dict_sumstatLD = NULL, # sumstat index for 1st column, LD index for 2nd column
Expand Down Expand Up @@ -202,11 +211,15 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
warning("Error: No individual data (X, Y) or summary statistics (sumstat) or (effect_est, effect_se) are provided! Please check!")
return(NULL)
}
if (!is.null(LD) && !is.null(X_ref)) {
warning("Error: Provide either LD or X_ref, not both.")
return(NULL)
}

# - check input data: individual level data and summary-level data
validated_data <- colocboost_validate_input_data(
X = X, Y = Y,
sumstat = sumstat, LD = LD,
sumstat = sumstat, LD = LD, X_ref = X_ref,
dict_YX = dict_YX, dict_sumstatLD = dict_sumstatLD,
effect_est = effect_est, effect_se = effect_se, effect_n = effect_n,
overlap_variables = overlap_variables,
Expand All @@ -227,6 +240,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
keep_variable_individual <- validated_data$keep_variable_individual
sumstat <- validated_data$sumstat
LD <- validated_data$LD
X_ref <- validated_data$X_ref
sumstatLD_dict <- validated_data$sumstatLD_dict
keep_variable_sumstat <- validated_data$keep_variable_sumstat
Z <- validated_data$Z
Expand Down Expand Up @@ -277,7 +291,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
}
cb_data <- colocboost_init_data(
X = X, Y = Y, dict_YX = yx_dict,
Z = Z, LD = LD, N_sumstat = N_sumstat, dict_sumstatLD = sumstatLD_dict,
Z = Z, LD = LD, X_ref = X_ref, N_sumstat = N_sumstat, dict_sumstatLD = sumstatLD_dict,
Var_y = Var_y, SeBhat = SeBhat,
keep_variables = keep_variables,
focal_outcome_idx = focal_outcome_idx,
Expand Down Expand Up @@ -408,7 +422,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
#'
#' @keywords internal
colocboost_validate_input_data <- function(X = NULL, Y = NULL,
sumstat = NULL, LD = NULL,
sumstat = NULL, LD = NULL, X_ref = NULL,
dict_YX = NULL, dict_sumstatLD = NULL,
effect_est = NULL, effect_se = NULL, effect_n = NULL,
overlap_variables = FALSE,
Expand Down Expand Up @@ -659,10 +673,36 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL,
cos_npc_cutoff_updated <- cos_npc_cutoff
npc_outcome_cutoff_updated <- npc_outcome_cutoff

if (is.null(LD)) {
# if no LD input, set diagonal matrix to LD
# --- Handle X_ref: convert to LD when N_ref >= P, or keep for on-the-fly computation
if (!is.null(X_ref)) {
if (is.data.frame(X_ref)) X_ref <- as.matrix(X_ref)
if (is.matrix(X_ref)) X_ref <- list(X_ref)
N_ref <- nrow(X_ref[[1]])
P_ref <- ncol(X_ref[[1]])
if (N_ref >= P_ref) {
# Precompute LD from X_ref (more compact when N_ref >= P)
message("N_ref >= P: precomputing LD from X_ref.")
LD <- lapply(X_ref, get_cormat)
for (idx in seq_along(LD)) {
rownames(LD[[idx]]) <- colnames(LD[[idx]]) <- colnames(X_ref[[idx]])
}
X_ref <- NULL
} else {
# N_ref < P: keep X_ref for on-the-fly computation
# Set rownames/colnames on X_ref for variant matching (use colnames as LD would)
# Build LD-like variant mapping using X_ref colnames
# Standardize X_ref
for (idx in seq_along(X_ref)) {
X_ref[[idx]] <- standardise(X_ref[[idx]], center = TRUE, scale = TRUE)
X_ref[[idx]][which(is.na(X_ref[[idx]]))] <- 0
}
}
}

if (is.null(LD) && is.null(X_ref)) {
# if no LD or X_ref input, set diagonal matrix to LD
warning(
"Providing the LD for summary statistics data is highly recommended. ",
"Providing the LD or X_ref for summary statistics data is highly recommended. ",
"Without LD, only a single iteration will be performed under the assumption of one causal variable per outcome. ",
"Additionally, the purity of CoS cannot be evaluated!"
)
Expand All @@ -679,59 +719,72 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL,
npc_outcome_cutoff_updated <- 0

} else {

if (is.data.frame(LD)) LD <- as.matrix(LD)
if (is.matrix(LD)) LD <- list(LD)
# - check if NA in LD matrix
num_na <- sapply(LD, sum)
if (any(is.na(num_na))){
warning("Error: Input LD must not contain missing values (NA).")
return(NULL)
# --- Determine reference list and variant extraction for LD or X_ref
if (!is.null(LD)) {
if (is.data.frame(LD)) LD <- as.matrix(LD)
if (is.matrix(LD)) LD <- list(LD)
# - check if NA in LD matrix
num_na <- sapply(LD, sum)
if (any(is.na(num_na))){
warning("Error: Input LD must not contain missing values (NA).")
return(NULL)
}
ref_list <- LD
get_ref_variants <- function(ref_mat) rownames(ref_mat)
ref_label <- "LD"
ref_ncol <- function(ref_mat) ncol(ref_mat)
} else {
# X_ref path (N_ref < P, already standardized above)
ref_list <- X_ref
get_ref_variants <- function(ref_mat) colnames(ref_mat)
ref_label <- "X_ref"
ref_ncol <- function(ref_mat) ncol(ref_mat)
}
# Create sumstat-LD mapping ===
if (length(LD) == 1) {

# Create sumstat-LD/X_ref mapping ===
if (length(ref_list) == 1) {
sumstatLD_dict <- rep(1, length(sumstat))
} else if (length(LD) == length(sumstat)) {
} else if (length(ref_list) == length(sumstat)) {
sumstatLD_dict <- seq_along(sumstat)
} else {
if (is.null(dict_sumstatLD)) {
warning('Error: Please provide dict_sumstatLD: you have ', length(sumstat),
' sumstats but only ', length(LD), ' LD matrices')
warning('Error: Please provide dict_sumstatLD: you have ', length(sumstat),
' sumstats but only ', length(ref_list), ' ', ref_label, ' matrices')
return(NULL)
} else {
# - dict for sumstat to LD mapping
# - dict for sumstat to LD/X_ref mapping
sumstatLD_dict <- rep(NA, length(sumstat))
for (i in 1:length(sumstat)) {
tmp <- unique(dict_sumstatLD[dict_sumstatLD[, 1] == i, 2])
if (length(tmp) == 0) {
warning(paste("Error: You don't provide matched LD for sumstat", i))
warning(paste("Error: You don't provide matched", ref_label, "for sumstat", i))
return(NULL)
} else if (length(tmp) != 1) {
warning(paste("Error: You provide multiple matched LD for sumstat", i))
warning(paste("Error: You provide multiple matched", ref_label, "for sumstat", i))
return(NULL)
} else {
sumstatLD_dict[i] <- tmp
}
}
if (max(sumstatLD_dict) > length(LD)) {
warning("Error: You don't provide enough LD matrices!")
if (max(sumstatLD_dict) > length(ref_list)) {
warning(paste("Error: You don't provide enough", ref_label, "matrices!"))
return(NULL)
}
}
}

# === Filter variants for each sumstat ===
for (i in seq_along(sumstat)) {
# Get sumstat variants (adjust field name based on your data structure)
sumstat_variants <- sumstat[[i]]$variant
n_total <- length(sumstat_variants)
# Get LD variants
# Get LD/X_ref variants
ld_idx <- sumstatLD_dict[i]
current_ld <- LD[[ld_idx]]
ld_variants <- rownames(current_ld)
current_ref <- ref_list[[ld_idx]]
ld_variants <- get_ref_variants(current_ref)
if (is.null(ld_variants)) {
if (ncol(current_ld) != n_total){
warning('Error: LD matrix ', ld_idx, ' has no rownames. Please ensure all LD matrices have variant names as rownames.')
if (ref_ncol(current_ref) != n_total){
warning('Error: ', ref_label, ' matrix ', ld_idx, ' has no variant names. Please ensure all ', ref_label, ' matrices have variant names.')
return(NULL)
}
}
Expand Down Expand Up @@ -834,7 +887,7 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL,
Z[[i.summstat]] <- z
}
} else {
Z <- N_sumstat <- Var_y <- SeBhat <- sumstatLD_dict <- keep_variable_sumstat <- NULL
Z <- N_sumstat <- Var_y <- SeBhat <- sumstatLD_dict <- keep_variable_sumstat <- X_ref <- NULL
M_updated <- M
min_abs_corr_updated <- min_abs_corr
jk_equiv_corr_updated <- jk_equiv_corr
Expand All @@ -851,6 +904,7 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL,
keep_variable_individual = keep_variable_individual,
sumstat = sumstat,
LD = LD,
X_ref = X_ref,
sumstatLD_dict = sumstatLD_dict,
keep_variable_sumstat = keep_variable_sumstat,
Z = Z,
Expand Down
5 changes: 4 additions & 1 deletion R/colocboost_assemble.R
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,13 @@ colocboost_assemble <- function(cb_obj,
}
}
if (!is.null(cb_obj_single$cb_data$data[[1]][["XtY"]])) {
X_dict <- cb_obj$cb_data$dict[i]
if (is.null(cb_obj_single$cb_data$data[[1]]$XtX)) {
X_dict <- cb_obj$cb_data$dict[i]
cb_obj_single$cb_data$data[[1]]$XtX <- cb_obj$cb_data$data[[X_dict]]$XtX
}
if (is.null(cb_obj_single$cb_data$data[[1]]$X_ref)) {
cb_obj_single$cb_data$data[[1]]$X_ref <- cb_obj$cb_data$data[[X_dict]]$X_ref
}
}
class(cb_obj_single) <- "colocboost"
out_ucos_each <- colocboost_assemble_ucos(cb_obj_single,
Expand Down
12 changes: 6 additions & 6 deletions R/colocboost_assemble_cos.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ colocboost_assemble_cos <- function(cb_obj,
for (iiii in 1:length(coloc_outcomes)) {
X_dict <- cb_data$dict[coloc_outcomes[iiii]]
tmp <- w_purity(avWeight[, iiii, drop = FALSE],
X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX,
X = get_genotype_matrix(cb_data$data[[X_dict]]), Xcorr = cb_data$data[[X_dict]]$XtX,
N = cb_data$data[[coloc_outcomes[iiii]]]$N, n = n_purity, coverage = sec_coverage_thresh,
min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr,
miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss
Expand Down Expand Up @@ -81,7 +81,7 @@ colocboost_assemble_cos <- function(cb_obj,
pos <- match(pos, setdiff(1:cb_model_para$P, cb_data$data[[i]]$variable_miss))
}
p_tmp <- matrix(get_purity(pos,
X = cb_data$data[[X_dict]]$X,
X = get_genotype_matrix(cb_data$data[[X_dict]]),
Xcorr = cb_data$data[[X_dict]]$XtX,
N = cb_data$data[[i]]$N, n = n_purity
), 1, 3)
Expand Down Expand Up @@ -145,7 +145,7 @@ colocboost_assemble_cos <- function(cb_obj,
for (iiii in 1:length(coloc_outcomes)) {
X_dict <- cb_data$dict[coloc_outcomes[iiii]]
tmp <- w_purity(avWeight[, iiii, drop = FALSE],
X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX,
X = get_genotype_matrix(cb_data$data[[X_dict]]), Xcorr = cb_data$data[[X_dict]]$XtX,
N = cb_data$data[[coloc_outcomes[iiii]]]$N, n = n_purity, coverage = sec_coverage_thresh,
min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr,
miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss
Expand Down Expand Up @@ -205,7 +205,7 @@ colocboost_assemble_cos <- function(cb_obj,
weight_cluster <- t(av[[iiii]][idx, , drop = FALSE])
X_dict <- cb_data$dict[coloc_outcomes[iiii]]
tmp <- w_purity(weight_cluster,
X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX,
X = get_genotype_matrix(cb_data$data[[X_dict]]), Xcorr = cb_data$data[[X_dict]]$XtX,
N = cb_data$data[[coloc_outcomes[iiii]]]$N, n = n_purity, coverage = sec_coverage_thresh,
min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr,
miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss
Expand Down Expand Up @@ -348,7 +348,7 @@ colocboost_assemble_cos <- function(cb_obj,
for (i in 1:cb_model_para$L) {
X_dict <- cb_data$dict[i]
res[[i]] <- get_between_purity(cset1, cset2,
X = cb_data$data[[X_dict]]$X,
X = get_genotype_matrix(cb_data$data[[X_dict]]),
Xcorr = cb_data$data[[X_dict]]$XtX,
miss_idx = cb_data$data[[i]]$variable_miss,
P = cb_model_para$P
Expand Down Expand Up @@ -436,7 +436,7 @@ colocboost_assemble_cos <- function(cb_obj,
pos <- match(pos, setdiff(1:cb_model_para$P, cb_data$data[[i3]]$variable_miss))
}
tmp <- matrix(get_purity(pos,
X = cb_data$data[[X_dict]]$X,
X = get_genotype_matrix(cb_data$data[[X_dict]]),
Xcorr = cb_data$data[[X_dict]]$XtX,
N = cb_data$data[[i3]]$N, n = n_purity
), 1, 3)
Expand Down
8 changes: 4 additions & 4 deletions R/colocboost_assemble_ucos.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ colocboost_assemble_ucos <- function(cb_obj_single,
pos <- match(pos, setdiff(1:cb_model_para$P, cb_data$data[[1]]$variable_miss))
}
purity <- matrix(get_purity(pos,
X = cb_data$data[[1]]$X, Xcorr = cb_data$data[[1]]$XtX,
X = get_genotype_matrix(cb_data$data[[1]]), Xcorr = cb_data$data[[1]]$XtX,
N = cb_data$data[[1]]$N, n = n_purity
), 1, 3)
purity <- as.data.frame(purity)
Expand Down Expand Up @@ -128,7 +128,7 @@ colocboost_assemble_ucos <- function(cb_obj_single,
w <- LogLik_change[idx]
weight_cluster <- t(weights[idx, , drop = FALSE])
check_purity <- w_purity(weight_cluster,
X = cb_data$data[[1]]$X, Xcorr = cb_data$data[[1]]$XtX,
X = get_genotype_matrix(cb_data$data[[1]]), Xcorr = cb_data$data[[1]]$XtX,
N = cb_data$data[[1]]$N, n = n_purity, coverage = coverage,
min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr,
miss_idx = cb_data$data[[1]]$variable_miss
Expand Down Expand Up @@ -234,7 +234,7 @@ colocboost_assemble_ucos <- function(cb_obj_single,
cset1 <- confidence_sets[[i.between]]
cset2 <- confidence_sets[[j.between]]
res <- get_between_purity(cset1, cset2,
X = cb_data$data[[1]]$X,
X = get_genotype_matrix(cb_data$data[[1]]),
Xcorr = cb_data$data[[1]]$XtX,
miss_idx = cb_data$data[[1]]$variable_miss,
P = cb_model_para$P
Expand Down Expand Up @@ -300,7 +300,7 @@ colocboost_assemble_ucos <- function(cb_obj_single,
rbind(
purity,
matrix(get_purity(pos,
X = cb_data$data[[1]]$X, Xcorr = cb_data$data[[1]]$XtX,
X = get_genotype_matrix(cb_data$data[[1]]), Xcorr = cb_data$data[[1]]$XtX,
N = cb_data$data[[1]]$N, n = n_purity
), 1, 3)
)
Expand Down
Loading