diff --git a/NAMESPACE b/NAMESPACE index 00b6170..8878dc0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,10 @@ export(get_hierarchical_clusters) export(get_robust_colocalization) export(get_robust_ucos) export(get_ucos_summary) +importFrom(Rfast,correls) +importFrom(Rfast,med) +importFrom(Rfast,standardise) +importFrom(Rfast,upper_tri) importFrom(grDevices,adjustcolor) importFrom(graphics,abline) importFrom(graphics,axis) diff --git a/R/colocboost.R b/R/colocboost.R index 3f9f860..02e3005 100644 --- a/R/colocboost.R +++ b/R/colocboost.R @@ -22,12 +22,20 @@ #' \code{variant} is required if sumstat for different outcomes do not have the same number of variables. #' \code{var_y} is the variance of phenotype (default is 1 meaning that the Y is in the \dQuote{standardized} scale). #' @param LD A list of correlation matrix indicating the LD matrix for each genotype. It also could be a single matrix if all sumstats were -#' obtained from the same genotypes. +#' obtained from the same genotypes. Provide either \code{LD} or \code{X_ref}, not both. +#' If neither is provided, LD-free mode is used. +#' @param X_ref A reference panel genotype matrix (N_ref x P) or a list of matrices, as an alternative to providing a precomputed +#' \code{LD} matrix. Column names must include variant names matching those in \code{sumstat}. +#' When the number of reference panel samples is less than the number of variants (N_ref < P), +#' this avoids storing the full P x P LD matrix and reduces memory usage. +#' When N_ref >= P, LD is precomputed from \code{X_ref} internally. +#' Provide either \code{LD} or \code{X_ref}, not both. If neither is provided, LD-free mode is used. #' @param dict_YX A L by 2 matrix of dictionary for \code{X} and \code{Y} if there exist subsets of outcomes corresponding to the same X matrix. #' The first column should be 1:L for L outcomes. The second column should be the index of \code{X} corresponding to the outcome. #' The innovation: do not provide the same matrix in \code{X} to reduce the computational burden. -#' @param dict_sumstatLD A L by 2 matrix of dictionary for \code{sumstat} and \code{LD} if there exist subsets of outcomes corresponding to the same sumstat. -#' The first column should be 1:L for L sumstat The second column should be the index of \code{LD} corresponding to the sumstat. +#' @param dict_sumstatLD A L by 2 matrix of dictionary for \code{sumstat} and \code{LD} (or \code{X_ref}) if there exist subsets of outcomes +#' corresponding to the same sumstat. +#' The first column should be 1:L for L sumstat The second column should be the index of \code{LD} (or \code{X_ref}) corresponding to the sumstat. #' The innovation: do not provide the same matrix in \code{LD} to reduce the computational burden. #' @param outcome_names The names of outcomes, which has the same order for Y. #' @param focal_outcome_idx The index of the focal outcome if perform GWAS-xQTL ColocBoost @@ -129,9 +137,10 @@ #' #' @family colocboost #' @importFrom stats na.omit +#' @importFrom Rfast correls standardise upper_tri med #' @export colocboost <- function(X = NULL, Y = NULL, # individual data - sumstat = NULL, LD = NULL, # summary statistics: either Z, bhat, sebhat, N, var_Y, + sumstat = NULL, LD = NULL, X_ref = NULL, # summary statistics: either Z, bhat, sebhat, N, var_Y, ###### - index dict for X match multiple Y / LD match multiple sumstat dict_YX = NULL, # Y index for 1st column, X index for 2nd column dict_sumstatLD = NULL, # sumstat index for 1st column, LD index for 2nd column @@ -202,11 +211,15 @@ colocboost <- function(X = NULL, Y = NULL, # individual data warning("Error: No individual data (X, Y) or summary statistics (sumstat) or (effect_est, effect_se) are provided! Please check!") return(NULL) } + if (!is.null(LD) && !is.null(X_ref)) { + warning("Error: Provide either LD or X_ref, not both.") + return(NULL) + } # - check input data: individual level data and summary-level data validated_data <- colocboost_validate_input_data( X = X, Y = Y, - sumstat = sumstat, LD = LD, + sumstat = sumstat, LD = LD, X_ref = X_ref, dict_YX = dict_YX, dict_sumstatLD = dict_sumstatLD, effect_est = effect_est, effect_se = effect_se, effect_n = effect_n, overlap_variables = overlap_variables, @@ -227,6 +240,8 @@ colocboost <- function(X = NULL, Y = NULL, # individual data keep_variable_individual <- validated_data$keep_variable_individual sumstat <- validated_data$sumstat LD <- validated_data$LD + X_ref <- validated_data$X_ref + ref_label <- validated_data$ref_label sumstatLD_dict <- validated_data$sumstatLD_dict keep_variable_sumstat <- validated_data$keep_variable_sumstat Z <- validated_data$Z @@ -277,7 +292,8 @@ colocboost <- function(X = NULL, Y = NULL, # individual data } cb_data <- colocboost_init_data( X = X, Y = Y, dict_YX = yx_dict, - Z = Z, LD = LD, N_sumstat = N_sumstat, dict_sumstatLD = sumstatLD_dict, + Z = Z, LD = LD, X_ref = X_ref, ref_label = ref_label, + N_sumstat = N_sumstat, dict_sumstatLD = sumstatLD_dict, Var_y = Var_y, SeBhat = SeBhat, keep_variables = keep_variables, focal_outcome_idx = focal_outcome_idx, @@ -377,7 +393,15 @@ colocboost <- function(X = NULL, Y = NULL, # individual data #' @param X A list of genotype matrices for different outcomes, or a single matrix if all outcomes share the same genotypes. #' @param Y A list of vectors of outcomes or an N by L matrix if it is considered for the same X and multiple outcomes. #' @param sumstat A list of data.frames of summary statistics. -#' @param LD A list of correlation matrices indicating the LD matrix for each genotype. +#' @param LD A list of correlation matrix indicating the LD matrix for each genotype. It also could be a single matrix if all sumstats were +#' obtained from the same genotypes. Provide either \code{LD} or \code{X_ref}, not both. +#' If neither is provided, LD-free mode is used. +#' @param X_ref A reference panel genotype matrix (N_ref x P) or a list of matrices, as an alternative to providing a precomputed +#' \code{LD} matrix. Column names must include variant names matching those in \code{sumstat}. +#' When the number of reference panel samples is less than the number of variants (N_ref < P), +#' this avoids storing the full P x P LD matrix and reduces memory usage. +#' When N_ref >= P, LD is precomputed from \code{X_ref} internally. +#' Provide either \code{LD} or \code{X_ref}, not both. If neither is provided, LD-free mode is used. #' @param dict_YX A L by 2 matrix of dictionary for X and Y if there exist subsets of outcomes corresponding to the same X matrix. #' @param dict_sumstatLD A L by 2 matrix of dictionary for sumstat and LD if there exist subsets of outcomes corresponding to the same sumstat. #' @param effect_est Matrix of variable regression coefficients (i.e. regression beta values) in the genomic region @@ -394,6 +418,8 @@ colocboost <- function(X = NULL, Y = NULL, # individual data #' \item{keep_variable_individual}{List of variable names for each X matrix} #' \item{sumstat}{Processed list of summary statistics data.frames} #' \item{LD}{Processed list of LD matrices} +#' \item{X_ref}{Processed list of reference genotype matrices} +#' \item{ref_label}{Style of reference matrics} #' \item{sumstatLD_dict}{Dictionary mapping sumstat to LD} #' \item{keep_variable_sumstat}{List of variant names for each sumstat} #' \item{Z}{List of z-scores for each outcome} @@ -408,7 +434,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data #' #' @keywords internal colocboost_validate_input_data <- function(X = NULL, Y = NULL, - sumstat = NULL, LD = NULL, + sumstat = NULL, LD = NULL, X_ref = NULL, dict_YX = NULL, dict_sumstatLD = NULL, effect_est = NULL, effect_se = NULL, effect_n = NULL, overlap_variables = FALSE, @@ -418,7 +444,7 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, cos_npc_cutoff = 0.2, npc_outcome_cutoff = 0.2) { - # - check individual level data + ############### Check individual level data ########################### if (!is.null(X) & !is.null(Y)) { # --- check input if (is.data.frame(X)) X <- as.matrix(X) @@ -557,8 +583,9 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, } else { yx_dict <- keep_variable_individual <- NULL } + ############### Done! Check individual level data ########################### - # - check summary-level data + ############### Check summary level data ########################### if ((!is.null(sumstat)) | (!is.null(effect_est) & !is.null(effect_se))) { # --- check input of (effect_est, effect_se) if ((!is.null(effect_est) & !is.null(effect_se))) { @@ -649,8 +676,9 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, } return(xx) }) + ############### Done! Check summary level data ########################### - # --- check input of LD + ############### Check input of LD or reference X_ref ########################### M_updated <- M min_abs_corr_updated <- min_abs_corr jk_equiv_corr_updated <- jk_equiv_corr @@ -659,10 +687,13 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, cos_npc_cutoff_updated <- cos_npc_cutoff npc_outcome_cutoff_updated <- npc_outcome_cutoff - if (is.null(LD)) { + # --- Handle X_ref: convert to LD when N_ref >= P, or keep for on-the-fly computation + + + if (is.null(LD) && is.null(X_ref)) { # if no LD input, set diagonal matrix to LD warning( - "Providing the LD for summary statistics data is highly recommended. ", + "Providing the LD or X_ref for summary statistics data is highly recommended. ", "Without LD, only a single iteration will be performed under the assumption of one causal variable per outcome. ", "Additionally, the purity of CoS cannot be evaluated!" ) @@ -677,61 +708,98 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, func_simplex_updated <- "only_z2z" cos_npc_cutoff_updated <- 0 npc_outcome_cutoff_updated <- 0 + ref_label <- "No_ref" } else { - if (is.data.frame(LD)) LD <- as.matrix(LD) - if (is.matrix(LD)) LD <- list(LD) - # - check if NA in LD matrix - num_na <- sapply(LD, sum) - if (any(is.na(num_na))){ - warning("Error: Input LD must not contain missing values (NA).") - return(NULL) + # -- Determine reference list and variant extraction for LD or X_ref + if (!is.null(LD)){ + + if (is.data.frame(LD)) LD <- as.matrix(LD) + if (is.matrix(LD)) LD <- list(LD) + # - check if NA in LD matrix + num_na <- sapply(LD, sum) + if (any(is.na(num_na))){ + warning("Error: Input LD must not contain missing values (NA).") + return(NULL) + } + ref_list <- LD + ref_label <- "LD" + + } else { + if (is.data.frame(X_ref)) X_ref <- as.matrix(X_ref) + if (is.matrix(X_ref)) X_ref <- list(X_ref) + + # When N_ref >= P, precompute LD (avoids repeated O(N_ref*P) in boosting loop) + # When N_ref < P, keep X_ref for on-the-fly computation (avoids P*P memory) + all_large <- all(sapply(X_ref, function(xr) nrow(xr) >= ncol(xr))) + if (all_large) { + message("N_ref >= P: precomputing LD from X_ref.") + LD <- lapply(X_ref, function(xr) { + ld <- get_cormat(xr) + rownames(ld) <- colnames(ld) <- colnames(xr) + ld + }) + X_ref <- NULL + ref_list <- LD + ref_label <- "LD" + } else { + # N_ref < P: standardize and keep for on-the-fly crossprod/(N_ref-1) + for (idx in seq_along(X_ref)) { + X_ref[[idx]] <- standardise(X_ref[[idx]], center = TRUE, scale = TRUE) + X_ref[[idx]][which(is.na(X_ref[[idx]]))] <- 0 + } + ref_list <- X_ref + ref_label <- "X_ref" + } } - # Create sumstat-LD mapping === - if (length(LD) == 1) { + + # -- Create sumstat - LD/X_ref mapping === + if (length(ref_list) == 1) { sumstatLD_dict <- rep(1, length(sumstat)) - } else if (length(LD) == length(sumstat)) { + } else if (length(ref_list) == length(sumstat)) { sumstatLD_dict <- seq_along(sumstat) } else { if (is.null(dict_sumstatLD)) { warning('Error: Please provide dict_sumstatLD: you have ', length(sumstat), - ' sumstats but only ', length(LD), ' LD matrices') + ' sumstats but only ', length(ref_list), ' ', ref_label, ' matrices') return(NULL) } else { - # - dict for sumstat to LD mapping + # - dict for sumstat to LD/X_ref mapping sumstatLD_dict <- rep(NA, length(sumstat)) for (i in 1:length(sumstat)) { tmp <- unique(dict_sumstatLD[dict_sumstatLD[, 1] == i, 2]) if (length(tmp) == 0) { - warning(paste("Error: You don't provide matched LD for sumstat", i)) + warning("Error: You don't provide matched", ref_label, "for sumstat", i) return(NULL) } else if (length(tmp) != 1) { - warning(paste("Error: You provide multiple matched LD for sumstat", i)) + warning("Error: You provide multiple matched", ref_label, "for sumstat", i) return(NULL) } else { sumstatLD_dict[i] <- tmp } } - if (max(sumstatLD_dict) > length(LD)) { - warning("Error: You don't provide enough LD matrices!") + if (max(sumstatLD_dict) > length(ref_list)) { + warning("Error: You don't provide enough", ref_label, "matrices!") return(NULL) } } } - # === Filter variants for each sumstat === + # -- Filter variants for each sumstat + get_ref_variants <- function(ref_mat) colnames(ref_mat) + ref_ncol <- function(ref_mat) ncol(ref_mat) for (i in seq_along(sumstat)) { # Get sumstat variants (adjust field name based on your data structure) sumstat_variants <- sumstat[[i]]$variant n_total <- length(sumstat_variants) - # Get LD variants + # Get LD/X_ref variants ld_idx <- sumstatLD_dict[i] - current_ld <- LD[[ld_idx]] - ld_variants <- rownames(current_ld) + current_ref <- ref_list[[ld_idx]] + ld_variants <- get_ref_variants(current_ref) if (is.null(ld_variants)) { - if (ncol(current_ld) != n_total){ - warning('Error: LD matrix ', ld_idx, ' has no rownames. Please ensure all LD matrices have variant names as rownames.') + if (ncol(current_ref) != n_total){ + warning('Error: ', ref_label, ' matrix ', ld_idx, ' has no variant names. Please ensure all ', ref_label, ' matrices have variant names.') return(NULL) } } @@ -755,6 +823,7 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, keep_variable_sumstat <- lapply(sumstat, function(xx) { xx$variant }) + ############### Done! Check input of LD or reference X_ref ########################### # - checking sample size existency n_exist <- sapply(sumstat, function(ss) { @@ -834,7 +903,7 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, Z[[i.summstat]] <- z } } else { - Z <- N_sumstat <- Var_y <- SeBhat <- sumstatLD_dict <- keep_variable_sumstat <- NULL + Z <- N_sumstat <- Var_y <- SeBhat <- sumstatLD_dict <- keep_variable_sumstat <- X_ref <- ref_label <- NULL M_updated <- M min_abs_corr_updated <- min_abs_corr jk_equiv_corr_updated <- jk_equiv_corr @@ -851,6 +920,8 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, keep_variable_individual = keep_variable_individual, sumstat = sumstat, LD = LD, + X_ref = X_ref, + ref_label = ref_label, sumstatLD_dict = sumstatLD_dict, keep_variable_sumstat = keep_variable_sumstat, Z = Z, diff --git a/R/colocboost_assemble.R b/R/colocboost_assemble.R index 94cbbe5..46e1d52 100644 --- a/R/colocboost_assemble.R +++ b/R/colocboost_assemble.R @@ -132,10 +132,13 @@ colocboost_assemble <- function(cb_obj, } } if (!is.null(cb_obj_single$cb_data$data[[1]][["XtY"]])) { + X_dict <- cb_obj$cb_data$dict[i] if (is.null(cb_obj_single$cb_data$data[[1]]$XtX)) { - X_dict <- cb_obj$cb_data$dict[i] cb_obj_single$cb_data$data[[1]]$XtX <- cb_obj$cb_data$data[[X_dict]]$XtX } + if (is.null(cb_obj_single$cb_data$data[[1]]$ref_label)) { + cb_obj_single$cb_data$data[[1]]$ref_label <- cb_obj$cb_data$data[[X_dict]]$ref_label + } } class(cb_obj_single) <- "colocboost" out_ucos_each <- colocboost_assemble_ucos(cb_obj_single, diff --git a/R/colocboost_assemble_cos.R b/R/colocboost_assemble_cos.R index 6a1243a..00a052c 100644 --- a/R/colocboost_assemble_cos.R +++ b/R/colocboost_assemble_cos.R @@ -43,7 +43,8 @@ colocboost_assemble_cos <- function(cb_obj, X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX, N = cb_data$data[[coloc_outcomes[iiii]]]$N, n = n_purity, coverage = sec_coverage_thresh, min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr, - miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss + miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss, + ref_label = cb_data$data[[X_dict]]$ref_label ) check_purity[iiii] <- length(tmp) == 1 } @@ -83,7 +84,8 @@ colocboost_assemble_cos <- function(cb_obj, p_tmp <- matrix(get_purity(pos, X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX, - N = cb_data$data[[i]]$N, n = n_purity + N = cb_data$data[[i]]$N, n = n_purity, + ref_label = cb_data$data[[X_dict]]$ref_label ), 1, 3) purity <- c(purity, list(p_tmp)) } @@ -148,7 +150,8 @@ colocboost_assemble_cos <- function(cb_obj, X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX, N = cb_data$data[[coloc_outcomes[iiii]]]$N, n = n_purity, coverage = sec_coverage_thresh, min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr, - miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss + miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss, + ref_label = cb_data$data[[X_dict]]$ref_label ) check_purity[iiii] <- length(tmp) == 1 } @@ -208,7 +211,8 @@ colocboost_assemble_cos <- function(cb_obj, X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX, N = cb_data$data[[coloc_outcomes[iiii]]]$N, n = n_purity, coverage = sec_coverage_thresh, min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr, - miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss + miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss, + ref_label = cb_data$data[[X_dict]]$ref_label ) check_purity[[iiii]] <- tmp } @@ -351,7 +355,8 @@ colocboost_assemble_cos <- function(cb_obj, X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX, miss_idx = cb_data$data[[i]]$variable_miss, - P = cb_model_para$P + P = cb_model_para$P, + ref_label = cb_data$data[[X_dict]]$ref_label ) } res <- Reduce(pmax, res) @@ -438,7 +443,8 @@ colocboost_assemble_cos <- function(cb_obj, tmp <- matrix(get_purity(pos, X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX, - N = cb_data$data[[i3]]$N, n = n_purity + N = cb_data$data[[i3]]$N, n = n_purity, + ref_label = cb_data$data[[X_dict]]$ref_label ), 1, 3) p_tmp <- rbind(p_tmp, tmp) } diff --git a/R/colocboost_assemble_ucos.R b/R/colocboost_assemble_ucos.R index 3336f19..2b44f54 100644 --- a/R/colocboost_assemble_ucos.R +++ b/R/colocboost_assemble_ucos.R @@ -64,7 +64,8 @@ colocboost_assemble_ucos <- function(cb_obj_single, } purity <- matrix(get_purity(pos, X = cb_data$data[[1]]$X, Xcorr = cb_data$data[[1]]$XtX, - N = cb_data$data[[1]]$N, n = n_purity + N = cb_data$data[[1]]$N, n = n_purity, + ref_label = cb_data$data[[1]]$ref_label ), 1, 3) purity <- as.data.frame(purity) colnames(purity) <- c("min_abs_corr", "mean_abs_corr", "median_abs_corr") @@ -131,7 +132,8 @@ colocboost_assemble_ucos <- function(cb_obj_single, X = cb_data$data[[1]]$X, Xcorr = cb_data$data[[1]]$XtX, N = cb_data$data[[1]]$N, n = n_purity, coverage = coverage, min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr, - miss_idx = cb_data$data[[1]]$variable_miss + miss_idx = cb_data$data[[1]]$variable_miss, + ref_label = cb_data$data[[1]]$ref_label ) if (length(check_purity) != 0) { w <- w[check_purity] @@ -237,7 +239,8 @@ colocboost_assemble_ucos <- function(cb_obj_single, X = cb_data$data[[1]]$X, Xcorr = cb_data$data[[1]]$XtX, miss_idx = cb_data$data[[1]]$variable_miss, - P = cb_model_para$P + P = cb_model_para$P, + ref_label = cb_data$data[[1]]$ref_label ) min_between[i.between, j.between] <- min_between[j.between, i.between] <- res[1] max_between[i.between, j.between] <- max_between[j.between, i.between] <- res[2] @@ -301,7 +304,8 @@ colocboost_assemble_ucos <- function(cb_obj_single, purity, matrix(get_purity(pos, X = cb_data$data[[1]]$X, Xcorr = cb_data$data[[1]]$XtX, - N = cb_data$data[[1]]$N, n = n_purity + N = cb_data$data[[1]]$N, n = n_purity, + ref_label = cb_data$data[[1]]$ref_label ), 1, 3) ) } diff --git a/R/colocboost_check_update_jk.R b/R/colocboost_check_update_jk.R index d449b03..57ebbc3 100644 --- a/R/colocboost_check_update_jk.R +++ b/R/colocboost_check_update_jk.R @@ -122,7 +122,9 @@ boost_check_update_jk_nofocal <- function(cb_model, cb_model_para, cb_data) { YtY = data_update[[ii]]$YtY, XtY = data_update[[ii]]$XtY, beta_k = model_update[[ii]]$beta, - miss_idx = data_update[[ii]]$variable_miss + miss_idx = data_update[[ii]]$variable_miss, + ref_label = cb_data$data[[X_dict[ii]]]$ref_label, + XtX_beta_cache = model_update[[ii]]$XtX_beta_cache ) }) change_res_each <- as.numeric(unlist(change_res_each)) @@ -197,7 +199,9 @@ boost_check_update_jk_nofocal <- function(cb_model, cb_model_para, cb_data) { YtY = data_update[[ii]]$YtY, XtY = data_update[[ii]]$XtY, beta_k = model_update[[ii]]$beta, - miss_idx = data_update[[ii]]$variable_miss + miss_idx = data_update[[ii]]$variable_miss, + ref_label = cb_data$data[[X_dict[ii]]]$ref_label, + XtX_beta_cache = model_update[[ii]]$XtX_beta_cache ) }) change_res_each <- as.numeric(unlist(change_res_each)) @@ -241,7 +245,9 @@ boost_check_update_jk_nofocal <- function(cb_model, cb_model_para, cb_data) { YtY = data_update[[ii]]$YtY, XtY = data_update[[ii]]$XtY, beta_k = model_update[[ii]]$beta, - miss_idx = data_update[[ii]]$variable_miss + miss_idx = data_update[[ii]]$variable_miss, + ref_label = cb_data$data[[X_dict[ii]]]$ref_label, + XtX_beta_cache = model_update[[ii]]$XtX_beta_cache ) }) change_res_each <- as.numeric(unlist(change_res_each)) @@ -368,7 +374,8 @@ boost_check_update_jk_focal <- function(cb_model, cb_model_para, cb_data, X = cb_data$data[[X_dict[pp_focal]]]$X, XtX = cb_data$data[[X_dict[pp_focal]]]$XtX, N = cb_data$data[[X_dict[pp_focal]]]$N, - remain_jk = 1:cb_model_para$P + remain_jk = 1:cb_model_para$P, + ref_label = cb_data$data[[X_dict[pp_focal]]]$ref_label ) }) # ----- second, if within the same LD buddies, select the following variants @@ -379,7 +386,8 @@ boost_check_update_jk_focal <- function(cb_model, cb_model_para, cb_data, # ----- third, if picked variant within the same LD buddies ld_tmp <- get_LD_jk1_jk2(jk_focal, jk_focal_tmp, XtX = cb_data$data[[X_dict[pp_focal]]]$XtX, - remain_jk = 1:cb_model_para$P + remain_jk = 1:cb_model_para$P, + ref_label = cb_data$data[[X_dict[pp_focal]]]$ref_label ) if (ld_tmp > jk_equiv_corr) { jk_focal <- jk_focal_tmp @@ -491,7 +499,7 @@ boost_check_update_jk_focal <- function(cb_model, cb_model_para, cb_data, #' @importFrom stats cor get_LD_jk1_jk2 <- function(jk1, jk2, X = NULL, XtX = NULL, N = NULL, - remain_jk = NULL) { + remain_jk = NULL, ref_label = "LD") { if (!is.null(X)) { LD_temp <- suppressWarnings({ get_cormat(X[, c(jk1, jk2)]) @@ -499,12 +507,18 @@ get_LD_jk1_jk2 <- function(jk1, jk2, LD_temp[which(is.na(LD_temp))] <- 0 LD_temp <- LD_temp[1, 2] } else if (!is.null(XtX)) { - if (length(XtX) == 1){ + if (identical(ref_label, "No_ref")) { LD_temp <- 0 } else { jk1.remain <- match(jk1, remain_jk) jk2.remain <- match(jk2, remain_jk) - LD_temp <- XtX[jk1.remain, jk2.remain] + if (identical(ref_label, "X_ref")) { + LD_temp <- suppressWarnings({ get_cormat(XtX[, c(jk1.remain, jk2.remain)]) }) + LD_temp[which(is.na(LD_temp))] <- 0 + LD_temp <- LD_temp[1, 2] + } else { + LD_temp <- XtX[jk1.remain, jk2.remain] + } } } return(LD_temp) @@ -528,7 +542,8 @@ check_jk_jkeach <- function(jk, jk_each, X = cb_data$data[[X_dict[i]]]$X, XtX = cb_data$data[[X_dict[i]]]$XtX, N = data_update[[i]]$N, - remain_jk = setdiff(1:length(model_update[[i]]$res), data_update[[i]]$variable_miss) + remain_jk = setdiff(1:length(model_update[[i]]$res), data_update[[i]]$variable_miss), + ref_label = cb_data$data[[X_dict[i]]]$ref_label ) judge[i] <- (change_each <= jk_equiv_loglik) & (abs(LD_temp) >= jk_equiv_corr) } else { @@ -549,7 +564,7 @@ check_pair_jkeach <- function(jk_each, #' @importFrom stats cor get_LD_jk_each <- function(jk_each, X = NULL, XtX = NULL, N = NULL, - remain_jk = NULL) { + remain_jk = NULL, ref_label = "LD") { if (!is.null(X)) { LD_temp <- suppressWarnings({ get_cormat(X[, jk_each]) @@ -557,11 +572,15 @@ check_pair_jkeach <- function(jk_each, LD_temp[which(is.na(LD_temp))] <- 0 # LD_temp <- LD_temp[1, 2] } else if (!is.null(XtX)) { - if (length(XtX) == 1){ + if (identical(ref_label, "No_ref")) { LD_temp <- matrix(0, length(jk_each), length(jk_each)) } else { jk.remain <- match(jk_each, remain_jk) - LD_temp <- XtX[jk.remain, jk.remain] + if (identical(ref_label, "X_ref")) { + LD_temp <- suppressWarnings({ get_cormat(XtX[, jk.remain]) }) + } else { + LD_temp <- XtX[jk.remain, jk.remain] + } LD_temp[which(is.na(LD_temp))] <- 0 } } @@ -582,7 +601,8 @@ check_pair_jkeach <- function(jk_each, X = cb_data$data[[X_dict[idx]]]$X, XtX = cb_data$data[[X_dict[idx]]]$XtX, N = data_update[[idx]]$N, - remain_jk = setdiff(1:length(model_update[[idx]]$res), data_update[[idx]]$variable_miss) + remain_jk = setdiff(1:length(model_update[[idx]]$res), data_update[[idx]]$variable_miss), + ref_label = cb_data$data[[X_dict[idx]]]$ref_label ) }) @@ -612,7 +632,9 @@ estimate_change_profile_res <- function(jk, X = NULL, res = NULL, N = NULL, XtX = NULL, YtY = NULL, XtY = NULL, beta_k = NULL, - miss_idx = NULL) { + miss_idx = NULL, + ref_label = "LD", + XtX_beta_cache = NULL) { if (!is.null(X)) { rtr <- sum(res^2) / (N - 1) xtr <- t(X[, jk]) %*% res / (N - 1) @@ -629,10 +651,13 @@ estimate_change_profile_res <- function(jk, } else { xty <- XtY / scaling_factor } - if (length(xtx) == 1){ + if (identical(ref_label, "No_ref")) { rtr <- yty - 2 * sum(beta_k * xty) + sum(beta_k^2) + } else if (!is.null(XtX_beta_cache)) { + rtr <- yty - 2 * sum(beta_k * xty) + sum(XtX_beta_cache * beta_k) } else { - rtr <- yty - 2 * sum(beta_k * xty) + sum((xtx %*% as.matrix(beta_k)) * beta_k) + XtX_beta <- compute_XtX_product(xtx, beta_k, ref_label) + rtr <- yty - 2 * sum(beta_k * xty) + sum(XtX_beta * beta_k) } } numerator <- xtr^2 / (2 * rtr) diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R index ff9f312..6df5ab5 100644 --- a/R/colocboost_inference.R +++ b/R/colocboost_inference.R @@ -190,7 +190,7 @@ get_n_cluster <- function(hc, Sigma, m = ncol(Sigma), min_cluster_corr = 0.8) { #' @keywords cb_post_inference #' @noRd w_purity <- function(weights, X = NULL, Xcorr = NULL, N = NULL, n = 100, coverage = 0.95, - min_abs_corr = 0.5, median_abs_corr = NULL, miss_idx = NULL) { + min_abs_corr = 0.5, median_abs_corr = NULL, miss_idx = NULL, ref_label = "LD") { tmp_purity <- apply(weights, 2, function(w) { pos <- w_cs(w, coverage = coverage) @@ -198,7 +198,7 @@ w_purity <- function(weights, X = NULL, Xcorr = NULL, N = NULL, n = 100, coverag if (!is.null(Xcorr)) { pos <- match(pos, setdiff(1:length(w), miss_idx)) } - get_purity(pos, X = X, Xcorr = Xcorr, N = N, n = n) + get_purity(pos, X = X, Xcorr = Xcorr, N = N, n = n, ref_label = ref_label) }) if (is.null(median_abs_corr)) { is_pure <- which(tmp_purity[1, ] >= min_abs_corr) @@ -227,7 +227,8 @@ check_null_post <- function(cb_obj, } get_profile <- function(cs_beta, X = NULL, Y = NULL, N = NULL, - XtX = NULL, YtY = NULL, XtY = NULL, miss_idx, adj_dep = 1) { + XtX = NULL, YtY = NULL, XtY = NULL, miss_idx, adj_dep = 1, + ref_label = "LD") { if (!is.null(X)) { mean((Y - X %*% as.matrix(cs_beta))^2) * N / (N - 1) } else if (!is.null(XtY)) { @@ -242,20 +243,23 @@ check_null_post <- function(cb_obj, } else { xty <- XtY / scaling_factor } - if (length(xtx) == 1){ + if (identical(ref_label, "No_ref")) { (yty - 2 * sum(cs_beta * xty) + sum(cs_beta^2)) * adj_dep } else { - (yty - 2 * sum(cs_beta * xty) + sum((xtx %*% as.matrix(cs_beta)) * cs_beta)) * adj_dep + XtX_beta <- compute_XtX_product(xtx, cs_beta, ref_label) + (yty - 2 * sum(cs_beta * xty) + sum(XtX_beta * cs_beta)) * adj_dep } } } get_cs_obj <- function(cs_beta, res, tau, func_simplex, lambda, adj_dep, LD_free, X = NULL, Y = NULL, N = NULL, - XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL) { + XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL, + ref_label = "LD") { correlation <- get_correlation( X = X, res = res, XtY = XtY, N = N, YtY = YtY, - XtX = XtX, beta_k = cs_beta, miss_idx = miss_idx + XtX = XtX, beta_k = cs_beta, miss_idx = miss_idx, + ref_label = ref_label ) z <- get_z(correlation, n = N, res) abs_cor <- abs(correlation) @@ -264,7 +268,7 @@ check_null_post <- function(cb_obj, P <- length(z) ld_jk <- get_LD_jk(jk, X = X, XtX = XtX, N = N, - remain_idx = setdiff(1:P, miss_idx), P = P + remain_idx = setdiff(1:P, miss_idx), P = P, ref_label = ref_label ) ld_feature <- sqrt(abs(ld_jk)) # - calculate delta @@ -284,27 +288,28 @@ check_null_post <- function(cb_obj, return(tau * matrixStats::logSumExp(exp_term / tau + log(delta))) } - update_res <- function(X = NULL, Y = NULL, XtX = NULL, XtY = NULL, N = NULL, cs_beta, miss_idx) { + update_res <- function(X = NULL, Y = NULL, XtX = NULL, XtY = NULL, N = NULL, cs_beta, miss_idx, ref_label = "LD") { if (!is.null(X)) { return(Y - X %*% cs_beta) } else if (!is.null(XtX)) { scaling.factor <- if (!is.null(N)) N - 1 else 1 beta_scaling <- if (!is.null(N)) 1 else 100 - xtx <- XtX / scaling.factor if (length(miss_idx) != 0) { xty <- XtY[-miss_idx] / scaling.factor res.tmp <- rep(0, length(XtY)) - if (length(xtx) == 1){ + if (identical(ref_label, "No_ref")) { res.tmp[-miss_idx] <- xty - cs_beta[-miss_idx] / beta_scaling } else { - res.tmp[-miss_idx] <- xty - xtx %*% (cs_beta[-miss_idx] / beta_scaling) + XtX_beta <- compute_XtX_product(XtX, cs_beta[-miss_idx] / beta_scaling, ref_label) + res.tmp[-miss_idx] <- xty - XtX_beta / scaling.factor } } else { xty <- XtY / scaling.factor - if (length(xtx) == 1){ + if (identical(ref_label, "No_ref")) { res.tmp <- xty - (cs_beta / beta_scaling) } else { - res.tmp <- xty - xtx %*% (cs_beta / beta_scaling) + XtX_beta <- compute_XtX_product(XtX, cs_beta / beta_scaling, ref_label) + res.tmp <- xty - XtX_beta / scaling.factor } } return(res.tmp) @@ -325,12 +330,14 @@ check_null_post <- function(cb_obj, cs_beta[cs_variants] <- 0 X_dict <- cb_data$dict[j] adj_dep <- cb_data$data[[j]]$dependency + ref_label_j <- cb_data$data[[X_dict]]$ref_label if (check_null_method == "profile") { cs_profile <- get_profile(cs_beta, X = cb_data$data[[X_dict]]$X, Y = cb_data$data[[j]]$Y, XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[j]]$XtY, YtY = cb_data$data[[j]]$YtY, N = cb_data$data[[j]]$N, - miss_idx = cb_data$data[[j]]$variable_miss, adj_dep = adj_dep + miss_idx = cb_data$data[[j]]$variable_miss, adj_dep = adj_dep, + ref_label = ref_label_j ) last_profile <- extract_last(cb_obj$cb_model[[j]]$profile_loglike_each) change <- abs(cs_profile - last_profile) @@ -348,7 +355,8 @@ check_null_post <- function(cb_obj, X = cb_data$data[[X_dict]]$X, Y = cb_data$data[[j]]$Y, XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[j]]$XtY, N = cb_data$data[[j]]$N, cs_beta, - miss_idx = cb_data$data[[j]]$variable_miss + miss_idx = cb_data$data[[j]]$variable_miss, + ref_label = ref_label_j ) cs_obj <- get_cs_obj(cs_beta, res, cb_obj$cb_model_para$tau, cb_obj$cb_model_para$func_simplex, cb_obj$cb_model_para$lambda, @@ -357,7 +365,8 @@ check_null_post <- function(cb_obj, X = cb_data$data[[X_dict]]$X, N = cb_data$data[[j]]$N, XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[X_dict]]$XtY, YtY = cb_data$data[[X_dict]]$YtY, - miss_idx = cb_data$data[[j]]$variable_miss + miss_idx = cb_data$data[[j]]$variable_miss, + ref_label = ref_label_j ) last_obj <- min(cb_obj$cb_model[[j]]$obj_path) change <- abs(cs_obj - last_obj) @@ -401,7 +410,7 @@ check_null_post <- function(cb_obj, #' @keywords cb_post_inference #' @noRd #' @importFrom stats na.omit -get_purity <- function(pos, X = NULL, Xcorr = NULL, N = NULL, n = 100) { +get_purity <- function(pos, X = NULL, Xcorr = NULL, N = NULL, n = 100, ref_label = "LD") { get_upper_tri <- Rfast::upper_tri get_median <- Rfast::med @@ -420,16 +429,17 @@ get_purity <- function(pos, X = NULL, Xcorr = NULL, N = NULL, n = 100) { if (is.null(Xcorr)) { X_sub <- X[, pos] X_sub <- as.matrix(X_sub) - corr <- suppressWarnings({ - get_cormat(X_sub) - }) + corr <- suppressWarnings({ get_cormat(X_sub) }) corr[which(is.na(corr))] <- 0 value <- abs(get_upper_tri(corr)) } else { - if (length(Xcorr) == 1){ + if (identical(ref_label, "No_ref") || length(Xcorr) == 1) { value <- 0 + } else if (identical(ref_label, "X_ref")) { + corr <- suppressWarnings({ get_cormat(Xcorr[, pos]) }) + corr[which(is.na(corr))] <- 0 + value <- abs(get_upper_tri(corr)) } else { - Xcorr <- Xcorr # if (!is.null(N)) Xcorr/(N-1) else Xcorr value <- abs(get_upper_tri(Xcorr[pos, pos])) } } @@ -445,7 +455,7 @@ get_purity <- function(pos, X = NULL, Xcorr = NULL, N = NULL, n = 100) { #' @keywords cb_post_inference #' @noRd #' @importFrom stats na.omit -get_between_purity <- function(pos1, pos2, X = NULL, Xcorr = NULL, miss_idx = NULL, P = NULL) { +get_between_purity <- function(pos1, pos2, X = NULL, Xcorr = NULL, miss_idx = NULL, P = NULL, ref_label = "LD") { get_matrix_mult <- function(X_sub1, X_sub2) { X_sub1 <- t(X_sub1) X_sub2 <- t(X_sub2) @@ -465,15 +475,21 @@ get_between_purity <- function(pos1, pos2, X = NULL, Xcorr = NULL, miss_idx = NU X_sub2 <- scale(X[, pos2, drop = FALSE], center = T, scale = F) value <- abs(get_matrix_mult(X_sub1, X_sub2)) } else { - if (sum(Xcorr)==1){ + if (identical(ref_label, "No_ref") || sum(Xcorr) == 1) { value <- 0 } else { - if (length(miss_idx)!=0){ + if (length(miss_idx) != 0) { pos1 <- na.omit(match(pos1, setdiff(1:P, miss_idx))) pos2 <- na.omit(match(pos2, setdiff(1:P, miss_idx))) } if (length(pos1) != 0 & length(pos2) != 0) { - value <- abs(Xcorr[pos1, pos2]) + if (identical(ref_label, "X_ref")) { + X_sub1 <- scale(Xcorr[, pos1, drop = FALSE], center = T, scale = F) + X_sub2 <- scale(Xcorr[, pos2, drop = FALSE], center = T, scale = F) + value <- abs(get_matrix_mult(X_sub1, X_sub2)) + } else { + value <- abs(Xcorr[pos1, pos2]) + } } else { value <- 0 } @@ -498,7 +514,8 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info) { } get_cos_profile <- function(cs_beta, outcome_idx, X = NULL, Y = NULL, N = NULL, - XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL, adj_dep = 1) { + XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL, adj_dep = 1, + ref_label = "LD") { if (!is.null(X)) { cos_profile <- mean((Y - X %*% as.matrix(cs_beta))^2) * N / (N - 1) yty <- var(Y) @@ -514,10 +531,11 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info) { } else { xty <- XtY / scaling_factor } - if (length(xtx) == 1){ + if (identical(ref_label, "No_ref")) { cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum(cs_beta^2)) * adj_dep } else { - cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum((xtx %*% as.matrix(cs_beta)) * cs_beta)) * adj_dep + XtX_beta <- compute_XtX_product(xtx, cs_beta, ref_label) + cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum(XtX_beta * cs_beta)) * adj_dep } } delta <- yty - cos_profile @@ -543,7 +561,8 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info) { XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[outcome_idx]]$XtY, YtY = cb_data$data[[outcome_idx]]$YtY, N = cb_data$data[[outcome_idx]]$N, miss_idx = cb_data$data[[outcome_idx]]$variable_miss, - adj_dep = cb_data$data[[outcome_idx]]$dependency + adj_dep = cb_data$data[[outcome_idx]]$dependency, + ref_label = cb_data$data[[X_dict]]$ref_label ) max_profile <- max(cb_obj$cb_model[[outcome_idx]]$profile_loglike_each) ifelse(max_profile < cos_profile, 0, max_profile - cos_profile) @@ -612,7 +631,8 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info) { get_ucos_evidence <- function(cb_obj, ucoloc_info) { get_ucos_profile <- function(cs_beta, outcome_idx, X = NULL, Y = NULL, N = NULL, - XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL, adj_dep = 1) { + XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL, adj_dep = 1, + ref_label = "LD") { if (!is.null(X)) { cos_profile <- mean((Y - X %*% as.matrix(cs_beta))^2) * N / (N - 1) yty <- var(Y) @@ -628,10 +648,11 @@ get_ucos_evidence <- function(cb_obj, ucoloc_info) { } else { xty <- XtY / scaling_factor } - if (length(xtx) == 1){ + if (identical(ref_label, "No_ref")) { cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum(cs_beta^2)) * adj_dep } else { - cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum((xtx %*% as.matrix(cs_beta)) * cs_beta)) * adj_dep + XtX_beta <- compute_XtX_product(xtx, cs_beta, ref_label) + cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum(XtX_beta * cs_beta)) * adj_dep } } delta <- yty - cos_profile @@ -657,7 +678,8 @@ get_ucos_evidence <- function(cb_obj, ucoloc_info) { XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[outcome_idx]]$XtY, YtY = cb_data$data[[outcome_idx]]$YtY, N = cb_data$data[[outcome_idx]]$N, miss_idx = cb_data$data[[outcome_idx]]$variable_miss, - adj_dep = cb_data$data[[outcome_idx]]$dependency + adj_dep = cb_data$data[[outcome_idx]]$dependency, + ref_label = cb_data$data[[X_dict]]$ref_label ) max_profile <- max(cb_obj$cb_model[[outcome_idx]]$profile_loglike_each) ifelse(max_profile < cos_profile, 0, max_profile - cos_profile) diff --git a/R/colocboost_init.R b/R/colocboost_init.R index 4fa8815..7385dd9 100644 --- a/R/colocboost_init.R +++ b/R/colocboost_init.R @@ -28,7 +28,8 @@ colocboost_inits <- function() { #' @noRd #' @keywords cb_objects colocboost_init_data <- function(X, Y, dict_YX, - Z, LD, N_sumstat, dict_sumstatLD, + Z, LD, X_ref, ref_label, + N_sumstat, dict_sumstatLD, Var_y, SeBhat, keep_variables, focal_outcome_idx = NULL, @@ -98,6 +99,7 @@ colocboost_init_data <- function(X, Y, dict_YX, ) for (i in 1:length(Y)) { cb_data$data[[flag]] <- ind_formated$result[[i]] + cb_data$data[[flag]]$ref_label <- "individual" names(cb_data$data)[flag] <- paste0("ind_outcome_", i) flag <- flag + 1 } @@ -105,17 +107,20 @@ colocboost_init_data <- function(X, Y, dict_YX, } n_ind <- flag - 1 # if summary: XtX XtY, YtY - if (!is.null(Z) & !is.null(LD)) { + if (!is.null(Z)) { ####################### need to consider more ######################### # ------ only code up one sumstat variant_lists <- keep_variables[c((n_ind_variable+1):length(keep_variables))] sumstat_formated <- process_sumstat( - Z, N_sumstat, Var_y, SeBhat, LD, + Z, N_sumstat, Var_y, SeBhat, + ld_matrices = if (ref_label == "X_ref") X_ref else LD, variant_lists, dict_sumstatLD, - keep_variable_names + keep_variable_names, + ref_label = ref_label ) for (i in 1:length(Z)) { cb_data$data[[flag]] <- sumstat_formated$results[[i]] + cb_data$data[[flag]]$ref_label <- ref_label names(cb_data$data)[flag] <- paste0("sumstat_outcome_", i) flag <- flag + 1 } @@ -196,7 +201,8 @@ colocboost_init_model <- function(cb_data, N = data_each$N, YtY = data_each$YtY, XtX = cb_data$data[[X_dict]]$XtX, beta_k = tmp$beta, - miss_idx = data_each$variable_miss + miss_idx = data_each$variable_miss, + ref_label = cb_data$data[[X_dict]]$ref_label ) # - initial z-score between X and residual based on correlation tmp$z <- get_z(tmp$correlation, n = data_each$N, tmp$res) @@ -378,7 +384,8 @@ inital_residual <- function(Y = NULL, XtY = NULL) { # - Calculate correlation between X and res get_correlation <- function(X = NULL, res = NULL, XtY = NULL, N = NULL, YtY = NULL, XtX = NULL, beta_k = NULL, miss_idx = NULL, - XtX_beta_cache = NULL) { + XtX_beta_cache = NULL, + ref_label = "LD") { if (!is.null(X)) { corr <- suppressWarnings({ Rfast::correls(res, X)[, "correlation"] @@ -400,12 +407,13 @@ get_correlation <- function(X = NULL, res = NULL, XtY = NULL, N = NULL, Xtr <- res / scaling_factor XtY <- XtY / scaling_factor } - if (length(XtX) == 1){ + if (identical(ref_label, "No_ref")) { var_r <- YtY - 2 * sum(beta_k * XtY) + sum(beta_k^2) } else if (!is.null(XtX_beta_cache)) { var_r <- YtY - 2 * sum(beta_k * XtY) + sum(XtX_beta_cache * beta_k) } else { - var_r <- YtY - 2 * sum(beta_k * XtY) + sum((XtX %*% as.matrix(beta_k)) * beta_k) + XtX_beta_val <- compute_XtX_product(XtX, beta_k, ref_label) + var_r <- YtY - 2 * sum(beta_k * XtY) + sum(XtX_beta_val * beta_k) } if (var_r > 1e-6) { corr_nomiss <- Xtr / sqrt(var_r) @@ -561,7 +569,10 @@ get_multiple_testing_correction <- function(z, miss_idx = NULL, func_multi_test #' #' @return List containing processed data with optimized LD submatrix storage #' @noRd -process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dict, target_variants) { +process_sumstat <- function(Z, N, Var_y, SeBhat, + ld_matrices, variant_lists, dict, + target_variants, + ref_label = "LD") { # Step 1: Identify unique combinations of (variant list, LD matrix) @@ -582,7 +593,6 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic break } } - if (!is_duplicate) { # If not a duplicate, assign its exact index unified_dict[i] <- i @@ -606,14 +616,12 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic current_variants <- variant_lists[[i]] current_z <- Z[[i]] current_n <- N[[i]] - # Get corresponding LD matrix from original dictionary mapping ld_index <- dict[i] - current_ld_matrix <- ld_matrices[[ld_index]] + current_ref <- ld_matrices[[ld_index]] # Find common variants between current list and target variants common_variants <- intersect(current_variants, target_variants) - # Find variants in target but not in current list missing_variants <- setdiff(target_variants, current_variants) tmp$variable_miss <- which(target_variants %in% missing_variants) @@ -625,51 +633,76 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic Z_extend[pos_target] <- current_z[pos_z] # Calculate submatrix for each unique entry (not duplicates) - if (length(current_ld_matrix) == 1){ - ld_submatrix <- current_ld_matrix + ref_submatrix <- NULL + + if (ref_label == "No_ref"){ + + ref_submatrix <- current_ref + + } else if (ref_label == "X_ref") { + + # match by column names only (rows = samples) + if (length(common_variants) > 0) { + if (i == unified_dict[i]) { + col_idx <- match(common_variants, colnames(current_ref)) + col_idx <- col_idx[!is.na(col_idx)] + if (length(col_idx) > 0) { + ref_submatrix <- current_ref[, col_idx, drop = FALSE] + colnames(ref_submatrix) <- common_variants + } + } + } + } else { - ld_submatrix <- NULL + + # ref_label == "LD": match by both row and column names if (length(common_variants) > 0) { - # Only include the submatrix if this entry is unique or is the first occurrence if (i == unified_dict[i]) { - # Check if common_variants and rownames have identical order - if (identical(common_variants, rownames(current_ld_matrix))) { - # If order is identical, use the matrix directly without reordering - ld_submatrix <- current_ld_matrix + if (identical(common_variants, rownames(current_ref))) { + # Order is identical, use matrix directly without reordering + ref_submatrix <- current_ref } else { - # If order is different, reorder using matched indices - matched_indices <- match(common_variants, rownames(current_ld_matrix)) - ld_submatrix <- current_ld_matrix[matched_indices, matched_indices, drop = FALSE] - rownames(ld_submatrix) <- common_variants - colnames(ld_submatrix) <- common_variants + # Reorder to match common_variants order + matched_indices <- match(common_variants, rownames(current_ref)) + ref_submatrix <- current_ref[matched_indices, matched_indices, drop = FALSE] + rownames(ref_submatrix) <- common_variants + colnames(ref_submatrix) <- common_variants } } } + } # Organize data if (is.null(current_n)) { - tmp$XtX <- ld_submatrix + tmp$XtX <- ref_submatrix tmp$XtY <- Z_extend tmp$YtY <- 1 } else { if (!is.null(SeBhat[[i]]) & !is.null(Var_y[[i]])) { + # var_y, shat (and bhat) are provided, so the effects are on the # *original scale*. adj <- 1 / (Z_extend^2 + current_n - 2) - if (!is.null(ld_submatrix)) { + tmp$YtY <- (current_n - 1) * Var_y[[i]] + tmp$XtY <- Z_extend * sqrt(adj) * Var_y[[i]] / SeBhat[[i]] + if (ref_label == "X_ref") { + # Store X_ref slice directly; XtX computed on-the-fly downstream + tmp$XtX <- ref_submatrix + } else { + # LD or No_ref: scale to original scale XtXdiag <- Var_y[[i]] * adj / (SeBhat[[i]]^2) - xtx <- t(ld_submatrix * sqrt(XtXdiag)) * sqrt(XtXdiag) + xtx <- t(ref_submatrix * sqrt(XtXdiag)) * sqrt(XtXdiag) tmp$XtX <- (xtx + t(xtx)) / 2 } - tmp$YtY <- (current_n - 1) * Var_y[[i]] - tmp$XtY <- Z_extend * sqrt(adj) * Var_y[[i]] / SeBhat[[i]] + } else { - if (!is.null(ld_submatrix)) { - tmp$XtX <- ld_submatrix - } + # Standardised scale tmp$YtY <- (current_n - 1) tmp$XtY <- sqrt(current_n - 1) * Z_extend + if (!is.null(ref_submatrix)) { + tmp$XtX <- ref_submatrix # LD, X_ref, or scalar 1 stored as-is + } } } diff --git a/R/colocboost_update.R b/R/colocboost_update.R index 0e50aba..52efd62 100644 --- a/R/colocboost_update.R +++ b/R/colocboost_update.R @@ -32,7 +32,8 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) { XtX = cb_data$data[[X_dict]]$XtX, N = cb_data$data[[i]]$N, remain_idx = setdiff(1:cb_model_para$P, cb_data$data[[i]]$variable_miss), - P = cb_model_para$P + P = cb_model_para$P, + ref_label = cb_data$data[[X_dict]]$ref_label ) cb_model[[i]]$ld_jk <- rbind(cb_model[[i]]$ld_jk, ld_jk) } @@ -107,39 +108,27 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) { beta_scaling <- cb_model[[i]]$beta_scaling # - summary statistics xtx <- cb_data$data[[X_dict]]$XtX + ref_label_i <- cb_data$data[[X_dict]]$ref_label cb_model[[i]]$res <- rep(0, cb_model_para$P) if (length(cb_data$data[[i]]$variable_miss) != 0) { beta <- cb_model[[i]]$beta[-cb_data$data[[i]]$variable_miss] / beta_scaling xty <- cb_data$data[[i]]$XtY[-cb_data$data[[i]]$variable_miss] - if (length(xtx) == 1){ - XtX_beta <- beta - cb_model[[i]]$res[-cb_data$data[[i]]$variable_miss] <- xty - scaling_factor * beta - } else { - XtX_beta <- xtx %*% beta - cb_model[[i]]$res[-cb_data$data[[i]]$variable_miss] <- xty - scaling_factor * XtX_beta - } + XtX_beta <- compute_XtX_product(xtx, beta, ref_label_i) + cb_model[[i]]$res[-cb_data$data[[i]]$variable_miss] <- xty - scaling_factor * XtX_beta } else { beta <- cb_model[[i]]$beta / beta_scaling xty <- cb_data$data[[i]]$XtY - if (length(xtx) == 1){ - XtX_beta <- beta - cb_model[[i]]$res <- xty - scaling_factor * beta - } else { - XtX_beta <- xtx %*% beta - cb_model[[i]]$res <- xty - scaling_factor * XtX_beta - } + XtX_beta <- compute_XtX_product(xtx, beta, ref_label_i) + cb_model[[i]]$res <- xty - scaling_factor * XtX_beta } # - cache XtX %*% beta for reuse in get_correlation (avoids redundant O(P^2) computation) cb_model[[i]]$XtX_beta_cache <- XtX_beta # - profile-loglikelihood (reuses cached XtX_beta) yty <- cb_data$data[[i]]$YtY / scaling_factor xty <- xty / scaling_factor - if (length(xtx) == 1){ - profile_log <- (yty - 2 * sum(beta * xty) + sum(beta^2)) * adj_dep - } else { - profile_log <- (yty - 2 * sum(beta * xty) + sum(XtX_beta * beta)) * adj_dep - } + profile_log <- (yty - 2 * sum(beta * xty) + sum(XtX_beta * beta)) * adj_dep + } cb_model[[i]]$profile_loglike_each <- c(cb_model[[i]]$profile_loglike_each, profile_log) } @@ -149,7 +138,7 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) { # - calculate LD for update_jk -get_LD_jk <- function(jk1, X = NULL, XtX = NULL, N = NULL, remain_idx = NULL, P = NULL) { +get_LD_jk <- function(jk1, X = NULL, XtX = NULL, N = NULL, remain_idx = NULL, P = NULL, ref_label = "LD") { if (!is.null(X)) { corr <- suppressWarnings({ Rfast::correls(X[, jk1], X)[, "correlation"] @@ -158,8 +147,14 @@ get_LD_jk <- function(jk1, X = NULL, XtX = NULL, N = NULL, remain_idx = NULL, P } else if (!is.null(XtX)) { jk1.remain <- which(remain_idx == jk1) corr <- rep(0, P) - if (length(XtX) == 1 | length(jk1.remain)==0){ + if (identical(ref_label, "No_ref") | length(jk1.remain) == 0) { corr[remain_idx] <- 1 + } else if (identical(ref_label, "X_ref")) { + corr_common <- suppressWarnings({ + correls(XtX[, jk1.remain], XtX)[, "correlation"] + }) + corr_common[which(is.na(corr_common))] <- 0 + corr[remain_idx] <- corr_common } else { corr[remain_idx] <- XtX[, jk1.remain] } @@ -262,7 +257,8 @@ boost_obj_last <- function(cb_data, cb_model, cb_model_para) { XtX = cb_data$data[[X_dict]]$XtX, N = cb_data$data[[i]]$N, remain_idx = setdiff(1:cb_model_para$P, cb_data$data[[i]]$variable_miss), - P = cb_model_para$P + P = cb_model_para$P, + ref_label = cb_data$data[[X_dict]]$ref_label ) ld_feature <- sqrt(abs(ld_jk)) diff --git a/R/colocboost_utils.R b/R/colocboost_utils.R index b486bf8..4c75b1a 100644 --- a/R/colocboost_utils.R +++ b/R/colocboost_utils.R @@ -43,7 +43,8 @@ merge_cos_ucos <- function(cb_obj, out_cos, out_ucos, coverage = 0.95, X = cb_obj$cb_data$data[[X_dict]]$X, Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, miss_idx = cb_obj$cb_data$data[[fine_outcome]]$variable_miss, - P = cb_obj$cb_model_para$P + P = cb_obj$cb_model_para$P, + ref_label = cb_obj$cb_data$data[[X_dict]]$ref_label ) # is.between <- length(intersect(cset1, cset2)) != 0 is.between <- (abs(res[2] - 1) < tol) @@ -66,7 +67,8 @@ merge_cos_ucos <- function(cb_obj, out_cos, out_ucos, coverage = 0.95, X = cb_obj$cb_data$data[[X_dict]]$X, Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, - P = cb_obj$cb_model_para$P + P = cb_obj$cb_model_para$P, + ref_label = cb_obj$cb_data$data[[X_dict]]$ref_label ) } res <- Reduce(pmax, res) @@ -151,7 +153,8 @@ merge_ucos <- function(cb_obj, past_out, X = cb_obj$cb_data$data[[X_dict]]$X, Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, - P = cb_obj$cb_model_para$P + P = cb_obj$cb_model_para$P, + ref_label = cb_obj$cb_data$data[[X_dict]]$ref_label ) flag <- flag + 1 } @@ -221,7 +224,8 @@ merge_ucos <- function(cb_obj, past_out, tmp <- matrix(get_purity(pos, X = cb_obj$cb_data$data[[X_dict]]$X, Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, - N = cb_obj$cb_data$data[[i3]]$N, n = n_purity + N = cb_obj$cb_data$data[[i3]]$N, n = n_purity, + ref_label = cb_obj$cb_data$data[[X_dict]]$ref_label ), 1, 3) p_tmp <- rbind(p_tmp, tmp) } @@ -689,7 +693,8 @@ get_cos_details <- function(cb_obj, coloc_out, data_info = NULL) { tmp <- matrix(get_purity(pos, X = cb_obj$cb_data$data[[X_dict]]$X, Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, - N = cb_obj$cb_data$data[[i3]]$N, n = cb_obj$cb_model_para$n_purity + N = cb_obj$cb_data$data[[i3]]$N, n = cb_obj$cb_model_para$n_purity, + ref_label = cb_obj$cb_data$data[[X_dict]]$ref_label ), 1, 3) p_tmp <- rbind(p_tmp, tmp) } @@ -763,7 +768,8 @@ get_cos_details <- function(cb_obj, coloc_out, data_info = NULL) { X = cb_obj$cb_data$data[[X_dict]]$X, Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, - P = cb_obj$cb_model_para$P + P = cb_obj$cb_model_para$P, + ref_label = cb_obj$cb_data$data[[X_dict]]$ref_label ) flag <- flag + 1 } @@ -1025,7 +1031,8 @@ get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output X = cb_obj$cb_data$data[[X_dict]]$X, Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, - P = cb_obj$cb_model_para$P + P = cb_obj$cb_model_para$P, + ref_label = cb_obj$cb_data$data[[X_dict]]$ref_label ) flag <- flag + 1 } @@ -1069,7 +1076,8 @@ get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output X = cb_obj$cb_data$data[[X_dict]]$X, Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, - P = cb_obj$cb_model_para$P + P = cb_obj$cb_model_para$P, + ref_label = cb_obj$cb_data$data[[X_dict]]$ref_label ) flag <- flag + 1 } @@ -1124,3 +1132,17 @@ get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output return(ll) } + +#' Compute XtX %*% beta, dispatching on ref_label +#' @noRd +compute_XtX_product <- function(XtX, beta, ref_label = "LD") { + if (identical(ref_label, "No_ref")) return(beta) + if (identical(ref_label, "X_ref")) { + N_ref <- nrow(XtX) + temp <- XtX %*% as.matrix(beta) + return(as.vector(crossprod(XtX, temp)) / (N_ref - 1)) + } + as.vector(XtX %*% as.matrix(beta)) +} + + diff --git a/R/colocboost_workhorse.R b/R/colocboost_workhorse.R index 2635e00..2f0a4f7 100644 --- a/R/colocboost_workhorse.R +++ b/R/colocboost_workhorse.R @@ -284,7 +284,8 @@ cb_model_update <- function(cb_data, cb_model, cb_model_para) { XtX = cb_data$data[[X_dict]]$XtX, beta_k = model_each$beta, miss_idx = data_each$variable_miss, - XtX_beta_cache = model_each$XtX_beta_cache + XtX_beta_cache = model_each$XtX_beta_cache, + ref_label = cb_data$data[[X_dict]]$ref_label ) cb_model[[i]]$correlation <- tmp cb_model[[i]]$z <- get_z(tmp, n = data_each$N, model_each$res) diff --git a/man/colocboost.Rd b/man/colocboost.Rd index 4f6c3df..7e1b0a0 100644 --- a/man/colocboost.Rd +++ b/man/colocboost.Rd @@ -12,6 +12,7 @@ colocboost( Y = NULL, sumstat = NULL, LD = NULL, + X_ref = NULL, dict_YX = NULL, dict_sumstatLD = NULL, outcome_names = NULL, @@ -81,14 +82,23 @@ The columns of data.frame should include either \code{z} or \code{beta}/\code{se \code{var_y} is the variance of phenotype (default is 1 meaning that the Y is in the \dQuote{standardized} scale).} \item{LD}{A list of correlation matrix indicating the LD matrix for each genotype. It also could be a single matrix if all sumstats were -obtained from the same genotypes.} +obtained from the same genotypes. Provide either \code{LD} or \code{X_ref}, not both. +If neither is provided, LD-free mode is used.} + +\item{X_ref}{A reference panel genotype matrix (N_ref x P) or a list of matrices, as an alternative to providing a precomputed +\code{LD} matrix. Column names must include variant names matching those in \code{sumstat}. +When the number of reference panel samples is less than the number of variants (N_ref < P), +this avoids storing the full P x P LD matrix and reduces memory usage. +When N_ref >= P, LD is precomputed from \code{X_ref} internally. +Provide either \code{LD} or \code{X_ref}, not both. If neither is provided, LD-free mode is used.} \item{dict_YX}{A L by 2 matrix of dictionary for \code{X} and \code{Y} if there exist subsets of outcomes corresponding to the same X matrix. The first column should be 1:L for L outcomes. The second column should be the index of \code{X} corresponding to the outcome. The innovation: do not provide the same matrix in \code{X} to reduce the computational burden.} -\item{dict_sumstatLD}{A L by 2 matrix of dictionary for \code{sumstat} and \code{LD} if there exist subsets of outcomes corresponding to the same sumstat. -The first column should be 1:L for L sumstat The second column should be the index of \code{LD} corresponding to the sumstat. +\item{dict_sumstatLD}{A L by 2 matrix of dictionary for \code{sumstat} and \code{LD} (or \code{X_ref}) if there exist subsets of outcomes +corresponding to the same sumstat. +The first column should be 1:L for L sumstat The second column should be the index of \code{LD} (or \code{X_ref}) corresponding to the sumstat. The innovation: do not provide the same matrix in \code{LD} to reduce the computational burden.} \item{outcome_names}{The names of outcomes, which has the same order for Y.} diff --git a/man/colocboost_validate_input_data.Rd b/man/colocboost_validate_input_data.Rd index d9c45f4..ec7e40a 100644 --- a/man/colocboost_validate_input_data.Rd +++ b/man/colocboost_validate_input_data.Rd @@ -9,6 +9,7 @@ colocboost_validate_input_data( Y = NULL, sumstat = NULL, LD = NULL, + X_ref = NULL, dict_YX = NULL, dict_sumstatLD = NULL, effect_est = NULL, @@ -31,7 +32,16 @@ colocboost_validate_input_data( \item{sumstat}{A list of data.frames of summary statistics.} -\item{LD}{A list of correlation matrices indicating the LD matrix for each genotype.} +\item{LD}{A list of correlation matrix indicating the LD matrix for each genotype. It also could be a single matrix if all sumstats were +obtained from the same genotypes. Provide either \code{LD} or \code{X_ref}, not both. +If neither is provided, LD-free mode is used.} + +\item{X_ref}{A reference panel genotype matrix (N_ref x P) or a list of matrices, as an alternative to providing a precomputed +\code{LD} matrix. Column names must include variant names matching those in \code{sumstat}. +When the number of reference panel samples is less than the number of variants (N_ref < P), +this avoids storing the full P x P LD matrix and reduces memory usage. +When N_ref >= P, LD is precomputed from \code{X_ref} internally. +Provide either \code{LD} or \code{X_ref}, not both. If neither is provided, LD-free mode is used.} \item{dict_YX}{A L by 2 matrix of dictionary for X and Y if there exist subsets of outcomes corresponding to the same X matrix.} @@ -57,6 +67,8 @@ A list containing: \item{keep_variable_individual}{List of variable names for each X matrix} \item{sumstat}{Processed list of summary statistics data.frames} \item{LD}{Processed list of LD matrices} +\item{X_ref}{Processed list of reference genotype matrices} +\item{ref_label}{Style of reference matrics} \item{sumstatLD_dict}{Dictionary mapping sumstat to LD} \item{keep_variable_sumstat}{List of variant names for each sumstat} \item{Z}{List of z-scores for each outcome} diff --git a/tests/testthat/Rplots.pdf b/tests/testthat/Rplots.pdf new file mode 100644 index 0000000..55da3dc Binary files /dev/null and b/tests/testthat/Rplots.pdf differ diff --git a/tests/testthat/test_Xref.R b/tests/testthat/test_Xref.R new file mode 100644 index 0000000..5d30abc --- /dev/null +++ b/tests/testthat/test_Xref.R @@ -0,0 +1,758 @@ +library(testthat) + +# ============================================================================ +# Shared test data generators for X_ref tests +# ============================================================================ + +generate_xref_test_data <- function(n = 200, n_ref = 50, p = 30, L = 2, seed = 42) { + set.seed(seed) + + # Generate X with LD structure (for individual-level truth) + sigma <- matrix(0, p, p) + for (i in 1:p) { + for (j in 1:p) { + sigma[i, j] <- 0.9^abs(i - j) + } + } + X <- MASS::mvrnorm(n, rep(0, p), sigma) + colnames(X) <- paste0("SNP", 1:p) + + # Generate X_ref (reference panel, separate samples) + X_ref <- MASS::mvrnorm(n_ref, rep(0, p), sigma) + colnames(X_ref) <- paste0("SNP", 1:p) + + # Generate true effects + true_beta <- matrix(0, p, L) + true_beta[5, 1] <- 0.7 # SNP5 affects trait 1 + true_beta[5, 2] <- 0.6 # SNP5 also affects trait 2 (colocalized) + true_beta[20, 2] <- 0.5 # SNP20 only affects trait 2 + + # Generate Y with some noise + Y <- matrix(0, n, L) + for (l in 1:L) { + Y[, l] <- X %*% true_beta[, l] + rnorm(n, 0, 1) + } + + # Calculate summary statistics + beta <- matrix(0, p, L) + se <- matrix(0, p, L) + for (i in 1:L) { + for (j in 1:p) { + fit <- summary(lm(Y[, i] ~ X[, j]))$coef + if (nrow(fit) == 2) { + beta[j, i] <- fit[2, 1] + se[j, i] <- fit[2, 2] + } + } + } + + sumstat_list <- lapply(1:L, function(i) { + data.frame( + beta = beta[, i], + sebeta = se[, i], + n = n, + variant = colnames(X) + ) + }) + + LD <- cor(X) + + list( + X = X, + Y = Y, + X_ref = X_ref, + LD = LD, + sumstat = sumstat_list, + true_beta = true_beta + ) +} + + +# ============================================================================ +# Test 1: colocboost rejects LD + X_ref together +# ============================================================================ + +test_that("colocboost rejects when both LD and X_ref are provided", { + test_data <- generate_xref_test_data() + + expect_warning( + result <- colocboost( + sumstat = test_data$sumstat, + LD = test_data$LD, + X_ref = test_data$X_ref, + M = 1 + ), + "Provide either LD or X_ref, not both" + ) + expect_null(result) +}) + + +# ============================================================================ +# Test 2: X_ref with N_ref >= P precomputes LD +# ============================================================================ + +test_that("X_ref with N_ref >= P precomputes LD and produces valid results", { + # N_ref = 50, P = 30 => N_ref >= P, should precompute LD + test_data <- generate_xref_test_data(n_ref = 50, p = 30) + + expect_message( + suppressWarnings({ + result_xref <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 10, + output_level = 2 + ) + }), + "N_ref >= P: precomputing LD from X_ref" + ) + + expect_s3_class(result_xref, "colocboost") + expect_equal(result_xref$data_info$n_outcomes, 2) + expect_equal(length(result_xref$data_info$variables), 30) +}) + + +# ============================================================================ +# Test 3: X_ref with N_ref < P keeps X_ref for on-the-fly computation +# ============================================================================ + +test_that("X_ref with N_ref < P runs correctly without precomputing LD", { + # N_ref = 20, P = 30 => N_ref < P, should keep X_ref + test_data <- generate_xref_test_data(n_ref = 50, p = 30) + + suppressWarnings(suppressMessages({ + result_xref <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 10, + output_level = 2 + ) + })) + + expect_s3_class(result_xref, "colocboost") + expect_equal(result_xref$data_info$n_outcomes, 2) + expect_equal(length(result_xref$data_info$variables), 30) +}) + + +# ============================================================================ +# Test 4: X_ref results are consistent with LD results (N_ref >= P case) +# ============================================================================ + +test_that("X_ref (N_ref >= P) produces results consistent with precomputed LD", { + test_data <- generate_xref_test_data(n = 200, n_ref = 200, p = 20, seed = 99) + + # Run with precomputed LD + suppressWarnings(suppressMessages({ + result_ld <- colocboost( + sumstat = test_data$sumstat, + LD = test_data$LD, + M = 50, + output_level = 2 + ) + })) + + # Run with X_ref (same samples as LD source, so N_ref >= P => precomputes LD) + suppressWarnings(suppressMessages({ + result_xref <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 50, + output_level = 2 + ) + })) + + # Both should produce valid colocboost objects + expect_s3_class(result_ld, "colocboost") + expect_s3_class(result_xref, "colocboost") + + # Same number of outcomes and variables + expect_equal(result_ld$data_info$n_outcomes, result_xref$data_info$n_outcomes) + expect_equal(length(result_ld$data_info$variables), length(result_xref$data_info$variables)) +}) + + +# ============================================================================ +# Test 5: X_ref with N_ref < P and one iteration (one_causal mode) +# ============================================================================ + +test_that("X_ref with N_ref < P works in one-causal mode (M=1 with LD)", { + test_data <- generate_xref_test_data(n_ref = 15, p = 30) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 1, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$model_info$model_coveraged, "one_causal") + expect_equal(result$model_info$n_updates, 1) +}) + + +# ============================================================================ +# Test 6: X_ref with 3 outcomes +# ============================================================================ + +test_that("X_ref works with 3 outcomes", { + set.seed(123) + n <- 200 + n_ref <- 25 + p <- 25 + L <- 3 + + sigma <- 0.9^abs(outer(1:p, 1:p, "-")) + X <- MASS::mvrnorm(n, rep(0, p), sigma) + colnames(X) <- paste0("SNP", 1:p) + X_ref <- MASS::mvrnorm(n_ref, rep(0, p), sigma) + colnames(X_ref) <- paste0("SNP", 1:p) + + true_beta <- matrix(0, p, L) + true_beta[5, 1] <- 0.6 + true_beta[5, 2] <- 0.5 + true_beta[15, 3] <- 0.7 + + Y <- matrix(0, n, L) + for (l in 1:L) { + Y[, l] <- X %*% true_beta[, l] + rnorm(n, 0, 1) + } + + beta <- se <- matrix(0, p, L) + for (i in 1:L) { + for (j in 1:p) { + fit <- summary(lm(Y[, i] ~ X[, j]))$coef + if (nrow(fit) == 2) { beta[j, i] <- fit[2, 1]; se[j, i] <- fit[2, 2] } + } + } + sumstat <- lapply(1:L, function(i) data.frame(beta = beta[, i], sebeta = se[, i], n = n, variant = colnames(X))) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = sumstat, + X_ref = X_ref, + M = 10, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 3) +}) + + +# ============================================================================ +# Test 7: X_ref with focal outcome +# ============================================================================ + +test_that("X_ref works with focal outcome", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + focal_outcome_idx = 1, + M = 10, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$outcome_info$is_focal[1], TRUE) + expect_equal(result$data_info$outcome_info$is_focal[2], FALSE) +}) + + +# ============================================================================ +# Test 8: X_ref with dict_sumstatLD mapping +# ============================================================================ + +test_that("X_ref works with dict_sumstatLD mapping for shared reference panel", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + # Both sumstats share the same X_ref + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = list(test_data$X_ref), + dict_sumstatLD = matrix(c(1, 1, 2, 1), ncol = 2), + M = 10, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 2) +}) + + +# ============================================================================ +# Test 9: X_ref with multiple reference panels +# ============================================================================ + +test_that("X_ref works with multiple reference panels", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + # Two separate X_ref panels (identical for testing) + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = list(test_data$X_ref, test_data$X_ref), + M = 10, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 2) +}) + + +# ============================================================================ +# Test 10: X_ref with missing variants (partial overlap) +# ============================================================================ + +test_that("X_ref handles partial variant overlap with sumstat", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + # X_ref has fewer variants than sumstat + X_ref_partial <- test_data$X_ref[, 1:25] # Only first 25 of 30 SNPs + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = X_ref_partial, + M = 10, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + # Should have 25 variables (intersection) + expect_equal(length(result$data_info$variables), 25) +}) + + +# ============================================================================ +# Test 11: compute_XtX_product correctness +# ============================================================================ + +test_that("compute_XtX_product produces correct results for all ref_label types", { + # Access internal function + compute_XtX_product <- get("compute_XtX_product", envir = asNamespace("colocboost")) + + set.seed(42) + n_ref <- 50 + p <- 20 + + sigma <- 0.9^abs(outer(1:p, 1:p, "-")) + X_ref_raw <- MASS::mvrnorm(n_ref, rep(0, p), sigma) + + # Standardize X_ref (as done in colocboost_validate_input_data) + X_ref <- Rfast::standardise(X_ref_raw, center = TRUE, scale = TRUE) + X_ref[is.na(X_ref)] <- 0 + + # Precompute LD from X_ref + LD <- get_cormat(X_ref_raw) + + beta <- rnorm(p) + + # Test LD case + result_ld <- compute_XtX_product(LD, beta, ref_label = "LD") + expected_ld <- as.vector(LD %*% beta) + expect_equal(result_ld, expected_ld) + + # Test X_ref case + result_xref <- compute_XtX_product(X_ref, beta, ref_label = "X_ref") + # Should approximate LD %*% beta + # X_ref' * X_ref / (N_ref - 1) ≈ LD + expected_xref <- as.vector(crossprod(X_ref, X_ref %*% beta) / (n_ref - 1)) + expect_equal(result_xref, expected_xref) + + # Test No_ref case + result_noref <- compute_XtX_product(1, beta, ref_label = "No_ref") + expect_equal(result_noref, beta) + + # X_ref and LD results should be similar (not exact due to finite N_ref) + expect_equal(result_xref, expected_ld, tolerance = 0.3) +}) + + +# ============================================================================ +# Test 12: X_ref validation in colocboost_validate_input_data +# ============================================================================ + +test_that("colocboost_validate_input_data correctly handles X_ref", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + # Test N_ref < P case + validated <- colocboost_validate_input_data( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref + ) + + expect_equal(validated$ref_label, "X_ref") + expect_true(!is.null(validated$X_ref)) + expect_null(validated$LD) + + # Test N_ref >= P case + test_data_large <- generate_xref_test_data(n_ref = 50, p = 30) + + validated_large <- suppressMessages( + colocboost_validate_input_data( + sumstat = test_data_large$sumstat, + X_ref = test_data_large$X_ref + ) + ) + + expect_equal(validated_large$ref_label, "LD") + expect_null(validated_large$X_ref) + expect_true(!is.null(validated_large$LD)) +}) + + +# ============================================================================ +# Test 13: X_ref with output_level 3 (diagnostic details) +# ============================================================================ + +test_that("X_ref works with output_level 3 and has correct ref_label in cb_data", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 10, + output_level = 3 + ) + })) + + expect_s3_class(result, "colocboost") + expect_true("diagnostic_details" %in% names(result)) + + # Check that ref_label is correctly stored in the model's cb_data + cb_data <- result$diagnostic_details$cb_data + for (i in seq_along(cb_data$data)) { + ref_label_i <- cb_data$data[[i]]$ref_label + expect_true(!is.null(ref_label_i), + info = paste("ref_label missing for data element", i)) + expect_true(ref_label_i %in% c("X_ref", "LD", "No_ref", "individual"), + info = paste("Invalid ref_label for data element", i)) + } +}) + + +# ============================================================================ +# Test 14: X_ref with XtX_beta_cache in diagnostic output +# ============================================================================ + +test_that("X_ref model has XtX_beta_cache in diagnostic output", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 10, + output_level = 3 + ) + })) + + cb_model <- result$diagnostic_details$cb_model + for (i in seq_along(cb_model)) { + expect_true("XtX_beta_cache" %in% names(cb_model[[i]]), + info = paste("XtX_beta_cache missing for outcome", i)) + } +}) + + +# ============================================================================ +# Test 15: X_ref purity functions work correctly +# ============================================================================ + +test_that("purity functions dispatch correctly for X_ref", { + get_purity <- get("get_purity", envir = asNamespace("colocboost")) + get_between_purity <- get("get_between_purity", envir = asNamespace("colocboost")) + + set.seed(42) + n_ref <- 50 + p <- 20 + + sigma <- 0.9^abs(outer(1:p, 1:p, "-")) + X_ref <- MASS::mvrnorm(n_ref, rep(0, p), sigma) + X_ref <- Rfast::standardise(X_ref, center = TRUE, scale = TRUE) + X_ref[is.na(X_ref)] <- 0 + + LD <- cor(MASS::mvrnorm(200, rep(0, p), sigma)) + + # Test get_purity with X_ref + purity_xref <- get_purity(c(1, 2, 3), Xcorr = X_ref, ref_label = "X_ref") + expect_length(purity_xref, 3) + expect_true(all(purity_xref >= 0 & purity_xref <= 1)) + + # Test get_purity with LD + purity_ld <- get_purity(c(1, 2, 3), Xcorr = LD, ref_label = "LD") + expect_length(purity_ld, 3) + expect_true(all(purity_ld >= 0 & purity_ld <= 1)) + + # Test get_purity with No_ref + purity_noref <- get_purity(c(1, 2, 3), Xcorr = 1, ref_label = "No_ref") + expect_equal(purity_noref, c(0, 0, 0)) + + # Results should be similar between X_ref and LD (not exact due to different N) + expect_equal(purity_xref[1], purity_ld[1], tolerance = 0.3) + + # Test get_between_purity with X_ref + between_xref <- get_between_purity(c(1, 2), c(3, 4), Xcorr = X_ref, + miss_idx = NULL, P = p, ref_label = "X_ref") + expect_length(between_xref, 3) + expect_true(all(between_xref >= 0 & between_xref <= 1)) + + # Test get_between_purity with LD + between_ld <- get_between_purity(c(1, 2), c(3, 4), Xcorr = LD, + miss_idx = NULL, P = p, ref_label = "LD") + expect_length(between_ld, 3) + + # Test get_between_purity with No_ref + between_noref <- get_between_purity(c(1, 2), c(3, 4), Xcorr = 1, + miss_idx = NULL, P = p, ref_label = "No_ref") + expect_equal(between_noref, c(0, 0, 0)) +}) + + +# ============================================================================ +# Test 16: X_ref with get_robust_colocalization post-processing +# ============================================================================ + +test_that("get_robust_colocalization works with X_ref results", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 10, + output_level = 2 + ) + })) + + # Should not error + expect_error( + suppressMessages( + filtered <- get_robust_colocalization(result, cos_npc_cutoff = 0.2, npc_outcome_cutoff = 0.1) + ), + NA + ) + expect_s3_class(filtered, "colocboost") +}) + + +# ============================================================================ +# Test 17: X_ref with get_robust_ucos post-processing +# ============================================================================ + +test_that("get_robust_ucos works with X_ref results", { + test_data <- generate_xref_test_data(n_ref = 500, p = 30) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 10, + output_level = 2 + ) + })) + + skip_if(is.null(result$ucos_details), "No ucos detected") + + expect_error( + suppressMessages( + filtered <- get_robust_ucos(result, npc_outcome_cutoff = 0.1) + ), + NA + ) + expect_s3_class(filtered, "colocboost") +}) + + +# ============================================================================ +# Test 18: X_ref plotting works +# ============================================================================ + +test_that("colocboost_plot works with X_ref results", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 10, + output_level = 2 + ) + })) + + expect_error(suppressWarnings(colocboost_plot(result)), NA) + expect_error(suppressWarnings(colocboost_plot(result, y = "vcp")), NA) +}) + + +# ============================================================================ +# Test 19: X_ref with get_colocboost_summary +# ============================================================================ + +test_that("get_colocboost_summary works with X_ref results", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = test_data$sumstat, + X_ref = test_data$X_ref, + M = 10, + output_level = 2 + ) + })) + + summary1 <- get_colocboost_summary(result, summary_level = 1) + expect_type(summary1, "list") + + summary2 <- get_colocboost_summary(result, summary_level = 2) + expect_type(summary2, "list") +}) + + +# ============================================================================ +# Test 20: X_ref with HyPrColoc format input +# ============================================================================ + +test_that("X_ref works with effect_est and effect_se input", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + beta <- se <- matrix(0, 30, 2) + for (i in 1:2) { + for (j in 1:30) { + fit <- summary(lm(test_data$Y[, i] ~ test_data$X[, j]))$coef + if (nrow(fit) == 2) { beta[j, i] <- fit[2, 1]; se[j, i] <- fit[2, 2] } + } + } + rownames(beta) <- rownames(se) <- colnames(test_data$X) + + suppressWarnings(suppressMessages({ + result <- colocboost( + effect_est = beta, + effect_se = se, + effect_n = rep(200, 2), + X_ref = test_data$X_ref, + M = 10, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 2) +}) + + +# ============================================================================ +# Test 21: LD_jk functions dispatch correctly for X_ref +# ============================================================================ + +test_that("get_LD_jk and get_LD_jk1_jk2 dispatch correctly for X_ref", { + get_LD_jk <- get("get_LD_jk", envir = asNamespace("colocboost")) + get_LD_jk1_jk2 <- get("get_LD_jk1_jk2", envir = asNamespace("colocboost")) + + set.seed(42) + n_ref <- 50 + p <- 20 + + sigma <- 0.9^abs(outer(1:p, 1:p, "-")) + X_ref <- MASS::mvrnorm(n_ref, rep(0, p), sigma) + X_ref <- Rfast::standardise(X_ref, center = TRUE, scale = TRUE) + X_ref[is.na(X_ref)] <- 0 + + LD <- cor(MASS::mvrnorm(200, rep(0, p), sigma)) + + remain_idx <- 1:p + + # get_LD_jk with X_ref + ld_jk_xref <- get_LD_jk(5, XtX = X_ref, remain_idx = remain_idx, P = p, ref_label = "X_ref") + expect_length(ld_jk_xref, p) + expect_equal(ld_jk_xref[5], 1.0, tolerance = 0.01) # Self-correlation ~ 1 + + # get_LD_jk with LD + ld_jk_ld <- get_LD_jk(5, XtX = LD, remain_idx = remain_idx, P = p, ref_label = "LD") + expect_length(ld_jk_ld, p) + expect_equal(ld_jk_ld[5], 1.0, tolerance = 0.01) + + # get_LD_jk with No_ref + ld_jk_noref <- get_LD_jk(5, XtX = 1, remain_idx = remain_idx, P = p, ref_label = "No_ref") + expect_true(all(ld_jk_noref[remain_idx] == 1)) + + # get_LD_jk1_jk2 with X_ref + ld_pair_xref <- get_LD_jk1_jk2(1, 2, XtX = X_ref, remain_jk = remain_idx, ref_label = "X_ref") + expect_true(abs(ld_pair_xref) <= 1) + expect_true(abs(ld_pair_xref) > 0.5) # Adjacent SNPs with 0.9 LD should be high + + # get_LD_jk1_jk2 with LD + ld_pair_ld <- get_LD_jk1_jk2(1, 2, XtX = LD, remain_jk = remain_idx, ref_label = "LD") + expect_true(abs(ld_pair_ld) <= 1) + + # get_LD_jk1_jk2 with No_ref + ld_pair_noref <- get_LD_jk1_jk2(1, 2, XtX = 1, remain_jk = remain_idx, ref_label = "No_ref") + expect_equal(ld_pair_noref, 0) +}) + + +# ============================================================================ +# Test 22: ref_label is never NULL in internal cb_data after processing +# ============================================================================ + +test_that("ref_label is always explicitly set in cb_data, never NULL", { + test_data <- generate_xref_test_data(n_ref = 20, p = 30) + + # Build cb_data directly to inspect ref_label + sumstat <- test_data$sumstat + Z_list <- lapply(sumstat, function(s) s$beta / s$sebeta) + + # X_ref case + cb_data_xref <- colocboost_init_data( + X = NULL, Y = NULL, dict_YX = NULL, + Z = Z_list, LD = NULL, X_ref = list(test_data$X_ref), ref_label = "X_ref", + N_sumstat = lapply(sumstat, function(s) s$n[1]), + dict_sumstatLD = c(1, 1), Var_y = NULL, SeBhat = NULL, + keep_variables = lapply(sumstat, function(s) s$variant) + ) + for (i in seq_along(cb_data_xref$data)) { + expect_equal(cb_data_xref$data[[i]]$ref_label, "X_ref") + } + + # LD case + cb_data_ld <- colocboost_init_data( + X = NULL, Y = NULL, dict_YX = NULL, + Z = Z_list, LD = list(test_data$LD), X_ref = NULL, ref_label = "LD", + N_sumstat = lapply(sumstat, function(s) s$n[1]), + dict_sumstatLD = c(1, 1), Var_y = NULL, SeBhat = NULL, + keep_variables = lapply(sumstat, function(s) s$variant) + ) + for (i in seq_along(cb_data_ld$data)) { + expect_equal(cb_data_ld$data[[i]]$ref_label, "LD") + } + + # Individual data case + Y_list <- lapply(1:2, function(l) as.matrix(test_data$Y[, l])) + X_list <- list(test_data$X, test_data$X) + cb_data_ind <- colocboost_init_data( + X = X_list, Y = Y_list, dict_YX = c(1, 2), + Z = NULL, LD = NULL, X_ref = NULL, ref_label = NULL, + N_sumstat = NULL, dict_sumstatLD = NULL, Var_y = NULL, SeBhat = NULL, + keep_variables = lapply(X_list, colnames) + ) + for (i in seq_along(cb_data_ind$data)) { + expect_equal(cb_data_ind$data[[i]]$ref_label, "individual") + } +}) \ No newline at end of file diff --git a/tests/testthat/test_inference.R b/tests/testthat/test_inference.R index 9c2c9d0..1b23c0f 100644 --- a/tests/testthat/test_inference.R +++ b/tests/testthat/test_inference.R @@ -790,6 +790,8 @@ generate_test_cb_obj_with_ucos <- function(n = 100, p = 20, L = 2, seed = 42) { dict_YX = c(1, 2), Z = NULL, LD = NULL, + X_ref = NULL, + ref_label = NULL, N_sumstat = NULL, dict_sumstatLD = NULL, Var_y = NULL, @@ -935,6 +937,8 @@ test_that("get_ucos_evidence handles individual-level data", { dict_YX = c(1, 2), Z = NULL, LD = NULL, + X_ref = NULL, + ref_label = NULL, N_sumstat = NULL, dict_sumstatLD = NULL, Var_y = NULL, @@ -1040,6 +1044,8 @@ test_that("get_ucos_evidence handles summary statistics data", { dict_YX = NULL, Z = Z_list, LD = list(LD_matrix), + X_ref = NULL, + ref_label = "LD", N_sumstat = lapply(sumstat_list, function(s) s$n[1]), dict_sumstatLD = c(1, 1), Var_y = NULL, diff --git a/tests/testthat/test_model.R b/tests/testthat/test_model.R index df998e8..fa28c1d 100644 --- a/tests/testthat/test_model.R +++ b/tests/testthat/test_model.R @@ -237,6 +237,7 @@ test_that("colocboost correctly maps focal outcome to keep_variables with dict_k dict_YX = dict_YX, Z = lapply(sumstat_list, function(s) s$beta / s$sebeta), # z-scores LD = list(LD_superset), + ref_label = "LD", N_sumstat = lapply(sumstat_list, function(s) s$n[1]), dict_sumstatLD = dict_sumstatLD, Var_y = NULL, diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_utils.R index fb1b999..e62fcf9 100644 --- a/tests/testthat/test_utils.R +++ b/tests/testthat/test_utils.R @@ -571,6 +571,7 @@ test_that("colocboost_init_data handles complex dictionary mappings", { dict_YX = dict_YX, Z = Z_list, LD = LD_list, + ref_label = "LD", N_sumstat = N_sumstat, dict_sumstatLD = dict_sumstatLD, Var_y = NULL, diff --git a/vignettes/Input_Data_Format.Rmd b/vignettes/Input_Data_Format.Rmd index d1deba3..9a7aca5 100644 --- a/vignettes/Input_Data_Format.Rmd +++ b/vignettes/Input_Data_Format.Rmd @@ -59,6 +59,8 @@ head(Sumstat_5traits$sumstat[[1]]) - `LD` is a matrix of LD. This matrix does not need to contain the exact same variants as in `sumstat`, but the `colnames` and `rownames` of `LD` should include the `variant` names for proper alignment. +- `X_ref` (alternative to `LD`) is a reference panel genotype matrix (N_ref x P). When the number of variants is large, passing `X_ref` directly avoids storing the full P x P LD matrix. See [Summary Statistics Colocalization](https://statfungen.github.io/colocboost/articles/Summary_Level_Colocalization.html) for details. + The input format for multiple traits is similar, but `sumstat` should be a list of data frames `sumstat = list(sumstat1, sumstat2, sumstat3)`. The flexibility of input format for multiple traits is as follows (see detailed usage with different input formats, refer to [Summary Statistics Colocalization](https://statfungen.github.io/colocboost/articles/Summary_Level_Colocalization.html)): diff --git a/vignettes/Partial_Overlap_Variants.Rmd b/vignettes/Partial_Overlap_Variants.Rmd index e179fa4..b74d7f3 100644 --- a/vignettes/Partial_Overlap_Variants.Rmd +++ b/vignettes/Partial_Overlap_Variants.Rmd @@ -22,7 +22,7 @@ This vignette demonstrates how ColocBoost handles partial overlapping variants a library(colocboost) ``` -![](../man/figures/missing_representation.png) +![Illustration of partial overlapping variants across traits](../man/figures/missing_representation.png) diff --git a/vignettes/Summary_Statistics_Colocalization.Rmd b/vignettes/Summary_Statistics_Colocalization.Rmd index 3177a68..2dc127a 100644 --- a/vignettes/Summary_Statistics_Colocalization.Rmd +++ b/vignettes/Summary_Statistics_Colocalization.Rmd @@ -177,7 +177,31 @@ res$cos_details$cos$cos_index ``` -## 3.4. HyPrColoc compatible format: effect size and standard error matrices +## 3.4. Using a reference panel genotype matrix (X_ref) instead of LD + +When the number of variants P is very large, storing the full P x P LD matrix may be infeasible. +If you have the reference panel genotype matrix from which LD would be computed, you can pass it directly via `X_ref`. +ColocBoost will compute LD products on the fly, avoiding the P x P memory cost. + +This is beneficial when the reference panel sample size (N_ref) is less than the number of variants (P). +When N_ref >= P, ColocBoost automatically precomputes the LD matrix internally for efficiency. + +Provide either `LD` or `X_ref`, not both. The `dict_sumstatLD` dictionary works with `X_ref` the same way as with `LD`. + +```{r x-ref-example} +# Use genotype matrix directly as reference panel +data("Ind_5traits") +X_ref <- Ind_5traits$X[[1]] + +# Run colocboost with X_ref instead of LD +res <- colocboost(sumstat = Sumstat_5traits$sumstat, X_ref = X_ref) + +# Identified CoS +res$cos_details$cos$cos_index +``` + + +## 3.5. HyPrColoc compatible format: effect size and standard error matrices ColocBoost also provides a flexibility to use HyPrColoc compatible format for summary statistics with and without LD matrix.