StatFunGen · gaow · Mar 29, 2026 · Mar 29, 2026
diff --git a/NAMESPACE b/NAMESPACE
@@ -12,6 +12,10 @@ export(get_hierarchical_clusters)
 export(get_robust_colocalization)
 export(get_robust_ucos)
 export(get_ucos_summary)
+importFrom(Rfast,correls)
+importFrom(Rfast,med)
+importFrom(Rfast,standardise)
+importFrom(Rfast,upper_tri)
 importFrom(grDevices,adjustcolor)
 importFrom(graphics,abline)
 importFrom(graphics,axis)

diff --git a/R/colocboost.R b/R/colocboost.R
@@ -22,12 +22,20 @@
 #'                  \code{variant} is required if sumstat for different outcomes do not have the same number of variables.
 #'                  \code{var_y} is the variance of phenotype (default is 1 meaning that the Y is in the \dQuote{standardized} scale).
 #' @param LD A list of correlation matrix indicating the LD matrix for each genotype. It also could be a single matrix if all sumstats were
-#'           obtained from the same genotypes.
+#'           obtained from the same genotypes. Provide either \code{LD} or \code{X_ref}, not both.
+#'           If neither is provided, LD-free mode is used.
+#' @param X_ref A reference panel genotype matrix (N_ref x P) or a list of matrices, as an alternative to providing a precomputed
+#'              \code{LD} matrix. Column names must include variant names matching those in \code{sumstat}.
+#'              When the number of reference panel samples is less than the number of variants (N_ref < P),
+#'              this avoids storing the full P x P LD matrix and reduces memory usage.
+#'              When N_ref >= P, LD is precomputed from \code{X_ref} internally.
+#'              Provide either \code{LD} or \code{X_ref}, not both. If neither is provided, LD-free mode is used.
 #' @param dict_YX A L by 2 matrix of dictionary for \code{X} and \code{Y} if there exist subsets of outcomes corresponding to the same X matrix.
 #'                  The first column should be 1:L for L outcomes. The second column should be the index of \code{X} corresponding to the outcome.
 #'                  The innovation: do not provide the same matrix in \code{X} to reduce the computational burden.
-#' @param dict_sumstatLD A L by 2 matrix of dictionary for \code{sumstat} and \code{LD} if there exist subsets of outcomes corresponding to the same sumstat.
-#'                  The first column should be 1:L for L sumstat The second column should be the index of \code{LD} corresponding to the sumstat.
+#' @param dict_sumstatLD A L by 2 matrix of dictionary for \code{sumstat} and \code{LD} (or \code{X_ref}) if there exist subsets of outcomes
+#'                  corresponding to the same sumstat.
+#'                  The first column should be 1:L for L sumstat The second column should be the index of \code{LD} (or \code{X_ref}) corresponding to the sumstat.
 #'                  The innovation: do not provide the same matrix in \code{LD} to reduce the computational burden.
 #' @param outcome_names The names of outcomes, which has the same order for Y.
 #' @param focal_outcome_idx The index of the focal outcome if perform GWAS-xQTL ColocBoost
@@ -129,9 +137,10 @@
 #'
 #' @family colocboost
 #' @importFrom stats na.omit
+#' @importFrom Rfast correls standardise upper_tri med
 #' @export
 colocboost <- function(X = NULL, Y = NULL, # individual data
-                       sumstat = NULL, LD = NULL, # summary statistics: either Z, bhat, sebhat, N, var_Y,
+                       sumstat = NULL, LD = NULL, X_ref = NULL, # summary statistics: either Z, bhat, sebhat, N, var_Y,
                        ###### - index dict for X match multiple Y / LD match multiple sumstat
                        dict_YX = NULL, # Y index for 1st column, X index for 2nd column
                        dict_sumstatLD = NULL, # sumstat index for 1st column, LD index for 2nd column
@@ -202,11 +211,15 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
     warning("Error: No individual data (X, Y) or summary statistics (sumstat) or (effect_est, effect_se) are provided! Please check!")
     return(NULL)
   }
+  if (!is.null(LD) && !is.null(X_ref)) {
+    warning("Error: Provide either LD or X_ref, not both.")
+    return(NULL)
+  }
 
   # - check input data: individual level data and summary-level data
   validated_data <- colocboost_validate_input_data(
     X = X, Y = Y,
-    sumstat = sumstat, LD = LD,
+    sumstat = sumstat, LD = LD, X_ref = X_ref,
     dict_YX = dict_YX, dict_sumstatLD = dict_sumstatLD,
     effect_est = effect_est, effect_se = effect_se, effect_n = effect_n,
     overlap_variables = overlap_variables,
@@ -227,6 +240,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
   keep_variable_individual <- validated_data$keep_variable_individual
   sumstat <- validated_data$sumstat
   LD <- validated_data$LD
+  X_ref <- validated_data$X_ref
   sumstatLD_dict <- validated_data$sumstatLD_dict
   keep_variable_sumstat <- validated_data$keep_variable_sumstat
   Z <- validated_data$Z
@@ -277,7 +291,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
   }
   cb_data <- colocboost_init_data(
     X = X, Y = Y, dict_YX = yx_dict,
-    Z = Z, LD = LD, N_sumstat = N_sumstat, dict_sumstatLD = sumstatLD_dict,
+    Z = Z, LD = LD, X_ref = X_ref, N_sumstat = N_sumstat, dict_sumstatLD = sumstatLD_dict,
     Var_y = Var_y, SeBhat = SeBhat,
     keep_variables = keep_variables,
     focal_outcome_idx = focal_outcome_idx,
@@ -408,7 +422,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
 #'
 #' @keywords internal
 colocboost_validate_input_data <- function(X = NULL, Y = NULL,
-                                           sumstat = NULL, LD = NULL,
+                                           sumstat = NULL, LD = NULL, X_ref = NULL,
                                            dict_YX = NULL, dict_sumstatLD = NULL,
                                            effect_est = NULL, effect_se = NULL, effect_n = NULL,
                                            overlap_variables = FALSE,
@@ -659,10 +673,36 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL,
     cos_npc_cutoff_updated <- cos_npc_cutoff
     npc_outcome_cutoff_updated <- npc_outcome_cutoff
 
-    if (is.null(LD)) {
-      # if no LD input, set diagonal matrix to LD
+    # --- Handle X_ref: convert to LD when N_ref >= P, or keep for on-the-fly computation
+    if (!is.null(X_ref)) {
+      if (is.data.frame(X_ref)) X_ref <- as.matrix(X_ref)
+      if (is.matrix(X_ref)) X_ref <- list(X_ref)
+      N_ref <- nrow(X_ref[[1]])
+      P_ref <- ncol(X_ref[[1]])
+      if (N_ref >= P_ref) {
+        # Precompute LD from X_ref (more compact when N_ref >= P)
+        message("N_ref >= P: precomputing LD from X_ref.")
+        LD <- lapply(X_ref, get_cormat)
+        for (idx in seq_along(LD)) {
+          rownames(LD[[idx]]) <- colnames(LD[[idx]]) <- colnames(X_ref[[idx]])
+        }
+        X_ref <- NULL
+      } else {
+        # N_ref < P: keep X_ref for on-the-fly computation
+        # Set rownames/colnames on X_ref for variant matching (use colnames as LD would)
+        # Build LD-like variant mapping using X_ref colnames
+        # Standardize X_ref
+        for (idx in seq_along(X_ref)) {
+          X_ref[[idx]] <- standardise(X_ref[[idx]], center = TRUE, scale = TRUE)
+          X_ref[[idx]][which(is.na(X_ref[[idx]]))] <- 0
+        }
+      }
+    }
+
+    if (is.null(LD) && is.null(X_ref)) {
+      # if no LD or X_ref input, set diagonal matrix to LD
       warning(
-        "Providing the LD for summary statistics data is highly recommended. ",
+        "Providing the LD or X_ref for summary statistics data is highly recommended. ",
         "Without LD, only a single iteration will be performed under the assumption of one causal variable per outcome. ",
         "Additionally, the purity of CoS cannot be evaluated!"
       )
@@ -679,59 +719,72 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL,
       npc_outcome_cutoff_updated <- 0
 
     } else {
-
-      if (is.data.frame(LD)) LD <- as.matrix(LD)
-      if (is.matrix(LD)) LD <- list(LD)
-      # - check if NA in LD matrix
-      num_na <- sapply(LD, sum)
-      if (any(is.na(num_na))){
-        warning("Error: Input LD must not contain missing values (NA).")
-        return(NULL)
+      # --- Determine reference list and variant extraction for LD or X_ref
+      if (!is.null(LD)) {
+        if (is.data.frame(LD)) LD <- as.matrix(LD)
+        if (is.matrix(LD)) LD <- list(LD)
+        # - check if NA in LD matrix
+        num_na <- sapply(LD, sum)
+        if (any(is.na(num_na))){
+          warning("Error: Input LD must not contain missing values (NA).")
+          return(NULL)
+        }
+        ref_list <- LD
+        get_ref_variants <- function(ref_mat) rownames(ref_mat)
+        ref_label <- "LD"
+        ref_ncol <- function(ref_mat) ncol(ref_mat)
+      } else {
+        # X_ref path (N_ref < P, already standardized above)
+        ref_list <- X_ref
+        get_ref_variants <- function(ref_mat) colnames(ref_mat)
+        ref_label <- "X_ref"
+        ref_ncol <- function(ref_mat) ncol(ref_mat)
       }
-      # Create sumstat-LD mapping ===
-      if (length(LD) == 1) {
+
+      # Create sumstat-LD/X_ref mapping ===
+      if (length(ref_list) == 1) {
         sumstatLD_dict <- rep(1, length(sumstat))
-      } else if (length(LD) == length(sumstat)) {
+      } else if (length(ref_list) == length(sumstat)) {
         sumstatLD_dict <- seq_along(sumstat)
       } else {
         if (is.null(dict_sumstatLD)) {
-          warning('Error: Please provide dict_sumstatLD: you have ', length(sumstat), 
-                  ' sumstats but only ', length(LD), ' LD matrices')
+          warning('Error: Please provide dict_sumstatLD: you have ', length(sumstat),
+                  ' sumstats but only ', length(ref_list), ' ', ref_label, ' matrices')
           return(NULL)
         } else {
-          # - dict for sumstat to LD mapping
+          # - dict for sumstat to LD/X_ref mapping
           sumstatLD_dict <- rep(NA, length(sumstat))
           for (i in 1:length(sumstat)) {
             tmp <- unique(dict_sumstatLD[dict_sumstatLD[, 1] == i, 2])
             if (length(tmp) == 0) {
-              warning(paste("Error: You don't provide matched LD for sumstat", i))
+              warning(paste("Error: You don't provide matched", ref_label, "for sumstat", i))
               return(NULL)
             } else if (length(tmp) != 1) {
-              warning(paste("Error: You provide multiple matched LD for sumstat", i))
+              warning(paste("Error: You provide multiple matched", ref_label, "for sumstat", i))
               return(NULL)
             } else {
               sumstatLD_dict[i] <- tmp
             }
           }
-          if (max(sumstatLD_dict) > length(LD)) {
-            warning("Error: You don't provide enough LD matrices!")
+          if (max(sumstatLD_dict) > length(ref_list)) {
+            warning(paste("Error: You don't provide enough", ref_label, "matrices!"))
             return(NULL)
           }
         }
       }
-      
+
       # === Filter variants for each sumstat ===
       for (i in seq_along(sumstat)) {
         # Get sumstat variants (adjust field name based on your data structure)
         sumstat_variants <- sumstat[[i]]$variant
         n_total <- length(sumstat_variants)
-        # Get LD variants
+        # Get LD/X_ref variants
         ld_idx <- sumstatLD_dict[i]
-        current_ld <- LD[[ld_idx]]
-        ld_variants <- rownames(current_ld)
+        current_ref <- ref_list[[ld_idx]]
+        ld_variants <- get_ref_variants(current_ref)
         if (is.null(ld_variants)) {
-          if (ncol(current_ld) != n_total){
-            warning('Error: LD matrix ', ld_idx, ' has no rownames. Please ensure all LD matrices have variant names as rownames.')
+          if (ref_ncol(current_ref) != n_total){
+            warning('Error: ', ref_label, ' matrix ', ld_idx, ' has no variant names. Please ensure all ', ref_label, ' matrices have variant names.')
             return(NULL)
           }
         }
@@ -834,7 +887,7 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL,
       Z[[i.summstat]] <- z
     }
   } else {
-    Z <- N_sumstat <- Var_y <- SeBhat <- sumstatLD_dict <- keep_variable_sumstat <- NULL
+    Z <- N_sumstat <- Var_y <- SeBhat <- sumstatLD_dict <- keep_variable_sumstat <- X_ref <- NULL
     M_updated <- M
     min_abs_corr_updated <- min_abs_corr
     jk_equiv_corr_updated <- jk_equiv_corr
@@ -851,6 +904,7 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL,
     keep_variable_individual = keep_variable_individual,
     sumstat = sumstat,
     LD = LD,
+    X_ref = X_ref,
     sumstatLD_dict = sumstatLD_dict,
     keep_variable_sumstat = keep_variable_sumstat,
     Z = Z,

diff --git a/R/colocboost_assemble.R b/R/colocboost_assemble.R
@@ -132,10 +132,13 @@ colocboost_assemble <- function(cb_obj,
           }
         }
         if (!is.null(cb_obj_single$cb_data$data[[1]][["XtY"]])) {
+          X_dict <- cb_obj$cb_data$dict[i]
           if (is.null(cb_obj_single$cb_data$data[[1]]$XtX)) {
-            X_dict <- cb_obj$cb_data$dict[i]
             cb_obj_single$cb_data$data[[1]]$XtX <- cb_obj$cb_data$data[[X_dict]]$XtX
           }
+          if (is.null(cb_obj_single$cb_data$data[[1]]$X_ref)) {
+            cb_obj_single$cb_data$data[[1]]$X_ref <- cb_obj$cb_data$data[[X_dict]]$X_ref
+          }
         }
         class(cb_obj_single) <- "colocboost"
         out_ucos_each <- colocboost_assemble_ucos(cb_obj_single,

diff --git a/R/colocboost_assemble_cos.R b/R/colocboost_assemble_cos.R
@@ -40,7 +40,7 @@ colocboost_assemble_cos <- function(cb_obj,
     for (iiii in 1:length(coloc_outcomes)) {
       X_dict <- cb_data$dict[coloc_outcomes[iiii]]
       tmp <- w_purity(avWeight[, iiii, drop = FALSE],
-        X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX,
+        X = get_genotype_matrix(cb_data$data[[X_dict]]), Xcorr = cb_data$data[[X_dict]]$XtX,
         N = cb_data$data[[coloc_outcomes[iiii]]]$N, n = n_purity, coverage = sec_coverage_thresh,
         min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr,
         miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss
@@ -81,7 +81,7 @@ colocboost_assemble_cos <- function(cb_obj,
           pos <- match(pos, setdiff(1:cb_model_para$P, cb_data$data[[i]]$variable_miss))
         }
         p_tmp <- matrix(get_purity(pos,
-          X = cb_data$data[[X_dict]]$X,
+          X = get_genotype_matrix(cb_data$data[[X_dict]]),
           Xcorr = cb_data$data[[X_dict]]$XtX,
           N = cb_data$data[[i]]$N, n = n_purity
         ), 1, 3)
@@ -145,7 +145,7 @@ colocboost_assemble_cos <- function(cb_obj,
         for (iiii in 1:length(coloc_outcomes)) {
           X_dict <- cb_data$dict[coloc_outcomes[iiii]]
           tmp <- w_purity(avWeight[, iiii, drop = FALSE],
-            X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX,
+            X = get_genotype_matrix(cb_data$data[[X_dict]]), Xcorr = cb_data$data[[X_dict]]$XtX,
             N = cb_data$data[[coloc_outcomes[iiii]]]$N, n = n_purity, coverage = sec_coverage_thresh,
             min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr,
             miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss
@@ -205,7 +205,7 @@ colocboost_assemble_cos <- function(cb_obj,
             weight_cluster <- t(av[[iiii]][idx, , drop = FALSE])
             X_dict <- cb_data$dict[coloc_outcomes[iiii]]
             tmp <- w_purity(weight_cluster,
-              X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX,
+              X = get_genotype_matrix(cb_data$data[[X_dict]]), Xcorr = cb_data$data[[X_dict]]$XtX,
               N = cb_data$data[[coloc_outcomes[iiii]]]$N, n = n_purity, coverage = sec_coverage_thresh,
               min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr,
               miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss
@@ -348,7 +348,7 @@ colocboost_assemble_cos <- function(cb_obj,
             for (i in 1:cb_model_para$L) {
               X_dict <- cb_data$dict[i]
               res[[i]] <- get_between_purity(cset1, cset2,
-                X = cb_data$data[[X_dict]]$X,
+                X = get_genotype_matrix(cb_data$data[[X_dict]]),
                 Xcorr = cb_data$data[[X_dict]]$XtX,
                 miss_idx = cb_data$data[[i]]$variable_miss,
                 P = cb_model_para$P
@@ -436,7 +436,7 @@ colocboost_assemble_cos <- function(cb_obj,
             pos <- match(pos, setdiff(1:cb_model_para$P, cb_data$data[[i3]]$variable_miss))
           }
           tmp <- matrix(get_purity(pos,
-            X = cb_data$data[[X_dict]]$X,
+            X = get_genotype_matrix(cb_data$data[[X_dict]]),
             Xcorr = cb_data$data[[X_dict]]$XtX,
             N = cb_data$data[[i3]]$N, n = n_purity
           ), 1, 3)

diff --git a/R/colocboost_assemble_ucos.R b/R/colocboost_assemble_ucos.R
@@ -63,7 +63,7 @@ colocboost_assemble_ucos <- function(cb_obj_single,
         pos <- match(pos, setdiff(1:cb_model_para$P, cb_data$data[[1]]$variable_miss))
       }
       purity <- matrix(get_purity(pos,
-        X = cb_data$data[[1]]$X, Xcorr = cb_data$data[[1]]$XtX,
+        X = get_genotype_matrix(cb_data$data[[1]]), Xcorr = cb_data$data[[1]]$XtX,
         N = cb_data$data[[1]]$N, n = n_purity
       ), 1, 3)
       purity <- as.data.frame(purity)
@@ -128,7 +128,7 @@ colocboost_assemble_ucos <- function(cb_obj_single,
       w <- LogLik_change[idx]
       weight_cluster <- t(weights[idx, , drop = FALSE])
       check_purity <- w_purity(weight_cluster,
-        X = cb_data$data[[1]]$X, Xcorr = cb_data$data[[1]]$XtX,
+        X = get_genotype_matrix(cb_data$data[[1]]), Xcorr = cb_data$data[[1]]$XtX,
         N = cb_data$data[[1]]$N, n = n_purity, coverage = coverage,
         min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr,
         miss_idx = cb_data$data[[1]]$variable_miss
@@ -234,7 +234,7 @@ colocboost_assemble_ucos <- function(cb_obj_single,
             cset1 <- confidence_sets[[i.between]]
             cset2 <- confidence_sets[[j.between]]
             res <- get_between_purity(cset1, cset2,
-              X = cb_data$data[[1]]$X,
+              X = get_genotype_matrix(cb_data$data[[1]]),
               Xcorr = cb_data$data[[1]]$XtX,
               miss_idx = cb_data$data[[1]]$variable_miss,
               P = cb_model_para$P
@@ -300,7 +300,7 @@ colocboost_assemble_ucos <- function(cb_obj_single,
           rbind(
             purity,
             matrix(get_purity(pos,
-              X = cb_data$data[[1]]$X, Xcorr = cb_data$data[[1]]$XtX,
+              X = get_genotype_matrix(cb_data$data[[1]]), Xcorr = cb_data$data[[1]]$XtX,
               N = cb_data$data[[1]]$N, n = n_purity
             ), 1, 3)
           )