Skip to content

Commit ce939bf

Browse files
Merge pull request #24 from alexmccreight/main
add sparse sim, update complex, add vignettes
2 parents c49a63c + 6406e88 commit ce939bf

11 files changed

+482
-113
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Description: Provides a fast and memory-efficient method for
99
URL: https://statfungen.github.io/simxQTL/, https://github.com/StatFunGen/simxQTL
1010
BugReports: https://github.com/StatFunGen/simxQTL/issues
1111
Authors@R: c(
12-
person("Alexander", "McCreight", role = c("cre", "aut")),
12+
person("Alexander", "McCreight", email = "alexmccreight01@gmail.com", role = c("cre", "aut")),
1313
person("Xuewei", "Cao", role = "aut"),
1414
person("Haochen", "Sun", role = "aut"),
1515
person("Gao", "Wang", role = "aut", email = "wang.gao@columbia.edu"))

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ export(sim_multi_traits)
2020
export(sim_single_trait_simple)
2121
export(sim_sumstats)
2222
export(simulate_causal_config)
23+
export(simulate_phenotype)
2324
export(simulate_polygenic_trait)
2425
export(simulate_trans_expression)
2526
export(simulate_trans_mixture_celltype)

R/simulate_eQTL.R

Lines changed: 60 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,18 @@
1313
#' \item{beta}{A vector of effect sizes with nonzero entries for the sparse SNPs.}
1414
#' \item{sparse_indices}{Indices of the sparse SNPs.}
1515
#' @keywords internal
16-
simulate_sparse_effects <- function(G, h2_sparse, n_sparse, effect_sd = 0.5) {
16+
simulate_sparse_effects <- function(G, h2_sparse, n_sparse, effect_sd = 0.5, valid_cols = NULL) {
1717
n_features <- ncol(G)
1818
beta <- rep(0, n_features)
1919

20-
# Select sparse SNPs
21-
if (n_sparse > 0) {
22-
sparse_indices <- sample(1:n_features, n_sparse)
20+
# Use valid columns if provided, otherwise use all
21+
if (is.null(valid_cols)) {
22+
valid_cols <- 1:n_features
23+
}
24+
25+
# Select sparse SNPs from valid columns only
26+
if (n_sparse > 0 && length(valid_cols) >= n_sparse) {
27+
sparse_indices <- sample(valid_cols, n_sparse)
2328

2429
# Draw effects from normal distribution with specified SD
2530
beta[sparse_indices] <- rnorm(n_sparse, mean = 0, sd = effect_sd)
@@ -123,10 +128,10 @@ simulate_infinitesimal_effects <- function(G, h2_infinitesimal, infinitesimal_in
123128
beta[infinitesimal_indices] <- rnorm(n_inf, mean = 0, sd = effect_sd)
124129

125130
# Scale to exactly match h2_infinitesimal
126-
inf_effects <- as.vector(G[, infinitesimal_indices] %*% beta[infinitesimal_indices])
131+
inf_effects <- as.vector(G[, infinitesimal_indices, drop = FALSE] %*% beta[infinitesimal_indices])
127132
current_var <- var(inf_effects)
128133

129-
if (current_var > 0) {
134+
if (current_var > 0 && !is.na(current_var)) {
130135
scaling_factor <- sqrt(h2_infinitesimal / current_var)
131136
beta[infinitesimal_indices] <- beta[infinitesimal_indices] * scaling_factor
132137
}
@@ -166,47 +171,47 @@ is_causal_power <- function(G, beta, residual_variance, power = 0.80) {
166171
}
167172

168173
###############################################################################
169-
#' Generate Cis-QTL Data with Multiple Genetic Architecture Components
170-
#'
171-
#' This function generates simulated cis-eQTL data with a partitioned
172-
#' genetic architecture that enforces strict effect size hierarchies:
173-
#' |sparse| > |oligogenic| >> |infinitesimal|
174+
#' Generate eQTL Data with Multiple Genetic Architecture Components
174175
#'
175-
#' Originally developed for the "susieR 2.0" manuscript, McCreight et al (2025).
176+
#' This function generates simulated gene expression data
176177
#'
177-
#' @param G Genotype matrix.
178+
#' @param G Standardized genotype matrix (samples x SNPs).
178179
#' @param h2g Total SNP heritability (proportion of variance explained by genotyped SNPs).
179180
#' @param prop_h2_sparse Proportion of h2g explained by sparse effects.
180181
#' @param prop_h2_oligogenic Proportion of h2g explained by oligogenic effects.
181182
#' @param prop_h2_infinitesimal Proportion of h2g explained by infinitesimal effects.
182183
#' @param n_sparse Number of sparse SNPs.
183184
#' @param n_oligogenic Number of oligogenic SNPs to simulate.
185+
#' @param n_inf Number of infinitesimal SNPs to simulate. If NULL (default), all remaining SNPs after sparse and oligogenic selection receive infinitesimal effects.
184186
#' @param mixture_props Mixture proportions for oligogenic effects (must sum to 1). Default c(0.75, 0.25) means 75% smaller effects, 25% larger effects.
185187
#' @param sparse_sd Standard deviation for drawing sparse effects (default 0.5).
186188
#' @param oligo_sds Standard deviations for oligogenic mixture components (default c(0.05, 0.15)).
187189
#' @param inf_sd Standard deviation for drawing infinitesimal effects (default 0.01).
188190
#' @param standardize Logical; if TRUE, the genotype matrix will be standardized.
189-
#' @param independent Logical; if TRUE, ensures all sparse and oligogenic SNPs have |r| < 0.15 with each other (default FALSE).
190-
#' @param verbose Logical; if TRUE, prints progress messages including LD constraint attempts (default FALSE).
191+
#' @param independent Logical; if TRUE, ensures all sparse and oligogenic SNPs have |r| < ld_threshold with each other (default TRUE).
192+
#' @param ld_threshold Numeric; maximum allowed absolute correlation between causal variants when independent = TRUE (default 0.15).
193+
#' @param max_attempts Integer; maximum number of attempts to find SNPs satisfying LD constraints (default 200).
191194
#' @param seed Optional seed for reproducibility.
192195
#' @return A list containing the standardized genotype matrix, simulated phenotype,
193196
#' combined beta values, indices for each effect component, realized heritability estimates,
194197
#' effect size ranges, hierarchy validation results, and causal indices.
195198
#' @export
196199
generate_cis_qtl_data <- function(G,
197-
h2g = 0.30,
200+
h2g = 0.25,
198201
prop_h2_sparse = 0.50,
199-
prop_h2_oligogenic = 0.15,
200-
prop_h2_infinitesimal = 0.35,
202+
prop_h2_oligogenic = 0.35,
203+
prop_h2_infinitesimal = 0.15,
201204
n_sparse = 2,
202-
n_oligogenic = 10,
205+
n_oligogenic = 5,
206+
n_inf = 15,
203207
mixture_props = c(0.75, 0.25),
204208
sparse_sd = 0.5,
205209
oligo_sds = c(0.05, 0.15),
206210
inf_sd = 0.01,
207211
standardize = TRUE,
208212
independent = TRUE,
209-
verbose = FALSE,
213+
ld_threshold = 0.15,
214+
max_attempts = 200,
210215
seed = NULL) {
211216
# Input validation
212217
if (abs(prop_h2_sparse + prop_h2_oligogenic + prop_h2_infinitesimal - 1) > 1e-6) {
@@ -216,13 +221,6 @@ generate_cis_qtl_data <- function(G,
216221
stop("mixture_props must sum to 1.")
217222
}
218223

219-
if (!is.null(seed)) set.seed(seed)
220-
221-
# Standardize genotype matrix if requested
222-
if (standardize) {
223-
G <- scale(G)
224-
}
225-
226224
n_samples <- nrow(G)
227225
n_features <- ncol(G)
228226

@@ -231,47 +229,52 @@ generate_cis_qtl_data <- function(G,
231229
h2_oligogenic <- h2g * prop_h2_oligogenic
232230
h2_infinitesimal <- h2g * prop_h2_infinitesimal
233231

232+
# All columns are valid for selection
233+
valid_cols <- 1:n_features
234+
234235
# Setup for LD constraint checking
235-
max_attempts <- if (independent) 200 else 1
236+
max_attempts <- if (independent) max_attempts else 1
236237
attempt <- 0
237238
ld_satisfied <- FALSE
238239

239-
# Main data generation loop
240-
if (verbose && independent) {
241-
cat("Attempting to satisfy LD constraints:\n")
242-
cat(" - Sparse SNPs: |r| < 0.15\n")
243-
cat(" - Oligogenic SNPs: |r| < 0.15\n")
244-
cat(" - Sparse-Oligogenic cross: |r| < 0.15\n")
245-
cat(" (Safeguard: will accept results after 200 attempts)\n")
240+
# Set initial seed if provided
241+
if (!is.null(seed)) {
242+
set.seed(seed)
246243
}
247244

248245
while (!ld_satisfied && attempt < max_attempts) {
249246
attempt <- attempt + 1
250247

251-
if (verbose && independent && attempt > 1) {
252-
cat(" Attempt", attempt, "of", max_attempts, "...\n")
253-
}
254-
255248
# Set seed for this attempt (for reproducibility)
256249
if (!is.null(seed) && attempt > 1) {
257250
set.seed(seed * 1000 + attempt)
258251
}
259252

260253
# 1. Sparse Effects using target-based scaling
261-
sparse_res <- simulate_sparse_effects(G, h2_sparse, n_sparse, sparse_sd)
254+
sparse_res <- simulate_sparse_effects(G, h2_sparse, n_sparse, sparse_sd, valid_cols)
262255
beta_sparse <- sparse_res$beta
263256
sparse_indices <- sparse_res$sparse_indices
264257

265258
# 2. Oligogenic Effects using target-based scaling
266-
non_sparse_indices <- setdiff(1:n_features, sparse_indices)
259+
non_sparse_indices <- setdiff(valid_cols, sparse_indices)
267260
oligo_res <- simulate_oligogenic_effects(G, h2_oligogenic, n_oligogenic,
268261
mixture_props, non_sparse_indices,
269262
oligo_sds)
270263
beta_oligo <- oligo_res$beta
271264
oligogenic_indices <- oligo_res$oligogenic_indices
272265

273266
# 3. Infinitesimal Effects using target-based scaling
274-
infinitesimal_indices <- setdiff(non_sparse_indices, oligogenic_indices)
267+
infinitesimal_pool <- setdiff(non_sparse_indices, oligogenic_indices)
268+
269+
# Select n_inf SNPs from the infinitesimal pool if n_inf is specified
270+
if (!is.null(n_inf)) {
271+
n_inf_actual <- min(n_inf, length(infinitesimal_pool))
272+
infinitesimal_indices <- sample(infinitesimal_pool, n_inf_actual)
273+
} else {
274+
# Default behavior: all remaining SNPs get infinitesimal effects
275+
infinitesimal_indices <- infinitesimal_pool
276+
}
277+
275278
beta_inf <- simulate_infinitesimal_effects(G, h2_infinitesimal, infinitesimal_indices,
276279
inf_sd)
277280

@@ -289,80 +292,42 @@ generate_cis_qtl_data <- function(G,
289292
if (independent) {
290293
ld_satisfied <- TRUE # Assume satisfied unless violations found
291294

292-
# Check sparse SNPs: |r| < 0.15
295+
# Check sparse SNPs: |r| < ld_threshold
293296
if (length(sparse_indices) > 1) {
294297
sparse_cor <- cor(G[, sparse_indices, drop = FALSE])
295-
high_ld_sparse <- which(abs(sparse_cor) >= 0.15 & upper.tri(sparse_cor, diag = FALSE), arr.ind = TRUE)
296-
297-
if (nrow(high_ld_sparse) > 0) {
298-
ld_satisfied <- FALSE
299-
}
298+
high_ld_sparse <- which(abs(sparse_cor) >= ld_threshold & upper.tri(sparse_cor, diag = FALSE), arr.ind = TRUE)
299+
if (nrow(high_ld_sparse) > 0) ld_satisfied <- FALSE
300300
}
301301

302-
# Check oligogenic SNPs: |r| < 0.15 (changed from 0.30)
302+
# Check oligogenic SNPs: |r| < ld_threshold
303303
if (ld_satisfied && length(oligogenic_indices) > 1) {
304304
oligo_cor <- cor(G[, oligogenic_indices, drop = FALSE])
305-
high_ld_oligo <- which(abs(oligo_cor) >= 0.15 & upper.tri(oligo_cor, diag = FALSE), arr.ind = TRUE)
306-
307-
if (nrow(high_ld_oligo) > 0) {
308-
ld_satisfied <- FALSE
309-
}
305+
high_ld_oligo <- which(abs(oligo_cor) >= ld_threshold & upper.tri(oligo_cor, diag = FALSE), arr.ind = TRUE)
306+
if (nrow(high_ld_oligo) > 0) ld_satisfied <- FALSE
310307
}
311308

312-
# Check between sparse and oligogenic: |r| < 0.15
309+
# Check between sparse and oligogenic: |r| < ld_threshold
313310
if (ld_satisfied && length(sparse_indices) > 0 && length(oligogenic_indices) > 0) {
314311
cross_cor <- cor(G[, sparse_indices, drop = FALSE], G[, oligogenic_indices, drop = FALSE])
315-
high_ld_cross <- which(abs(cross_cor) >= 0.15, arr.ind = TRUE)
316-
317-
if (nrow(high_ld_cross) > 0) {
318-
ld_satisfied <- FALSE
319-
}
312+
high_ld_cross <- which(abs(cross_cor) >= ld_threshold, arr.ind = TRUE)
313+
if (nrow(high_ld_cross) > 0) ld_satisfied <- FALSE
320314
}
321315
} else {
322316
# If not independent, accept
323317
ld_satisfied <- TRUE
324318
}
325319
}
326320

327-
# Report LD constraint results
321+
# Warn if LD constraints not satisfied
328322
if (independent && !ld_satisfied) {
329-
msg <- paste0("Failed to satisfy LD constraints after ",
330-
max_attempts, " attempts. Returning last generated data with LD violations.")
331-
warning(msg)
332-
if (verbose) {
333-
cat("\n", msg, "\n")
334-
# Report actual LD values
335-
if (length(sparse_indices) > 1) {
336-
sparse_cor <- cor(G[, sparse_indices, drop = FALSE])
337-
max_sparse_cor <- max(abs(sparse_cor[upper.tri(sparse_cor)]))
338-
cat(" Final max |r| among sparse SNPs:", round(max_sparse_cor, 4),
339-
ifelse(max_sparse_cor < 0.15, "", "✗ (constraint: < 0.15)"), "\n")
340-
}
341-
if (length(oligogenic_indices) > 1) {
342-
oligo_cor <- cor(G[, oligogenic_indices, drop = FALSE])
343-
max_oligo_cor <- max(abs(oligo_cor[upper.tri(oligo_cor)]))
344-
cat(" Final max |r| among oligogenic SNPs:", round(max_oligo_cor, 4),
345-
ifelse(max_oligo_cor < 0.15, "", "✗ (constraint: < 0.15)"), "\n")
346-
}
347-
if (length(sparse_indices) > 0 && length(oligogenic_indices) > 0) {
348-
cross_cor <- cor(G[, sparse_indices, drop = FALSE], G[, oligogenic_indices, drop = FALSE])
349-
max_cross_cor <- max(abs(cross_cor))
350-
cat(" Final max |r| between sparse and oligogenic:", round(max_cross_cor, 4),
351-
ifelse(max_cross_cor < 0.15, "", "✗ (constraint: < 0.15)"), "\n")
352-
}
353-
}
354-
} else if (verbose && independent) {
355-
if (attempt == 1) {
356-
cat("✓ LD constraints satisfied on first attempt!\n")
357-
} else {
358-
cat("✓ LD constraints satisfied after", attempt, "attempts.\n")
359-
}
323+
warning(paste0("Failed to satisfy LD constraints after ", max_attempts,
324+
" attempts. Returning last generated data with LD violations."))
360325
}
361326

362-
# Calculate causal indices using power-based identification
327+
# Calculate causal indices using power-based identification (non-central chi-square with Bonferroni correction)
363328
causal_indices <- is_causal_power(G, beta, var_epsilon, power = 0.80)
364329

365-
# Calculate empirical heritability for each component (final data)
330+
# Calculate empirical heritability for each component
366331
var_y <- var(y)
367332
h2_sparse_actual <- var(as.vector(G[, sparse_indices] %*% beta[sparse_indices])) / var_y
368333
h2_oligogenic_actual <- var(as.vector(G[, oligogenic_indices] %*% beta[oligogenic_indices])) / var_y
@@ -371,8 +336,7 @@ generate_cis_qtl_data <- function(G,
371336

372337
return(list(
373338
G = G,
374-
y.ori = y,
375-
y = scale(y),
339+
y = y,
376340
beta = beta,
377341
h2g = h2g_actual,
378342
h2_sparse = h2_sparse_actual,

0 commit comments

Comments
 (0)