1-
2- # sim_geno_functions.R
3-
41# ' Simulate Independent Genotypes
52# '
63# ' @param n Sample size.
74# ' @param p Number of SNPs.
85# ' @param min_maf Minimum minor allele frequency.
96# ' @param max_maf Maximum minor allele frequency.
10- # ' @param scale Logical, whether to scale the data using scale_faster .
7+ # ' @param scale Logical, whether to scale the data.
118# ' @return A matrix of genotypes.
129# ' @examples
1310# ' sim_geno_indep(n = 100, p = 10, min_maf = 0.01, max_maf = 0.4, scale = TRUE)
@@ -22,7 +19,7 @@ sim_geno_indep <- function(n, p, min_maf = 0.01, max_maf = 0.4, scale = FALSE) {
2219 colnames(G ) <- paste0(" variant" , 1 : p )
2320
2421 if (scale ) {
25- G <- scale_faster (G )
22+ G <- scale (G )
2623 }
2724
2825 return (G )
@@ -35,13 +32,14 @@ sim_geno_indep <- function(n, p, min_maf = 0.01, max_maf = 0.4, scale = FALSE) {
3532# ' @param min_maf Minimum minor allele frequency.
3633# ' @param max_maf Maximum minor allele frequency.
3734# ' @param lambda Regularization parameter.
38- # ' @param type Type of genotype to simulate, either "continuous" or "discrete".
39- # ' @param scale Logical, whether to scale the data using scale_faster.
35+ # ' @param is.discrete Logical, whether to generate discrete (TRUE) or continuous (FALSE) genotypes.
36+ # ' @param scale Logical, whether to scale the data.
37+ # ' @param tol Tolerance for checking diagonal elements of LD matrix.
4038# ' @return A matrix of genotypes.
4139# ' @examples
42- # ' sim_geno_LD(n = 100, LD = matrix(runif(100), 10, 10), min_maf = 0.01, max_maf = 0.4, lambda = 1e-3, type = "continuous" , scale = TRUE)
40+ # ' sim_geno_LD(n = 100, LD = matrix(runif(100), 10, 10), min_maf = 0.01, max_maf = 0.4, lambda = 1e-3, is.discrete = FALSE , scale = TRUE)
4341# ' @export
44- sim_geno_LD <- function (n , LD , min_maf = 0.01 , max_maf = 0.4 , lambda = 1e-3 , is.discrete = FALSE , scale = FALSE ) {
42+ sim_geno_LD <- function (n , LD , min_maf = 0.01 , max_maf = 0.4 , lambda = 1e-3 , is.discrete = FALSE , scale = FALSE , tol = sqrt( .Machine $ double.eps ) ) {
4543 if (missing(n )) stop(" Please provide the sample size" )
4644 if (is.null(LD )) stop(" Please provide LD matrix!" )
4745
@@ -58,7 +56,7 @@ sim_geno_LD <- function(n, LD, min_maf = 0.01, max_maf = 0.4, lambda = 1e-3, is.
5856 }
5957
6058 if (scale ) {
61- G <- scale_faster (G )
59+ G <- scale (G )
6260 }
6361
6462 colnames(G ) <- paste0(" variant" , 1 : p )
@@ -70,7 +68,7 @@ sim_geno_LD <- function(n, LD, min_maf = 0.01, max_maf = 0.4, lambda = 1e-3, is.
7068# ' @param n Sample size.
7169# ' @param file_path Path to the UK Biobank file.
7270# ' @param min_maf Minimum minor allele frequency.
73- # ' @param scale Logical, whether to scale the data using scale_faster .
71+ # ' @param scale Logical, whether to scale the data.
7472# ' @return A matrix of genotypes.
7573# ' @examples
7674# ' sim_geno_UKB(n = 100, file_path = "path/to/UKB/file", min_maf = 0.01, scale = TRUE)
@@ -83,26 +81,31 @@ sim_geno_UKB <- function(n, file_path, min_maf = 0.01, scale = FALSE) {
8381 G <- process_ukb(file_path , n , min_maf )
8482
8583 if (scale ) {
86- G <- scale_faster (G )
84+ G <- scale (G )
8785 }
8886
89- colnames(G ) <- paste0(" variant" , 1 : p )
87+ colnames(G ) <- paste0(" variant" , 1 : ncol( G ) )
9088 return (G )
9189}
9290
91+ # Safe multivariate normal random generation
92+ # ' @importFrom mvtnorm rmvnorm
93+ # ' @keywords internal
9394safe_rmvnorm <- function (n , p , LD , maf , lambda = 1e-3 ) {
9495
9596 var_g <- diag(sqrt(2 * maf * (1 - maf )))
9697 Sigma <- var_g %*% LD %*% var_g
97- G <- try(mvtnorm :: rmvnorm(n , mean = rep(0 , p ), sigma = Sigma ), silent = TRUE )
98+ G <- try(rmvnorm(n , mean = rep(0 , p ), sigma = Sigma ), silent = TRUE )
9899 if (any(class(G ) == " try-error" )) {
99100 Sigma <- get_ld_pd(Sigma , lambda = lambda )
100- G <- mvtnorm :: rmvnorm(n , mean = rep(0 , p ), sigma = Sigma )
101+ G <- rmvnorm(n , mean = rep(0 , p ), sigma = Sigma )
101102 }
102103 return (G )
103104
104105}
105106
107+ # Convert continuous genotypes to discrete (binomial)
108+ # ' @keywords internal
106109binomialize_genotype <- function (G , p , maf ) {
107110
108111 sapply(1 : p , function (i ) {
@@ -116,9 +119,13 @@ binomialize_genotype <- function(G, p, maf) {
116119
117120}
118121
122+ # Process UK Biobank genotype file
123+ # ' @importFrom BEDMatrix BEDMatrix
124+ # ' @importFrom data.table fread
125+ # ' @keywords internal
119126process_ukb <- function (file.path , n , min_maf = 0.01 ) {
120127
121- G <- BEDMatrix :: BEDMatrix (file.path )
128+ G <- BEDMatrix(file.path )
122129 G <- data.matrix(G [1 : n ,])
123130 G <- apply(G , 2 , function (g ) {
124131 tmp <- which(is.na(g ))
@@ -127,7 +134,7 @@ process_ukb <- function(file.path, n, min_maf = 0.01) {
127134 })
128135 maf <- colMeans(G , na.rm = TRUE ) / 2
129136 snpname <- strsplit(file.path , split = " \\ .bed" )[[1 ]][1 ]
130- snp.names <- unlist(data.table :: fread(paste0(snpname , " .bim" ))[,2 ])
137+ snp.names <- unlist(fread(paste0(snpname , " .bim" ))[,2 ])
131138 colnames(G ) <- snp.names
132139 poss <- which(maf < = min_maf )
133140 if (length(poss ) != 0 ) {
@@ -138,6 +145,8 @@ process_ukb <- function(file.path, n, min_maf = 0.01) {
138145}
139146
140147
148+ # Generate SNP with specified LD
149+ # ' @keywords internal
141150generate_snp_LD <- function (g , ld , maf ) {
142151
143152 # - change genotype to haplotype
0 commit comments