|
| 1 | +#' Ancestry-informed individual-variant analysis using score test |
| 2 | +#' |
| 3 | +#' The \code{AI_Individual_Analysis} function takes in chromosome, an user-defined variant list, |
| 4 | +#' the object of opened annotated GDS file, and the object from fitting the null model to analyze the association between a |
| 5 | +#' quantitative/dichotomous phenotype and each individual variant by using score test. |
| 6 | +#' The results of the ancestry-informed analysis correspond to ensemble p-values across base tests, |
| 7 | +#' with the option to return a list of base weights and p-values for each base test. |
| 8 | +#' @param chr chromosome. |
| 9 | +#' @param individual_results the data frame of (significant) individual variants of interest for ancestry-informed analysis. |
| 10 | +#' The first 4 columns should correspond to chromosome (CHR), position (POS), reference allele (REF), and alternative allele (ALT). |
| 11 | +#' @param genofile an object of opened annotated GDS (aGDS) file. |
| 12 | +#' @param obj_nullmodel an object from fitting the null model, which is either the output from \code{\link{fit_nullmodel}} function with two or more specified ancestries in \code{pop.groups}, |
| 13 | +#' or the output from \code{\link{fit_nullmodel}} function transformed using the \code{\link{staar2aistaar_nullmodel}} function. |
| 14 | +#' @param QC_label channel name of the QC label in the GDS/aGDS file (default = "annotation/filter"). |
| 15 | +#' @param variant_type type of variant included in the analysis. Choices include "variant", "SNV", or "Indel" (default = "variant"). |
| 16 | +#' @param geno_missing_imputation method of handling missing genotypes. Either "mean" or "minor" (default = "mean"). |
| 17 | +#' @param find_weight logical: should the ancestry group-specific weights and weighting scenario-specific p-values for each base test be saved as output (default = FALSE). |
| 18 | +#' @return A data frame containing the score test p-value and the estimated effect size of the minor allele for each individual variant in the given genetic region. |
| 19 | +#' The first 4 columns correspond to chromosome (CHR), position (POS), reference allele (REF), and alternative allele (ALT). |
| 20 | +#' If \code{find_weight} is TRUE, returns a list containing the ancestry-informed score test p-values, estimated effect sizes with corresponding variant characteristics, |
| 21 | +#' as well as the ensemble weights under two sampling scenarios and p-values under scenarios 1, 2, and combined for each base test. |
| 22 | +#' @references Chen, H., et al. (2016). Control for population structure and relatedness for binary traits |
| 23 | +#' in genetic association studies via logistic mixed models. \emph{The American Journal of Human Genetics}, \emph{98}(4), 653-666. |
| 24 | +#' (\href{https://doi.org/10.1016/j.ajhg.2016.02.012}{pub}) |
| 25 | +#' @references Li, Z., Li, X., et al. (2022). A framework for detecting |
| 26 | +#' noncoding rare-variant associations of large-scale whole-genome sequencing |
| 27 | +#' studies. \emph{Nature Methods}, \emph{19}(12), 1599-1611. |
| 28 | +#' (\href{https://doi.org/10.1038/s41592-022-01640-x}{pub}) |
| 29 | +#' @export |
| 30 | + |
| 31 | +AI_Individual_Analysis <- function(chr,individual_results, genofile, obj_nullmodel, QC_label="annotation/filter", |
| 32 | + variant_type=c("variant","SNV","Indel"),geno_missing_imputation=c("mean","minor"), |
| 33 | + find_weight = TRUE){ |
| 34 | + |
| 35 | + ## evaluate choices |
| 36 | + variant_type <- match.arg(variant_type) |
| 37 | + geno_missing_imputation <- match.arg(geno_missing_imputation) |
| 38 | + |
| 39 | + individual_results_chr <- individual_results[individual_results$CHR == chr, c("CHR", "POS", "REF", "ALT")] |
| 40 | + |
| 41 | + ## Null Model |
| 42 | + phenotype.id <- as.character(obj_nullmodel$id_include) |
| 43 | + samplesize <- length(phenotype.id) |
| 44 | + |
| 45 | + if(!is.null(obj_nullmodel$use_SPA)) |
| 46 | + { |
| 47 | + use_SPA <- obj_nullmodel$use_SPA |
| 48 | + }else |
| 49 | + { |
| 50 | + use_SPA <- FALSE |
| 51 | + } |
| 52 | + if(use_SPA) |
| 53 | + { |
| 54 | + return(NULL) |
| 55 | + } |
| 56 | + |
| 57 | + ## residuals and cov |
| 58 | + residuals.phenotype <- as.vector(obj_nullmodel$scaled.residuals) |
| 59 | + ### dense GRM |
| 60 | + if(!obj_nullmodel$sparse_kins) |
| 61 | + { |
| 62 | + P <- obj_nullmodel$P |
| 63 | + } |
| 64 | + |
| 65 | + ### sparse GRM |
| 66 | + if(obj_nullmodel$sparse_kins) |
| 67 | + { |
| 68 | + Sigma_i <- obj_nullmodel$Sigma_i |
| 69 | + Sigma_iX <- as.matrix(obj_nullmodel$Sigma_iX) |
| 70 | + cov <- obj_nullmodel$cov |
| 71 | + } |
| 72 | + |
| 73 | + ####### Obtain Genotype Information from Genofiles ####### |
| 74 | + genotype <- char <- c() |
| 75 | + |
| 76 | + ## get SNV id |
| 77 | + filter <- seqGetData(genofile, QC_label) |
| 78 | + if(variant_type=="variant") |
| 79 | + { |
| 80 | + SNVlist <- filter == "PASS" |
| 81 | + } |
| 82 | + |
| 83 | + if(variant_type=="SNV") |
| 84 | + { |
| 85 | + SNVlist <- (filter == "PASS") & isSNV(genofile) |
| 86 | + } |
| 87 | + |
| 88 | + if(variant_type=="Indel") |
| 89 | + { |
| 90 | + SNVlist <- (filter == "PASS") & (!isSNV(genofile)) |
| 91 | + } |
| 92 | + |
| 93 | + variant.id <- seqGetData(genofile, "variant.id") |
| 94 | + is.in <- SNVlist |
| 95 | + SNV.id <- variant.id[is.in] |
| 96 | + |
| 97 | + seqSetFilter(genofile,variant.id=SNV.id,sample.id=phenotype.id) |
| 98 | + position <- as.numeric(seqGetData(genofile, "position")) |
| 99 | + |
| 100 | + #further subset by position, which may not be unique - uniqueness further identified in matching step |
| 101 | + is.in <- position %in% individual_results_chr$POS |
| 102 | + |
| 103 | + seqSetFilter(genofile,variant.id=SNV.id[which(is.in)],sample.id=phenotype.id) |
| 104 | + CHR <- as.numeric(seqGetData(genofile, "chromosome")) |
| 105 | + POS <- as.numeric(seqGetData(genofile, "position")) |
| 106 | + REF <- as.character(seqGetData(genofile, "$ref")) |
| 107 | + ALT <- as.character(seqGetData(genofile, "$alt")) |
| 108 | + N <- rep(samplesize,length(CHR)) |
| 109 | + |
| 110 | + ## all variant identifying information from genofile |
| 111 | + ref_group <- data.frame(CHR=CHR,POS=POS,REF=REF,ALT=ALT) |
| 112 | + ref_group$id <- rownames(ref_group) |
| 113 | + |
| 114 | + ## match variant information in provided data to those in genofile |
| 115 | + individual_results_chr <- dplyr::inner_join(individual_results_chr, ref_group, by = c("CHR", "POS", "REF", "ALT")) |
| 116 | + |
| 117 | + id.genotype <- seqGetData(genofile,"sample.id") |
| 118 | + |
| 119 | + id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype))) |
| 120 | + phenotype.id.merge <- data.frame(phenotype.id) |
| 121 | + phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype")) |
| 122 | + id.genotype.match <- phenotype.id.merge$index |
| 123 | + |
| 124 | + Geno <- seqGetData(genofile, "$dosage") |
| 125 | + Geno <- Geno[id.genotype.match,,drop=FALSE] |
| 126 | + |
| 127 | + if(geno_missing_imputation=="mean") |
| 128 | + { |
| 129 | + Geno <- matrix_flip_mean(Geno) |
| 130 | + } |
| 131 | + if(geno_missing_imputation=="minor") |
| 132 | + { |
| 133 | + Geno <- matrix_flip_minor(Geno) |
| 134 | + } |
| 135 | + |
| 136 | + ## subset variants of interest on indices unique to variants in genofile |
| 137 | + index <- as.numeric(individual_results_chr$id) |
| 138 | + Geno_chr <- Geno$Geno[,index] |
| 139 | + |
| 140 | + CHR_chr <- CHR[index] |
| 141 | + position_chr <- POS[index] |
| 142 | + REF_chr <- REF[index] |
| 143 | + ALT_chr <- ALT[index] |
| 144 | + N_chr <- N[index] |
| 145 | + Geno_chr <- as.matrix(Geno_chr,ncol=1) |
| 146 | + |
| 147 | + genotype <- cbind(genotype, Geno_chr) |
| 148 | + char <- rbind(char, cbind(CHR_chr, position_chr, REF_chr, ALT_chr, N_chr)) |
| 149 | + |
| 150 | + ####### AI-Individual Analysis ####### |
| 151 | + genotype_ref <- genotype |
| 152 | + B <- dim(obj_nullmodel$pop_weights_1_1)[2] |
| 153 | + |
| 154 | + n_pop <- length(unique(obj_nullmodel$pop.groups)) |
| 155 | + pop <- obj_nullmodel$pop.groups |
| 156 | + indices <- list() |
| 157 | + a_p <- matrix(0, nrow = ncol(genotype), ncol = n_pop) |
| 158 | + |
| 159 | + for(i in 1:n_pop) |
| 160 | + { |
| 161 | + eth <- unique(pop)[i] |
| 162 | + indices[[i]] <- which(pop %in% eth) |
| 163 | + a_p[,i] <- apply(as.matrix(genotype[indices[[i]],]), 2, function(x){min(mean(x)/2, 1-mean(x)/2)}) |
| 164 | + } |
| 165 | + |
| 166 | + a_p <- ifelse(a_p > 0, dbeta(a_p,1,25), a_p) |
| 167 | + |
| 168 | + w_b_1 <- w_b_2 <- matrix(0, nrow = ncol(genotype), ncol = n_pop) |
| 169 | + weight_all_1 <- weight_all_2 <- array(NA, dim = c(n_pop,B,ncol(genotype))) |
| 170 | + pvalues_1_all <- pvalues_2_all <- pvalues_12_all <- c() |
| 171 | + pvalues_aggregate_all <- pvalues_aggregate_s1_all <- pvalues_aggregate_s2_all <- rep(NA, ncol(genotype)) |
| 172 | + |
| 173 | + for(g in 1:ncol(genotype)) |
| 174 | + { |
| 175 | + pvalues_1_tot <- pvalues_2_tot <- matrix(NA,nrow = ncol(genotype), ncol = B) |
| 176 | + genotype_1_all <- genotype_2_all <- vector("list", B) |
| 177 | + |
| 178 | + for(b in 1:B) |
| 179 | + { |
| 180 | + w_b_1 <- matrix(rep(obj_nullmodel$pop_weights_1_1[,b], ncol(genotype)),nrow = ncol(genotype), |
| 181 | + ncol = n_pop, byrow = TRUE)[g,] |
| 182 | + w_b_2 <- (a_p%*%diag(obj_nullmodel$pop_weights_1_25[,b]))[g,] |
| 183 | + |
| 184 | + if(find_weight == T) |
| 185 | + { |
| 186 | + weight_all_1[,b,g] <- w_b_1 |
| 187 | + weight_all_2[,b,g] <- w_b_2 |
| 188 | + } |
| 189 | + |
| 190 | + genotype_1 <- genotype_2 <- matrix(0, nrow = nrow(genotype), ncol = 1) |
| 191 | + |
| 192 | + for(i in 1:n_pop) |
| 193 | + { |
| 194 | + eth <- unique(pop)[i] |
| 195 | + eth_wt_1 <- w_b_1[i] |
| 196 | + eth_wt_2 <- w_b_2[i] |
| 197 | + genotype_1[indices[[i]],] <- t(t(genotype[indices[[i]],g])*as.vector(eth_wt_1)) |
| 198 | + genotype_2[indices[[i]],] <- t(t(genotype[indices[[i]],g])*as.vector(eth_wt_2)) |
| 199 | + } |
| 200 | + |
| 201 | + genotype_1_all[[b]] <- genotype_1 |
| 202 | + genotype_2_all[[b]] <- genotype_2 |
| 203 | + } |
| 204 | + |
| 205 | + genotype_1_all <- do.call(cbind, genotype_1_all) |
| 206 | + genotype_2_all <- do.call(cbind, genotype_2_all) |
| 207 | + |
| 208 | + if(obj_nullmodel$sparse_kins) |
| 209 | + { |
| 210 | + Score_test1 <- Individual_Score_Test(genotype_1_all, Sigma_i, Sigma_iX, cov, residuals.phenotype) |
| 211 | + Score_test2 <- Individual_Score_Test(genotype_2_all, Sigma_i, Sigma_iX, cov, residuals.phenotype) |
| 212 | + } |
| 213 | + else if(!obj_nullmodel$sparse_kins) |
| 214 | + { |
| 215 | + Score_test1 <- Individual_Score_Test_denseGRM(genotype_1_all, P, residuals.phenotype) |
| 216 | + Score_test2 <- Individual_Score_Test_denseGRM(genotype_2_all, P, residuals.phenotype) |
| 217 | + } |
| 218 | + |
| 219 | + pvalues_1_tot <- t(exp(-Score_test1$pvalue_log)) |
| 220 | + pvalues_2_tot <- t(exp(-Score_test2$pvalue_log)) |
| 221 | + |
| 222 | + obj_1 <- matrix(pvalues_1_tot,ncol = 1, nrow = B, byrow = TRUE) |
| 223 | + obj_2 <- matrix(pvalues_2_tot,ncol = 1, nrow = B, byrow = TRUE) |
| 224 | + |
| 225 | + obj_1 <- t(obj_1) |
| 226 | + obj_2 <- t(obj_2) |
| 227 | + |
| 228 | + pvalues_tot <- cbind(obj_1,obj_2) |
| 229 | + pvalues_aggregate <- apply(pvalues_tot,1,function(x){CCT(x)}) |
| 230 | + |
| 231 | + if(find_weight == T) |
| 232 | + { |
| 233 | + pvalues_aggregate_s1 <- apply(obj_1,1,function(x){CCT(x)}) |
| 234 | + pvalues_aggregate_s2 <- apply(obj_2,1,function(x){CCT(x)}) |
| 235 | + pvalues_12_weight <- NULL |
| 236 | + |
| 237 | + for(i in 1:B) |
| 238 | + { |
| 239 | + pvalues_12_weight <- cbind(pvalues_12_weight,CCT(pvalues_tot[c(i,i+B)])) |
| 240 | + } |
| 241 | + |
| 242 | + pvalues_1_all <- rbind(pvalues_1_all,t(exp(-Score_test1$pvalue_log))) |
| 243 | + pvalues_2_all <- rbind(pvalues_2_all,t(exp(-Score_test2$pvalue_log))) |
| 244 | + pvalues_aggregate_s1_all[g] <- pvalues_aggregate_s1 |
| 245 | + pvalues_aggregate_s2_all[g] <- pvalues_aggregate_s2 |
| 246 | + pvalues_12_all <- rbind(pvalues_12_all, pvalues_12_weight) |
| 247 | + } |
| 248 | + |
| 249 | + pvalues_aggregate_all[g] <- pvalues_aggregate |
| 250 | + } |
| 251 | + |
| 252 | + if(find_weight == TRUE) |
| 253 | + { |
| 254 | + dimnames(weight_all_1)[[1]] <- dimnames(weight_all_2)[[1]] <- unique(obj_nullmodel$pop.groups) |
| 255 | + dimnames(weight_all_1)[[2]] <- dimnames(weight_all_2)[[2]] <- seq(0,c(B-1)) |
| 256 | + dimnames(weight_all_1)[[3]] <- dimnames(weight_all_2)[[3]] <- paste0(individual_results_chr$CHR, "_", |
| 257 | + individual_results_chr$POS, "_", |
| 258 | + individual_results_chr$REF,"_", |
| 259 | + individual_results_chr$ALT) |
| 260 | + |
| 261 | + rownames(pvalues_1_all) <- rownames(pvalues_2_all) <- rownames(pvalues_12_all) <- paste0(individual_results_chr$CHR, "_", |
| 262 | + individual_results_chr$POS, "_", |
| 263 | + individual_results_chr$REF,"_", |
| 264 | + individual_results_chr$ALT) |
| 265 | + colnames(pvalues_1_all) <- colnames(pvalues_2_all) <- colnames(pvalues_12_all) <- seq(0,c(B-1)) |
| 266 | + } |
| 267 | + |
| 268 | + if(find_weight == TRUE) |
| 269 | + { |
| 270 | + results <- list(data.frame(CHR=CHR_chr,POS=position_chr,REF=REF_chr,ALT=ALT_chr,N=N_chr, |
| 271 | + pvalue=pvalues_aggregate_all,pvalue_s1=pvalues_aggregate_s1_all,pvalue_s2=pvalues_aggregate_s2_all, |
| 272 | + pvalue_log10=-log10(pvalues_aggregate_all)), |
| 273 | + weight_all_1=weight_all_1, weight_all_2=weight_all_2, results_weight=pvalues_12_all, |
| 274 | + results_weight1=pvalues_1_all, results_weight2=pvalues_2_all) |
| 275 | + }else{ |
| 276 | + results <- data.frame(CHR=CHR_chr,POS=position_chr,REF=REF_chr,ALT=ALT_chr,N=N_chr, |
| 277 | + pvalue=pvalues_aggregate_all,pvalue_log10=-log10(pvalues_aggregate_all)) |
| 278 | + } |
| 279 | + |
| 280 | + seqResetFilter(genofile) |
| 281 | + |
| 282 | + return(results) |
| 283 | + |
| 284 | +} |
0 commit comments