xihaoli
diff --git a/‎DESCRIPTION‎
Lines changed: 4 additions & 4 deletions b/‎DESCRIPTION‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 4 additions & 2 deletions b/‎NAMESPACE‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎R/AI_Individual_Analysis.R‎
Lines changed: 284 additions & 0 deletions b/‎R/AI_Individual_Analysis.R‎
Lines changed: 284 additions & 0 deletions
@@ -1,10 +1,10 @@
 Package: STAARpipeline
 Type: Package
 Title: STAARpipeline for Analyzing Whole-Genome/Whole-Exome Sequencing Data
-Version: 0.9.7.2
-Date: 2024-11-17
-Author: Xihao Li [aut, cre], Zilin Li [aut, cre], Sheila M. Gaynor [aut], Han Chen [aut]
-Maintainer: Xihao Li <xihaoli@unc.edu>, Zilin Li <lizl@nenu.edu.cn>
+Version: 0.9.8
+Date: 2025-02-07
+Author: Xihao Li [aut, cre], Zilin Li [aut, cre], Wenbo Wang [aut], Sheila M. Gaynor [aut], Han Chen [aut]
+Maintainer: Xihao Li <xihaoli@unc.edu>, Zilin Li <lizl@nenu.edu.cn>, Wenbo Wang <wenbo@live.unc.edu>
 Description: An R package for performing STAARpipeline in analyzing whole-genome/whole-exome sequencing data.
 License: GPL-3
 Copyright: See COPYRIGHTS for details.
 
@@ -1,6 +1,6 @@
 exportPattern("^[[:alpha:]]+")
 import(Rcpp, STAAR, MultiSTAAR, SCANG, dplyr, SeqArray, TxDb.Hsapiens.UCSC.hg38.knownGene, GMMAT, Matrix, methods)
-importFrom("stats", "pchisq", "lm", "model.matrix", "quantile", "rnorm", "binomial")
+importFrom("stats", "pchisq", "lm", "model.matrix", "quantile", "rnorm", "binomial", "dbeta")
 importFrom("SeqVarTools", "isSNV")
 importFrom("GenomicFeatures", "genes", "promoters")
 importFrom("GENESIS", "makeSparseMatrix")
@@ -11,7 +11,9 @@ useDynLib(STAARpipeline, .registration = TRUE)
 export(fit_nullmodel,
        genesis2staar_nullmodel,
        staar2scang_nullmodel,
-       Individual_Analysis,Individual_Analysis_cond,Individual_Analysis_cond_spa,
+       staar2aistaar_nullmodel,
+       Individual_Analysis,AI_Individual_Analysis,
+       Individual_Analysis_cond,Individual_Analysis_cond_spa,
        Gene_Centric_Coding,Gene_Centric_Coding_cond,Gene_Centric_Coding_cond_spa,
        Gene_Centric_Noncoding,Gene_Centric_Noncoding_cond,Gene_Centric_Noncoding_cond_spa,
        ncRNA,ncRNA_cond,ncRNA_cond_spa,
 
@@ -0,0 +1,284 @@
+#' Ancestry-informed individual-variant analysis using score test
+#'
+#' The \code{AI_Individual_Analysis} function takes in chromosome, an user-defined variant list,
+#' the object of opened annotated GDS file, and the object from fitting the null model to analyze the association between a
+#' quantitative/dichotomous phenotype and each individual variant by using score test.
+#' The results of the ancestry-informed analysis correspond to ensemble p-values across base tests,
+#' with the option to return a list of base weights and p-values for each base test.
+#' @param chr chromosome.
+#' @param individual_results the data frame of (significant) individual variants of interest for ancestry-informed analysis.
+#' The first 4 columns should correspond to chromosome (CHR), position (POS), reference allele (REF), and alternative allele (ALT).
+#' @param genofile an object of opened annotated GDS (aGDS) file.
+#' @param obj_nullmodel an object from fitting the null model, which is either the output from \code{\link{fit_nullmodel}} function with two or more specified ancestries in \code{pop.groups},
+#' or the output from \code{\link{fit_nullmodel}} function transformed using the \code{\link{staar2aistaar_nullmodel}} function.
+#' @param QC_label channel name of the QC label in the GDS/aGDS file (default = "annotation/filter").
+#' @param variant_type type of variant included in the analysis. Choices include "variant", "SNV", or "Indel" (default = "variant").
+#' @param geno_missing_imputation method of handling missing genotypes. Either "mean" or "minor" (default = "mean").
+#' @param find_weight logical: should the ancestry group-specific weights and weighting scenario-specific p-values for each base test be saved as output (default = FALSE).
+#' @return A data frame containing the score test p-value and the estimated effect size of the minor allele for each individual variant in the given genetic region.
+#' The first 4 columns correspond to chromosome (CHR), position (POS), reference allele (REF), and alternative allele (ALT).
+#' If \code{find_weight} is TRUE, returns a list containing the ancestry-informed score test p-values, estimated effect sizes with corresponding variant characteristics,
+#' as well as the ensemble weights under two sampling scenarios and p-values under scenarios 1, 2, and combined for each base test.
+#' @references Chen, H., et al. (2016). Control for population structure and relatedness for binary traits
+#' in genetic association studies via logistic mixed models. \emph{The American Journal of Human Genetics}, \emph{98}(4), 653-666.
+#' (\href{https://doi.org/10.1016/j.ajhg.2016.02.012}{pub})
+#' @references Li, Z., Li, X., et al. (2022). A framework for detecting
+#' noncoding rare-variant associations of large-scale whole-genome sequencing
+#' studies. \emph{Nature Methods}, \emph{19}(12), 1599-1611.
+#' (\href{https://doi.org/10.1038/s41592-022-01640-x}{pub})
+#' @export
+
+AI_Individual_Analysis <- function(chr,individual_results, genofile, obj_nullmodel, QC_label="annotation/filter",
+                                   variant_type=c("variant","SNV","Indel"),geno_missing_imputation=c("mean","minor"),
+                                   find_weight = TRUE){
+
+	## evaluate choices
+	variant_type <- match.arg(variant_type)
+	geno_missing_imputation <- match.arg(geno_missing_imputation)
+
+	individual_results_chr <- individual_results[individual_results$CHR == chr, c("CHR", "POS", "REF", "ALT")]
+
+	## Null Model
+	phenotype.id <- as.character(obj_nullmodel$id_include)
+	samplesize <- length(phenotype.id)
+
+	if(!is.null(obj_nullmodel$use_SPA))
+	{
+	  use_SPA <- obj_nullmodel$use_SPA
+	}else
+	{
+	  use_SPA <- FALSE
+	}
+	if(use_SPA)
+	{
+	  return(NULL)
+	}
+
+	## residuals and cov
+	residuals.phenotype <- as.vector(obj_nullmodel$scaled.residuals)
+	### dense GRM
+	if(!obj_nullmodel$sparse_kins)
+	{
+		P <- obj_nullmodel$P
+	}
+
+	### sparse GRM
+	if(obj_nullmodel$sparse_kins)
+	{
+		Sigma_i <- obj_nullmodel$Sigma_i
+		Sigma_iX <- as.matrix(obj_nullmodel$Sigma_iX)
+		cov <- obj_nullmodel$cov
+	}
+
+	####### Obtain Genotype Information from Genofiles #######
+	genotype <- char <- c()
+
+	## get SNV id
+	filter <- seqGetData(genofile, QC_label)
+	if(variant_type=="variant")
+	{
+	  SNVlist <- filter == "PASS"
+	}
+
+	if(variant_type=="SNV")
+	{
+	  SNVlist <- (filter == "PASS") & isSNV(genofile)
+	}
+
+	if(variant_type=="Indel")
+	{
+	  SNVlist <- (filter == "PASS") & (!isSNV(genofile))
+	}
+
+	variant.id <- seqGetData(genofile, "variant.id")
+	is.in <- SNVlist
+	SNV.id <- variant.id[is.in]
+
+	seqSetFilter(genofile,variant.id=SNV.id,sample.id=phenotype.id)
+	position <- as.numeric(seqGetData(genofile, "position"))
+
+	#further subset by position, which may not be unique - uniqueness further identified in matching step
+	is.in <- position %in% individual_results_chr$POS
+
+	seqSetFilter(genofile,variant.id=SNV.id[which(is.in)],sample.id=phenotype.id)
+	CHR <- as.numeric(seqGetData(genofile, "chromosome"))
+	POS <- as.numeric(seqGetData(genofile, "position"))
+	REF <- as.character(seqGetData(genofile, "$ref"))
+	ALT <- as.character(seqGetData(genofile, "$alt"))
+	N <- rep(samplesize,length(CHR))
+
+	## all variant identifying information from genofile
+	ref_group <- data.frame(CHR=CHR,POS=POS,REF=REF,ALT=ALT)
+	ref_group$id <- rownames(ref_group)
+
+	## match variant information in provided data to those in genofile
+	individual_results_chr <- dplyr::inner_join(individual_results_chr, ref_group, by = c("CHR", "POS", "REF", "ALT"))
+
+	id.genotype <- seqGetData(genofile,"sample.id")
+
+	id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
+	phenotype.id.merge <- data.frame(phenotype.id)
+	phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
+	id.genotype.match <- phenotype.id.merge$index
+
+	Geno <- seqGetData(genofile, "$dosage")
+	Geno <- Geno[id.genotype.match,,drop=FALSE]
+
+	if(geno_missing_imputation=="mean")
+	{
+		Geno <- matrix_flip_mean(Geno)
+	}
+	if(geno_missing_imputation=="minor")
+	{
+		Geno <- matrix_flip_minor(Geno)
+	}
+
+	## subset variants of interest on indices unique to variants in genofile
+	index <- as.numeric(individual_results_chr$id)
+	Geno_chr <- Geno$Geno[,index]
+
+	CHR_chr <- CHR[index]
+	position_chr <- POS[index]
+	REF_chr <- REF[index]
+	ALT_chr <- ALT[index]
+	N_chr <- N[index]
+	Geno_chr <- as.matrix(Geno_chr,ncol=1)
+
+	genotype <- cbind(genotype, Geno_chr)
+	char <- rbind(char, cbind(CHR_chr, position_chr, REF_chr, ALT_chr, N_chr))
+
+	####### AI-Individual Analysis #######
+	genotype_ref <- genotype
+	B <- dim(obj_nullmodel$pop_weights_1_1)[2]
+
+	n_pop <- length(unique(obj_nullmodel$pop.groups))
+	pop <- obj_nullmodel$pop.groups
+	indices <- list()
+	a_p <- matrix(0, nrow = ncol(genotype), ncol = n_pop)
+
+	for(i in 1:n_pop)
+	{
+		eth <-  unique(pop)[i]
+		indices[[i]] <- which(pop %in% eth)
+		a_p[,i] <- apply(as.matrix(genotype[indices[[i]],]), 2, function(x){min(mean(x)/2, 1-mean(x)/2)})
+	}
+
+	a_p <- ifelse(a_p > 0, dbeta(a_p,1,25), a_p)
+
+	w_b_1 <- w_b_2 <- matrix(0, nrow = ncol(genotype), ncol = n_pop)
+	weight_all_1 <- weight_all_2 <- array(NA, dim = c(n_pop,B,ncol(genotype)))
+	pvalues_1_all <- pvalues_2_all <- pvalues_12_all <- c()
+	pvalues_aggregate_all <- pvalues_aggregate_s1_all <- pvalues_aggregate_s2_all <- rep(NA, ncol(genotype))
+
+	for(g in 1:ncol(genotype))
+	{
+		pvalues_1_tot <- pvalues_2_tot <- matrix(NA,nrow = ncol(genotype), ncol = B)
+		genotype_1_all <- genotype_2_all <- vector("list", B)
+
+		for(b in 1:B)
+		{
+			w_b_1 <- matrix(rep(obj_nullmodel$pop_weights_1_1[,b], ncol(genotype)),nrow = ncol(genotype),
+							ncol = n_pop, byrow = TRUE)[g,]
+			w_b_2 <- (a_p%*%diag(obj_nullmodel$pop_weights_1_25[,b]))[g,]
+
+			if(find_weight == T)
+			{
+				weight_all_1[,b,g] <- w_b_1
+				weight_all_2[,b,g] <- w_b_2
+			}
+
+			genotype_1 <- genotype_2 <- matrix(0, nrow = nrow(genotype), ncol = 1)
+
+			for(i in 1:n_pop)
+			{
+				eth <- unique(pop)[i]
+				eth_wt_1 <- w_b_1[i]
+				eth_wt_2 <- w_b_2[i]
+				genotype_1[indices[[i]],] <- t(t(genotype[indices[[i]],g])*as.vector(eth_wt_1))
+				genotype_2[indices[[i]],] <- t(t(genotype[indices[[i]],g])*as.vector(eth_wt_2))
+			}
+
+			genotype_1_all[[b]] <- genotype_1
+			genotype_2_all[[b]] <- genotype_2
+		}
+
+		genotype_1_all <- do.call(cbind, genotype_1_all)
+		genotype_2_all <- do.call(cbind, genotype_2_all)
+
+		if(obj_nullmodel$sparse_kins)
+		{
+			Score_test1 <- Individual_Score_Test(genotype_1_all, Sigma_i, Sigma_iX, cov, residuals.phenotype)
+			Score_test2 <- Individual_Score_Test(genotype_2_all, Sigma_i, Sigma_iX, cov, residuals.phenotype)
+		}
+		else if(!obj_nullmodel$sparse_kins)
+		{
+			Score_test1 <- Individual_Score_Test_denseGRM(genotype_1_all, P, residuals.phenotype)
+			Score_test2 <- Individual_Score_Test_denseGRM(genotype_2_all, P, residuals.phenotype)
+		}
+
+		pvalues_1_tot <- t(exp(-Score_test1$pvalue_log))
+		pvalues_2_tot <- t(exp(-Score_test2$pvalue_log))
+
+		obj_1 <- matrix(pvalues_1_tot,ncol = 1, nrow = B, byrow = TRUE)
+		obj_2 <- matrix(pvalues_2_tot,ncol = 1, nrow = B, byrow = TRUE)
+
+		obj_1 <- t(obj_1)
+		obj_2 <- t(obj_2)
+
+		pvalues_tot <- cbind(obj_1,obj_2)
+		pvalues_aggregate <- apply(pvalues_tot,1,function(x){CCT(x)})
+
+		if(find_weight == T)
+		{
+			pvalues_aggregate_s1 <- apply(obj_1,1,function(x){CCT(x)})
+			pvalues_aggregate_s2 <- apply(obj_2,1,function(x){CCT(x)})
+			pvalues_12_weight <- NULL
+
+			for(i in 1:B)
+			{
+				pvalues_12_weight <-  cbind(pvalues_12_weight,CCT(pvalues_tot[c(i,i+B)]))
+			}
+
+			pvalues_1_all <- rbind(pvalues_1_all,t(exp(-Score_test1$pvalue_log)))
+			pvalues_2_all <- rbind(pvalues_2_all,t(exp(-Score_test2$pvalue_log)))
+			pvalues_aggregate_s1_all[g] <- pvalues_aggregate_s1
+			pvalues_aggregate_s2_all[g] <- pvalues_aggregate_s2
+			pvalues_12_all <- rbind(pvalues_12_all, pvalues_12_weight)
+		}
+
+		pvalues_aggregate_all[g] <- pvalues_aggregate
+	}
+
+	if(find_weight == TRUE)
+	{
+		dimnames(weight_all_1)[[1]] <- dimnames(weight_all_2)[[1]] <- unique(obj_nullmodel$pop.groups)
+		dimnames(weight_all_1)[[2]] <- dimnames(weight_all_2)[[2]] <- seq(0,c(B-1))
+		dimnames(weight_all_1)[[3]] <- dimnames(weight_all_2)[[3]] <- paste0(individual_results_chr$CHR, "_",
+		                                                                     individual_results_chr$POS, "_",
+		                                                                     individual_results_chr$REF,"_",
+		                                                                     individual_results_chr$ALT)
+
+		rownames(pvalues_1_all) <- rownames(pvalues_2_all) <- rownames(pvalues_12_all) <- paste0(individual_results_chr$CHR, "_",
+		                                                                                         individual_results_chr$POS, "_",
+		                                                                                         individual_results_chr$REF,"_",
+		                                                                                         individual_results_chr$ALT)
+		colnames(pvalues_1_all) <- colnames(pvalues_2_all) <- colnames(pvalues_12_all) <- seq(0,c(B-1))
+	}
+
+	if(find_weight == TRUE)
+	{
+		results <- list(data.frame(CHR=CHR_chr,POS=position_chr,REF=REF_chr,ALT=ALT_chr,N=N_chr,
+		                           pvalue=pvalues_aggregate_all,pvalue_s1=pvalues_aggregate_s1_all,pvalue_s2=pvalues_aggregate_s2_all,
+		                           pvalue_log10=-log10(pvalues_aggregate_all)),
+		                weight_all_1=weight_all_1, weight_all_2=weight_all_2, results_weight=pvalues_12_all,
+		                results_weight1=pvalues_1_all, results_weight2=pvalues_2_all)
+	}else{
+		results <- data.frame(CHR=CHR_chr,POS=position_chr,REF=REF_chr,ALT=ALT_chr,N=N_chr,
+		                      pvalue=pvalues_aggregate_all,pvalue_log10=-log10(pvalues_aggregate_all))
+	}
+
+	seqResetFilter(genofile)
+
+	return(results)
+
+}