Skip to content

Commit 4a838d9

Browse files
committed
STAARpipeline v0.9.8
STAARpipeline v0.9.8
1 parent e2b6ec2 commit 4a838d9

31 files changed

+1848
-177
lines changed

DESCRIPTION

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
Package: STAARpipeline
22
Type: Package
33
Title: STAARpipeline for Analyzing Whole-Genome/Whole-Exome Sequencing Data
4-
Version: 0.9.7.2
5-
Date: 2024-11-17
6-
Author: Xihao Li [aut, cre], Zilin Li [aut, cre], Sheila M. Gaynor [aut], Han Chen [aut]
7-
Maintainer: Xihao Li <xihaoli@unc.edu>, Zilin Li <lizl@nenu.edu.cn>
4+
Version: 0.9.8
5+
Date: 2025-02-07
6+
Author: Xihao Li [aut, cre], Zilin Li [aut, cre], Wenbo Wang [aut], Sheila M. Gaynor [aut], Han Chen [aut]
7+
Maintainer: Xihao Li <xihaoli@unc.edu>, Zilin Li <lizl@nenu.edu.cn>, Wenbo Wang <wenbo@live.unc.edu>
88
Description: An R package for performing STAARpipeline in analyzing whole-genome/whole-exome sequencing data.
99
License: GPL-3
1010
Copyright: See COPYRIGHTS for details.

NAMESPACE

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
exportPattern("^[[:alpha:]]+")
22
import(Rcpp, STAAR, MultiSTAAR, SCANG, dplyr, SeqArray, TxDb.Hsapiens.UCSC.hg38.knownGene, GMMAT, Matrix, methods)
3-
importFrom("stats", "pchisq", "lm", "model.matrix", "quantile", "rnorm", "binomial")
3+
importFrom("stats", "pchisq", "lm", "model.matrix", "quantile", "rnorm", "binomial", "dbeta")
44
importFrom("SeqVarTools", "isSNV")
55
importFrom("GenomicFeatures", "genes", "promoters")
66
importFrom("GENESIS", "makeSparseMatrix")
@@ -11,7 +11,9 @@ useDynLib(STAARpipeline, .registration = TRUE)
1111
export(fit_nullmodel,
1212
genesis2staar_nullmodel,
1313
staar2scang_nullmodel,
14-
Individual_Analysis,Individual_Analysis_cond,Individual_Analysis_cond_spa,
14+
staar2aistaar_nullmodel,
15+
Individual_Analysis,AI_Individual_Analysis,
16+
Individual_Analysis_cond,Individual_Analysis_cond_spa,
1517
Gene_Centric_Coding,Gene_Centric_Coding_cond,Gene_Centric_Coding_cond_spa,
1618
Gene_Centric_Noncoding,Gene_Centric_Noncoding_cond,Gene_Centric_Noncoding_cond_spa,
1719
ncRNA,ncRNA_cond,ncRNA_cond_spa,

R/AI_Individual_Analysis.R

Lines changed: 284 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,284 @@
1+
#' Ancestry-informed individual-variant analysis using score test
2+
#'
3+
#' The \code{AI_Individual_Analysis} function takes in chromosome, an user-defined variant list,
4+
#' the object of opened annotated GDS file, and the object from fitting the null model to analyze the association between a
5+
#' quantitative/dichotomous phenotype and each individual variant by using score test.
6+
#' The results of the ancestry-informed analysis correspond to ensemble p-values across base tests,
7+
#' with the option to return a list of base weights and p-values for each base test.
8+
#' @param chr chromosome.
9+
#' @param individual_results the data frame of (significant) individual variants of interest for ancestry-informed analysis.
10+
#' The first 4 columns should correspond to chromosome (CHR), position (POS), reference allele (REF), and alternative allele (ALT).
11+
#' @param genofile an object of opened annotated GDS (aGDS) file.
12+
#' @param obj_nullmodel an object from fitting the null model, which is either the output from \code{\link{fit_nullmodel}} function with two or more specified ancestries in \code{pop.groups},
13+
#' or the output from \code{\link{fit_nullmodel}} function transformed using the \code{\link{staar2aistaar_nullmodel}} function.
14+
#' @param QC_label channel name of the QC label in the GDS/aGDS file (default = "annotation/filter").
15+
#' @param variant_type type of variant included in the analysis. Choices include "variant", "SNV", or "Indel" (default = "variant").
16+
#' @param geno_missing_imputation method of handling missing genotypes. Either "mean" or "minor" (default = "mean").
17+
#' @param find_weight logical: should the ancestry group-specific weights and weighting scenario-specific p-values for each base test be saved as output (default = FALSE).
18+
#' @return A data frame containing the score test p-value and the estimated effect size of the minor allele for each individual variant in the given genetic region.
19+
#' The first 4 columns correspond to chromosome (CHR), position (POS), reference allele (REF), and alternative allele (ALT).
20+
#' If \code{find_weight} is TRUE, returns a list containing the ancestry-informed score test p-values, estimated effect sizes with corresponding variant characteristics,
21+
#' as well as the ensemble weights under two sampling scenarios and p-values under scenarios 1, 2, and combined for each base test.
22+
#' @references Chen, H., et al. (2016). Control for population structure and relatedness for binary traits
23+
#' in genetic association studies via logistic mixed models. \emph{The American Journal of Human Genetics}, \emph{98}(4), 653-666.
24+
#' (\href{https://doi.org/10.1016/j.ajhg.2016.02.012}{pub})
25+
#' @references Li, Z., Li, X., et al. (2022). A framework for detecting
26+
#' noncoding rare-variant associations of large-scale whole-genome sequencing
27+
#' studies. \emph{Nature Methods}, \emph{19}(12), 1599-1611.
28+
#' (\href{https://doi.org/10.1038/s41592-022-01640-x}{pub})
29+
#' @export
30+
31+
AI_Individual_Analysis <- function(chr,individual_results, genofile, obj_nullmodel, QC_label="annotation/filter",
32+
variant_type=c("variant","SNV","Indel"),geno_missing_imputation=c("mean","minor"),
33+
find_weight = TRUE){
34+
35+
## evaluate choices
36+
variant_type <- match.arg(variant_type)
37+
geno_missing_imputation <- match.arg(geno_missing_imputation)
38+
39+
individual_results_chr <- individual_results[individual_results$CHR == chr, c("CHR", "POS", "REF", "ALT")]
40+
41+
## Null Model
42+
phenotype.id <- as.character(obj_nullmodel$id_include)
43+
samplesize <- length(phenotype.id)
44+
45+
if(!is.null(obj_nullmodel$use_SPA))
46+
{
47+
use_SPA <- obj_nullmodel$use_SPA
48+
}else
49+
{
50+
use_SPA <- FALSE
51+
}
52+
if(use_SPA)
53+
{
54+
return(NULL)
55+
}
56+
57+
## residuals and cov
58+
residuals.phenotype <- as.vector(obj_nullmodel$scaled.residuals)
59+
### dense GRM
60+
if(!obj_nullmodel$sparse_kins)
61+
{
62+
P <- obj_nullmodel$P
63+
}
64+
65+
### sparse GRM
66+
if(obj_nullmodel$sparse_kins)
67+
{
68+
Sigma_i <- obj_nullmodel$Sigma_i
69+
Sigma_iX <- as.matrix(obj_nullmodel$Sigma_iX)
70+
cov <- obj_nullmodel$cov
71+
}
72+
73+
####### Obtain Genotype Information from Genofiles #######
74+
genotype <- char <- c()
75+
76+
## get SNV id
77+
filter <- seqGetData(genofile, QC_label)
78+
if(variant_type=="variant")
79+
{
80+
SNVlist <- filter == "PASS"
81+
}
82+
83+
if(variant_type=="SNV")
84+
{
85+
SNVlist <- (filter == "PASS") & isSNV(genofile)
86+
}
87+
88+
if(variant_type=="Indel")
89+
{
90+
SNVlist <- (filter == "PASS") & (!isSNV(genofile))
91+
}
92+
93+
variant.id <- seqGetData(genofile, "variant.id")
94+
is.in <- SNVlist
95+
SNV.id <- variant.id[is.in]
96+
97+
seqSetFilter(genofile,variant.id=SNV.id,sample.id=phenotype.id)
98+
position <- as.numeric(seqGetData(genofile, "position"))
99+
100+
#further subset by position, which may not be unique - uniqueness further identified in matching step
101+
is.in <- position %in% individual_results_chr$POS
102+
103+
seqSetFilter(genofile,variant.id=SNV.id[which(is.in)],sample.id=phenotype.id)
104+
CHR <- as.numeric(seqGetData(genofile, "chromosome"))
105+
POS <- as.numeric(seqGetData(genofile, "position"))
106+
REF <- as.character(seqGetData(genofile, "$ref"))
107+
ALT <- as.character(seqGetData(genofile, "$alt"))
108+
N <- rep(samplesize,length(CHR))
109+
110+
## all variant identifying information from genofile
111+
ref_group <- data.frame(CHR=CHR,POS=POS,REF=REF,ALT=ALT)
112+
ref_group$id <- rownames(ref_group)
113+
114+
## match variant information in provided data to those in genofile
115+
individual_results_chr <- dplyr::inner_join(individual_results_chr, ref_group, by = c("CHR", "POS", "REF", "ALT"))
116+
117+
id.genotype <- seqGetData(genofile,"sample.id")
118+
119+
id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
120+
phenotype.id.merge <- data.frame(phenotype.id)
121+
phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
122+
id.genotype.match <- phenotype.id.merge$index
123+
124+
Geno <- seqGetData(genofile, "$dosage")
125+
Geno <- Geno[id.genotype.match,,drop=FALSE]
126+
127+
if(geno_missing_imputation=="mean")
128+
{
129+
Geno <- matrix_flip_mean(Geno)
130+
}
131+
if(geno_missing_imputation=="minor")
132+
{
133+
Geno <- matrix_flip_minor(Geno)
134+
}
135+
136+
## subset variants of interest on indices unique to variants in genofile
137+
index <- as.numeric(individual_results_chr$id)
138+
Geno_chr <- Geno$Geno[,index]
139+
140+
CHR_chr <- CHR[index]
141+
position_chr <- POS[index]
142+
REF_chr <- REF[index]
143+
ALT_chr <- ALT[index]
144+
N_chr <- N[index]
145+
Geno_chr <- as.matrix(Geno_chr,ncol=1)
146+
147+
genotype <- cbind(genotype, Geno_chr)
148+
char <- rbind(char, cbind(CHR_chr, position_chr, REF_chr, ALT_chr, N_chr))
149+
150+
####### AI-Individual Analysis #######
151+
genotype_ref <- genotype
152+
B <- dim(obj_nullmodel$pop_weights_1_1)[2]
153+
154+
n_pop <- length(unique(obj_nullmodel$pop.groups))
155+
pop <- obj_nullmodel$pop.groups
156+
indices <- list()
157+
a_p <- matrix(0, nrow = ncol(genotype), ncol = n_pop)
158+
159+
for(i in 1:n_pop)
160+
{
161+
eth <- unique(pop)[i]
162+
indices[[i]] <- which(pop %in% eth)
163+
a_p[,i] <- apply(as.matrix(genotype[indices[[i]],]), 2, function(x){min(mean(x)/2, 1-mean(x)/2)})
164+
}
165+
166+
a_p <- ifelse(a_p > 0, dbeta(a_p,1,25), a_p)
167+
168+
w_b_1 <- w_b_2 <- matrix(0, nrow = ncol(genotype), ncol = n_pop)
169+
weight_all_1 <- weight_all_2 <- array(NA, dim = c(n_pop,B,ncol(genotype)))
170+
pvalues_1_all <- pvalues_2_all <- pvalues_12_all <- c()
171+
pvalues_aggregate_all <- pvalues_aggregate_s1_all <- pvalues_aggregate_s2_all <- rep(NA, ncol(genotype))
172+
173+
for(g in 1:ncol(genotype))
174+
{
175+
pvalues_1_tot <- pvalues_2_tot <- matrix(NA,nrow = ncol(genotype), ncol = B)
176+
genotype_1_all <- genotype_2_all <- vector("list", B)
177+
178+
for(b in 1:B)
179+
{
180+
w_b_1 <- matrix(rep(obj_nullmodel$pop_weights_1_1[,b], ncol(genotype)),nrow = ncol(genotype),
181+
ncol = n_pop, byrow = TRUE)[g,]
182+
w_b_2 <- (a_p%*%diag(obj_nullmodel$pop_weights_1_25[,b]))[g,]
183+
184+
if(find_weight == T)
185+
{
186+
weight_all_1[,b,g] <- w_b_1
187+
weight_all_2[,b,g] <- w_b_2
188+
}
189+
190+
genotype_1 <- genotype_2 <- matrix(0, nrow = nrow(genotype), ncol = 1)
191+
192+
for(i in 1:n_pop)
193+
{
194+
eth <- unique(pop)[i]
195+
eth_wt_1 <- w_b_1[i]
196+
eth_wt_2 <- w_b_2[i]
197+
genotype_1[indices[[i]],] <- t(t(genotype[indices[[i]],g])*as.vector(eth_wt_1))
198+
genotype_2[indices[[i]],] <- t(t(genotype[indices[[i]],g])*as.vector(eth_wt_2))
199+
}
200+
201+
genotype_1_all[[b]] <- genotype_1
202+
genotype_2_all[[b]] <- genotype_2
203+
}
204+
205+
genotype_1_all <- do.call(cbind, genotype_1_all)
206+
genotype_2_all <- do.call(cbind, genotype_2_all)
207+
208+
if(obj_nullmodel$sparse_kins)
209+
{
210+
Score_test1 <- Individual_Score_Test(genotype_1_all, Sigma_i, Sigma_iX, cov, residuals.phenotype)
211+
Score_test2 <- Individual_Score_Test(genotype_2_all, Sigma_i, Sigma_iX, cov, residuals.phenotype)
212+
}
213+
else if(!obj_nullmodel$sparse_kins)
214+
{
215+
Score_test1 <- Individual_Score_Test_denseGRM(genotype_1_all, P, residuals.phenotype)
216+
Score_test2 <- Individual_Score_Test_denseGRM(genotype_2_all, P, residuals.phenotype)
217+
}
218+
219+
pvalues_1_tot <- t(exp(-Score_test1$pvalue_log))
220+
pvalues_2_tot <- t(exp(-Score_test2$pvalue_log))
221+
222+
obj_1 <- matrix(pvalues_1_tot,ncol = 1, nrow = B, byrow = TRUE)
223+
obj_2 <- matrix(pvalues_2_tot,ncol = 1, nrow = B, byrow = TRUE)
224+
225+
obj_1 <- t(obj_1)
226+
obj_2 <- t(obj_2)
227+
228+
pvalues_tot <- cbind(obj_1,obj_2)
229+
pvalues_aggregate <- apply(pvalues_tot,1,function(x){CCT(x)})
230+
231+
if(find_weight == T)
232+
{
233+
pvalues_aggregate_s1 <- apply(obj_1,1,function(x){CCT(x)})
234+
pvalues_aggregate_s2 <- apply(obj_2,1,function(x){CCT(x)})
235+
pvalues_12_weight <- NULL
236+
237+
for(i in 1:B)
238+
{
239+
pvalues_12_weight <- cbind(pvalues_12_weight,CCT(pvalues_tot[c(i,i+B)]))
240+
}
241+
242+
pvalues_1_all <- rbind(pvalues_1_all,t(exp(-Score_test1$pvalue_log)))
243+
pvalues_2_all <- rbind(pvalues_2_all,t(exp(-Score_test2$pvalue_log)))
244+
pvalues_aggregate_s1_all[g] <- pvalues_aggregate_s1
245+
pvalues_aggregate_s2_all[g] <- pvalues_aggregate_s2
246+
pvalues_12_all <- rbind(pvalues_12_all, pvalues_12_weight)
247+
}
248+
249+
pvalues_aggregate_all[g] <- pvalues_aggregate
250+
}
251+
252+
if(find_weight == TRUE)
253+
{
254+
dimnames(weight_all_1)[[1]] <- dimnames(weight_all_2)[[1]] <- unique(obj_nullmodel$pop.groups)
255+
dimnames(weight_all_1)[[2]] <- dimnames(weight_all_2)[[2]] <- seq(0,c(B-1))
256+
dimnames(weight_all_1)[[3]] <- dimnames(weight_all_2)[[3]] <- paste0(individual_results_chr$CHR, "_",
257+
individual_results_chr$POS, "_",
258+
individual_results_chr$REF,"_",
259+
individual_results_chr$ALT)
260+
261+
rownames(pvalues_1_all) <- rownames(pvalues_2_all) <- rownames(pvalues_12_all) <- paste0(individual_results_chr$CHR, "_",
262+
individual_results_chr$POS, "_",
263+
individual_results_chr$REF,"_",
264+
individual_results_chr$ALT)
265+
colnames(pvalues_1_all) <- colnames(pvalues_2_all) <- colnames(pvalues_12_all) <- seq(0,c(B-1))
266+
}
267+
268+
if(find_weight == TRUE)
269+
{
270+
results <- list(data.frame(CHR=CHR_chr,POS=position_chr,REF=REF_chr,ALT=ALT_chr,N=N_chr,
271+
pvalue=pvalues_aggregate_all,pvalue_s1=pvalues_aggregate_s1_all,pvalue_s2=pvalues_aggregate_s2_all,
272+
pvalue_log10=-log10(pvalues_aggregate_all)),
273+
weight_all_1=weight_all_1, weight_all_2=weight_all_2, results_weight=pvalues_12_all,
274+
results_weight1=pvalues_1_all, results_weight2=pvalues_2_all)
275+
}else{
276+
results <- data.frame(CHR=CHR_chr,POS=position_chr,REF=REF_chr,ALT=ALT_chr,N=N_chr,
277+
pvalue=pvalues_aggregate_all,pvalue_log10=-log10(pvalues_aggregate_all))
278+
}
279+
280+
seqResetFilter(genofile)
281+
282+
return(results)
283+
284+
}

0 commit comments

Comments
 (0)