Skip to content

Commit a4aa798

Browse files
committed
Initial commit
0 parents  commit a4aa798

File tree

19 files changed

+924
-0
lines changed

19 files changed

+924
-0
lines changed

.Rbuildignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
^.*\.Rproj$
2+
^\.Rproj\.user$
3+
^_pkgdown\.yml$
4+
^docs$
5+
^pkgdown$

.gitignore

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
.Rproj.user
2+
.Rhistory
3+
.RData
4+
.Ruserdata
5+
.Rdata
6+
.httr-oauth
7+
.DS_Store
8+
.quarto
9+
docs

DESCRIPTION

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
Package: FLASHMM
2+
Title: Fast and Scalable Single Cell Differential Expression Analysis using Mixed-effects Models
3+
Version: 0.1.0
4+
Authors@R: c(
5+
person("Changjiang", "Xu", email = "[email protected]", role = c("aut", "cre")),
6+
person("Gary", "Bader", email = "[email protected]", role = "aut"))
7+
Description: More about what it does (maybe more than one line)
8+
Use four spaces when indenting paragraphs within the Description.
9+
License: GPL-3
10+
Encoding: UTF-8
11+
LazyData: true
12+
RoxygenNote: 7.3.2
13+
Imports:
14+
MASS,
15+
stats
16+
Suggests:
17+
Matrix,
18+
testthat (>= 3.0.0)
19+
Config/testthat/edition: 3

FLASHMM.Rproj

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
Version: 1.0
2+
3+
RestoreWorkspace: Default
4+
SaveWorkspace: Default
5+
AlwaysSaveHistory: Default
6+
7+
EnableCodeIndexing: Yes
8+
UseSpacesForTab: Yes
9+
NumSpacesForTab: 2
10+
Encoding: UTF-8
11+
12+
RnwWeave: Sweave
13+
LaTeX: pdfLaTeX
14+
15+
AutoAppendNewline: Yes
16+
StripTrailingWhitespace: Yes
17+
18+
BuildType: Package
19+
PackageUseDevtools: Yes
20+
PackageInstallArgs: --no-multiarch --with-keep.source
21+
PackageRoxygenize: rd,collate,namespace,vignette

NAMESPACE

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
exportPattern("^[[:alpha:]]+")
2+
import("stats")
3+
importFrom("MASS","ginv")

R/lmm.R

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
#' Fitting Linear Mixed-effects Models
2+
#'
3+
#' @description lmm is used to fit linear mixed-effects models (LMM) based on summary-level data. The LMM parameters are estimated by restricted maximum likelihood (REML) with Fisher scoring (FS) gradient descent algorithm.
4+
#'
5+
#' @param XX t(X)\%*\%X, X is a design matrix for fixed effects.
6+
#' @param XY t(Y\%*\%X), Y is a features-by-samples matrix of observed responses (genes-by-cells expression matrix for scRNA-seq).
7+
#' @param ZX t(Z)\%*\%X, Z = [Z1, ..., Zk], a design matrix for k random factors (variables).
8+
#' @param ZY t(Y\%*\%Z).
9+
#' @param ZZ t(Z)\%*\%Z.
10+
#' @param Ynorm Norms for features (each row in Y), that is, Ynorm = rowSums(Y*Y).
11+
#' @param n Numbers of samples (cells in scRNA-seq), nrow(X).
12+
#' @param summary.stats A list of the summary statistics: XX, XY, ZX, ZY, ZZ, Ynorm and n, which can be computed by csslmm function.
13+
#' @param d A vector of (m1,...,mk), mi = ncol(Zi), number of columns in Zi. m1 + ... + mk = ncol(Z), number of columns in Z.
14+
#' @param theta0 A vector of initial values of the variance components, (s1, ...,sk, s_(k+1)), si = sigma_i^2, the variance component of the i-th type random effects. s_(k+1) = sigma^2, the variance component of model residual error.
15+
#' @param method The REML with Fisher scoring (FS) iterative algorithm, REML-FS.
16+
#' @param max.iter The maximal number of iterations for the iterative algorithm.
17+
#' @param epsilon Positive convergence tolerance. If the absolute value of the first partial derivative of log likelihood is less than epsilon, the iterations converge.
18+
#' @param output.cov If TRUE, output the covariance matrices for the estimated coefficients, which are needed for testing contrasts.
19+
#' @param output.RE If TRUE, output the best linear unbiased prediction (BLUP) of the random effects.
20+
#'
21+
#' @return A list containing the following components:
22+
#' @return dlogL First partial derivatives of log-likelihoods for each feature (gene).
23+
#' @return niter Nmbers of iterations for each feature (gene).
24+
#' @return coef A matrix of estimated coefficients (fixed effects), each column corresponds to a feature (gene) and each row one covariate.
25+
#' @return se A matrix of the standard errors of the estimated coefficients.
26+
#' @return t A matrix of t-values for the fixed effects, equal to coef/se.
27+
#' @return df Degrees of freedom.
28+
#' @return p A matrix of two-sided p-values for the fixed effects.
29+
#' @return cov A array of covariance matrices of the estimated coefficients (fixed effects).
30+
#' @return theta A matrix of the estimated variance components, each column corresponds to a feature (gene) and each row one variance component. The last row is the variance component of the residual error.
31+
#' @return se.theta Standard errors of the estimated theta.
32+
#' @return RE A matrix of the best linear unbiased prediction (BLUP) of random effects.
33+
#'
34+
#' @export
35+
lmm <- function(XX, XY, ZX, ZY, ZZ, Ynorm, n, summary.stats = NULL, d, theta0 = NULL, method = "REML-FS", max.iter = 50, epsilon = 1e-5, output.cov = TRUE, output.RE = FALSE)
36+
{
37+
if (!is.null(summary.stats)){
38+
stopifnot(all(c("XX", "XY", "ZX", "ZY", "ZZ", "Ynorm", "n") %in% names(summary.stats)))
39+
XX <- summary.stats$XX
40+
XY <- summary.stats$XY
41+
ZX <- summary.stats$ZX
42+
ZY <- summary.stats$ZY
43+
ZZ <- summary.stats$ZZ
44+
Ynorm <- summary.stats$Ynorm
45+
n <- summary.stats$n
46+
}
47+
48+
stopifnot(!any(is.na(XY)), !any(is.na(ZX)), !any(is.na(ZY)))
49+
p <- ncol(ZX)
50+
k <- length(d)
51+
52+
stopifnot(sum(d) == ncol(ZZ))
53+
54+
XXinv <- try(chol2inv(chol(XX)), silent = TRUE)
55+
if (inherits(XXinv, "try-error")) {
56+
stop("XX is not positive-definite or X is not full column rank.")
57+
}
58+
59+
xxz <- XXinv%*%t(ZX)
60+
zrz <- ZZ - ZX%*%(XXinv%*%t(ZX))
61+
zry <- ZY - ZX%*%(XXinv%*%XY)
62+
yry <- Ynorm - colSums(XY*(XXinv%*%XY))
63+
64+
niter <- NULL
65+
dlogL <- NULL
66+
theta <- matrix(nrow = k + 1, ncol = ncol(XY), dimnames = list(paste0("var", c(1:k, 0)), colnames(XY)))
67+
setheta <- theta
68+
RE <- matrix(nrow = nrow(ZY), ncol = ncol(XY), dimnames = dimnames(ZY))
69+
beta <- matrix(nrow = nrow(XY), ncol = ncol(XY), dimnames = dimnames(XY))
70+
sebeta <- beta
71+
covbeta <- array(dim = c(nrow(XY), nrow(XY), ncol(XY)),
72+
dimnames = list(rownames(XY), rownames(XY), colnames(XY)))
73+
74+
for (jy in 1:ncol(ZY)) {
75+
if (is.null(theta0)) {
76+
s <- c(rep(0, k), yry[jy]/(n-p))
77+
} else s <- theta0
78+
79+
vest <- varest(xxz, zrz, zryj = zry[, jy], yryj = yry[jy], n = n, d = d, s = s, max.iter = max.iter, epsilon = epsilon)
80+
s <- vest$s
81+
dl <- vest$dl
82+
iter <- vest$iter
83+
Minv <- vest$Minv
84+
85+
if (max(abs(dl)) > epsilon) {
86+
warningText <- paste0("The first derivatives of log likelihood for Y", jy)
87+
dlText <- paste0(ifelse(abs(dl) > 1e-3, round(dl, 4), format(dl, digits = 3, scientific = TRUE)), collapse = ", ")
88+
warning(paste0(warningText, ": ", dlText, ", doesn't reach epsilon ", epsilon))
89+
}
90+
91+
sr <- s[1:k]/s[k+1]
92+
M <- solve(sweep(ZZ, 1, STATS = rep(sr, times = d), FUN = "*") + diag(sum(d)))
93+
M <- sweep(M, 2, STATS = rep(sr, times = d), FUN = "*")
94+
xvx <- XXinv + xxz%*%(ginv(diag(sum(d)) - M%*%(ZX%*%xxz))%*%(M%*%t(xxz)))
95+
xvy <- XY[, jy] - t(ZX)%*%(M%*%ZY[, jy])
96+
b <- xvx%*%xvy
97+
covbeta[,,jy] <- (xvx + t(xvx))*(s[k+1]/2)
98+
99+
RE[, jy] <- M%*%(ZY[, jy] - ZX%*%b)
100+
101+
niter <- c(niter, iter)
102+
theta[, jy] <- s
103+
setheta[, jy] <- sqrt(diag(Minv))
104+
beta[, jy] <- b
105+
dlogL <- cbind(dlogL, dl)
106+
sebeta[, jy] <- sqrt(diag(as.matrix(covbeta[,,jy])))
107+
}
108+
tval <- beta/sebeta
109+
pval <- 2 * pt(-abs(tval), df = n-p)
110+
111+
if (!output.cov) covbeta <- NULL
112+
if (!output.RE) RE <- NULL
113+
114+
list(method = method, dlogL = dlogL, niter = niter, coef = beta, se = sebeta, t = tval, p = pval, cov = covbeta, df = n-p, theta = theta, se.theta = setheta, RE = RE)
115+
}
116+
117+
118+
#' A internal function to estimate variance components for one feature (gene).
119+
#'
120+
#' @description This function is used internally (inside lmm).
121+
#'
122+
#' @param xxz XXinv\%*\%t(ZX), where XXinv is the inverse of XX and ZX = t(Z)\%*\%X.
123+
#' @param zrz ZZ - ZX\%*\%(XXinv\%*\%t(ZX)), ZZ = t(Z)\%*\%Z.
124+
#' @param zryj zry[, j], where zry = ZY - ZX\%*\%(XXinv\%*\%XY)
125+
#' @param yryj yry[j], where yry = Ynorm - colSums(XY*(XXinv\%*\%XY))
126+
#' @param n Numbers of samples (cells in scRNA-seq).
127+
#' @param d A vector of (m1,...,mk), mi = ncol(Zi), number of columns in Zi.
128+
#' @param s A vector of initial values of the variance components, (s1, ...,sk, s_(k+1)).
129+
#' @param max.iter The maximal number of iterations.
130+
#' @param epsilon Positive convergence tolerance.
131+
#'
132+
#' @return A list consisting of
133+
#' estimates of variance components (s),
134+
#' first partial derivatives of log-likehood (dl),
135+
#' number of iterations (iter), and
136+
#' inverse of Fisher information matrix (Minv).
137+
#' @keywords internal
138+
varest <- function(xxz, zrz, zryj, yryj, n, d, s, max.iter = 50, epsilon = 1e-5)
139+
{
140+
p <- nrow(xxz)
141+
k <- length(d)
142+
143+
dl <- 100
144+
iter <- 0
145+
while ((max(abs(dl)) > epsilon) & (iter < max.iter)){
146+
iter <- iter + 1
147+
148+
fs <- matrix(NA, k+1, k+1)
149+
dl <- rep(NA, k+1)
150+
151+
sr <- s[1:k]/s[k+1]
152+
M <- solve(sweep(zrz, 1, STATS = rep(sr, times = d), FUN = "*") + diag(sum(d)))
153+
ZRZ <- zrz%*%M
154+
ZR2Z <- ZRZ%*%M
155+
yRZ <- t(zryj)%*%M
156+
157+
mi <- 0
158+
for (i in 1:k){
159+
ik <- (mi+1):(mi+d[i])
160+
dl[i] <- (sum((yRZ[ik])^2)/s[k+1]^2 - sum(diag(ZRZ[ik, ik, drop = FALSE]))/s[k+1])/2
161+
162+
mj <- 0
163+
for (j in 1:i){
164+
ji <- (mj+1):(mj+d[j])
165+
fs[i, j] <- sum((ZRZ[ji, ik])^2)/s[k+1]^2/2
166+
fs[j, i] <- fs[i, j]
167+
mj <- mj + d[j]
168+
}
169+
170+
j <- k+1
171+
fs[i, j] <- sum(diag(ZR2Z[ik, ik, drop = FALSE]))/s[k+1]^2/2
172+
fs[j, i] <- fs[i, j]
173+
mi <- mi + d[i]
174+
}
175+
176+
i <- k+1
177+
fs[i, i] <- (n - p - sum(d) + sum(t(M)*M))/s[k+1]^2/2
178+
179+
yR2y <- yryj - sum(((t(M) + diag(sum(d)))%*%zryj)*(M%*%(rep(sr, times = d)*zryj)))
180+
dl[i] <- (yR2y/s[k+1]^2 - (n-p-sum(d)+sum(diag(M)))/s[k+1])/2
181+
182+
Minv <- ginv(fs)
183+
s <- s + Minv%*%dl
184+
}
185+
186+
list(s = c(s), dl = dl, iter = iter, Minv = Minv)
187+
}

R/lmmfit.R

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#' Fitting Linear Mixed-effects Models
2+
#'
3+
#' @description lmmfit, a wrapper function of lmm, fits linear mixed-effects models (LMM) by sample-level data. The LMM parameters are estimated by restricted maximum likelihood (REML) with Fisher scoring (FS) gradient descent algorithm.
4+
#'
5+
#' @param X A design matrix for fixed effects, with rows corresponding to the columns of Y.
6+
#' @param Y A features-by-samples matrix of responses (genes-by-cells matrix of gene expressions for scRNA-seq).
7+
#' @param Z A design matrix for random effects, with rows corresponding to the columns of Y. Z = [Z1, ..., Zk], and Zi, i=1,...,k, is the design matrix for the i-th type random factor.
8+
#' @param d A vector of (m1,...,mk), mi = ncol(Zi), number of columns in Zi. m1 + ... + mk = ncol(Z), number of columns in Z.
9+
#' @param theta0 A vector of initial values of the variance components, (s1, ...,sk, s_(k+1)), si = sigma_i^2, the variance component of the i-th type random effects. s_(k+1) = sigma^2, the variance component of model residual error.
10+
#' @param method The REML with Fisher scoring (FS) iterative algorithm, REML-FS.
11+
#' @param max.iter The maximal number of iterations for the iterative algorithm.
12+
#' @param epsilon Positive convergence tolerance. If the absolute value of the first partial derivative of log likelihood is less than epsilon, the iterations converge.
13+
#' @param output.cov If TRUE, output the covariance matrices for the estimated coefficients, which are needed for testing contrasts.
14+
#' @param output.RE If TRUE, output the best linear unbiased prediction (BLUP) of the random effects.
15+
#'
16+
#' @return A list containing the following components:
17+
#' @return dlogL First partial derivatives of log-likelihoods for each feature (gene).
18+
#' @return niter Nmbers of iterations for each feature (gene).
19+
#' @return coef A matrix of estimated coefficients (fixed effects), each column corresponds to a feature (gene) and each row one covariate.
20+
#' @return se A matrix of the standard errors of the estimated coefficients.
21+
#' @return t A matrix of t-values for the fixed effects, equal to coef/se.
22+
#' @return df Degrees of freedom.
23+
#' @return p A matrix of two-sided p-values for the fixed effects.
24+
#' @return cov A array of covariance matrices of the estimated coefficients (fixed effects).
25+
#' @return theta A matrix of the estimated variance components, each column corresponds to a feature (gene) and each row one variance component. The last row is the variance component of the residual error.
26+
#' @return se.theta Standard errors of the estimated theta.
27+
#' @return RE A matrix of the best linear unbiased prediction (BLUP) of random effects.
28+
#'
29+
#' @export
30+
lmmfit <- function(Y, X, Z, d, theta0 = NULL, method = "REML-FS", max.iter = 50, epsilon = 1e-5, output.cov = TRUE, output.RE = FALSE)
31+
{
32+
stopifnot(!any(is.na(Y)), !any(is.na(X)), !any(is.na(Z)))
33+
stopifnot(ncol(Y) == nrow(X), ncol(Y) == nrow(Z))
34+
35+
n <- nrow(X)
36+
XX <- t(X)%*%X
37+
XY <- t(Y%*%X)
38+
ZX <- t(Z)%*%X
39+
ZY <- t(Y%*%Z)
40+
ZZ <- t(Z)%*%Z
41+
Ynorm <- rowSums(Y*Y)
42+
43+
lmm(XX, XY, ZX, ZY, ZZ, Ynorm = Ynorm, n = n, d = d, theta0 = theta0, method = method, max.iter = max.iter, epsilon = epsilon, output.cov = output.cov, output.RE = output.RE)
44+
}

R/lmmtest.R

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#' Testing Fixed Effects and Contrasts of the Fixed Effects
2+
#'
3+
#' @description lmmtest is used to test fixed effects or contrasts of fixed effects.
4+
5+
#' @param fit A result of lmmfit/lmm, which contains
6+
#' coef (estimates of fixed effects), a matrix with rows representing the fixed effects and columns the different response variables in the model,
7+
#' cov (covariance matrix of the fixed effects), an array of three dimmesions for different response variables in the model,
8+
#' df (residual degree of freedom in the linear model).
9+
#' @param index A vector of integers or characters indicating which fixed effects are to be tested. By default index consists of all of the fixed effects. Ignored if contrast is not NULL.
10+
#' @param contrast A matrix with columns corresponding to contrasts of the fixed effects to be tested.
11+
#' @param alternative A character string specifying the alternative hypothesis, one of "two.sided", "greater" or "less".
12+
#'
13+
#' @return A matrix of t-values and p-values, in which the rows correspond to the features (genes) and the columns the fixed effects (covariates). .
14+
#'
15+
#' @export
16+
lmmtest <- function(fit, index, contrast = NULL, alternative = c("two.sided", "less", "greater")){
17+
alternative <- match.arg(alternative)
18+
if (is.null(contrast)){
19+
if (missing(index)) index <- 1:nrow(fit$coef)
20+
contrast <- diag(nrow(fit$coef))
21+
colnames(contrast) <- rownames(fit$coef)
22+
contrast <- contrast[, index, drop = FALSE]
23+
}
24+
25+
tval <- t(contrast)%*%fit$coef
26+
for (j in 1:ncol(fit$coef)){
27+
tval[, j] <- tval[, j]/sqrt(diag(t(contrast)%*%fit$cov[,,j]%*%contrast))
28+
}
29+
30+
df <- fit$df
31+
if (alternative == "less") {
32+
pval <- pt(tval, df)
33+
} else if (alternative == "greater") {
34+
pval <- pt(tval, df, lower.tail = FALSE)
35+
} else pval <- 2 * pt(-abs(tval), df)
36+
37+
rownames(tval) <- paste0(rownames(tval), "_t")
38+
rownames(pval) <- paste0(rownames(pval), "_pvalue")
39+
40+
cbind(t(tval), t(pval))
41+
}

0 commit comments

Comments
 (0)