BaderLab
diff --git a/‎.Rbuildignore‎
Lines changed: 5 additions & 0 deletions b/‎.Rbuildignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 9 additions & 0 deletions b/‎.gitignore‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎DESCRIPTION‎
Lines changed: 19 additions & 0 deletions b/‎DESCRIPTION‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎FLASHMM.Rproj‎
Lines changed: 21 additions & 0 deletions b/‎FLASHMM.Rproj‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 3 additions & 0 deletions b/‎NAMESPACE‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎R/lmm.R‎
Lines changed: 187 additions & 0 deletions b/‎R/lmm.R‎
Lines changed: 187 additions & 0 deletions
diff --git a/‎R/lmmfit.R‎
Lines changed: 44 additions & 0 deletions b/‎R/lmmfit.R‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎R/lmmtest.R‎
Lines changed: 41 additions & 0 deletions b/‎R/lmmtest.R‎
Lines changed: 41 additions & 0 deletions
@@ -0,0 +1,5 @@
+^.*\.Rproj$
+^\.Rproj\.user$
+^_pkgdown\.yml$
+^docs$
+^pkgdown$
@@ -0,0 +1,9 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
+.Rdata
+.httr-oauth
+.DS_Store
+.quarto
+docs
@@ -0,0 +1,19 @@
+Package: FLASHMM
+Title: Fast and Scalable Single Cell Differential Expression Analysis using Mixed-effects Models
+Version: 0.1.0
+Authors@R: c(
+  person("Changjiang", "Xu", email = "[email protected]", role = c("aut", "cre")),
+  person("Gary", "Bader", email = "[email protected]", role = "aut"))
+Description: More about what it does (maybe more than one line)
+    Use four spaces when indenting paragraphs within the Description.
+License: GPL-3 
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 7.3.2
+Imports: 
+    MASS,
+    stats
+Suggests: 
+    Matrix,
+    testthat (>= 3.0.0)
+Config/testthat/edition: 3
@@ -0,0 +1,21 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
+PackageRoxygenize: rd,collate,namespace,vignette
@@ -0,0 +1,3 @@
+exportPattern("^[[:alpha:]]+")
+import("stats")
+importFrom("MASS","ginv")
@@ -0,0 +1,187 @@
+#' Fitting Linear Mixed-effects Models
+#'
+#' @description lmm is used to fit linear mixed-effects models (LMM) based on summary-level data. The LMM parameters are estimated by restricted maximum likelihood (REML) with Fisher scoring (FS) gradient descent algorithm.
+#'
+#' @param XX t(X)\%*\%X, X is a design matrix for fixed effects.
+#' @param XY t(Y\%*\%X), Y is a features-by-samples matrix of observed responses (genes-by-cells expression matrix for scRNA-seq).
+#' @param ZX t(Z)\%*\%X, Z = [Z1, ..., Zk],  a design matrix for k random factors (variables).
+#' @param ZY t(Y\%*\%Z).
+#' @param ZZ t(Z)\%*\%Z.
+#' @param Ynorm Norms for features (each row in Y), that is, Ynorm = rowSums(Y*Y).
+#' @param n Numbers of samples (cells in scRNA-seq), nrow(X).
+#' @param summary.stats A list of the summary statistics: XX, XY, ZX, ZY, ZZ, Ynorm and n, which can be computed by csslmm function.
+#' @param d A vector of (m1,...,mk), mi = ncol(Zi), number of columns in Zi. m1 + ... + mk = ncol(Z), number of columns in Z.
+#' @param theta0 A vector of initial values of the variance components, (s1, ...,sk, s_(k+1)), si = sigma_i^2, the variance component of the i-th type random effects. s_(k+1) = sigma^2, the variance component of model residual error.
+#' @param method The REML with Fisher scoring (FS) iterative algorithm, REML-FS.
+#' @param max.iter The maximal number of iterations for the iterative algorithm.
+#' @param epsilon Positive convergence tolerance. If the absolute value of the first partial derivative of log likelihood is less than epsilon, the iterations converge.
+#' @param output.cov If TRUE, output the covariance matrices for the estimated coefficients, which are needed for testing contrasts.
+#' @param output.RE If TRUE, output the best linear unbiased prediction (BLUP) of the random effects.
+#'
+#' @return A list containing the following components:
+#' @return dlogL First partial derivatives of log-likelihoods for each feature (gene).
+#' @return niter Nmbers of iterations for each feature (gene).
+#' @return coef A matrix of estimated coefficients (fixed effects), each column corresponds to a feature (gene) and each row one covariate.
+#' @return se A matrix of the standard errors of the estimated coefficients.
+#' @return t A matrix of t-values for the fixed effects, equal to coef/se.
+#' @return df Degrees of freedom.
+#' @return p A matrix of two-sided p-values for the fixed effects.
+#' @return cov A array of covariance matrices of the estimated coefficients (fixed effects).
+#' @return theta A matrix of the estimated variance components, each column corresponds to a feature (gene) and each row one variance component. The last row is the variance component of the residual error.
+#' @return se.theta Standard errors of the estimated theta.
+#' @return RE A matrix of the best linear unbiased prediction (BLUP) of random effects.
+#'
+#' @export
+lmm <- function(XX, XY, ZX, ZY, ZZ, Ynorm, n, summary.stats = NULL, d, theta0 = NULL, method = "REML-FS", max.iter = 50, epsilon = 1e-5, output.cov = TRUE, output.RE = FALSE)
+{
+if (!is.null(summary.stats)){
+stopifnot(all(c("XX", "XY", "ZX", "ZY", "ZZ", "Ynorm", "n") %in% names(summary.stats)))
+	XX <- summary.stats$XX
+	XY <- summary.stats$XY
+	ZX <- summary.stats$ZX
+	ZY <- summary.stats$ZY
+	ZZ <- summary.stats$ZZ
+	Ynorm <- summary.stats$Ynorm
+	n <- summary.stats$n
+}
+
+stopifnot(!any(is.na(XY)), !any(is.na(ZX)), !any(is.na(ZY)))
+p <- ncol(ZX)
+k <- length(d)
+
+stopifnot(sum(d) == ncol(ZZ))
+
+XXinv <- try(chol2inv(chol(XX)), silent = TRUE)
+if (inherits(XXinv, "try-error")) {
+	stop("XX is not positive-definite or X is not full column rank.")
+	}
+
+xxz <- XXinv%*%t(ZX)
+zrz <- ZZ - ZX%*%(XXinv%*%t(ZX))
+zry <- ZY - ZX%*%(XXinv%*%XY)
+yry <- Ynorm - colSums(XY*(XXinv%*%XY))
+
+niter <- NULL
+dlogL <- NULL
+theta <- matrix(nrow = k + 1, ncol = ncol(XY), dimnames = list(paste0("var", c(1:k, 0)), colnames(XY)))
+setheta <- theta
+RE <- matrix(nrow = nrow(ZY), ncol = ncol(XY), dimnames = dimnames(ZY))
+beta <- matrix(nrow = nrow(XY), ncol = ncol(XY), dimnames = dimnames(XY))
+sebeta <- beta
+covbeta <- array(dim = c(nrow(XY), nrow(XY), ncol(XY)),
+	dimnames = list(rownames(XY), rownames(XY), colnames(XY)))
+
+for (jy in 1:ncol(ZY)) {
+	if (is.null(theta0)) {
+		s <- c(rep(0, k), yry[jy]/(n-p))
+	} else s <- theta0
+
+vest <- varest(xxz, zrz, zryj = zry[, jy], yryj = yry[jy], n = n, d = d, s = s, max.iter = max.iter, epsilon = epsilon)
+s <- vest$s
+dl <- vest$dl
+iter <- vest$iter
+Minv <- vest$Minv
+
+if (max(abs(dl)) > epsilon) {
+	warningText <- paste0("The first derivatives of log likelihood for Y", jy)
+	dlText <- paste0(ifelse(abs(dl) > 1e-3, round(dl, 4), format(dl, digits = 3, scientific = TRUE)), collapse = ", ")
+	warning(paste0(warningText, ": ", dlText, ", doesn't reach epsilon ", epsilon))
+	}
+
+sr <- s[1:k]/s[k+1]
+M <- solve(sweep(ZZ, 1, STATS = rep(sr, times = d), FUN = "*") + diag(sum(d)))
+M <- sweep(M, 2, STATS = rep(sr, times = d), FUN = "*")
+xvx <- XXinv + xxz%*%(ginv(diag(sum(d)) - M%*%(ZX%*%xxz))%*%(M%*%t(xxz)))
+xvy <- XY[, jy] - t(ZX)%*%(M%*%ZY[, jy])
+b <- xvx%*%xvy
+covbeta[,,jy] <- (xvx + t(xvx))*(s[k+1]/2)
+
+RE[, jy] <- M%*%(ZY[, jy] - ZX%*%b)
+
+niter <- c(niter, iter)
+theta[, jy] <- s
+setheta[, jy] <- sqrt(diag(Minv))
+beta[, jy] <- b
+dlogL <- cbind(dlogL, dl)
+sebeta[, jy] <- sqrt(diag(as.matrix(covbeta[,,jy])))
+}
+tval <- beta/sebeta
+pval <- 2 * pt(-abs(tval), df = n-p)
+
+if (!output.cov) covbeta <- NULL
+if (!output.RE) RE <- NULL
+
+list(method = method, dlogL = dlogL, niter = niter, coef = beta, se = sebeta, t = tval, p = pval, cov = covbeta, df = n-p, theta = theta, se.theta = setheta, RE = RE)
+}
+
+
+#' A internal function to estimate variance components for one feature (gene).
+#'
+#' @description This function is used internally (inside lmm).
+#'
+#' @param xxz XXinv\%*\%t(ZX), where XXinv is the inverse of XX and ZX = t(Z)\%*\%X.
+#' @param zrz ZZ - ZX\%*\%(XXinv\%*\%t(ZX)), ZZ = t(Z)\%*\%Z.
+#' @param zryj zry[, j], where zry = ZY - ZX\%*\%(XXinv\%*\%XY)
+#' @param yryj yry[j], where yry = Ynorm - colSums(XY*(XXinv\%*\%XY))
+#' @param n Numbers of samples (cells in scRNA-seq).
+#' @param d A vector of (m1,...,mk), mi = ncol(Zi), number of columns in Zi.
+#' @param s A vector of initial values of the variance components, (s1, ...,sk, s_(k+1)).
+#' @param max.iter The maximal number of iterations.
+#' @param epsilon Positive convergence tolerance.
+#'
+#' @return A list consisting of
+#' estimates of variance components (s),
+#' first partial derivatives of log-likehood (dl),
+#' number of iterations (iter), and
+#' inverse of Fisher information matrix (Minv).
+#' @keywords internal
+varest <- function(xxz, zrz, zryj, yryj, n, d, s, max.iter = 50, epsilon = 1e-5)
+{
+  p <- nrow(xxz)
+  k <- length(d)
+
+  dl <- 100
+  iter <- 0
+  while ((max(abs(dl)) > epsilon)	& (iter < max.iter)){
+    iter <- iter + 1
+
+    fs <- matrix(NA, k+1, k+1)
+    dl <- rep(NA, k+1)
+
+    sr <- s[1:k]/s[k+1]
+    M <- solve(sweep(zrz, 1, STATS = rep(sr, times = d), FUN = "*") + diag(sum(d)))
+    ZRZ <- zrz%*%M
+    ZR2Z <- ZRZ%*%M
+    yRZ <- t(zryj)%*%M
+
+    mi <- 0
+    for (i in 1:k){
+      ik <- (mi+1):(mi+d[i])
+      dl[i] <- (sum((yRZ[ik])^2)/s[k+1]^2 - sum(diag(ZRZ[ik, ik, drop = FALSE]))/s[k+1])/2
+
+      mj <- 0
+      for (j in 1:i){
+        ji <- (mj+1):(mj+d[j])
+        fs[i, j] <- sum((ZRZ[ji, ik])^2)/s[k+1]^2/2
+        fs[j, i] <- fs[i, j]
+        mj <- mj + d[j]
+      }
+
+      j <- k+1
+      fs[i, j] <- sum(diag(ZR2Z[ik, ik, drop = FALSE]))/s[k+1]^2/2
+      fs[j, i] <- fs[i, j]
+      mi <- mi + d[i]
+    }
+
+    i <- k+1
+    fs[i, i] <- (n - p - sum(d) + sum(t(M)*M))/s[k+1]^2/2
+
+    yR2y <- yryj - sum(((t(M) + diag(sum(d)))%*%zryj)*(M%*%(rep(sr, times = d)*zryj)))
+    dl[i] <-  (yR2y/s[k+1]^2 - (n-p-sum(d)+sum(diag(M)))/s[k+1])/2
+
+    Minv <- ginv(fs)
+    s <- s + Minv%*%dl
+  }
+
+  list(s = c(s), dl = dl, iter = iter, Minv = Minv)
+}
@@ -0,0 +1,44 @@
+#' Fitting Linear Mixed-effects Models
+#'
+#' @description lmmfit, a wrapper function of lmm, fits linear mixed-effects models (LMM) by sample-level data. The LMM parameters are estimated by restricted maximum likelihood (REML) with Fisher scoring (FS) gradient descent algorithm.
+#'
+#' @param X A design matrix for fixed effects, with rows corresponding to the columns of Y.
+#' @param Y A features-by-samples matrix of responses (genes-by-cells matrix of gene expressions for scRNA-seq).
+#' @param Z A design matrix for random effects, with rows corresponding to the columns of Y. Z = [Z1, ..., Zk], and Zi, i=1,...,k, is the design matrix for the i-th type random factor.
+#' @param d A vector of (m1,...,mk), mi = ncol(Zi), number of columns in Zi. m1 + ... + mk = ncol(Z), number of columns in Z.
+#' @param theta0 A vector of initial values of the variance components, (s1, ...,sk, s_(k+1)), si = sigma_i^2, the variance component of the i-th type random effects. s_(k+1) = sigma^2, the variance component of model residual error.
+#' @param method The REML with Fisher scoring (FS) iterative algorithm, REML-FS.
+#' @param max.iter The maximal number of iterations for the iterative algorithm.
+#' @param epsilon Positive convergence tolerance. If the absolute value of the first partial derivative of log likelihood is less than epsilon, the iterations converge.
+#' @param output.cov If TRUE, output the covariance matrices for the estimated coefficients, which are needed for testing contrasts.
+#' @param output.RE If TRUE, output the best linear unbiased prediction (BLUP) of the random effects.
+#'
+#' @return A list containing the following components:
+#' @return dlogL First partial derivatives of log-likelihoods for each feature (gene).
+#' @return niter Nmbers of iterations for each feature (gene).
+#' @return coef A matrix of estimated coefficients (fixed effects), each column corresponds to a feature (gene) and each row one covariate.
+#' @return se A matrix of the standard errors of the estimated coefficients.
+#' @return t A matrix of t-values for the fixed effects, equal to coef/se.
+#' @return df Degrees of freedom.
+#' @return p A matrix of two-sided p-values for the fixed effects.
+#' @return cov A array of covariance matrices of the estimated coefficients (fixed effects).
+#' @return theta A matrix of the estimated variance components, each column corresponds to a feature (gene) and each row one variance component. The last row is the variance component of the residual error.
+#' @return se.theta Standard errors of the estimated theta.
+#' @return RE A matrix of the best linear unbiased prediction (BLUP) of random effects.
+#'
+#' @export
+lmmfit <- function(Y, X, Z, d, theta0 = NULL, method = "REML-FS", max.iter = 50, epsilon = 1e-5, output.cov = TRUE, output.RE = FALSE)
+{
+stopifnot(!any(is.na(Y)), !any(is.na(X)), !any(is.na(Z)))
+stopifnot(ncol(Y) == nrow(X), ncol(Y) == nrow(Z))
+
+n <- nrow(X)
+XX <- t(X)%*%X
+XY <- t(Y%*%X)
+ZX <- t(Z)%*%X
+ZY <- t(Y%*%Z)
+ZZ <- t(Z)%*%Z
+Ynorm <- rowSums(Y*Y)
+
+lmm(XX, XY, ZX, ZY, ZZ, Ynorm = Ynorm, n = n, d = d, theta0 = theta0, method = method, max.iter = max.iter, epsilon = epsilon, output.cov = output.cov, output.RE = output.RE)
+}
@@ -0,0 +1,41 @@
+#' Testing Fixed Effects and Contrasts of the Fixed Effects
+#'
+#' @description lmmtest is used to test fixed effects or contrasts of fixed effects.
+
+#' @param fit A result of lmmfit/lmm, which contains
+#' coef (estimates of fixed effects), a matrix with rows representing the fixed effects and columns the different response variables in the model,
+#' cov (covariance matrix of the fixed effects), an array of three dimmesions for different response variables in the model,
+#' df (residual degree of freedom in the linear model).
+#' @param index A vector of integers or characters indicating which fixed effects are to be tested. By default index consists of all of the fixed effects. Ignored if contrast is not NULL.
+#' @param contrast A matrix with columns corresponding to contrasts of the fixed effects to be tested.
+#' @param alternative A character string specifying the alternative hypothesis, one of "two.sided", "greater" or "less".
+#'
+#' @return A matrix of t-values and p-values, in which the rows correspond to the features (genes) and the columns the fixed effects (covariates). .
+#'
+#' @export
+lmmtest <- function(fit, index, contrast = NULL, alternative = c("two.sided", "less", "greater")){
+alternative <- match.arg(alternative)
+	if (is.null(contrast)){
+		if (missing(index)) index <- 1:nrow(fit$coef)
+		contrast <- diag(nrow(fit$coef))
+		colnames(contrast) <- rownames(fit$coef)
+		contrast <- contrast[, index, drop = FALSE]
+	}
+
+	tval <- t(contrast)%*%fit$coef
+	for (j in 1:ncol(fit$coef)){
+		tval[, j] <- tval[, j]/sqrt(diag(t(contrast)%*%fit$cov[,,j]%*%contrast))
+		}
+
+	df <- fit$df
+	if (alternative == "less") {
+        pval <- pt(tval, df)
+    } else if (alternative == "greater") {
+        pval <- pt(tval, df, lower.tail = FALSE)
+    } else pval <- 2 * pt(-abs(tval), df)
+
+	rownames(tval) <- paste0(rownames(tval), "_t")
+	rownames(pval) <- paste0(rownames(pval), "_pvalue")
+
+cbind(t(tval), t(pval))
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+exportPattern("^[[:alpha:]]+")`
	`2`	`+import("stats")`
	`3`	`+importFrom("MASS","ginv")`