CSB5
diff --git a/‎README.md‎
Lines changed: 23 additions & 11 deletions b/‎README.md‎
Lines changed: 23 additions & 11 deletions
diff --git a/‎emFunctions.r‎
Lines changed: 51 additions & 13 deletions b/‎emFunctions.r‎
Lines changed: 51 additions & 13 deletions
@@ -1,7 +1,10 @@
 # BEEM
-BEEM is an approach to infer models for microbial community dynamics based on metagenomic sequencing data (16S or shotgun-metagenomics). It is based on the commonly used [generalized Lotka-Volterra modelling](https://en.wikipedia.org/wiki/Generalized_Lotka–Volterra_equation) (gLVM) framework. BEEM uses an iterative EM-like algorithm to simultaneously infer scaling factors (microbial biomass) and model parameters (microbial growth rate and interaction terms) and can thus work directly with the relative abundance values that are obtained with metagenomic sequencing. A preprint describing this work will be posted on bioRxiv soon.
 
-Note: BEEM stands for **B**iomass **E**stimation and model inference with an **E**xpectation **M**aximization-like algorithm. 
+<img src="logo.png" height="200" align="right" />
+
+BEEM is an approach to infer models for microbial community dynamics based on metagenomic sequencing data (16S or shotgun-metagenomics). It is based on the commonly used [generalized Lotka-Volterra modelling](https://en.wikipedia.org/wiki/Generalized_Lotka–Volterra_equation) (gLVM) framework. BEEM uses an iterative EM-like algorithm to simultaneously infer scaling factors (microbial biomass) and model parameters (microbial growth rate and interaction terms) from **longitudinal** data and can thus work directly with the relative abundance values that are obtained with metagenomic sequencing.
+
+Note: BEEM stands for **B**iomass **E**stimation and model inference with an **E**xpectation **M**aximization-like algorithm. We have now extended the BEEM framework to be able to work with cross-sectional data (BEEM-static, check out our R package [here](https://github.com/CSB5/BEEM-static)).
 
 ## Dependencies
 
@@ -12,10 +15,19 @@ BEEM was written in R (>3.3.1) and requires the following packages:
  - pspline
  - monomvn
 
-BEEM scripts can be loaded with the following command in R:
+The BEEM functions can be loaded in R directly with the following commands:
+
+```r
+beem = RCurl::getURL("https://raw.githubusercontent.com/CSB5/BEEM/master/emFunctions.r", ssl.verifypeer = FALSE)
+eval(parse(text = beem))
+```
+
+Alternatively the repository together with the example data can be cloned/downloaded. The functions are then loaded with the following commands in R:
+
 ```r
-source('path/to/this/repo/emFunctions.r')
+source('local/path/to/beem/emFunctions.r')
 ```
+
 ## Input data
 
 The input files for BEEM should have the same format as described in the manual for [MDSINE](https://bitbucket.org/MDSINE/mdsine/). The following two files are required by BEEM:
@@ -41,21 +53,21 @@ We have provided several sample input files that were also analyzed in our manus
 
 #### Data from [Props et. al. (2016)](https://www.nature.com/articles/ismej2016117)
 
- - OTU count `table: isme_analysis/counts.sel.txt`
- - Metadata: `isme_analysis/metadata.sel.txt`
+ - OTU count `table: props_et_al_analysis/counts.sel.txt`
+ - Metadata: `props_et_al_analysis/metadata.sel.txt`
 
 #### Data from [Gibbons et. al. (2017)](http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005364)
 
- - OTU count table: `time_series_analysis/{DA,DB,M3,F4}.counts.txt`
- - Metadata: `time_series_analysis/{DA,DB,M3,F4}.metadata.txt`
+ - OTU count table: `gibbons_et_al_analysis/{DA,DB,M3,F4}.counts.txt`
+ - Metadata: `gibbons_et_al_analysis/{DA,DB,M3,F4}.metadata.txt`
 
 ## Usage
 
 ### Basic Usage (R commands)
 
 ```r
 ## Load functions
-source('path/to/this/repo/emFunctions.r')
+source('emFunctions.r')
 ## Read inputs
 counts <- read.table('counts.txt', head=F, row.names=1)
 metadata <- read.table('metadata.txt', head=T)
@@ -79,10 +91,10 @@ BEEM estimated parameters is an R `data.frame` (a table) with the following colu
 
 ### Analyses in the manuscript
 
-The commands for reproducing the analysis reportd in the manuscript are presented as two jupyter notebooks: (1) [notebook for Props et. al.](https://github.com/CSB5/BEEM/blob/master/isme.ipynb) and (2) [notebook for Gibbons et. al.](https://github.com/CSB5/BEEM/blob/master/time_series_meta.ipynb).
+The commands for reproducing the analysis reportd in the manuscript are presented as two jupyter notebooks: (1) [notebook for Props et. al.](https://github.com/CSB5/BEEM/blob/master/props_et_al.ipynb) and (2) [notebook for Gibbons et. al.](https://github.com/CSB5/BEEM/blob/master/gibbons_et_al.ipynb).
 
 ## Citation
-C Li, L Tucker-Kellogg & N Nagarajan. (2018). System	Biology	Modeling	with Compositional Microbiome	Data	Reveals Personalized	Gut	Microbial	Dynamics	and	Keystone	Species. [*BioRxiv*](https://www.biorxiv.org/content/early/2018/03/27/288803).
+C Li, L Tucker-Kellogg & N Nagarajan. (2018). An expectation-maximization-like algorithm enables accurate ecological modeling using longitudinal metagenome sequencing data [*BioRxiv*](https://www.biorxiv.org/content/early/2018/07/17/288803).
 
 ## Contact
 Please direct any questions or feedback to Chenhao Li (lich@gis.a-star.edu.sg) and Niranjan Nagarajan (nagarajann@gis.a-star.edu.sg).
@@ -536,7 +536,7 @@ NORM <- function(tss, gradients, perturbInd, metadata, rmSp, params, ncpu=10, sc
         list(w.tmp, mse)
     }
 
-    mse.mat <- (matrix(unlist((w.list[[2]])), p-1)) ##* apply(abs(Xs),2,function(x)x/sum(x))
+    mse.mat <- (matrix(unlist((w.list[[2]])), nrow(tss)-1)) ##* apply(abs(Xs),2,function(x)x/sum(x))
     w <- unlist(w.list[[1]])
 
     mse <- mean(c(mse.mat), na.rm = TRUE)
@@ -594,8 +594,10 @@ suggestRefs <- function(dat, meta, scaling=1){
         cv <- apply(cv,2,mean)
     }
     message("The following species is recommended as the reference:")
-    message(names(sort(cv[!(fil1 | fil2 | fil3)])[1]))
-    data.frame(cv=cv, index=1:length(cv), hasZero=fil1, isTooHigh=fil3, isTooLow=fil2)[order(cv),]
+    sel <- names(sort(cv[!(fil1 | fil2 | fil3)])[1])
+    message(sel)
+    list(table=data.frame(cv=cv, index=1:length(cv), hasZero=fil1, isTooHigh=fil3, isTooLow=fil2)[order(cv),],
+         selected=which(sps==sel))
 }
 
 #' Function to estimate biomass and parameters simultaneously
@@ -623,13 +625,13 @@ EM <- function(dat, meta, forceBreak=NULL, useSpline=TRUE,
     }    
     refRank <- suggestRefs(dat, meta)
     if(is.null(refSp)){
-        message("No input for reference species, selecting one with the lowest coefficient of variation...")
-        refSp <- refRank$index[1]
+        message("BEEM selecting reference species as default...")
+        refSp <- refRank$selected
     }
     if(sum(dat[refSp,]==0)>0){
         message("[!]: The reference species has zero abundance in some samples. This will treated as non-zeros by adding a pseudo count.")
     }
-    if(refRank[refSp, 1]>0.9) {
+    if(refRank$table[rownames(dat)[refSp], 1]>0.9) {
         message("[!]: The reference species has high CV (>90%). Parameter estiamtes might be inaccurate (check the trace of weighted mse for convergence).")
     }
     message(paste0("Reference species: ",  rownames(dat)[refSp]))
@@ -714,10 +716,10 @@ EM <- function(dat, meta, forceBreak=NULL, useSpline=TRUE,
 #' @param biomass biomass data following MDSINE's biomass data format
 #' @param forceBreak force to break the trajectory to handle pulsed perturbation (or species invasion) (default: NULL)
 #' @param dev deviation (dev * mad) from the median to be considered as outliers (default:Inf, no filtering)
-#' @param ncpu maximal number of CPUs used (default:10)
+#' @param ncpu maximal number of CPUs used (default:4)
 #' @param infer_flag run inference (default:TRUE)
 param.infer <- function(dat, metadata, biomass,
-                        forceBreak=NULL, dev=Inf, ncpu=10, infer_flag=TRUE){
+                        forceBreak=NULL, dev=Inf, ncpu=4, infer_flag=TRUE){
     registerDoMC(ncpu)
     log.transform <- function(x){
         tmp <- log(x)
@@ -756,6 +758,36 @@ param.infer <- function(dat, metadata, biomass,
     BLASSO(X=Xs, P=NULL, Ys=Ys, Fs=isOutlier, ncpu=ncpu, rmSp=0, vnames=rownames(dat))
 }
 
+#' Diagnose the EM process
+#' @param beem.obj BEEM output list
+#' @param counts counts data following MDSINE's OTU table
+inspectEM <- function(beem.obj, counts){
+    if(NROW(counts) <7){
+        warning('You have less than 7 species. The estimation of parameters might be inaccurate.')
+    }
+    trace.mse.weighted <- beem.obj$trace.mse.weighted
+    idx <- round(length(trace.mse.weighted)/25):length(trace.mse.weighted)
+    if(min(trace.mse.weighted) > trace.mse.weighted[2]){
+        warning('Optimization failed.') ## worse fit than CSS (first iteration)
+        return(NA)
+    }
+    if( sum(trace.mse.weighted[idx] > 1e-5)/length(idx) > 0.3 ){
+        warning('Poor fitting detected.') ## too many iterations with large MSE
+        return(NA)
+    }
+    if( cor(trace.mse.weighted[idx], 1:length(idx), method='spearman') < -0.5){
+        ## dicreasing error
+        return(0)
+    }
+    fit <- lm(trace.mse.weighted[idx]/mad(trace.mse.weighted[idx]) ~ idx)
+    if(sqrt(median(resid(fit)^2)) > 0.5 && coef(fit)[2]>0 ) {
+        warning('Poor convergence detected.') ## not smooth enough
+        return(NA)
+    } 
+    return(0)
+}
+
+
 #' Inferring biomass from BEEM results
 #' @param beem.obj BEEM output list
 biomassFromEM <- function(beem.obj){
@@ -770,13 +802,12 @@ biomassFromEM <- function(beem.obj){
 #' @param beem.obj BEEM output list
 #' @param counts counts data following MDSINE's OTU table
 #' @param metadata metadata following MDSINE's metadata format
+#' @param sparse use the sparse mode to estimate the parameters (default: TRUE)
 #' @param forceBreak force to break the trajectory to handle pulsed perturbation (or species invasion) (default: NULL)
-#' @param ncpu maximal number of CPUs used (default:10)
+#' @param ncpu maximal number of CPUs used (default:4)
 #' @param enforceLogistic re-estimate the self-interaction parameters (enforce to negative values)
-paramFromEM <- function(beem.obj, counts, metadata, forceBreak=NULL, ncpu=10, enforceLogistic=FALSE){
-    if(NROW(counts) <7){
-        warning('You have less than 7 species. The estimation of parameters might be inaccurate.')
-    }
+paramFromEM <- function(beem.obj, counts, metadata, sparse=TRUE, forceBreak=NULL, ncpu=4, enforceLogistic=FALSE){
+    inspectEM(beem.obj, counts)
     registerDoMC(ncpu)
     trace.mse <- beem.obj$trace.mse
     min.mse <- min(trace.mse)
@@ -791,6 +822,13 @@ paramFromEM <- function(beem.obj, counts, metadata, forceBreak=NULL, ncpu=10, en
         beem.biomass <- apply(beem.obj$trace.biomass[,em.idx],1,median)
         beem.param <- apply(beem.obj$trace.params[,em.idx],1,median)
     }
+
+    if(!sparse){
+        message('Re-estimating parameters with the non-sparse mode...')
+        return(param.infer(dat=counts, metadata=metadata, biomass=beem.biomass,
+                       forceBreak=forceBreak, ncpu=ncpu)$mdsine)
+    }
+
     p <- nrow(counts)
     ## solve for interaction matrix
     beem.a <- beem.param[1:p]