smithlabcode
diff --git a/‎DESCRIPTION‎
Lines changed: 3 additions & 3 deletions b/‎DESCRIPTION‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 1 addition & 0 deletions b/‎NAMESPACE‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎R/kmer.R‎
Lines changed: 10 additions & 10 deletions b/‎R/kmer.R‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎R/sequencing.R‎
Lines changed: 4 additions & 5 deletions b/‎R/sequencing.R‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎inst/CITATION‎
Lines changed: 5 additions & 5 deletions b/‎inst/CITATION‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎man/Dickens.Rd‎
Lines changed: 1 addition & 1 deletion b/‎man/Dickens.Rd‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎man/ShakespeareWordHist.Rd‎ ‎man/Shakespeare.Rd‎man/ShakespeareWordHist.Rd renamed to man/Shakespeare.Rd b/‎man/ShakespeareWordHist.Rd‎ ‎man/Shakespeare.Rd‎man/ShakespeareWordHist.Rd renamed to man/Shakespeare.Rd
diff --git a/‎man/Twitter.Rd‎
Lines changed: 1 addition & 1 deletion b/‎man/Twitter.Rd‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎man/WillButterfly.Rd‎
Lines changed: 1 addition & 1 deletion b/‎man/WillButterfly.Rd‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎man/bbc.rSAC.Rd‎
Lines changed: 5 additions & 1 deletion b/‎man/bbc.rSAC.Rd‎
Lines changed: 5 additions & 1 deletion
@@ -1,11 +1,11 @@
 Package: preseqR
 Type: Package
-Title: Predicting the Number of Species in a Random Sample
+Title: Predicting Species Accumulation Curves
 Version: 4.0.0
-Date: 2017-12-26
+Date: 2018-06-27
 Author: Chao Deng, Timothy Daley and Andrew D. Smith
 Maintainer: Chao Deng <[email protected]>
-Description: The relation between the number of species and the number of individuals in a random sample is a classic problem back to Fisher (1943) <doi:10.2307/1411>. We generalize this problem to predict the number of species represented at least r times in a random sample. In particular when r=1, it becomes the classic problem. We use a mixture of Poisson processes to model sampling procedures and apply an empirical Bayes approach to obtain a rational function estimator. The approach can be applied to assess the quality of DNA sequencing libraries and optimize depths of sequencing experiments. For more information on 'preseqR', see Deng C, Daley T and Smith AD (2015) <doi:10.1007/s40484-015-0049-7> and Deng C and Smith AD (2016) <arXiv:1607.02804v2>.
+Description: Originally as an R version of Preseq <doi:10.1038/nmeth.2375>, the package has extended its functionality to predict the r-species accumulation curve (r-SAC), which is the number of species represented at least r times as a function of the sampling effort. When r = 1, the curve is known as the species accumulation curve, or the library complexity curve in high-throughput genomic sequencing. The package includes both parametric and nonparametric methods, as described by Deng C, et al. (2018) <arXiv:1607.02804v3>. 
 License: GPL-3
 Imports:
   polynom, graphics, stats
@@ -9,6 +9,7 @@ export(preseqR.interpolate.rSAC)
 export(preseqR.rSAC)
 export(preseqR.rSAC.bootstrap)
 export(ds.rSAC)
+export(ds.rSAC.bootstrap)
 export(ztnb.rSAC)
 export(ztp.rSAC)
 export(bbc.rSAC)
 
@@ -26,18 +26,18 @@ kmer.frac <- function(n, r=2, mt=20) {
 
 ## the fraction of k-mers represented at least r times as a function of 
 ## sample sizes
-kmer.frac.curve <- function(n, k, read.len, seq.gb, r=2, mt=20) {
+kmer.frac.curve <- function(n, k, read.len, seq, r=2, mt=20) {
   f <- kmer.frac(n, r=r, mt=mt)
   if (is.null(f))
     return(NULL)
   n[, 2] <- as.numeric(n[, 2])
   N <- n[, 1] %*% n[, 2]
   ## average number of k-mers per read
   m <- read.len - k + 1
-  unit.gb <- N / m * read.len / 1e9
-  seq.effort <- seq.gb / unit.gb
-  result <- matrix(c(seq.gb, f(seq.effort)), ncol=2, byrow=FALSE)
-  colnames(result) <- c("bases(GB)", paste("frac(X>=", r, ")", sep=""))
+  unit <- N / m * read.len
+  seq.effort <- seq / unit
+  result <- matrix(c(seq, f(seq.effort)), ncol=2, byrow=FALSE)
+  colnames(result) <- c("bases", paste("frac(X>=", r, ")", sep=""))
   return(result)
 }
 
@@ -50,7 +50,7 @@ kmer.frac.bootstrap <- function(n, r=2, mt=20, times=30, conf=0.95) {
 
 ## the fraction of k-mers represented at least r times as a function of 
 ## sample sizes
-kmer.frac.curve.bootstrap <- function(n, k, read.len, seq.gb, r=2, mt=20,
+kmer.frac.curve.bootstrap <- function(n, k, read.len, seq, r=2, mt=20,
                                       times=30, conf=0.95)
 {
   f <- kmer.frac.bootstrap(n, r=r, mt=mt, times=times, conf=conf)
@@ -60,11 +60,11 @@ kmer.frac.curve.bootstrap <- function(n, k, read.len, seq.gb, r=2, mt=20,
   N <- n[, 1] %*% n[, 2]
   ## average number of k-mers per read
   m <- read.len - k + 1
-  unit.gb <- N / m * read.len / 1e9
-  seq.effort <- seq.gb / unit.gb
-  result <- matrix(c(seq.gb, f$f(seq.effort), f$lb(seq.effort), 
+  unit <- N / m * read.len
+  seq.effort <- seq / unit
+  result <- matrix(c(seq, f$f(seq.effort), f$lb(seq.effort), 
                      f$ub(seq.effort)), ncol=4, byrow=FALSE)
-  colnames(result) <- c("bases(GB)", paste("frac(X>=", r, ")", sep=""), 
+  colnames(result) <- c("bases", paste("frac(X>=", r, ")", sep=""), 
                         "lb", "ub")
   return(result)
 }
@@ -20,8 +20,7 @@
 
 ## predict the optimal number of sequenced bases using cost-benefit ratio
 preseqR.optimal.sequencing <- function(
-  n, efficiency=0.05, bin=1e8, r=1, mt=20, size=SIZE.INIT,
-  mu=MU.INIT, times=30, conf=0.95)
+  n, efficiency=0.05, bin=1e8, r=1, mt=20, times=30, conf=0.95)
 {
   find.start <- function(f, N, bin, efficiency) {
     y = sapply(1:100, function(x) (f(x + bin / N) - f(x)) / bin - efficiency)
@@ -36,8 +35,8 @@ preseqR.optimal.sequencing <- function(
   N <- n[, 1] %*% n[, 2]
 
   ## r-species accumulation curve as a function of relative sample size
-  f.rSAC <- preseqR.rSAC.bootstrap(
-    n=n, r=r, mt=mt, size=size, mu=mu,times=times, conf=conf)
+  f.rSAC <- ds.rSAC.bootstrap(
+    n=n, r=r, mt=mt, times=times, conf=conf)
 
   ## hint: using r-SAC as a function of the number of sequenced bases
   f <- f.rSAC$f
@@ -73,7 +72,7 @@ preseqR.optimal.sequencing <- function(
 ## the function is designed for EXOME sequencing, where aligned reads that
 ## map to the same location are removed to avoid potential duplicate
 preseqR.rSAC.sequencing.rmdup <- function(
-  n_base, n_read, r=1, mt=20, times=100, conf=0.95)
+  n_base, n_read, r=1, mt=20, times=30, conf=0.95)
 {
   checking.hist(n_read)
   checking.hist(n_base)
 
@@ -19,14 +19,14 @@ citEntry(entry = "article",
 
 citEntry(entry = "article",
   title        = "Estimating the number of species to attain sufficient representation in a random sample",
-  author       = personList(as.person("Chao Deng"), as.person("Andrew D. Smith")),
+  author       = personList(as.person("Chao Deng"), as.person("Timothy Daley"), as.person("Peter Calabrese"), as.person("Jie Ren"), as.person("Andrew D. Smith")),
   journal      = "arXiv",
-  year         = "2016",
-  url          = "https://arxiv.org/abs/1607.02804v2",
+  year         = "2018",
+  url          = "https://arxiv.org/abs/1607.02804v3",
 
   textVersion  =
-  paste("Deng C and Smith AD (2016).",
+  paste("Deng C, Daley T, Calabrese P, Ren J and Smith AD (2018).",
         "Estimating the number of species to attain sufficient representation in a random sample.",
         "arXiv preprint.",
-        "URL https://arxiv.org/abs/1607.02804v2.")
+        "URL https://arxiv.org/abs/1607.02804v3.")
 )
@@ -8,7 +8,7 @@
 \details{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{N_j}, the number of unique words appeared \eqn{j}
+    is \eqn{N_j}, the number of unique words appeared exactly \eqn{j}
     times in a collection of Charles Dickens.
 }
 
 
@@ -7,7 +7,7 @@
 \details{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{n_j}, the number of users with \eqn{j} followers.
+    is \eqn{n_j}, the number of users with exactly \eqn{j} followers.
 }
 
 \references{
 
@@ -13,7 +13,7 @@ Animal Population, Journal of Animal Ecology, 12, 42-58, Table 3.
 \details{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{n_j}, the number of butterflies captured \eqn{j}
+    is \eqn{n_j}, the number of butterflies captured exactly \eqn{j}
     times in the sample.
 }  
 
 
@@ -18,7 +18,7 @@ bbc.rSAC(n, r=1)
   \item{n}{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{N_j}, the number of species with each species represented \eqn{j}
+    is \eqn{N_j}, the number of species with each species represented exactly \eqn{j}
     times in the initial sample. The first column must be sorted in an
     ascending order.
   }
@@ -41,6 +41,10 @@ bbc.rSAC(n, r=1)
 Boneh, S., Boneh, A., & Caron, R. J. (1998). Estimating the prediction function
 and the number of unseen species in sampling with replacement.
 Journal of the American Statistical Association, 93(441), 372-379.
+
+Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating
+the number of species to attain sufficient representation in a random sample.
+arXiv preprint arXiv:1607.02804v3.
 }
 
 \examples{
Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@`
`8`	`8`	`\details{`
`9`	`9`	`A two-column matrix.`
`10`	`10`	`The first column is the frequency \eqn{j = 1,2,\dots}; and the second column`
`11`		`- is \eqn{N_j}, the number of unique words appeared \eqn{j}`
	`11`	`+ is \eqn{N_j}, the number of unique words appeared exactly \eqn{j}`
`12`	`12`	`times in a collection of Charles Dickens.`
`13`	`13`	`}`
`14`	`14`
Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@`
`7`	`7`	`\details{`
`8`	`8`	`A two-column matrix.`
`9`	`9`	`The first column is the frequency \eqn{j = 1,2,\dots}; and the second column`
`10`		`- is \eqn{n_j}, the number of users with \eqn{j} followers.`
	`10`	`+ is \eqn{n_j}, the number of users with exactly \eqn{j} followers.`
`11`	`11`	`}`
`12`	`12`
`13`	`13`	`\references{`
Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ Animal Population, Journal of Animal Ecology, 12, 42-58, Table 3.`
`13`	`13`	`\details{`
`14`	`14`	`A two-column matrix.`
`15`	`15`	`The first column is the frequency \eqn{j = 1,2,\dots}; and the second column`
`16`		`- is \eqn{n_j}, the number of butterflies captured \eqn{j}`
	`16`	`+ is \eqn{n_j}, the number of butterflies captured exactly \eqn{j}`
`17`	`17`	`times in the sample.`
`18`	`18`	`}`
`19`	`19`