DavisLaboratory
diff --git a/‎DESCRIPTION‎
Lines changed: 1 addition & 1 deletion b/‎DESCRIPTION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎NAMESPACE‎
Lines changed: 3 additions & 0 deletions b/‎NAMESPACE‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎R/CPD.R‎
Lines changed: 2 additions & 2 deletions b/‎R/CPD.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/KNC.R‎
Lines changed: 2 additions & 2 deletions b/‎R/KNC.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/KNN.R‎
Lines changed: 2 additions & 2 deletions b/‎R/KNN.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/computeStructuralMetrics.R‎
Lines changed: 61 additions & 22 deletions b/‎R/computeStructuralMetrics.R‎
Lines changed: 61 additions & 22 deletions
diff --git a/‎R/findVariableFeatures.R‎
Lines changed: 25 additions & 0 deletions b/‎R/findVariableFeatures.R‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎R/msImpute.R‎
Lines changed: 11 additions & 7 deletions b/‎R/msImpute.R‎
Lines changed: 11 additions & 7 deletions
diff --git a/‎R/scaleData.R‎
Lines changed: 10 additions & 6 deletions b/‎R/scaleData.R‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎R/selectFeatures.R‎
Lines changed: 10 additions & 5 deletions b/‎R/selectFeatures.R‎
Lines changed: 10 additions & 5 deletions
@@ -1,7 +1,7 @@
 Package: msImpute
 Type: Package
 Title: Peptide imputation in label-free proteomics
-Version: 1.2.0
+Version: 1.3.0
 Authors@R: 
     person(given = "Soroor",
            family = "Hediyeh-zadeh",
 
@@ -5,8 +5,11 @@ export(KNC)
 export(KNN)
 export(betweenness)
 export(computeStructuralMetrics)
+export(findVariableFeatures)
 export(gromov_wasserstein)
 export(msImpute)
 export(scaleData)
 export(selectFeatures)
 export(withinness)
+importFrom(scran,decomposeVar)
+importFrom(scran,trendVar)
@@ -4,8 +4,8 @@
 #' CPD quantifies preservation of the global structure after imputation.
 #' Requires complete datasets - for developers/use in benchmark studies only.
 #'
-#' @param xorigin numeric matrix. The original data. Can not contain missing values.
-#' @param ximputed numeric matrix. The imputed data. Can not contain missing values.
+#' @param xorigin numeric matrix. The original log-intensity data. Can not contain missing values.
+#' @param ximputed numeric matrix. The imputed log-intensity data. Can not contain missing values.
 #'
 #' @return numeric
 #'
 
@@ -4,8 +4,8 @@
 #' quantifies preservation of the mesoscopic structure after imputation.
 #' Requires complete datasets - for developers/use in benchmark studies only.
 #'
-#' @param xorigin numeric matrix. The original data. Can contain missing values.
-#' @param ximputed numeric matrix. The imputed data.
+#' @param xorigin numeric matrix. The original log-intensity data. Can contain missing values.
+#' @param ximputed numeric matrix. The imputed log-intensity data.
 #' @param class factor. A vector of length number of columns (samples) in the data specifying the class/label (i.e. experimental group) of each sample.
 #' @param k  number of nearest class means. default to k=3.
 #'
 
@@ -4,8 +4,8 @@
 #' KNN quantifies preservation of the local, or microscopic structure.
 #' Requires complete datasets - for developers/use in benchmark studies only.
 #'
-#' @param xorigin numeric matrix. The original data. Can not contain missing values.
-#' @param ximputed numeric matrix. The imputed data. Can not contain missing values.
+#' @param xorigin numeric matrix. The original log-intensity data. Can not contain missing values.
+#' @param ximputed numeric matrix. The imputed log-intensity data. Can not contain missing values.
 #' @param k  number of nearest neighbours. default to k=3.
 #'
 #' @return numeric  The proportion of preserved k-nearest neighbours in imputed data.
 
@@ -1,12 +1,13 @@
 #' Metrics for the assessment of post-imputation structural preservation
 #'
 #' For an imputed dataset, it computes within phenotype/experimental condition similarity (i.e. preservation of local structures),
-#' between phenotype distances (preservation of global structures), and the Gromov-Wasserstein (GW) distance between original and
+#' between phenotype distances (preservation of global structures), and the Gromov-Wasserstein (GW) distance between original (source) and
 #' imputed data.
 #'
-#' @param x numeric matrix. An imputed data matrix.
+#' @param x numeric matrix. An imputed data matrix of log-intensity.
 #' @param group factor. A vector of biological groups, experimental conditions or phenotypes (e.g. control, treatment).
-#' @param xna numeric matrix. Data matrix with missing values (i.e. the original intensity matrix with NAs)
+#' @param y numeric matrix. The source data (i.e. the original log-intensity matrix), preferably subsetted on highly variable peptides (see \code{findVariableFeatures}).
+#' @param k numeric. Number of Principal Components used to compute the GW distance. default to 2.
 #'
 #' @details For each group of experimental conditions (e.g. treatment and control), the group centroid is calculated as the average
 #' of observed peptide intensities. Withinness for each group is computed as sum of the squared distances between samples in that group and
@@ -16,15 +17,21 @@
 #' The GW metric considers preservation of both local and global structures simultaneously. A small GW distance suggests that
 #' imputation has introduced small distortions to global and local structures overall, whereas a large distance implies significant
 #' distortions. When comparing two or more imputation methods, the optimal method is the method with smallest GW distance.
-#' To compute the GW distance, the missing values in each column of \code{xna} are replaced by mean of observed values in that column.
-#' This is equivalent to imputation by KNN, where k is set to the total number of identified peptides (i.e. number of rows in the input matrix).
-#' GW distance estimation requires \code{python}. See example.
-#' All metrics are on log scale.
+#' The GW distance is computed on Principal Components (PCs) of the source and imputed data, instead of peptides. Principal components capture the
+#' geometry of the data, hence GW computed on PCs is a better measure of preservation of local and global structures. The PCs in the source data are
+#' recommended to be computed on peptides with high biological variance. Hence, users are recommended to subset the source data only on highly variable peptides (hvp)
+#' (see \code{findVariableFeatures}). Since the hvp peptides have high biological variance, they are likely to have enough information to discriminate samples
+#' from different experimental groups. Hence, PCs computed on those peptides should be representative of the original source data with missing values.
+#' If the samples cluster by experimental group in the first couple of PCs, then a choice of k=2 is reasonable. If the desired separation/clustering of samples
+#' occurs in later PCs (i.e. the first few PCs are dominated by batches or unwanted variability), then it is recommended to use a larger number of PCs to compute the
+#' GW metric. If you are interested in how well the imputed data represent the original data in all possible dimensions, then set k to the number of samples
+#' in the data (i.e. the number of columns in the intensity matrix).
+#' GW distance estimation requires \code{python}. See example. All metrics are on log scale.
 #'
 #'
 #' @return list of three metrics: withinness (sum of squared distances within a phenotype group),
 #' betweenness (sum of squared distances between the phenotypes), and gromov-wasserstein distance (if \code{xna} is not NULL).
-#' All metrics are on log scale.
+#' if \code{group} is NULL only the GW distance is returned. All metrics are on log scale.
 #'
 #'
 #' @examples
@@ -49,28 +56,35 @@
 #' # you can then run the computeStructuralMetrics() function.
 #' # Note that the reticulate package should be loaded before loading msImpute.
 #' set.seed(101)
-#' n=200
-#' p=100
-#' J=50
+#' n=12000
+#' p=10
+#' J=5
 #' np=n*p
 #' missfrac=0.3
-#' x=matrix(rnorm(n*J),n,J)%*%matrix(rnorm(J*p),J,p)+matrix(rnorm(np),n,p)/5
+#' x=matrix(rnorm(n*J,mean = 5,sd = 0.2),n,J)%*%matrix(rnorm(J*p, mean = 5,sd = 0.2),J,p)+
+#'   matrix(rnorm(np,mean = 5,sd = 0.2),n,p)/5
 #' ix=seq(np)
 #' imiss=sample(ix,np*missfrac,replace=FALSE)
 #' xna=x
 #' xna[imiss]=NA
+#' keep <- (rowSums(!is.na(xna)) >= 4)
+#' xna <- xna[keep,]
+#' rownames(xna) <- 1:nrow(xna)
 #' y <- xna
 #' xna <- scaleData(xna)
 #' xcomplete <- msImpute(object=xna)
-#' G <- as.factor(sample(1:5, 100, replace = TRUE))
-#' computeStructuralMetrics(xcomplete, G, y)
+#' G <- as.factor(sample(1:3, p, replace = TRUE))
+#' top.hvp <- findVariableFeatures(y)
+#' computeStructuralMetrics(xcomplete, G, y[rownames(top.hvp)[1:50],], k = 2)
 #' @export
-computeStructuralMetrics <- function(x, group, xna = NULL){
-  out <- list(withinness = log(withinness(x, group)),
-       betweenness = log(betweenness(x,group)))
+computeStructuralMetrics <- function(x, group=NULL, y = NULL, k=2){
+ if(!is.null(group)){
+   out <- list(withinness = log(withinness(x, group)),
+               betweenness = log(betweenness(x,group)))
+ }
 
-  if(!is.null(xna)){
-    GW <- gromov_wasserstein(xna, x)
+  if(!is.null(y)){
+    GW <- gromov_wasserstein(x, y, k=k)
     out[['gw_dist']] <- GW[[2]]$gw_dist
   }
   return(out)
@@ -101,8 +115,33 @@ betweenness <- function(x, class_label){
 
 
 #' @export
-gromov_wasserstein <- function(xna, ximputed){
+gromov_wasserstein <- function(x, y, k, min.mean = 0.1){
+  if (k > ncol(x)) stop("Number of Principal Components cannot be greater than number of columns (samples) in the data.")
+  if (any(!is.finite(x))) stop("Non-finite values (NA, Inf, NaN) encountered in imputed data")
+  if (any(!is.finite(y))) stop("Non-finite values (NA, Inf, NaN) encountered in source data")
+
+  means <- rowMeans(x)
+  vars <- matrixStats::rowSds(x)
+
+  # Filtering out zero-variance and low-abundance peptides
+  is.okay <- !is.na(vars) & vars > 1e-8 & means >= min.mean
+
+  xt <- t(x)
+  yt <- t(y)
+
+  # compute PCA
+  xt_pca <- prcomp(xt[,is.okay], scale. = TRUE, center = TRUE)
+  yt_pca <- prcomp(yt, scale. = TRUE, center = TRUE)
+
+  C1 <- yt_pca$x[,1:k]
+  C2 <- xt_pca$x[,1:k]
+
+
+  cat("Computing GW distance using k=", k, "Principal Components")
   reticulate::source_python(system.file("python", "gw.py", package = "msImpute"))
-  xna <- apply(xna, 2, FUN=function(x) {x[is.na(x)] <- mean(x, na.rm=TRUE); return(x)})
-  return(gw(t(xna), t(ximputed), ncol(xna)))
+  return(gw(C1,C2, ncol(x)))
 }
+
+
+
+
@@ -0,0 +1,25 @@
+#' Find highly variable peptides
+#'
+#' For each peptide, the total variance is decomposed into biological and technical variance using package \code{scran}
+#' @param y numeric matrix giving log-intensity. Can contain NA values.
+#'
+#' @return A data frame where rows are peptides and columns contain estimates of biological and technical variances. Peptides are ordered by biological variance.
+#'
+#' @details A loess trend is fitted to total sample variances and mean intensities. For each peptide, the biological variance is then
+#' computed by subtracting the estimated technical variance from the loess fit from the total sample variance.
+#'
+#' @seealso computeStructuralMetrics
+#'
+#' @export
+#' @importFrom scran trendVar decomposeVar
+findVariableFeatures <- function(y){
+  fit <- trendVar(y)
+  results <- decomposeVar(y, fit)
+  plot(results$mean, results$total)
+  o <- order(results$mean)
+  lines(results$mean[o], results$tech[o], col="red", lwd=2)
+  results <- as.data.frame(results)
+  top.dec <- results[order(results$bio, decreasing=TRUE), ]
+  return(top.dec)
+
+}
@@ -8,7 +8,7 @@
 #' \code{msImpute} operates on the softImpute-ALS algorithm.
 #' For more details on the underlying algorithm, please see \code{\link[softImpute]{softImpute}} package.
 #'
-#' @param object Numeric matrix  where missing values are denoted by NA. Rows are peptides, columns are samples.
+#' @param object Numeric matrix giving log-intensity where missing values are denoted by NA. Rows are peptides, columns are samples.
 #' @param rank.max Numeric. This restricts the rank of the solution. is set to min(dim(\code{object})-1) by default.
 #' @param lambda Numeric. Nuclear-norm regularization parameter. Controls the low-rank property of the solution
 #' to the matrix completion problem. By default, it is determined at the scaling step. If set to zero
@@ -24,16 +24,19 @@
 #'
 #' @examples
 #' set.seed(101)
-#' n=200
-#' p=100
-#' J=50
+#' n=12000
+#' p=10
+#' J=5
 #' np=n*p
 #' missfrac=0.3
-#' x=matrix(rnorm(n*J),n,J)%*%matrix(rnorm(J*p),J,p)+matrix(rnorm(np),n,p)/5
+#' x=matrix(rnorm(n*J,mean = 5,sd = 0.2),n,J)%*%matrix(rnorm(J*p, mean = 5,sd = 0.2),J,p)+
+#'   matrix(rnorm(np,mean = 5,sd = 0.2),n,p)/5
 #' ix=seq(np)
 #' imiss=sample(ix,np*missfrac,replace=FALSE)
 #' xna=x
 #' xna[imiss]=NA
+#' keep <- (rowSums(!is.na(xna)) >= 4)
+#' xna <- xna[keep,]
 #' xna <- scaleData(xna)
 #' xcomplete <- msImpute(object=xna)
 #' @seealso selectFeatures, scaleData
@@ -50,20 +53,21 @@ msImpute <- function(object, rank.max = NULL, lambda = NULL, thresh = 1e-05,
   if(is(object, "matrix")) {
     x <- object
     xnas <- x
+    warning("Input is not scaled. Data scaling is recommended for msImpute optimal performance.")
     }
   # MAList object
   # or \code{MAList} object from \link{limma}
   # if(is(object,"MAList")) x <- object$E
 
-
+  if(any(is.nan(x) | is.infinite(x))) stop("Inf or NaN values encountered.")
   if(any(rowSums(!is.na(x)) <= 3)) stop("Peptides with excessive NAs are detected. Please revisit your fitering step. At least 4 non-missing measurements are required for any peptide.")
   if(any(x < 0, na.rm = TRUE)){
     warning("Negative values encountered in imputed data. Please consider revising filtering and/or normalisation steps.")
   }
   if(is.null(rank.max)) rank.max <- min(dim(x) - 1)
   cat("maximum rank is", rank.max, "\n")
   cat("computing lambda0 ... \n")
-  if(is.null(lambda)) lambda <- softImpute::lambda0(x)
+  if(is.null(lambda)) lambda <- softImpute::lambda0(xnas)
   cat("lambda0 is", lambda, "\n")
   cat("fit the low-rank model ... \n")
   fit <- softImpute::softImpute(xnas,rank=rank.max,lambda=lambda, type = "als", thresh = thresh,
 
@@ -1,7 +1,7 @@
 #' Standardize a matrix to have optionally row means zero and variances one, and/or column means zero and variances one.
 #'
 #'
-#' @param object numeric matrix where missing values are denoted by NA. Rows are peptides, columns are samples.
+#' @param object numeric matrix giving log-intensity where missing values are denoted by NA. Rows are peptides, columns are samples.
 #' @param maxit numeric. maximum iteration for the algorithm to converge (default to 20). When both row and column centering/scaling is requested, iteration may be necessary.
 #' @param thresh numeric. Convergence threshold (default to 1e-09).
 #' @param row.center logical. if row.center==TRUE (the default), row centering will be performed resulting in a matrix with row means zero. If row.center is a vector, it will be used to center the rows. If row.center=FALSE nothing is done.
@@ -12,23 +12,27 @@
 #'
 #' @details
 #' Standardizes rows and/or columns of a matrix with missing values, according to the \code{biScale} algorithm in Hastie et al. 2015.
+#' Data is assumed to be normalised and log-transformed.
 #'
 #' @return
 #' A list of two components: E and E.scaled. E contains the input matrix, E.scaled contains the scaled data
 #'
 #'
 #' @examples
 #' set.seed(101)
-#' n=200
-#' p=100
-#' J=50
+#' n=12000
+#' p=10
+#' J=5
 #' np=n*p
 #' missfrac=0.3
-#' x=matrix(rnorm(n*J),n,J)%*%matrix(rnorm(J*p),J,p)+matrix(rnorm(np),n,p)/5
+#' x=matrix(rnorm(n*J,mean = 5,sd = 0.2),n,J)%*%matrix(rnorm(J*p, mean = 5,sd = 0.2),J,p)+
+#'   matrix(rnorm(np,mean = 5,sd = 0.2),n,p)/5
 #' ix=seq(np)
 #' imiss=sample(ix,np*missfrac,replace=FALSE)
 #' xna=x
 #' xna[imiss]=NA
+#' keep <- (rowSums(!is.na(xna)) >= 4)
+#' xna <- xna[keep,]
 #' xna <- scaleData(xna)
 #' @seealso selectFeatures, msImpute
 #' @export
@@ -39,7 +43,7 @@ scaleData <- function(object, maxit = 20, thresh = 1e-09, row.center = TRUE, row
   }else{
     x <- object
   }
-
+  if(any(is.nan(x) | is.infinite(x))) stop("Inf or NaN values encountered.")
   if(any(rowSums(!is.na(x)) <= 3)) stop("Peptides with excessive NAs are detected. Please revisit your fitering step. At least 4 non-missing measurements are required for any peptide.")
   if(any(x < 0, na.rm = TRUE)){
     warning("Negative values encountered in imputed data. Please consider revisting the filtering and/or normalisation steps, if appropriate.")
 
@@ -4,7 +4,7 @@
 #' used to determine if data is Missing Not At Random (MNAR). Users should note that \code{msImpute} assumes peptides
 #' are Missing At Random (MAR).
 #'
-#' @param object Numeric matrix where missing values are denoted by NA.
+#' @param object Numeric matrix giving log-intensity where missing values are denoted by NA.
 #' Rows are peptides, columns are samples.
 #' @param n_features Numeric, number of features with high dropout rate. 500 by default.
 #' @param suppress_plot Logical show plot of dropouts vs abundances.
@@ -13,16 +13,19 @@
 #'
 #' @examples
 #' set.seed(101)
-#' n=800
-#' p=100
-#' J=50
+#' n=12000
+#' p=10
+#' J=5
 #' np=n*p
 #' missfrac=0.3
-#' x=matrix(rnorm(n*J),n,J)%*%matrix(rnorm(J*p),J,p)+matrix(rnorm(np),n,p)/5
+#' x=matrix(rnorm(n*J,mean = 5,sd = 0.2),n,J)%*%matrix(rnorm(J*p, mean = 5,sd = 0.2),J,p)+
+#'   matrix(rnorm(np,mean = 5,sd = 0.2),n,p)/5
 #' ix=seq(np)
 #' imiss=sample(ix,np*missfrac,replace=FALSE)
 #' xna=x
 #' xna[imiss]=NA
+#' keep <- (rowSums(!is.na(xna)) >= 4)
+#' xna <- xna[keep,]
 #' rownames(xna) <- 1:nrow(xna)
 #' hdp <- selectFeatures(xna, n_features=500,  suppress_plot=FALSE)
 #' # construct matrix M to capture missing entries
@@ -59,6 +62,8 @@ selectFeatures <- function(object, n_features=500, suppress_plot = FALSE) {
   }
 
   if(is.null(rownames(x))) stop("No row names in input. Please provide input with named rows.")
+  if(any(is.nan(x) | is.infinite(x))) stop("Inf or NaN values encountered.")
+
   AveExpr <- rowMeans(x, na.rm = TRUE)
   dropout <- rowMeans(is.na(x))