|
1 | 1 | #' Metrics for the assessment of post-imputation structural preservation |
2 | 2 | #' |
3 | 3 | #' For an imputed dataset, it computes within phenotype/experimental condition similarity (i.e. preservation of local structures), |
4 | | -#' between phenotype distances (preservation of global structures), and the Gromov-Wasserstein (GW) distance between original and |
| 4 | +#' between phenotype distances (preservation of global structures), and the Gromov-Wasserstein (GW) distance between original (source) and |
5 | 5 | #' imputed data. |
6 | 6 | #' |
7 | | -#' @param x numeric matrix. An imputed data matrix. |
| 7 | +#' @param x numeric matrix. An imputed data matrix of log-intensity. |
8 | 8 | #' @param group factor. A vector of biological groups, experimental conditions or phenotypes (e.g. control, treatment). |
9 | | -#' @param xna numeric matrix. Data matrix with missing values (i.e. the original intensity matrix with NAs) |
| 9 | +#' @param y numeric matrix. The source data (i.e. the original log-intensity matrix), preferably subsetted on highly variable peptides (see \code{findVariableFeatures}). |
| 10 | +#' @param k numeric. Number of Principal Components used to compute the GW distance. default to 2. |
10 | 11 | #' |
11 | 12 | #' @details For each group of experimental conditions (e.g. treatment and control), the group centroid is calculated as the average |
12 | 13 | #' of observed peptide intensities. Withinness for each group is computed as sum of the squared distances between samples in that group and |
|
16 | 17 | #' The GW metric considers preservation of both local and global structures simultaneously. A small GW distance suggests that |
17 | 18 | #' imputation has introduced small distortions to global and local structures overall, whereas a large distance implies significant |
18 | 19 | #' distortions. When comparing two or more imputation methods, the optimal method is the method with smallest GW distance. |
19 | | -#' To compute the GW distance, the missing values in each column of \code{xna} are replaced by mean of observed values in that column. |
20 | | -#' This is equivalent to imputation by KNN, where k is set to the total number of identified peptides (i.e. number of rows in the input matrix). |
21 | | -#' GW distance estimation requires \code{python}. See example. |
22 | | -#' All metrics are on log scale. |
| 20 | +#' The GW distance is computed on Principal Components (PCs) of the source and imputed data, instead of peptides. Principal components capture the |
| 21 | +#' geometry of the data, hence GW computed on PCs is a better measure of preservation of local and global structures. The PCs in the source data are |
| 22 | +#' recommended to be computed on peptides with high biological variance. Hence, users are recommended to subset the source data only on highly variable peptides (hvp) |
| 23 | +#' (see \code{findVariableFeatures}). Since the hvp peptides have high biological variance, they are likely to have enough information to discriminate samples |
| 24 | +#' from different experimental groups. Hence, PCs computed on those peptides should be representative of the original source data with missing values. |
| 25 | +#' If the samples cluster by experimental group in the first couple of PCs, then a choice of k=2 is reasonable. If the desired separation/clustering of samples |
| 26 | +#' occurs in later PCs (i.e. the first few PCs are dominated by batches or unwanted variability), then it is recommended to use a larger number of PCs to compute the |
| 27 | +#' GW metric. If you are interested in how well the imputed data represent the original data in all possible dimensions, then set k to the number of samples |
| 28 | +#' in the data (i.e. the number of columns in the intensity matrix). |
| 29 | +#' GW distance estimation requires \code{python}. See example. All metrics are on log scale. |
23 | 30 | #' |
24 | 31 | #' |
25 | 32 | #' @return list of three metrics: withinness (sum of squared distances within a phenotype group), |
26 | 33 | #' betweenness (sum of squared distances between the phenotypes), and gromov-wasserstein distance (if \code{xna} is not NULL). |
27 | | -#' All metrics are on log scale. |
| 34 | +#' if \code{group} is NULL only the GW distance is returned. All metrics are on log scale. |
28 | 35 | #' |
29 | 36 | #' |
30 | 37 | #' @examples |
|
49 | 56 | #' # you can then run the computeStructuralMetrics() function. |
50 | 57 | #' # Note that the reticulate package should be loaded before loading msImpute. |
51 | 58 | #' set.seed(101) |
52 | | -#' n=200 |
53 | | -#' p=100 |
54 | | -#' J=50 |
| 59 | +#' n=12000 |
| 60 | +#' p=10 |
| 61 | +#' J=5 |
55 | 62 | #' np=n*p |
56 | 63 | #' missfrac=0.3 |
57 | | -#' x=matrix(rnorm(n*J),n,J)%*%matrix(rnorm(J*p),J,p)+matrix(rnorm(np),n,p)/5 |
| 64 | +#' x=matrix(rnorm(n*J,mean = 5,sd = 0.2),n,J)%*%matrix(rnorm(J*p, mean = 5,sd = 0.2),J,p)+ |
| 65 | +#' matrix(rnorm(np,mean = 5,sd = 0.2),n,p)/5 |
58 | 66 | #' ix=seq(np) |
59 | 67 | #' imiss=sample(ix,np*missfrac,replace=FALSE) |
60 | 68 | #' xna=x |
61 | 69 | #' xna[imiss]=NA |
| 70 | +#' keep <- (rowSums(!is.na(xna)) >= 4) |
| 71 | +#' xna <- xna[keep,] |
| 72 | +#' rownames(xna) <- 1:nrow(xna) |
62 | 73 | #' y <- xna |
63 | 74 | #' xna <- scaleData(xna) |
64 | 75 | #' xcomplete <- msImpute(object=xna) |
65 | | -#' G <- as.factor(sample(1:5, 100, replace = TRUE)) |
66 | | -#' computeStructuralMetrics(xcomplete, G, y) |
| 76 | +#' G <- as.factor(sample(1:3, p, replace = TRUE)) |
| 77 | +#' top.hvp <- findVariableFeatures(y) |
| 78 | +#' computeStructuralMetrics(xcomplete, G, y[rownames(top.hvp)[1:50],], k = 2) |
67 | 79 | #' @export |
68 | | -computeStructuralMetrics <- function(x, group, xna = NULL){ |
69 | | - out <- list(withinness = log(withinness(x, group)), |
70 | | - betweenness = log(betweenness(x,group))) |
| 80 | +computeStructuralMetrics <- function(x, group=NULL, y = NULL, k=2){ |
| 81 | + if(!is.null(group)){ |
| 82 | + out <- list(withinness = log(withinness(x, group)), |
| 83 | + betweenness = log(betweenness(x,group))) |
| 84 | + } |
71 | 85 |
|
72 | | - if(!is.null(xna)){ |
73 | | - GW <- gromov_wasserstein(xna, x) |
| 86 | + if(!is.null(y)){ |
| 87 | + GW <- gromov_wasserstein(x, y, k=k) |
74 | 88 | out[['gw_dist']] <- GW[[2]]$gw_dist |
75 | 89 | } |
76 | 90 | return(out) |
@@ -101,8 +115,33 @@ betweenness <- function(x, class_label){ |
101 | 115 |
|
102 | 116 |
|
103 | 117 | #' @export |
104 | | -gromov_wasserstein <- function(xna, ximputed){ |
| 118 | +gromov_wasserstein <- function(x, y, k, min.mean = 0.1){ |
| 119 | + if (k > ncol(x)) stop("Number of Principal Components cannot be greater than number of columns (samples) in the data.") |
| 120 | + if (any(!is.finite(x))) stop("Non-finite values (NA, Inf, NaN) encountered in imputed data") |
| 121 | + if (any(!is.finite(y))) stop("Non-finite values (NA, Inf, NaN) encountered in source data") |
| 122 | + |
| 123 | + means <- rowMeans(x) |
| 124 | + vars <- matrixStats::rowSds(x) |
| 125 | + |
| 126 | + # Filtering out zero-variance and low-abundance peptides |
| 127 | + is.okay <- !is.na(vars) & vars > 1e-8 & means >= min.mean |
| 128 | + |
| 129 | + xt <- t(x) |
| 130 | + yt <- t(y) |
| 131 | + |
| 132 | + # compute PCA |
| 133 | + xt_pca <- prcomp(xt[,is.okay], scale. = TRUE, center = TRUE) |
| 134 | + yt_pca <- prcomp(yt, scale. = TRUE, center = TRUE) |
| 135 | + |
| 136 | + C1 <- yt_pca$x[,1:k] |
| 137 | + C2 <- xt_pca$x[,1:k] |
| 138 | + |
| 139 | + |
| 140 | + cat("Computing GW distance using k=", k, "Principal Components") |
105 | 141 | reticulate::source_python(system.file("python", "gw.py", package = "msImpute")) |
106 | | - xna <- apply(xna, 2, FUN=function(x) {x[is.na(x)] <- mean(x, na.rm=TRUE); return(x)}) |
107 | | - return(gw(t(xna), t(ximputed), ncol(xna))) |
| 142 | + return(gw(C1,C2, ncol(x))) |
108 | 143 | } |
| 144 | + |
| 145 | + |
| 146 | + |
| 147 | + |
0 commit comments