diff --git a/NEWS.md b/NEWS.md index 320b993..0eccc32 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,8 +2,7 @@ ## scp 1.19.2 -- Fix grep pattern in example for readSingleCellExperiment() (see - #94). +- feat: improved error message when pattern is not found in computeSCR ## scp 1.19.1 diff --git a/R/compute_metrics.R b/R/compute_metrics.R index 26f1c4d..f2a901c 100644 --- a/R/compute_metrics.R +++ b/R/compute_metrics.R @@ -1,14 +1,14 @@ ####---- Internal functions ----#### -## Internal function: compute coefficient of variation for each column, +## Internal function: compute coefficient of variation for each column, ## taking into account the grouping structure of the rows ## This code is inspired from MSnbase::rowsd ##' @importFrom stats median sd -.rowCV <- function(x, +.rowCV <- function(x, group, nobs = 2, - reorder = FALSE, + reorder = FALSE, na.rm = FALSE) { ## Check nobs if (nobs < 2) @@ -21,57 +21,57 @@ nna <- matrix(1, ncol = ncol(x), nrow = nrow(x)) } ## Get the number of non-NA observations per column per group - nna <- rowsum(nna, - group = group, - reorder = reorder, + nna <- rowsum(nna, + group = group, + reorder = reorder, na.rm = na.rm) ## Return NA if nna < nobs. This behaviour is similar to sd nna[nna < nobs] <- NA ## Compute mean - mean <- rowsum(x, - group = group, - reorder = reorder, + mean <- rowsum(x, + group = group, + reorder = reorder, na.rm = na.rm) / nna ## Compute variance - var <- rowsum(x * x, - group = group, - reorder = reorder, + var <- rowsum(x * x, + group = group, + reorder = reorder, na.rm = na.rm) / nna - mean^2L ## CV = sd / mean - sqrt(var * nna / (nna - 1L)) / mean + sqrt(var * nna / (nna - 1L)) / mean } ## Internal function: compute q-values from a vector of PEPs ## this is calc_fdr from SCoPE2 -## x: numeric() containing probabilities +## x: numeric() containing probabilities .pep2qvalue <- function(x) { (cumsum(x[order(x)]) / seq_along(x))[order(order(x))] } ## @param x A `SingleCellExperiment` object -## +## ## @param group A `factor()` that indicates how features (rows) should -## be grouped. The CVs are computed for each group separately. -## +## be grouped. The CVs are computed for each group separately. +## ## @rdname medianCVperCell featureCV <- function(x, group, na.rm = TRUE, norm = "none", nobs = 2, ...) { ## Check object if (!inherits(x, "SingleCellExperiment")) stop("'x' must inherit from a 'SingleCellExperiment'") - ## Optional normalization(s) + ## Optional normalization(s) if (identical(norm, "SCoPE2")) { xnorm <- .normalizeSCP(x, method = "div.median") - assay(x) <- sweep(assay(x), 1, + assay(x) <- sweep(assay(x), 1, rowMeans(assay(xnorm), na.rm = TRUE), "/") } else if (!identical(norm, "none")) { for(normi in norm) x <- .normalizeSCP(x, method = normi, ...) } ## Compute CVs - .rowCV(assay(x), - group = group, + .rowCV(assay(x), + group = group, nobs = nobs, - reorder = TRUE, + reorder = TRUE, na.rm = na.rm) } @@ -83,9 +83,9 @@ featureCV <- function(x, group, na.rm = TRUE, norm = "none", nobs = 2, ...) { } ## Internal function that computes missing value metrics given an -## identification matrix. -## @param x A matrix of logicals were TRUE indicates the feature i is -## found in sample j. +## identification matrix. +## @param x A matrix of logicals were TRUE indicates the feature i is +## found in sample j. .computeMissingValueMetrics <- function(x) { c( LocalSensitivityMean = mean(colSums(x != 0)), @@ -100,10 +100,10 @@ featureCV <- function(x, group, na.rm = TRUE, norm = "none", nobs = 2, ...) { ## of columns in a given identification matrix. Note that the Jaccard ## matrix is a symmetric matrix, so we only return the upper triangle. ## WARNING: for large number of cells, eg n > 10,000, this may become -## inefficient. When the time comes this problem is relevant, +## inefficient. When the time comes this problem is relevant, ## implement sub-sampling. -## @param x A matrix of logicals were TRUE indicates the feature i is -## found in sample j. +## @param x A matrix of logicals were TRUE indicates the feature i is +## found in sample j. .computeJaccardIndex <- function(x) { vectorSizes <- colSums(x) pwVectorSizes <- sapply(vectorSizes, function(xx) vectorSizes + xx) @@ -124,17 +124,17 @@ featureCV <- function(x, group, na.rm = TRUE, norm = "none", nobs = 2, ...) { } ## Internal function that computes the cumulative sensitivity curve -## given an identification matrix. The function will sample an +## given an identification matrix. The function will sample an ## increasing number of cells (depending on nsteps). Each sampling is -## repeated niters time. Optionally, if 'batch' is provided, the +## repeated niters time. Optionally, if 'batch' is provided, the ## function will aggregate columns within each batch. This is usefull -## when dealing with mulitplexed data were detection depends on the -## batch. -## @param x A matrix of logicals were TRUE indicates the feature i is -## found in sample j. +## when dealing with mulitplexed data were detection depends on the +## batch. +## @param x A matrix of logicals were TRUE indicates the feature i is +## found in sample j. .computeCumulativeSensitivityCurve <- function(x, batch = NULL, - niters = 10, + niters = 10, nsteps = 30) { out <- list() if (!is.null(batch)) { @@ -154,54 +154,54 @@ featureCV <- function(x, group, na.rm = TRUE, norm = "none", nobs = 2, ...) { ##' Compute the sample over carrier ratio (SCR) -##' +##' ##' The function computes the ratio of the intensities of sample ##' channels over the intentisty of the carrier channel for each ##' feature. The ratios are averaged within the assay. ##' ##' @param object A `QFeatures` object. -##' +##' ##' @param i A `character()` or `integer()` indicating for which ##' assay(s) the SCR needs to be computed. -##' +##' ##' @param colvar A `character(1)` indicating the variable to take ##' from `colData(object)` that gives the sample annotation. -##' +##' ##' @param samplePattern A `character(1)` pattern that matches the ##' sample encoding in `colvar`. -##' +##' ##' @param carrierPattern A `character(1)` pattern that matches the ##' carrier encoding in `colvar`. Only one match per assay is ##' allowed, otherwise only the first match is taken -##' -##' @param sampleFUN A `character(1)` or `function` that provides the +##' +##' @param sampleFUN A `character(1)` or `function` that provides the ##' summarization function to use (eg mean, sum, media, max, ...). -##' Only used when the pattern matches multiple samples. Default +##' Only used when the pattern matches multiple samples. Default ##' is `mean`. Note for custom function, `na.rm = TRUE` is passed -##' to `sampleFUN` to ignore missing values, make sure to provide +##' to `sampleFUN` to ignore missing values, make sure to provide ##' a function that accepts this argument. -##' -##' @param carrierFUN A `character(1)` or `function` that provides the +##' +##' @param carrierFUN A `character(1)` or `function` that provides the ##' summarization function to use (eg mean, sum, media, max, ...). -##' Only used when the pattern matches multiple carriers. Default +##' Only used when the pattern matches multiple carriers. Default ##' is the same function as `sampleFUN`. Note for custom function, -##' `na.rm = TRUE` is passed to `carrierFUN` to ignore missing -##' values, make sure to provide a function that accepts this +##' `na.rm = TRUE` is passed to `carrierFUN` to ignore missing +##' values, make sure to provide a function that accepts this ##' argument. -##' -##' @param rowDataName A `character(1)` giving the name of the new -##' variable in the `rowData` where the computed SCR will be +##' +##' @param rowDataName A `character(1)` giving the name of the new +##' variable in the `rowData` where the computed SCR will be ##' stored. The name cannot already exist in any of the assay ##' `rowData`. -##' +##' ##' @return A `QFeatures` object for which the `rowData` of the given -##' assay(s) is augmented with the mean SCR. +##' assay(s) is augmented with the mean SCR. ##' ##' @export ##' ##' @examples ##' data("scp1") -##' scp1 <- computeSCR(scp1, +##' scp1 <- computeSCR(scp1, ##' i = 1, ##' colvar = "SampleType", ##' carrierPattern = "Carrier", @@ -210,25 +210,27 @@ featureCV <- function(x, group, na.rm = TRUE, norm = "none", nobs = 2, ...) { ##' rowDataName = "MeanSCR") ##' ## Check results ##' rowData(scp1)[[1]][, "MeanSCR"] -##' -computeSCR <- function(object, - i, - colvar, - samplePattern, +##' +computeSCR <- function(object, + i, + colvar, + samplePattern, sampleFUN = "mean", carrierPattern, carrierFUN = sampleFUN, rowDataName = "SCR") { if (!inherits(object, "QFeatures")) stop("'object' must be a QFeatures object") - if (is.numeric(samplePattern)) + if (is.numeric(samplePattern)) warning("The pattern is numeric. This is only allowed for replicating ", "the SCoPE2 analysis and will later get defunct.") - if (any(unlist(rowDataNames(object)[i]) == rowDataName)) - stop("The rowData name '", rowDataName, "' already exists. ", + if (any(unlist(rowDataNames(object)[i]) == rowDataName)) + stop("The rowData name '", rowDataName, "' already exists. ", "Use another name or remove that column before running ", "this function.") - + i <- QFeatures:::.normIndex(object, i) + sampleNotFound <- character() + carrierNotFound <- character() ## Iterate over the different assay indices for (ii in i) { annot <- colData(object)[colnames(object[[ii]]), ][, colvar] @@ -238,11 +240,10 @@ computeSCR <- function(object, } else { sampIdx <- grep(samplePattern, annot) } - if (!length(sampIdx)) stop("No match found with 'samplePattern = ", - samplePattern, "'.") carrIdx <- grep(carrierPattern, annot) - if (!length(carrIdx)) stop("No match found with 'carrierPattern = ", - carrierPattern, "'.") + if (!length(sampIdx)) sampleNotFound <- c(sampleNotFound, ii) + if (!length(carrIdx)) carrierNotFound <- c(carrierNotFound, ii) + if (!length(sampIdx) || !length(carrIdx)) next ## Get sample data samp <- assay(object[[ii]])[, sampIdx, drop = FALSE] if (ncol(samp) > 1) @@ -254,76 +255,120 @@ computeSCR <- function(object, ## Compute ratio ratio <- unname(samp / carrier) ## Store ratio in rowData - rowData(object@ExperimentList@listData[[ii]])[, rowDataName] <- + rowData(object@ExperimentList@listData[[ii]])[, rowDataName] <- ratio - ## more efficient than + ## more efficient than ## rowData(object[[ii]])[, rowDataName] <- ratio } + .checkMissingSamples( + sampleNotFound, carrierNotFound, samplePattern, carrierPattern + ) object } +## Internal function that will throw an informative error if the +## carrier or the sample pattern (used for computing the SCR) are not +## found in an assay. +## +## The function throws an error if there is at least one of +## sampleNotFound, carrierNotFound is not empty. Otherwise, the +## function returns NULL. +## +## @param sampleNotFound A character() with the names of the sets for +## which the sample pattern is not found. The function will generate +## an error when it is not empty. +## @param carrierNotFound A character() with the names of the sets for +## which the carrier pattern is not found. The function will +## generate an error when it is not empty. +## @param samplePattern A character(1) providing the name of the +## sample pattern used to identify sample columns. This is used to +## generate an informative error message. +## @param carrierPattern A character(1) providing the name of the +## carrier pattern used to identify carrier columns. This is used to +## generate an informative error message. +.checkMissingSamples <- function(sampleNotFound, carrierNotFound, + samplePattern, carrierPattern) { + msg <- "" + if (length(sampleNotFound)) { + msg <- paste0( + msg, "No match found with 'samplePattern = \"", + samplePattern, "\"' for the following set(s):\n", + paste0(sampleNotFound, collapse = ", "), "\n" + ) + } + if (length(carrierNotFound)) { + msg <- paste0( + msg, "No match found with 'carrierPattern = \"", + carrierPattern, "\"' for the following set(s):\n", + paste0(carrierNotFound, collapse = ", "), "\n" + ) + } + if (msg != "") stop(msg) + NULL +} + ##' Compute q-values -##' -##' This function computes q-values from the posterior error +##' +##' This function computes q-values from the posterior error ##' probabilities (PEPs). The functions takes the PEPs from the given ##' assay's `rowData` and adds a new variable to it that contains the ##' computed q-values. ##' ##' @param object A `QFeatures` object -##' -##' @param i A `numeric()` or `character()` vector indicating from +##' +##' @param i A `numeric()` or `character()` vector indicating from ##' which assays the `rowData` should be taken. -##' -##' @param groupBy A `character(1)` indicating the variable name in -##' the `rowData` that contains the grouping variable, for +##' +##' @param groupBy A `character(1)` indicating the variable name in +##' the `rowData` that contains the grouping variable, for ##' instance to compute protein FDR. When `groupBy` is not missing, ##' the best feature approach is used to compute the PEP per group, -##' meaning that the smallest PEP is taken as the PEP of the group. -##' -##' @param PEP A `character(1)` indicating the variable names in the -##' `rowData` that contains the PEPs. Since, PEPs are probabilities, the +##' meaning that the smallest PEP is taken as the PEP of the group. +##' +##' @param PEP A `character(1)` indicating the variable names in the +##' `rowData` that contains the PEPs. Since, PEPs are probabilities, the ##' variable must be contained in (0, 1). ##' -##' @param rowDataName A `character(1)` giving the name of the new -##' variable in the `rowData` where the computed FDRs will be +##' @param rowDataName A `character(1)` giving the name of the new +##' variable in the `rowData` where the computed FDRs will be ##' stored. The name cannot already exist in any of the assay ##' `rowData`. ##' ##' @return A `QFeatures` object. -##' -##' @details -##' -##' The q-value of a feature (PSM, peptide, protein) is the minimum -##' FDR at which that feature will be selected upon filtering -##' (Savitski et al.). On the other hand, the feature PEP is the -##' probability that the feature is wrongly matched and hence can be -##' seen as a local FDR (Kall et al.). While filtering on PEP is -##' guaranteed to control for FDR, it is usually too conservative. -##' Therefore, we provide this function to convert PEP to q-values. -##' -##' We compute the q-value of a feature as the average of the PEPs -##' associated to PSMs that have equal or greater identification -##' confidence (so smaller PEP). See Kall et al. for a visual +##' +##' @details +##' +##' The q-value of a feature (PSM, peptide, protein) is the minimum +##' FDR at which that feature will be selected upon filtering +##' (Savitski et al.). On the other hand, the feature PEP is the +##' probability that the feature is wrongly matched and hence can be +##' seen as a local FDR (Kall et al.). While filtering on PEP is +##' guaranteed to control for FDR, it is usually too conservative. +##' Therefore, we provide this function to convert PEP to q-values. +##' +##' We compute the q-value of a feature as the average of the PEPs +##' associated to PSMs that have equal or greater identification +##' confidence (so smaller PEP). See Kall et al. for a visual ##' interpretation. -##' -##' We also allow inference of q-values at higher level, for instance -##' computing the protein q-values from PSM PEP. This can be performed -##' by supplying the `groupBy` argument. In this case, we adopt the -##' best feature strategy that will take the best (smallest) PEP for +##' +##' We also allow inference of q-values at higher level, for instance +##' computing the protein q-values from PSM PEP. This can be performed +##' by supplying the `groupBy` argument. In this case, we adopt the +##' best feature strategy that will take the best (smallest) PEP for ##' each group (Savitski et al.). -##' -##' @references -##' -##' Käll, Lukas, John D. Storey, Michael J. MacCoss, and William -##' Stafford Noble. 2008. “Posterior Error Probabilities and False +##' +##' @references +##' +##' Käll, Lukas, John D. Storey, Michael J. MacCoss, and William +##' Stafford Noble. 2008. “Posterior Error Probabilities and False ##' Discovery Rates: Two Sides of the Same Coin.” Journal of Proteome ##' Research 7 (1): 40–44. -##' -##' Savitski, Mikhail M., Mathias Wilhelm, Hannes Hahne, Bernhard -##' Kuster, and Marcus Bantscheff. 2015. “A Scalable Approach for -##' Protein False Discovery Rate Estimation in Large Proteomic Data +##' +##' Savitski, Mikhail M., Mathias Wilhelm, Hannes Hahne, Bernhard +##' Kuster, and Marcus Bantscheff. 2015. “A Scalable Approach for +##' Protein False Discovery Rate Estimation in Large Proteomic Data ##' Sets.” Molecular & Cellular Proteomics: MCP 14 (9): 2394–2404. -##' +##' ##' @export ##' ##' @examples @@ -335,10 +380,10 @@ computeSCR <- function(object, ##' rowDataName = "qvalue_protein") ##' ## Check results ##' rowData(scp1)[[1]][, c("dart_PEP", "qvalue_protein")] -##' -pep2qvalue <- function(object, - i, - groupBy, +##' +pep2qvalue <- function(object, + i, + groupBy, PEP, rowDataName = "qvalue") { if (!inherits(object, "QFeatures")) @@ -346,26 +391,26 @@ pep2qvalue <- function(object, if (is.numeric(i)) i <- names(object)[i] ## Check arguments: rowDataName is valid if (rowDataName %in% unlist(rowDataNames(object)[i])) - stop("The rowData name '", rowDataName, "' already exists. ", + stop("The rowData name '", rowDataName, "' already exists. ", "Use another name or remove that column before running ", "this function.") - + ## Get the rowData from all assays df <- rbindRowData(object, i) - + ## Check groupNy and PEP vars <- PEP if (!missing(groupBy)) vars <- c(vars, groupBy) - if (any(!vars %in% colnames(df))) + if (any(!vars %in% colnames(df))) stop("Variable(s) not found in the 'rowData'. Make sure 'PEP'", "and 'groupBy' are present in all selected assays.") - + ## Check PEP is a probability pepRange <- range(df[, PEP], na.rm = TRUE) if (max(pepRange) > 1 | min(pepRange < 0)) stop("'", PEP, "' is not a probability in (0, 1)") - - ## Compute the q-values + + ## Compute the q-values if (missing(groupBy)) { df$qval <- .pep2qvalue(df[, PEP]) } else { ## Apply grouping if supplied @@ -374,74 +419,74 @@ pep2qvalue <- function(object, qval <- .pep2qvalue(p) df$qval <- unname(qval[df[[groupBy]]]) } - + ## Insert the q-value inside every assay rowData pepID <- paste0(df$assay, df$rowname) for (ii in i) { rdID <- paste0(ii, rownames(object[[ii]])) - rowData(object@ExperimentList@listData[[ii]])[, rowDataName] <- + rowData(object@ExperimentList@listData[[ii]])[, rowDataName] <- df[match(rdID, pepID), ]$qval } return(object) } ##' Compute the median coefficient of variation (CV) per cell -##' -##' The function computes for each cell the median CV and stores them +##' +##' The function computes for each cell the median CV and stores them ##' accordingly in the `colData` of the `QFeatures` object. The CVs in -##' each cell are computed from a group of features. The grouping is -##' defined by a variable in the `rowData`. The function can be -##' applied to one or more assays, as long as the samples (column -##' names) are not duplicated. Also, the user can supply a minimal +##' each cell are computed from a group of features. The grouping is +##' defined by a variable in the `rowData`. The function can be +##' applied to one or more assays, as long as the samples (column +##' names) are not duplicated. Also, the user can supply a minimal ##' number of observations required to compute a CV to avoid that CVs -##' computed on too few observations influence the distribution within -##' a cell. The quantification matrix can be optionally normalized +##' computed on too few observations influence the distribution within +##' a cell. The quantification matrix can be optionally normalized ##' before computing the CVs. Multiple normalizations are possible. -##' -##' A new column is added to the `colData` of the object. The samples -##' (columns) that are not present in the selection `i` will get +##' +##' A new column is added to the `colData` of the object. The samples +##' (columns) that are not present in the selection `i` will get ##' assigned an NA. -##' +##' ##' @param object A `QFeatures` object ##' -##' @param i A `numeric()` or `character()` vector indicating from +##' @param i A `numeric()` or `character()` vector indicating from ##' which assays the `rowData` should be taken. -##' -##' @param groupBy A `character(1)` indicating the variable name in +##' +##' @param groupBy A `character(1)` indicating the variable name in ##' the `rowData` that contains the feature grouping. -##' -##' @param nobs An `integer(1)` indicating how many observations +##' +##' @param nobs An `integer(1)` indicating how many observations ##' (features) should at least be considered for computing the CV. -##' Since no CV can be computed for less than 2 observations, -##' `nobs` should at least be 2. -##' +##' Since no CV can be computed for less than 2 observations, +##' `nobs` should at least be 2. +##' ##' @param na.rm A `logical(1)` indicating whether missing data should ##' be removed before computation. -##' -##' @param colDataName A `character(1)` giving the name of the new -##' variable in the `colData` where the computed CVs will be +##' +##' @param colDataName A `character(1)` giving the name of the new +##' variable in the `colData` where the computed CVs will be ##' stored. The name cannot already exist in the `colData`. -##' -##' @param norm A `character()` of normalization methods that will be -##' sequentially applied to each feature (row) in each assay. -##' Available methods and additional -##' information about normalization can be found in +##' +##' @param norm A `character()` of normalization methods that will be +##' sequentially applied to each feature (row) in each assay. +##' Available methods and additional +##' information about normalization can be found in ##' [MsCoreUtils::normalizeMethods]. You can also specify -##' `norm = "SCoPE2"` to reproduce the normalization performed -##' before computing the CVs as suggested by Specht et al. +##' `norm = "SCoPE2"` to reproduce the normalization performed +##' before computing the CVs as suggested by Specht et al. ##' `norm = "none"` will not normalize the data (default) -##' -##' @param ... Additional arguments that are passed to the -##' normalization method. -##' -##' @return A `QFeatures` object. -##' +##' +##' @param ... Additional arguments that are passed to the +##' normalization method. +##' +##' @return A `QFeatures` object. +##' ##' @references Specht, Harrison, Edward Emmott, Aleksandra A. Petelski, -##' R. Gray Huffman, David H. Perlman, Marco Serra, Peter Kharchenko, +##' R. Gray Huffman, David H. Perlman, Marco Serra, Peter Kharchenko, ##' Antonius Koller, and Nikolai Slavov. 2021. “Single-Cell Proteomic -##' and Transcriptomic Analysis of Macrophage Heterogeneity Using +##' and Transcriptomic Analysis of Macrophage Heterogeneity Using ##' SCoPE2.” Genome Biology 22 (1): 50. -##' +##' ##' @export ##' ##' @importFrom matrixStats colMedians @@ -449,7 +494,7 @@ pep2qvalue <- function(object, ##' @examples ##' data("scp1") ##' scp1 <- filterFeatures(scp1, ~ !is.na(Proteins)) -##' scp1 <- medianCVperCell(scp1, +##' scp1 <- medianCVperCell(scp1, ##' i = 1:3, ##' groupBy = "Proteins", ##' nobs = 5, @@ -458,7 +503,7 @@ pep2qvalue <- function(object, ##' norm = "div.median") ##' ## Check results ##' hist(scp1$MedianCV) -##' +##' ##' @rdname medianCVperCell medianCVperCell <- function(object, i, groupBy, nobs = 5, na.rm = TRUE, colDataName = "MedianCV", norm = "none", ...) { @@ -469,13 +514,13 @@ medianCVperCell <- function(object, i, groupBy, nobs = 5, na.rm = TRUE, if (is.numeric(i)) i <- names(object)[i] ## Check arguments: colDataName is valid if (colDataName %in% colnames(colData(object))) - stop("The colData name '", colDataName, "' already exists. ", + stop("The colData name '", colDataName, "' already exists. ", "Use another name or remove that column before running ", "this function.") ## Check arguments: no redundant columns coln <- unlist(colnames(object)[i]) - if (any(duplicated(coln))) - stop("Duplicated samples were found in assay(s) 'i'. This would ", + if (any(duplicated(coln))) + stop("Duplicated samples were found in assay(s) 'i'. This would ", "lead to inconsistencies in the 'colData'.") ## Initiate the vectors with cell median CVs medCVs <- rep(NA, length(coln)) @@ -499,53 +544,53 @@ medianCVperCell <- function(object, i, groupBy, nobs = 5, na.rm = TRUE, return(object) } -##---- Report missing values ---- +##---- Report missing values ---- ##' Four metrics to report missing values ##' ##' The function computes four metrics to report missing values in -##' single-cell proteomics. +##' single-cell proteomics. ##' ##' @param object An object of class [QFeatures]. -##' @param i The index of the assay in `object`. The assay must +##' @param i The index of the assay in `object`. The assay must ##' contain an identification matrix, that is a matrix where an ##' entry is `TRUE` if the value is observed and `FALSE` is the ##' value is missing (see examples). `i` may be numeric, character ##' or logical, but it must select only one assay. -##' @param by A vector of length equal to the number of columns in +##' @param by A vector of length equal to the number of columns in ##' assay `i` that defines groups for which the metrics should be ##' computed separately. If missing, the metrics are computed for ##' the complete assay. ##' -##' @return A `data.frame` with groups as rows and 5 columns: -##' +##' @return A `data.frame` with groups as rows and 5 columns: +##' ##' - `LocalSensitivityMean`: the average number of features per cell. ##' - `LocalSensitivitySd`: the standard deviation of the local ##' sensitivity. -##' - `TotalSensitivity`: the total number of features found in the -##' dataset. +##' - `TotalSensitivity`: the total number of features found in the +##' dataset. ##' - `Completeness`: the proportion of values that are not missing in ##' the data. ##' - `NumberCells`: the number of cells in the dataset. -##' +##' ##' @export ##' ##' @examples -##' +##' ##' data("scp1") -##' +##' ##' ## Define the identification matrix ##' peps <- scp1[["peptides"]] ##' assay(peps) <- !is.na(assay(peps)) ##' scp1 <- addAssay(scp1, peps, "id") -##' -##' ## Report metrics +##' +##' ## Report metrics ##' reportMissingValues(scp1, "id") ##' ## Report metrics by sample type ##' reportMissingValues(scp1, "id", scp1$SampleType) -##' +##' ##' data -##' +##' reportMissingValues <- function(object, i, by = NULL) { x <- .getDetectionMatrix(object, i) if (is.null(by)) by <- rep("all", ncol(x)) @@ -559,23 +604,23 @@ reportMissingValues <- function(object, i, by = NULL) { ##' Compute the pairwise Jaccard index ##' ##' The function computes the Jaccard index between all pairs of cells. -##' +##' ##' @param object An object of class [QFeatures]. -##' @param i The index of the assay in `object`. The assay must +##' @param i The index of the assay in `object`. The assay must ##' contain an identification matrix, that is a matrix where an ##' entry is `TRUE` if the value is observed and `FALSE` is the ##' value is missing (see examples). -##' @param by A vector of length equal to the number of columns in +##' @param by A vector of length equal to the number of columns in ##' assay `i` that defines groups for which the Jaccard index ##' should be computed separately. If missing, the Jaccard indices ##' are computed for all airs of cells in the dataset. ##' ##' @return A `data.frame` with as many rows as pairs of cells -##' and the following column(s): +##' and the following column(s): ##' ##' - `jaccard`: the computed Jaccard index ##' - `by`: if `by` is not `NULL`, the group of the pair of cells -##' for which the Jaccard index is computed. +##' for which the Jaccard index is computed. ##' ##' @export ##' @@ -587,7 +632,7 @@ reportMissingValues <- function(object, i, by = NULL) { ##' peps <- scp1[["peptides"]] ##' assay(peps) <- ifelse(is.na(assay(peps)), FALSE, TRUE) ##' scp1 <- addAssay(scp1, peps, "id") -##' +##' ##' ## Compute Jaccard indices ##' jaccardIndex(scp1, "id") ##' ## Compute Jaccard indices by sample type @@ -610,46 +655,46 @@ jaccardIndex <- function(object, i, by = NULL) { ##' @description ##' ##' The cumulative sensitivity curve is used to evaluate if the sample -##' size is sufficient to accurately estimate the total sensitivity. +##' size is sufficient to accurately estimate the total sensitivity. ##' If it is not the case, an asymptotic regression model may provide ##' a prediction of the total sensitivity if more samples would have ##' been acquired. ##' ##' @param object An object of class [QFeatures]. -##' @param i The index of the assay in `object`. The assay must +##' @param i The index of the assay in `object`. The assay must ##' contain an identification matrix, that is a matrix where an ##' entry is `TRUE` if the value is observed and `FALSE` is the ##' value is missing (see examples). -##' @param by A vector of length equal to the number of columns in +##' @param by A vector of length equal to the number of columns in ##' assay `i` that defines groups for a cumulative sensitivity ##' curve will be computed separately. If missing, the sensitivity ##' curve is computed for the completd dataset. -##' @param batch A vector of length equal to the number of columns in +##' @param batch A vector of length equal to the number of columns in ##' assay `i` that defines the cell batches. All cells in a batch ##' will be aggregated to a single sample. ##' @param nsteps The number of equally spaced sample sizes to compute ##' the sensitivity. -##' @param niters The number of iteration to compute +##' @param niters The number of iteration to compute ##' ##' @return A `data.frame` with groups as many rows as pairs of cells -##' and the following column(s): +##' and the following column(s): ##' ##' - `jaccard`: the computed Jaccard index ##' - `by`: if `by` is not `NULL`, the group of the pair of cells -##' for which the Jaccard index is computed. +##' for which the Jaccard index is computed. ##' ##' @details -##' -##' As more samples are added to a dataset, the total number of +##' +##' As more samples are added to a dataset, the total number of ##' distinct features increases. When sufficient number of samples are ##' acquired, all peptides that are identifiable by the technology and -##' increasing the sample size no longer increases the set of +##' increasing the sample size no longer increases the set of ##' identified features. The cumulative sensitivity curve depicts the ##' relationship between sensitivity (number of distinct peptides in ##' the data) and the sample size. More precisely, the curve is built ##' by sampling cells in the data and count the number of distinct ##' features found across the sampled cells. The sampling is repeated -##' multiple times to account for the stochasticity of the approach. +##' multiple times to account for the stochasticity of the approach. ##' Datasets that have a sample size sufficiently large should have a ##' cumulative sensitivity curve with a plateau. ##' @@ -659,22 +704,22 @@ jaccardIndex <- function(object, i, by = NULL) { ##' `by` argument. ##' ##' For multiplexed experiments, several cells are acquired in a run. -##' In that case, when a features is identified in a cell, it is +##' In that case, when a features is identified in a cell, it is ##' frequently also identified in all other cells of that run, and ##' this will distort the cumulative sensitivity curve. Therefore, the ##' function allows to compute the cumulative sensitivity curve at the ##' batches level rather than at the cell level. This is possible when ##' providing the `batch` argument. -##' -##' Once the cumulative sensitivity curve is computed, the returned -##' data can be visualized to explore the relationship between the +##' +##' Once the cumulative sensitivity curve is computed, the returned +##' data can be visualized to explore the relationship between the ##' sensitivity and the sample size. If enough samples are acquired, ##' the curve should plateau at high numbers of samples. If it is not ##' the case, the total sensitivity can be predicted using an ##' asymptotic regression curve. To predict the total sensitivity, the -##' model is extrapolated to infinite sample size. Therefore, the +##' model is extrapolated to infinite sample size. Therefore, the ##' accuracy of the extrapolation will highly depend on the available -##' data. The closer the curve is to the plateau, the more accurate +##' data. The closer the curve is to the plateau, the more accurate ##' the prediction. ##' ##' @export @@ -703,7 +748,7 @@ jaccardIndex <- function(object, i, by = NULL) { ##' batch = sim$batch ##' ) ##' predCSC <- predictSensitivity(csc, nSample = 1:50) -##' +##' ##' library(ggplot2) ##' ggplot(csc) + ##' aes(x = SampleSize, y = Sensitivity, colour = by) + @@ -716,15 +761,15 @@ jaccardIndex <- function(object, i, by = NULL) { ##' ##' @rdname cumulativeSensitivityCurve cumulativeSensitivityCurve <- function(object, i, by = NULL, - batch = NULL, nsteps = 30, + batch = NULL, nsteps = 30, niters = 10) { x <- .getDetectionMatrix(object, i) if (is.null(by)) by <- rep("all", ncol(x)) stopifnot(length(by) == ncol(x)) csc <- lapply(unique(by), function(byi) { out <- .computeCumulativeSensitivityCurve( - x[, by == byi, drop = FALSE], - batch[by == byi], + x[, by == byi, drop = FALSE], + batch[by == byi], niters, nsteps ) out$by <- byi diff --git a/tests/testthat/test-compute_metrics.R b/tests/testthat/test-compute_metrics.R index fbf77f6..2671085 100644 --- a/tests/testthat/test-compute_metrics.R +++ b/tests/testthat/test-compute_metrics.R @@ -75,20 +75,20 @@ test_that("featureCV", { test_that(".getDetectionMatrix", { data("scp1") - expect_error(.getDetectionMatrix(scp1, "peptide"), + expect_error(.getDetectionMatrix(scp1, "peptide"), regexp = "assay.*is/are not found.*peptide$") - expect_error(.getDetectionMatrix(scp1, c("peptides", "proteins")), + expect_error(.getDetectionMatrix(scp1, c("peptides", "proteins")), regexp = "You selected multiple assays") - expect_error(.getDetectionMatrix(scp1, 1:2), + expect_error(.getDetectionMatrix(scp1, 1:2), regexp = "You selected multiple assays") - expect_error(.getDetectionMatrix(scp1, rep(TRUE, length(scp1))), + expect_error(.getDetectionMatrix(scp1, rep(TRUE, length(scp1))), regexp = "You selected multiple assays") x <- .getDetectionMatrix(scp1, "peptides") expect_identical(dim(scp1[["peptides"]]), dim(x)) expect_identical(dimnames(scp1[["peptides"]]), dimnames(x)) expect_identical(class(x), c("matrix", "array")) expect_identical(mode(x), "logical") - + assay(scp1[["peptides"]])[is.na(assay(scp1[["peptides"]]))] <- 0 x2 <- .getDetectionMatrix(scp1, "peptides") expect_identical(x, x2) @@ -99,11 +99,11 @@ test_that(".computeMissingValueMetrics", { "LocalSensitivityMean", "LocalSensitivitySd", "TotalSensitivity", "Completeness", "NumberCells" ) - + x <- matrix() exp <- structure(c(rep(NA, 4), 1), names = expNames) expect_identical(.computeMissingValueMetrics(x), exp) - + set.seed(1) x <- matrix(TRUE, 10, 10) x[sample(1:100, 10)] <- FALSE @@ -171,13 +171,59 @@ test_that("computeSCR", { samplePattern = "Reference", carrierPattern = "Carrier|Blank"), regexp = "invalid names") ## Error: sample pattern not found - expect_error(computeSCR(scp1, i = 1:2, colvar = "SampleType", - samplePattern = "foo", carrierPattern = "Carrier"), - regexp = "No match.*samplePattern") + data(scp1) + expect_error( + computeSCR( + scp1, i = 1:3, colvar = "SampleType", + samplePattern = "foo", carrierPattern = "Carrier" + ), + regexp = "No match.*samplePattern.*foo.*for the following set.*190321S_LCA10_X_FP97AG, 190222S_LCA9_X_FP94BM, 190914S_LCB3_X_16plex_Set_21\n$" + ) + ## Error: sample pattern found in 1 set, but not others + data(scp1) + scp2 <- scp1 + scp2$SampleType[2:3] <- "foo" + expect_error( + computeSCR( + scp2, i = 1:3, colvar = "SampleType", + samplePattern = "foo", carrierPattern = "Carrier" + ), + regexp = "No match.*samplePattern.*foo.*for the following set.*190321S_LCA10_X_FP97AG, 190914S_LCB3_X_16plex_Set_21\n$" + ) ## Error: carrier pattern not found - expect_error(computeSCR(scp1, i = 1:2, colvar = "SampleType", - samplePattern = "Ref", carrierPattern = "foo"), - regexp = "No match.*carrierPattern") + data(scp1) + expect_error( + computeSCR( + scp1, i = 1:3, colvar = "SampleType", + samplePattern = "Ref", carrierPattern = "foo"), + regexp = "No match.*carrierPattern.*foo.*for the following set.*190321S_LCA10_X_FP97AG, 190222S_LCA9_X_FP94BM, 190914S_LCB3_X_16plex_Set_21\n$" + + ) + ## Error: carrier pattern found in 1 set, but not others + data(scp1) + scp2 <- scp1 + scp2$SampleType[1] <- "foo" + expect_error( + computeSCR( + scp2, i = 1:3, colvar = "SampleType", + samplePattern = "Ref", carrierPattern = "foo" + ), + regexp = "No match.*carrierPattern.*foo.*for the following set.*190321S_LCA10_X_FP97AG, 190914S_LCB3_X_16plex_Set_21\n$" + ) + ## Error: sample and carrier pattern found in 1 set, but not others + data(scp1) + scp2 <- scp1 + scp2$SampleType[1:3] <- "foo" + expect_error( + computeSCR( + scp2, i = 1:3, colvar = "SampleType", + samplePattern = "foo", carrierPattern = "foo" + ), + regexp = paste0( + "No match.*samplePattern.*foo.*for the following set.*190321S_LCA10_X_FP97AG, 190914S_LCB3_X_16plex_Set_21\n", + "No match.*carrierPattern.*foo.*for the following set.*190321S_LCA10_X_FP97AG, 190914S_LCB3_X_16plex_Set_21\n$" + ) + ) ## Error: the new rowData variable already exists expect_error(computeSCR(scp1, i = 1:2, colvar = "SampleType", samplePattern = "Reference", carrierPattern = "Carrier", @@ -185,6 +231,44 @@ test_that("computeSCR", { regexp = "already exists") }) +test_that(".checkMissingSamples", { + ## Pattern is found in all sets: no error + expect_null( + .checkMissingSamples( + sampleNotFound = character(), + carrierNotFound = character(), + samplePattern = "foo1", carrierPattern = "foo2" + ) + ) + ## Pattern not found for carrier + expect_error( + .checkMissingSamples( + sampleNotFound = character(), + carrierNotFound = c("bar1", "bar2"), + samplePattern = "foo1", carrierPattern = "foo2" + ), + regexp = "No match found with 'carrierPattern = \"foo2\"' for the following set\\(s\\):\nbar1, bar2" + ) + ## Pattern not found for sample + expect_error( + .checkMissingSamples( + sampleNotFound = c("bar1", "bar2"), + carrierNotFound = character(), + samplePattern = "foo1", carrierPattern = "foo2" + ), + regexp = "No match found with 'samplePattern = \"foo1\"' for the following set\\(s\\):\nbar1, bar2" + ) + ## Pattern not found for both + expect_error( + .checkMissingSamples( + sampleNotFound = c("bar1", "bar2"), + carrierNotFound = c("bar3", "bar4"), + samplePattern = "foo1", carrierPattern = "foo2" + ), + regexp = "No match found with 'samplePattern = \"foo1\"' for the following set\\(s\\):\nbar1, bar2\nNo match found with 'carrierPattern = \"foo2\"' for the following set\\(s\\):\nbar3, bar4\n" + ) +}) + test_that("pep2qvalue", { ## Correct use ## Compute q-values with scp function, then compare to the known result @@ -249,7 +333,7 @@ test_that("medianCVperCell", { ## Error: the assays contain duplicated samples expect_error(medianCVperCell(scp1, i = 1:5, groupBy = "Proteins"), regexp = "Duplicated samples") - + }) test_that("reportMissingValues", { @@ -258,9 +342,9 @@ test_that("reportMissingValues", { "LocalSensitivityMean", "LocalSensitivitySd", "TotalSensitivity", "Completeness", "NumberCells" ) - expect_error(reportMissingValues(scp1, "peptides", 1), + expect_error(reportMissingValues(scp1, "peptides", 1), regexp = "length.by.*ncol.*is not TRUE") - + test1 <- reportMissingValues(scp1, "peptides") expect_identical(class(test1), "data.frame") expect_identical(dimnames(test1), list("all", expNames)) @@ -280,7 +364,7 @@ test_that("jaccardIndex", { expn2 <- (expn2^2 - expn2) / 2 expect_identical(nrow(test2), as.integer(sum(expn2))) expect_identical( - test2$by, + test2$by, unlist(lapply(unique(scp1$SampleType), function(n) rep(n, expn2[[n]]))) ) })