diff --git a/NAMESPACE b/NAMESPACE index 22df1e7..b35770a 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(associateBarcodes) export(calcNearestStringDist) export(calculateFitnessScore) export(calculateRelativeFC) @@ -76,6 +77,7 @@ importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_text) importFrom(ggplot2,ggplot) importFrom(ggplot2,labs) +importFrom(ggplot2,scale_color_manual) importFrom(ggplot2,scale_fill_continuous) importFrom(ggplot2,scale_fill_discrete) importFrom(ggplot2,scale_x_continuous) diff --git a/R/associateBarcodes.R b/R/associateBarcodes.R new file mode 100644 index 0000000..e4e81d6 --- /dev/null +++ b/R/associateBarcodes.R @@ -0,0 +1,114 @@ +#' Associate barcode(s) to variants +#' +#' @param countTable A data frame containing barcode sequences, variant +#' sequences and corresponding read counts for each barcode-variant pair. +#' @param barcodeCol,variantCol,countCol Character scalars indicating the +#' names of the columns of \code{countTable} containing barcode +#' sequences, variant IDs and read counts, respectively. +#' @param minCount Numeric scalar. Pairs with fewer reads will be flagged +#' for exclusion. +#' @param minFracOfBarcodeCount Numeric scalar. Pairs where the read count +#' is lower than \code{minFracOfBarcodeCount} * total barcode count will +#' be flagged for exclusion. +#' +#' @export +#' @author Charlotte Soneson +#' +#' @return A list with two elements: \code{counts} (a data frame with counts) +#' and \code{plots} (a list of ggplot objects). +#' +#' @importFrom ggplot2 scale_color_manual +#' @importFrom dplyr %>% group_by mutate ungroup filter arrange desc +#' summarize +#' @importFrom rlang .data +#' @importFrom ggplot2 ggplot aes geom_histogram theme_bw labs +#' scale_color_manual scale_x_log10 scale_y_log10 geom_point +#' +associateBarcodes <- function(countTable, barcodeCol, variantCol, + countCol = "nbrReads", minCount = 10, + minFracOfBarcodeCount = 0.5) { + .assertScalar(x = barcodeCol, type = "character", + validValues = colnames(countTable)) + .assertVector(x = countTable[[barcodeCol]], type = "character") + .assertScalar(x = variantCol, type = "character", + validValues = colnames(countTable)) + .assertVector(x = countTable[[variantCol]], type = "character") + .assertScalar(x = countCol, type = "character", + validValues = colnames(countTable)) + .assertVector(x = countTable[[countCol]], type = "numeric") + .assertScalar(x = minCount, type = "numeric", rngIncl = c(0, Inf)) + .assertScalar(x = minFracOfBarcodeCount, type = "numeric", rngIncl = c(0, 1)) + + ## Initialize plot list + plots <- list() + + ## Add columns with total counts for barcode/variant, with and without + ## filtering + countTable <- countTable %>% + ## first aggregate all rows with the same barcode/variant pair, + ## in case that wasn't already done + dplyr::group_by(.data[[barcodeCol]], .data[[variantCol]]) %>% + dplyr::summarize("{countCol}" := sum(.data[[countCol]])) %>% + dplyr::ungroup() %>% + ## flag pairs with < minCount reads + dplyr::mutate(keep = (.data[[countCol]] >= minCount)) %>% + ## add total barcode counts + dplyr::group_by(.data[[barcodeCol]]) %>% + dplyr::mutate(totalBarcodeCountAll = sum(.data[[countCol]])) %>% + dplyr::mutate(totalBarcodeCountFilt = sum(.data[[countCol]][.data$keep])) %>% + dplyr::ungroup() %>% + ## add total variant counts + dplyr::group_by(.data[[variantCol]]) %>% + dplyr::mutate(totalVariantCountAll = sum(.data[[countCol]])) %>% + dplyr::mutate(totalVariantCountFilt = sum(.data[[countCol]][.data$keep])) %>% + dplyr::ungroup() + plots[["totalBcCountAll"]] <- + ggplot(countTable %>% dplyr::filter(!duplicated(.data[[barcodeCol]])), + aes(x = .data$totalBarcodeCountAll)) + + geom_histogram(bins = 100, fill = "lightgrey") + + theme_bw() + scale_x_log10() + + labs(title = "Total barcode count (all)", x = "Total barcode count") + plots[["totalBcCountFiltered"]] <- + ggplot(countTable %>% dplyr::filter(!duplicated(.data[[barcodeCol]])), + aes(x = .data$totalBarcodeCountFilt)) + + geom_histogram(bins = 100, fill = "lightgrey") + + theme_bw() + scale_x_log10() + + labs(title = "Total barcode count (filtered)", x = "Total barcode count") + + ## Calculate relative frequency for each pair as the ratio between the + ## count for the pair and the total count for the barcode or variant, + ## respectively + countTable <- countTable %>% + dplyr::mutate(fracOfTotalBarcodeCountFilt = + .data[[countCol]]/.data$totalBarcodeCountFilt, + fracOfTotalVariantCountFilt = + .data[[countCol]]/.data$totalVariantCountFilt) + + ## Order by decreasing frequency and keep only the top hit for each + ## barcode + countTable <- countTable %>% + dplyr::arrange(dplyr::desc(.data[[countCol]])) %>% + dplyr::mutate(keep = .data$keep & !duplicated(.data[[barcodeCol]])) + + ## Flag pairs where the count/total barcode count is too low + countTable <- countTable %>% + dplyr::mutate(keep = .data$keep & + (.data$fracOfTotalBarcodeCountFilt >= + minFracOfBarcodeCount)) + + ## Plot count for pair vs total count for barcode + plots[["pairVsBarcodeCount"]] <- + ggplot(countTable, aes(x = .data$totalBarcodeCountFilt, + y = .data[[countCol]], + color = .data$keep)) + + geom_point(size = 0.5) + theme_bw() + + scale_color_manual(values = c(`TRUE` = "black", `FALSE` = "red"), + name = "Keep") + + scale_x_log10() + scale_y_log10() + + labs(x = "Total barcode count", y = "Barcode/variant pair count") + + return(list(counts = countTable, plots = plots)) +} + + + diff --git a/R/linkMultipleVariants.R b/R/linkMultipleVariants.R index 9f303fb..2aeb503 100644 --- a/R/linkMultipleVariants.R +++ b/R/linkMultipleVariants.R @@ -62,6 +62,11 @@ #' @param ... Additional arguments providing arguments to \code{digestFastqs} #' for the separate runs (processing each variable sequence in turn). #' Each argument must be a named list of arguments to \code{digestFastqs}. +#' @param collapseToAA Either \code{TRUE}, \code{FALSE}, or a character vector +#' indicating for which of the separate runs the sequences should be +#' collapsed to the amino acid mutant name rather than the codon- or +#' nucleotide-level name. \code{TRUE} is equivalent to providing the +#' names of all lists provided in \code{...}. #' #' @author Charlotte Soneson #' @@ -75,13 +80,15 @@ #' \item convSeparate - a list of conversion tables from the respective #' separate runs. #' \item outCombined - the \code{digestFastqs} output for the combined run. +#' \item filtSeparate - a list of filtering tables for the separate runs. #' } #' #' @importFrom dplyr select rename group_by summarize across matches mutate #' @importFrom tidyr separate separate_rows #' @importFrom rlang .data #' -linkMultipleVariants <- function(combinedDigestParams = list(), ...) { +linkMultipleVariants <- function(combinedDigestParams = list(), ..., + collapseToAA = FALSE) { ## Process additional arguments paramsSeparate <- list(...) @@ -111,6 +118,24 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { as.list(defaults[!(names(defaults) %in% names(combinedDigestParams))])) + ## Make sure collapseToAA is a vector containing the names of the + ## runs where sequences should be collapsed to AAs + if (length(collapseToAA) == 1) { + if (is.logical(collapseToAA)) { + if (collapseToAA) { + collapseToAA <- names(paramsSeparate) + } else { + collapseToAA <- character(0) + } + } else { + .assertVector(x = collapseToAA, type = "character", + validValues = names(paramsSeparate)) + } + } else { + .assertVector(x = collapseToAA, type = "character", + validValues = names(paramsSeparate)) + } + ## --------------------------------------------------------------------- ## ## Checks ## --------------------------------------------------------------------- ## @@ -227,7 +252,8 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { ## Conversion tables convSeparate <- lapply(outSeparate, function(out) { out$summaryTable %>% - dplyr::select(.data$mutantName, .data$sequence) %>% + dplyr::select(.data$mutantName, .data$mutantNameAA, + .data$sequence) %>% tidyr::separate_rows(.data$sequence, sep = ",") }) @@ -235,9 +261,15 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { ## Replace naive sequences with corrected ones ## --------------------------------------------------------------------- ## for (i in names(paramsSeparate)) { - countCombined[[i]] <- - convSeparate[[i]]$mutantName[match(countCombined[[i]], - convSeparate[[i]]$sequence)] + if (i %in% collapseToAA) { + countCombined[[i]] <- + convSeparate[[i]]$mutantNameAA[match(countCombined[[i]], + convSeparate[[i]]$sequence)] + } else { + countCombined[[i]] <- + convSeparate[[i]]$mutantName[match(countCombined[[i]], + convSeparate[[i]]$sequence)] + } } ## Filter out rows with NAs (the sequence was not retained) diff --git a/inst/extdata/multipleVariableRegions_truth.rds b/inst/extdata/multipleVariableRegions_truth.rds index fa1243c..a86cab1 100644 Binary files a/inst/extdata/multipleVariableRegions_truth.rds and b/inst/extdata/multipleVariableRegions_truth.rds differ diff --git a/inst/scripts/generateExampleDataForLinkMultipleVariants.R b/inst/scripts/generateExampleDataForLinkMultipleVariants.R index a38c9f5..c33b147 100644 --- a/inst/scripts/generateExampleDataForLinkMultipleVariants.R +++ b/inst/scripts/generateExampleDataForLinkMultipleVariants.R @@ -187,10 +187,53 @@ WTV3 <- c(V3 = V3) constFwd <- "ACGTGATTGATCCCGGTAAATACCGG" constRev <- "TAAATACCGGACCTGA" +## Get AA mutant name as well +V2varAA <- vapply(V2var, function(w) { + w <- strsplit(w, "\\.")[[1]] + if (w[3] != "WT") { + pos0 <- ceiling(as.numeric(w[2]) / 3) - 1 + wt <- strsplit(WTV2[w[1]], "")[[1]] + wt_codon <- paste(wt[(3 * pos0 + 1):(3 * pos0 + 3)], collapse = "") + wt_aa <- mutscan:::translateString(wt_codon) + wt[as.numeric(w[2])] <- w[3] + mut_codon <- paste(wt[(3 * pos0 + 1):(3 * pos0 + 3)], collapse = "") + mut_aa <- mutscan:::translateString(mut_codon) + if (mut_aa == wt_aa) { + paste(c(w[1], 0, "WT"), collapse = ".") + } else { + paste(c(w[1], pos0 + 1, mut_aa), collapse = ".") + } + } else { + paste(w, collapse = ".") + } +}, "") +V3varAA <- vapply(V3var, function(w) { + w <- strsplit(w, "\\.")[[1]] + if (w[3] != "WT") { + pos0 <- ceiling(as.numeric(w[2]) / 3) - 1 + wt <- strsplit(WTV3[w[1]], "")[[1]] + wt_codon <- paste(wt[(3 * pos0 + 1):(3 * pos0 + 3)], collapse = "") + wt_aa <- mutscan:::translateString(wt_codon) + wt[as.numeric(w[2])] <- w[3] + mut_codon <- paste(wt[(3 * pos0 + 1):(3 * pos0 + 3)], collapse = "") + mut_aa <- mutscan:::translateString(mut_codon) + if (mut_aa == wt_aa) { + paste(c(w[1], 0, "WT"), collapse = ".") + } else { + paste(c(w[1], pos0 + 1, mut_aa), collapse = ".") + } + } else { + paste(w, collapse = ".") + } +}, "") + + truth <- data.frame(read = paste0("R", seq_len(nReads)), trueBarcode = barcode, trueV2 = V2var, trueV3 = V3var, + trueV2aa = V2varAA, + trueV3aa = V3varAA, obsBarcode = v1, status = status) diff --git a/man/associateBarcodes.Rd b/man/associateBarcodes.Rd new file mode 100644 index 0000000..d198dc7 --- /dev/null +++ b/man/associateBarcodes.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/associateBarcodes.R +\name{associateBarcodes} +\alias{associateBarcodes} +\title{Associate barcode(s) to variants} +\usage{ +associateBarcodes( + countTable, + barcodeCol, + variantCol, + countCol = "nbrReads", + minCount = 10, + minFracOfBarcodeCount = 0.5 +) +} +\arguments{ +\item{countTable}{A data frame containing barcode sequences, variant +sequences and corresponding read counts for each barcode-variant pair.} + +\item{barcodeCol, variantCol, countCol}{Character scalars indicating the +names of the columns of \code{countTable} containing barcode +sequences, variant IDs and read counts, respectively.} + +\item{minCount}{Numeric scalar. Pairs with fewer reads will be flagged +for exclusion.} + +\item{minFracOfBarcodeCount}{Numeric scalar. Pairs where the read count +is lower than \code{minFracOfBarcodeCount} * total barcode count will +be flagged for exclusion.} +} +\value{ +A list with two elements: \code{counts} (a data frame with counts) +and \code{plots} (a list of ggplot objects). +} +\description{ +Associate barcode(s) to variants +} +\author{ +Charlotte Soneson +} diff --git a/man/linkMultipleVariants.Rd b/man/linkMultipleVariants.Rd index cb60207..b9a8122 100644 --- a/man/linkMultipleVariants.Rd +++ b/man/linkMultipleVariants.Rd @@ -4,7 +4,7 @@ \alias{linkMultipleVariants} \title{Process an experiment with multiple variable sequences} \usage{ -linkMultipleVariants(combinedDigestParams = list(), ...) +linkMultipleVariants(combinedDigestParams = list(), ..., collapseToAA = FALSE) } \arguments{ \item{combinedDigestParams}{A named list of arguments to @@ -13,6 +13,12 @@ linkMultipleVariants(combinedDigestParams = list(), ...) \item{...}{Additional arguments providing arguments to \code{digestFastqs} for the separate runs (processing each variable sequence in turn). Each argument must be a named list of arguments to \code{digestFastqs}.} + +\item{collapseToAA}{Either \code{TRUE}, \code{FALSE}, or a character vector +indicating for which of the separate runs the sequences should be +collapsed to the amino acid mutant name rather than the codon- or +nucleotide-level name. \code{TRUE} is equivalent to providing the +names of all lists provided in \code{...}.} } \value{ A list with the following elements: @@ -23,6 +29,7 @@ A list with the following elements: \item convSeparate - a list of conversion tables from the respective separate runs. \item outCombined - the \code{digestFastqs} output for the combined run. +\item filtSeparate - a list of filtering tables for the separate runs. } } \description{ diff --git a/tests/testthat/test-associateBarcodes.R b/tests/testthat/test-associateBarcodes.R new file mode 100644 index 0000000..e8c6140 --- /dev/null +++ b/tests/testthat/test-associateBarcodes.R @@ -0,0 +1,168 @@ +test_that("associateBarcodes works", { + ## Read truth (for WT sequences) + truth <- readRDS(system.file("extdata", "multipleVariableRegions_truth.rds", + package = "mutscan")) + verbose <- FALSE + + ## Don't filter by mutations in constant sequence + res <- linkMultipleVariants( + combinedDigestParams = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CVCVCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CVCV", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 20, + maxOverlap = 30, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + verbose = verbose), + barcode = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + elementsForward = "CVCSCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + avePhredMinForward = 20, variableCollapseMaxDist = 1, + variableCollapseMinReads = 1, variableCollapseMinRatio = 1, + verbose = verbose), + V2 = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CSCVCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CSCV", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 10, + maxOverlap = 20, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + wildTypeForward = truth$WTV2, + nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, + forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE, + verbose = verbose, useTreeWTmatch = TRUE), + V3 = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CSCSCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CVCS", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 5, + maxOverlap = 15, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + wildTypeForward = truth$WTV3, + nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, + forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE, + verbose = verbose, useTreeWTmatch = TRUE), + collapseToAA = FALSE + ) + + ## Fails with the wrong input + ## ------------------------------------------------------------------------ + expect_error(associateBarcodes(res$countAggregated, barcodeCol = 1, + variantCol = "V2", countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = 0.25), + "'barcodeCol' must be of class 'character'") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = c("barcode", "V2"), + variantCol = "V2", countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = 0.25), + "'barcodeCol' must have length 1") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "missing", + variantCol = "V2", countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = 0.25), + "All values in 'barcodeCol' must be one of") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "nbrReads", + variantCol = "V2", countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = 0.25), + "'[[countTablebarcodeCol' must be of class 'character'", fixed = TRUE) + + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = 1, countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = 0.25), + "'variantCol' must be of class 'character'") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = c("V2", "barcode"), + countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = 0.25), + "'variantCol' must have length 1") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "missing", countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = 0.25), + "All values in 'variantCol' must be one of") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "nbrReads", countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = 0.25), + "'[[countTablevariantCol' must be of class 'character'", fixed = TRUE) + + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", countCol = 1, minCount = 2, + minFracOfBarcodeCount = 0.25), + "'countCol' must be of class 'character'") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", countCol = c("barcode", "V2"), minCount = 2, + minFracOfBarcodeCount = 0.25), + "'countCol' must have length 1") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", countCol = "missing", minCount = 2, + minFracOfBarcodeCount = 0.25), + "All values in 'countCol' must be one of") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", countCol = "barcode", minCount = 2, + minFracOfBarcodeCount = 0.25), + "'[[countTablecountCol' must be of class 'numeric'", fixed = TRUE) + + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", countCol = "nbrReads", minCount = "2", + minFracOfBarcodeCount = 0.25), + "'minCount' must be of class 'numeric'") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", countCol = "nbrReads", minCount = c(1, 2), + minFracOfBarcodeCount = 0.25), + "'minCount' must have length 1") + + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = "0.25"), + "'minFracOfBarcodeCount' must be of class 'numeric'") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = c(0.1, 0.25)), + "'minFracOfBarcodeCount' must have length 1") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = 2.5), + "'minFracOfBarcodeCount' must be within [0,1]", fixed = TRUE) + + + ## Works with the correct input + ## ------------------------------------------------------------------------ + ## Run barcode association + out <- associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", minCount = 2, + minFracOfBarcodeCount = 0.25) + + ## Calculate expected values manually + correct <- truth$truth[!grepl("N|del", truth$truth$status), ] %>% + ## get frequency for each barcode/variant pair + dplyr::group_by(trueBarcode, trueV2) %>% + dplyr::tally() %>% + ## filter rare pairs + dplyr::filter(n >= 2) %>% + ## filter pairs with too low fraction of total bc counts + dplyr::group_by(trueBarcode) %>% + dplyr::mutate(relFrac = n/sum(n)) %>% + dplyr::filter(relFrac >= 0.25) %>% + ## keep top variant for each barcode %>% + dplyr::arrange(dplyr::desc(n)) %>% + dplyr::filter(!duplicated(trueBarcode)) + + expect_equal(sum(out$counts$keep), nrow(correct)) + expect_true(all(paste0(out$counts$barcode[out$counts$keep], + out$counts$V2[out$counts$keep]) %in% + paste0(correct$trueBarcode, correct$trueV2))) + tmp <- out$counts[match(paste0(correct$trueBarcode, correct$trueV2), + paste0(out$counts$barcode, out$counts$V2)), ] + expect_equal(tmp$nbrReads, correct$n) + expect_equal(tmp$fracOfTotalBarcodeCountFilt, correct$relFrac) +}) diff --git a/tests/testthat/test_linkMultipleVariants.R b/tests/testthat/test_linkMultipleVariants.R index 933db0b..f476375 100644 --- a/tests/testthat/test_linkMultipleVariants.R +++ b/tests/testthat/test_linkMultipleVariants.R @@ -16,6 +16,10 @@ test_that("linkMultipleVariants works", { expect_error(linkMultipleVariants(combinedDigestParams = list(fastqForward = 1), secondArg = list(unknown = 1)), "All values in 'namesparms' must be one of") + expect_error(linkMultipleVariants(combinedDigestParams = list(fastqForward = 1), + secondArg = list(fastqForward = 1), + collapseToAA = 1), + "'collapseToAA' must be of class 'character'") expect_error(linkMultipleVariants(combinedDigestParams = list(fastqForward = 1), secondArg = list(elementsForward = "CC")), @@ -86,7 +90,8 @@ test_that("linkMultipleVariants works", { wildTypeForward = truth$WTV3, nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE, - verbose = verbose, useTreeWTmatch = TRUE) + verbose = verbose, useTreeWTmatch = TRUE), + collapseToAA = FALSE ) ## Filter out only reads with N/deletions in the variable sequence @@ -124,6 +129,186 @@ test_that("linkMultipleVariants works", { expect_equal(res$filtSeparate$V3[, "nbrRetained"], sum(!grepl("del", truth$truth$status))) + ## ------------------------------------------------------------------------ + ## Don't filter by mutations in constant sequence, collapse by AA + res <- linkMultipleVariants( + combinedDigestParams = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CVCVCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CVCV", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 20, + maxOverlap = 30, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + verbose = verbose), + barcode = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + elementsForward = "CVCSCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + avePhredMinForward = 20, variableCollapseMaxDist = 1, + variableCollapseMinReads = 1, variableCollapseMinRatio = 1, + verbose = verbose), + V2 = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CSCVCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CSCV", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 10, + maxOverlap = 20, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + wildTypeForward = truth$WTV2, + nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, + forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE, + verbose = verbose, useTreeWTmatch = TRUE), + V3 = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CSCSCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CVCS", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 5, + maxOverlap = 15, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + wildTypeForward = truth$WTV3, + nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, + forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE, + verbose = verbose, useTreeWTmatch = TRUE), + collapseToAA = c("V2", "V3") + ) + + ## Filter out only reads with N/deletions in the variable sequence + keep <- !grepl("N|del", truth$truth$status) + + ## Truth + correct <- truth$truth[keep, ] %>% + dplyr::group_by(trueBarcode, trueV2aa, trueV3aa) %>% dplyr::tally() %>% + dplyr::arrange(dplyr::desc(n), trueBarcode, trueV2aa, trueV3aa) + + ## Obs + obs <- res$countAggregated %>% dplyr::arrange(desc(nbrReads), barcode, V2, V3) + + expect_equal(sum(obs$nbrReads), sum(keep)) + expect_equal(correct$trueBarcode, obs$barcode) + expect_equal(correct$trueV2aa, obs$V2) + expect_equal(correct$trueV3aa, obs$V3) + expect_equal(correct$n, obs$nbrReads) + expect_equal(res$outCombined$filterSummary[, "f4_nbrNoValidOverlap"], + sum(grepl("del", truth$truth$status))) ## del + expect_equal(res$outCombined$filterSummary[, "f6_nbrTooManyNinVar"], + sum(grepl("N", truth$truth$status))) ## N + expect_equal(res$outCombined$filterSummary[, "nbrRetained"], + sum(!grepl("N|del", truth$truth$status))) + expect_equal(res$filtSeparate$barcode[, "f6_nbrTooManyNinVar"], + sum(grepl("V1N", truth$truth$status))) + expect_equal(res$filtSeparate$barcode[, "nbrRetained"], + sum(!grepl("V1N", truth$truth$status))) + expect_equal(res$filtSeparate$V2[, "f4_nbrNoValidOverlap"], + sum(grepl("del", truth$truth$status))) + expect_equal(res$filtSeparate$V2[, "nbrRetained"], + sum(!grepl("del", truth$truth$status))) + expect_equal(res$filtSeparate$V3[, "f4_nbrNoValidOverlap"], + sum(grepl("del", truth$truth$status))) + expect_equal(res$filtSeparate$V3[, "nbrRetained"], + sum(!grepl("del", truth$truth$status))) + + ## ------------------------------------------------------------------------ + ## Don't filter by mutations in constant sequence, collapse by AA (only V2) + res <- linkMultipleVariants( + combinedDigestParams = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CVCVCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CVCV", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 20, + maxOverlap = 30, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + verbose = verbose), + barcode = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + elementsForward = "CVCSCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + avePhredMinForward = 20, variableCollapseMaxDist = 1, + variableCollapseMinReads = 1, variableCollapseMinRatio = 1, + verbose = verbose), + V2 = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CSCVCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CSCV", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 10, + maxOverlap = 20, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + wildTypeForward = truth$WTV2, + nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, + forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE, + verbose = verbose, useTreeWTmatch = TRUE), + V3 = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CSCSCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CVCS", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 5, + maxOverlap = 15, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + wildTypeForward = truth$WTV3, + nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, + forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE, + verbose = verbose, useTreeWTmatch = TRUE), + collapseToAA = "V2" + ) + + ## Filter out only reads with N/deletions in the variable sequence + keep <- !grepl("N|del", truth$truth$status) + + ## Truth + correct <- truth$truth[keep, ] %>% + dplyr::group_by(trueBarcode, trueV2aa, trueV3) %>% dplyr::tally() %>% + dplyr::arrange(dplyr::desc(n), trueBarcode, trueV2aa, trueV3) + + ## Obs + obs <- res$countAggregated %>% dplyr::arrange(desc(nbrReads), barcode, V2, V3) + + expect_equal(sum(obs$nbrReads), sum(keep)) + expect_equal(correct$trueBarcode, obs$barcode) + expect_equal(correct$trueV2aa, obs$V2) + expect_equal(correct$trueV3, obs$V3) + expect_equal(correct$n, obs$nbrReads) + expect_equal(res$outCombined$filterSummary[, "f4_nbrNoValidOverlap"], + sum(grepl("del", truth$truth$status))) ## del + expect_equal(res$outCombined$filterSummary[, "f6_nbrTooManyNinVar"], + sum(grepl("N", truth$truth$status))) ## N + expect_equal(res$outCombined$filterSummary[, "nbrRetained"], + sum(!grepl("N|del", truth$truth$status))) + expect_equal(res$filtSeparate$barcode[, "f6_nbrTooManyNinVar"], + sum(grepl("V1N", truth$truth$status))) + expect_equal(res$filtSeparate$barcode[, "nbrRetained"], + sum(!grepl("V1N", truth$truth$status))) + expect_equal(res$filtSeparate$V2[, "f4_nbrNoValidOverlap"], + sum(grepl("del", truth$truth$status))) + expect_equal(res$filtSeparate$V2[, "nbrRetained"], + sum(!grepl("del", truth$truth$status))) + expect_equal(res$filtSeparate$V3[, "f4_nbrNoValidOverlap"], + sum(grepl("del", truth$truth$status))) + expect_equal(res$filtSeparate$V3[, "nbrRetained"], + sum(!grepl("del", truth$truth$status))) + ## ------------------------------------------------------------------------ ## Filter out mutations in constant sequences res <- linkMultipleVariants(