Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(associateBarcodes)
export(calcNearestStringDist)
export(calculateFitnessScore)
export(calculateRelativeFC)
Expand Down Expand Up @@ -76,6 +77,7 @@ importFrom(ggplot2,geom_point)
importFrom(ggplot2,geom_text)
importFrom(ggplot2,ggplot)
importFrom(ggplot2,labs)
importFrom(ggplot2,scale_color_manual)
importFrom(ggplot2,scale_fill_continuous)
importFrom(ggplot2,scale_fill_discrete)
importFrom(ggplot2,scale_x_continuous)
Expand Down
114 changes: 114 additions & 0 deletions R/associateBarcodes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#' Associate barcode(s) to variants
#'
#' @param countTable A data frame containing barcode sequences, variant
#' sequences and corresponding read counts for each barcode-variant pair.
#' @param barcodeCol,variantCol,countCol Character scalars indicating the
#' names of the columns of \code{countTable} containing barcode
#' sequences, variant IDs and read counts, respectively.
#' @param minCount Numeric scalar. Pairs with fewer reads will be flagged
#' for exclusion.
#' @param minFracOfBarcodeCount Numeric scalar. Pairs where the read count
#' is lower than \code{minFracOfBarcodeCount} * total barcode count will
#' be flagged for exclusion.
#'
#' @export
#' @author Charlotte Soneson
#'
#' @return A list with two elements: \code{counts} (a data frame with counts)
#' and \code{plots} (a list of ggplot objects).
#'
#' @importFrom ggplot2 scale_color_manual
#' @importFrom dplyr %>% group_by mutate ungroup filter arrange desc
#' summarize
#' @importFrom rlang .data
#' @importFrom ggplot2 ggplot aes geom_histogram theme_bw labs
#' scale_color_manual scale_x_log10 scale_y_log10 geom_point
#'
associateBarcodes <- function(countTable, barcodeCol, variantCol,
countCol = "nbrReads", minCount = 10,
minFracOfBarcodeCount = 0.5) {
.assertScalar(x = barcodeCol, type = "character",
validValues = colnames(countTable))
.assertVector(x = countTable[[barcodeCol]], type = "character")
.assertScalar(x = variantCol, type = "character",
validValues = colnames(countTable))
.assertVector(x = countTable[[variantCol]], type = "character")
.assertScalar(x = countCol, type = "character",
validValues = colnames(countTable))
.assertVector(x = countTable[[countCol]], type = "numeric")
.assertScalar(x = minCount, type = "numeric", rngIncl = c(0, Inf))
.assertScalar(x = minFracOfBarcodeCount, type = "numeric", rngIncl = c(0, 1))

## Initialize plot list
plots <- list()

## Add columns with total counts for barcode/variant, with and without
## filtering
countTable <- countTable %>%
## first aggregate all rows with the same barcode/variant pair,
## in case that wasn't already done
dplyr::group_by(.data[[barcodeCol]], .data[[variantCol]]) %>%
dplyr::summarize("{countCol}" := sum(.data[[countCol]])) %>%
dplyr::ungroup() %>%
## flag pairs with < minCount reads
dplyr::mutate(keep = (.data[[countCol]] >= minCount)) %>%
## add total barcode counts
dplyr::group_by(.data[[barcodeCol]]) %>%
dplyr::mutate(totalBarcodeCountAll = sum(.data[[countCol]])) %>%
dplyr::mutate(totalBarcodeCountFilt = sum(.data[[countCol]][.data$keep])) %>%
dplyr::ungroup() %>%
## add total variant counts
dplyr::group_by(.data[[variantCol]]) %>%
dplyr::mutate(totalVariantCountAll = sum(.data[[countCol]])) %>%
dplyr::mutate(totalVariantCountFilt = sum(.data[[countCol]][.data$keep])) %>%
dplyr::ungroup()
plots[["totalBcCountAll"]] <-
ggplot(countTable %>% dplyr::filter(!duplicated(.data[[barcodeCol]])),
aes(x = .data$totalBarcodeCountAll)) +
geom_histogram(bins = 100, fill = "lightgrey") +
theme_bw() + scale_x_log10() +
labs(title = "Total barcode count (all)", x = "Total barcode count")
plots[["totalBcCountFiltered"]] <-
ggplot(countTable %>% dplyr::filter(!duplicated(.data[[barcodeCol]])),
aes(x = .data$totalBarcodeCountFilt)) +
geom_histogram(bins = 100, fill = "lightgrey") +
theme_bw() + scale_x_log10() +
labs(title = "Total barcode count (filtered)", x = "Total barcode count")

## Calculate relative frequency for each pair as the ratio between the
## count for the pair and the total count for the barcode or variant,
## respectively
countTable <- countTable %>%
dplyr::mutate(fracOfTotalBarcodeCountFilt =
.data[[countCol]]/.data$totalBarcodeCountFilt,
fracOfTotalVariantCountFilt =
.data[[countCol]]/.data$totalVariantCountFilt)

## Order by decreasing frequency and keep only the top hit for each
## barcode
countTable <- countTable %>%
dplyr::arrange(dplyr::desc(.data[[countCol]])) %>%
dplyr::mutate(keep = .data$keep & !duplicated(.data[[barcodeCol]]))

## Flag pairs where the count/total barcode count is too low
countTable <- countTable %>%
dplyr::mutate(keep = .data$keep &
(.data$fracOfTotalBarcodeCountFilt >=
minFracOfBarcodeCount))

## Plot count for pair vs total count for barcode
plots[["pairVsBarcodeCount"]] <-
ggplot(countTable, aes(x = .data$totalBarcodeCountFilt,
y = .data[[countCol]],
color = .data$keep)) +
geom_point(size = 0.5) + theme_bw() +
scale_color_manual(values = c(`TRUE` = "black", `FALSE` = "red"),
name = "Keep") +
scale_x_log10() + scale_y_log10() +
labs(x = "Total barcode count", y = "Barcode/variant pair count")

return(list(counts = countTable, plots = plots))
}



42 changes: 37 additions & 5 deletions R/linkMultipleVariants.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@
#' @param ... Additional arguments providing arguments to \code{digestFastqs}
#' for the separate runs (processing each variable sequence in turn).
#' Each argument must be a named list of arguments to \code{digestFastqs}.
#' @param collapseToAA Either \code{TRUE}, \code{FALSE}, or a character vector
#' indicating for which of the separate runs the sequences should be
#' collapsed to the amino acid mutant name rather than the codon- or
#' nucleotide-level name. \code{TRUE} is equivalent to providing the
#' names of all lists provided in \code{...}.
#'
#' @author Charlotte Soneson
#'
Expand All @@ -75,13 +80,15 @@
#' \item convSeparate - a list of conversion tables from the respective
#' separate runs.
#' \item outCombined - the \code{digestFastqs} output for the combined run.
#' \item filtSeparate - a list of filtering tables for the separate runs.
#' }
#'
#' @importFrom dplyr select rename group_by summarize across matches mutate
#' @importFrom tidyr separate separate_rows
#' @importFrom rlang .data
#'
linkMultipleVariants <- function(combinedDigestParams = list(), ...) {
linkMultipleVariants <- function(combinedDigestParams = list(), ...,
collapseToAA = FALSE) {

## Process additional arguments
paramsSeparate <- list(...)
Expand Down Expand Up @@ -111,6 +118,24 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) {
as.list(defaults[!(names(defaults) %in%
names(combinedDigestParams))]))

## Make sure collapseToAA is a vector containing the names of the
## runs where sequences should be collapsed to AAs
if (length(collapseToAA) == 1) {
if (is.logical(collapseToAA)) {
if (collapseToAA) {
collapseToAA <- names(paramsSeparate)
} else {
collapseToAA <- character(0)
}
} else {
.assertVector(x = collapseToAA, type = "character",
validValues = names(paramsSeparate))
}
} else {
.assertVector(x = collapseToAA, type = "character",
validValues = names(paramsSeparate))
}

## --------------------------------------------------------------------- ##
## Checks
## --------------------------------------------------------------------- ##
Expand Down Expand Up @@ -227,17 +252,24 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) {
## Conversion tables
convSeparate <- lapply(outSeparate, function(out) {
out$summaryTable %>%
dplyr::select(.data$mutantName, .data$sequence) %>%
dplyr::select(.data$mutantName, .data$mutantNameAA,
.data$sequence) %>%
tidyr::separate_rows(.data$sequence, sep = ",")
})

## --------------------------------------------------------------------- ##
## Replace naive sequences with corrected ones
## --------------------------------------------------------------------- ##
for (i in names(paramsSeparate)) {
countCombined[[i]] <-
convSeparate[[i]]$mutantName[match(countCombined[[i]],
convSeparate[[i]]$sequence)]
if (i %in% collapseToAA) {
countCombined[[i]] <-
convSeparate[[i]]$mutantNameAA[match(countCombined[[i]],
convSeparate[[i]]$sequence)]
} else {
countCombined[[i]] <-
convSeparate[[i]]$mutantName[match(countCombined[[i]],
convSeparate[[i]]$sequence)]
}
}

## Filter out rows with NAs (the sequence was not retained)
Expand Down
Binary file modified inst/extdata/multipleVariableRegions_truth.rds
Binary file not shown.
43 changes: 43 additions & 0 deletions inst/scripts/generateExampleDataForLinkMultipleVariants.R
Original file line number Diff line number Diff line change
Expand Up @@ -187,10 +187,53 @@ WTV3 <- c(V3 = V3)
constFwd <- "ACGTGATTGATCCCGGTAAATACCGG"
constRev <- "TAAATACCGGACCTGA"

## Get AA mutant name as well
V2varAA <- vapply(V2var, function(w) {
w <- strsplit(w, "\\.")[[1]]
if (w[3] != "WT") {
pos0 <- ceiling(as.numeric(w[2]) / 3) - 1
wt <- strsplit(WTV2[w[1]], "")[[1]]
wt_codon <- paste(wt[(3 * pos0 + 1):(3 * pos0 + 3)], collapse = "")
wt_aa <- mutscan:::translateString(wt_codon)
wt[as.numeric(w[2])] <- w[3]
mut_codon <- paste(wt[(3 * pos0 + 1):(3 * pos0 + 3)], collapse = "")
mut_aa <- mutscan:::translateString(mut_codon)
if (mut_aa == wt_aa) {
paste(c(w[1], 0, "WT"), collapse = ".")
} else {
paste(c(w[1], pos0 + 1, mut_aa), collapse = ".")
}
} else {
paste(w, collapse = ".")
}
}, "")
V3varAA <- vapply(V3var, function(w) {
w <- strsplit(w, "\\.")[[1]]
if (w[3] != "WT") {
pos0 <- ceiling(as.numeric(w[2]) / 3) - 1
wt <- strsplit(WTV3[w[1]], "")[[1]]
wt_codon <- paste(wt[(3 * pos0 + 1):(3 * pos0 + 3)], collapse = "")
wt_aa <- mutscan:::translateString(wt_codon)
wt[as.numeric(w[2])] <- w[3]
mut_codon <- paste(wt[(3 * pos0 + 1):(3 * pos0 + 3)], collapse = "")
mut_aa <- mutscan:::translateString(mut_codon)
if (mut_aa == wt_aa) {
paste(c(w[1], 0, "WT"), collapse = ".")
} else {
paste(c(w[1], pos0 + 1, mut_aa), collapse = ".")
}
} else {
paste(w, collapse = ".")
}
}, "")


truth <- data.frame(read = paste0("R", seq_len(nReads)),
trueBarcode = barcode,
trueV2 = V2var,
trueV3 = V3var,
trueV2aa = V2varAA,
trueV3aa = V3varAA,
obsBarcode = v1,
status = status)

Expand Down
40 changes: 40 additions & 0 deletions man/associateBarcodes.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 8 additions & 1 deletion man/linkMultipleVariants.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading