|
| 1 | +#' Build Edit Distance Network Using Symmetric Deletion Lookup |
| 2 | +#' |
| 3 | +#' Constructs a weighted similarity network from biological sequences using a |
| 4 | +#' symmetric deletion lookup strategy combined with a banded edit-distance |
| 5 | +#' computation. The returned igraph object contains vertices representing the |
| 6 | +#' input sequences and edges representing pairs of sequences whose edit distance |
| 7 | +#' is less than or equal to the specified threshold. The edge attribute |
| 8 | +#' \code{weight} stores the computed edit distance. |
| 9 | +#' |
| 10 | +#' This function supports both a character vector of sequences and a data frame. |
| 11 | +#' When provided a data frame, the user can specify the column containing sequences |
| 12 | +#' using the \code{sequence.column} parameter. Additionally, candidate pairs can be |
| 13 | +#' filtered by requiring matching \code{v.gene} and/or \code{j.gene} annotations |
| 14 | +#' (see \code{filter.v} and \code{filter.j}). If filtering is enabled, the corresponding |
| 15 | +#' gene annotation columns are required. |
| 16 | +#' |
| 17 | +#' @param input.data A character vector of AIR sequences, or a data frame containing |
| 18 | +#' sequence data. |
| 19 | +#' @param sequence.column A character string specifying the name of the column in |
| 20 | +#' \code{input.data} that contains the sequences. Default is \code{"sequence"}. |
| 21 | +#' This parameter is ignored when \code{input.data} is a character vector. |
| 22 | +#' @param threshold An integer specifying the maximum allowed edit distance. Only |
| 23 | +#' pairs of sequences with an edit distance less than or equal to this value |
| 24 | +#' will be connected. Default is \code{2}. |
| 25 | +#' @param filter.v Logical indicating whether to filter candidate pairs to only |
| 26 | +#' those that have matching \code{v.gene} family annotations. Default is |
| 27 | +#' \code{FALSE}. When \code{TRUE}, the input data frame must contain a column |
| 28 | +#' with V gene annotations, either named \code{v.gene} or determined by |
| 29 | +#' \code{.get.genes.updated}. |
| 30 | +#' @param filter.j Logical indicating whether to filter candidate pairs to only |
| 31 | +#' those that have matching \code{j.gene} family annotations. Default is |
| 32 | +#' \code{FALSE}. When \code{TRUE}, the input data frame must contain a column |
| 33 | +#' with J gene annotations, either named \code{j.gene} or determined by |
| 34 | +#' \code{.get.genes.updated}. |
| 35 | +#' @param technology The sequencing technology employed - \strong{'TenX'}, |
| 36 | +#' \strong{'Adaptive'}, or \strong{'AIRR'}. |
| 37 | +#' @param simplify.format If applicable, remove the allelic designation (\strong{TRUE}) or |
| 38 | +#' retain all information (\strong{FALSE}) |
| 39 | +#' @param simplify.families If applicable, remove the hyphenated designation |
| 40 | +#' (\strong{TRUE}) or retain all information (\strong{FALSE}) |
| 41 | +#' |
| 42 | +#' @return An igraph object representing the AIR similarity network. Vertices |
| 43 | +#' contain the original sequences (and gene annotations, if available), and each |
| 44 | +#' edge has a \code{weight} attribute corresponding to the computed edit distance. |
| 45 | +#' If no edges meet the threshold, an igraph object with only vertices is returned. |
| 46 | +#' |
| 47 | +#' @details |
| 48 | +#' The function first calls a C++ routine (via Rcpp) to perform a symmetric deletion |
| 49 | +#' lookup, generating candidate pairs of sequences that might be within the specified |
| 50 | +#' edit distance. It then uses a banded dynamic programming algorithm (also implemented |
| 51 | +#' in C++) to compute the exact edit distance for each candidate pair. When using a |
| 52 | +#' data frame input, the candidate pairs can be further filtered by requiring that |
| 53 | +#' sequences have matching \code{v.gene} and/or \code{j.gene} values. Note that gene |
| 54 | +#' filtering is only applied if the corresponding filtering flag is set to \code{TRUE}. |
| 55 | +#' |
| 56 | +#' @examples |
| 57 | +#' # Using a character vector of sequences: |
| 58 | +#' sequences <- c("CASSLGTDTQYF", "CASSPGTDTQYF", "CASSLGNDTQYF", "CASRLGNDTQYF") |
| 59 | +#' g <- buildNetwork(sequences, threshold = 2) |
| 60 | +#' plot(g) |
| 61 | +#' |
| 62 | +#' # Using a data frame with a custom sequence column: |
| 63 | +#' df <- data.frame( |
| 64 | +#' mySeqs = c("CASSLGTDTQYF", "CASSPGTDTQYF", "CASSLGNDTQYF", "CASRLGNDTQYF"), |
| 65 | +#' v = c("TRBV20", "TRBV20", "TRBV12", "TRBV20"), |
| 66 | +#' j = c("TRBJ2-7", "TRBJ2-7", "TRBJ2-1", "TRBJ2-7") |
| 67 | +#' ) |
| 68 | +#' g_df <- buildNetwork(df, |
| 69 | +#' threshold = 2, |
| 70 | +#' filter.v = TRUE, |
| 71 | +#' filter.j = TRUE, |
| 72 | +#' sequence.column = "mySeqs") |
| 73 | +#' plot(g_df) |
| 74 | +#' |
| 75 | +#' @importFrom igraph graph_from_data_frame make_empty_graph V E `V<-` `E<-` |
| 76 | +#' @importFrom Rcpp evalCpp |
| 77 | +#' @useDynLib immApex, .registration = TRUE |
| 78 | +#' @export |
| 79 | +buildNetwork <- function(input.data, |
| 80 | + sequence.column = "sequence", |
| 81 | + threshold = 2, |
| 82 | + filter.v = FALSE, |
| 83 | + filter.j = FALSE, |
| 84 | + technology = NULL, |
| 85 | + simplify.format = TRUE, |
| 86 | + simplify.families = TRUE) { |
| 87 | + # Handle input based on its type. |
| 88 | + if (is.data.frame(input.data)) { |
| 89 | + req_cols <- sequence.column |
| 90 | + |
| 91 | + # Determine the V gene header. |
| 92 | + if (filter.v) { |
| 93 | + v.genes.header <- .get.genes.updated(input.data, technology, "v") |
| 94 | + req_cols <- c(req_cols, v.genes.header) |
| 95 | + } else if ("v.gene" %in% colnames(input.data)) { |
| 96 | + v.genes.header <- "v.gene" |
| 97 | + } else { |
| 98 | + v.genes.header <- NULL |
| 99 | + } |
| 100 | + |
| 101 | + # Determine the J gene header. |
| 102 | + if (filter.j) { |
| 103 | + j.genes.header <- .get.genes.updated(input.data, technology, "j") |
| 104 | + req_cols <- c(req_cols, j.genes.header) |
| 105 | + } else if ("j.gene" %in% colnames(input.data)) { |
| 106 | + j.genes.header <- "j.gene" |
| 107 | + } else { |
| 108 | + j.genes.header <- NULL |
| 109 | + } |
| 110 | + |
| 111 | + # Check for required columns. |
| 112 | + if (!all(req_cols %in% colnames(input.data))) { |
| 113 | + stop(paste("Data frame must contain the following column(s):", |
| 114 | + paste(req_cols, collapse = ", "))) |
| 115 | + } |
| 116 | + |
| 117 | + sequences <- as.character(input.data[[sequence.column]]) |
| 118 | + if (!is.null(v.genes.header)) { |
| 119 | + v_genes <- as.character(input.data[[v.genes.header]]) |
| 120 | + } else { |
| 121 | + v_genes <- rep(NA, length(sequences)) |
| 122 | + } |
| 123 | + if (!is.null(j.genes.header)) { |
| 124 | + j_genes <- as.character(input.data[[j.genes.header]]) |
| 125 | + } else { |
| 126 | + j_genes <- rep(NA, length(sequences)) |
| 127 | + } |
| 128 | + |
| 129 | + } else if (is.character(input.data)) { |
| 130 | + sequences <- input.data |
| 131 | + v_genes <- rep(NA, length(sequences)) |
| 132 | + j_genes <- rep(NA, length(sequences)) |
| 133 | + } else { |
| 134 | + stop("Input must be either a character vector or a data frame with the required columns.") |
| 135 | + } |
| 136 | + |
| 137 | + n <- length(sequences) |
| 138 | + |
| 139 | + if(simplify.format) { |
| 140 | + v_genes <- str_split(v_genes, "[*]", simplify = TRUE)[,1] |
| 141 | + j_genes <- str_split(j_genes, "[*]", simplify = TRUE)[,1] |
| 142 | + } |
| 143 | + |
| 144 | + if(simplify.families) { |
| 145 | + v_genes <- str_split(v_genes, "[-]", simplify = TRUE)[,1] |
| 146 | + j_genes <- str_split(j_genes, "[-]", simplify = TRUE)[,1] |
| 147 | + } |
| 148 | + |
| 149 | + # Get candidate pairs using the symmetric deletion lookup (implemented in C++). |
| 150 | + candidate_pairs <- symmetric_deletion_lookup_cpp(sequences, threshold) |
| 151 | + |
| 152 | + # Post-filter candidates to verify edit distances and obtain edge weights. |
| 153 | + edge_df <- post_filter_candidates_seq(candidate_pairs, |
| 154 | + sequences, |
| 155 | + v_genes, |
| 156 | + j_genes, |
| 157 | + threshold, |
| 158 | + filter.v, |
| 159 | + filter.j) |
| 160 | + |
| 161 | + # Build a vertices data frame including all sequences. |
| 162 | + vertices_df <- data.frame(name = as.character(seq_len(n)), |
| 163 | + sequence = sequences, |
| 164 | + stringsAsFactors = FALSE) |
| 165 | + if (!all(is.na(v_genes))) { |
| 166 | + vertices_df$v.gene <- v_genes |
| 167 | + } |
| 168 | + if (!all(is.na(j_genes))) { |
| 169 | + vertices_df$j.gene <- j_genes |
| 170 | + } |
| 171 | + |
| 172 | + # Create the igraph object with weighted edges. |
| 173 | + if (nrow(edge_df) == 0) { |
| 174 | + g <- make_empty_graph(n) |
| 175 | + V(g)$sequence <- sequences |
| 176 | + if (!all(is.na(v_genes))) V(g)$v.gene <- v_genes |
| 177 | + if (!all(is.na(j_genes))) V(g)$j.gene <- j_genes |
| 178 | + } else { |
| 179 | + g <- graph_from_data_frame(d = as.data.frame(edge_df), directed = FALSE, vertices = vertices_df) |
| 180 | + E(g)$weight <- as.numeric(edge_df$weight) |
| 181 | + } |
| 182 | + |
| 183 | + return(g) |
| 184 | +} |
0 commit comments