BorchLab
diff --git a/‎.Rhistory‎
Lines changed: 63 additions & 0 deletions b/‎.Rhistory‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎DESCRIPTION‎
Lines changed: 9 additions & 1 deletion b/‎DESCRIPTION‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎NAMESPACE‎
Lines changed: 9 additions & 0 deletions b/‎NAMESPACE‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎R/RcppExports.R‎
Lines changed: 15 additions & 0 deletions b/‎R/RcppExports.R‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎R/buildNetwork.R‎
Lines changed: 184 additions & 0 deletions b/‎R/buildNetwork.R‎
Lines changed: 184 additions & 0 deletions
diff --git a/‎R/formatGenes.R‎
Lines changed: 2 additions & 7 deletions b/‎R/formatGenes.R‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎R/utils.R‎
Lines changed: 22 additions & 0 deletions b/‎R/utils.R‎
Lines changed: 22 additions & 0 deletions
@@ -0,0 +1,63 @@
+setwd("~/Documents/GitHub/ImmApex")
+devtools::document()
+install.packages("RcppParallel")
+devtools::check()
+devtools::document()
+devtools::check()
+devtools::document()
+devtools::check()
+keras3_installed <- packageVersion("keras3") < "1.2.0"
+keras3_installed <- packageVersion("keras3") >= "1.2.0"
+# Check Python modules once at the start
+keras_installed <- reticulate::py_module_available("keras")
+# Check Python modules once at the start
+keras_installed <- reticulate::py_module_available("keras")
+numpy_installed <- reticulate::py_module_available("numpy")
+keras_installed && numpy_installed && keras3_installed
+devtools::document()
+devtools::check()
+devtools::check()
+devtools::document()
+devtools::check()
+roxygen2::roxygenise()
+devtools::check()
+devtools::document()
+devtools::document()
+devtools::check()
+devtools::document()
+devtools::check()
+// [[Rcpp::plugins(cpp11)]]
+devtools::document()
+devtools::check()
+devtools::check()
+devtools::document()
+devtools::check()
+technology <- NULL
+if (technology %in% c("TenX", "Adaptive")) {
+potential_col <- paste0(region, "_gene")
+if (!(potential_col %in% colnames(input.data))) {
+genes.updated <- paste0(region, "GeneName")
+} else {
+genes.updated <- potential_col
+}
+} else if (technology %in% c("AIRR")) {
+genes.updated <- paste0(region, "_call")
+} else {
+genes.updated <- region
+}
+if is.null(technology) technology <- NA
+is.null(technology)
+if (is.null(technology)) technology <- NA
+if (technology %in% c("TenX", "Adaptive")) {
+potential_col <- paste0(region, "_gene")
+if (!(potential_col %in% colnames(input.data))) {
+genes.updated <- paste0(region, "GeneName")
+} else {
+genes.updated <- potential_col
+}
+} else if (technology %in% c("AIRR")) {
+genes.updated <- paste0(region, "_call")
+} else {
+genes.updated <- region
+}
+devtools::check()
@@ -10,12 +10,15 @@ RoxygenNote: 7.3.2
 biocViews: Software, ImmunoOncology, SingleCell, Classification, Annotation, Sequencing, MotifAnnotation
 Depends: 
 	R (>= 4.3.0)
-Imports: hash,
+Imports: 
+        hash,
 	httr,
+        igraph,
 	keras3,
 	magrittr,
 	matrixStats,
 	methods,
+        Rcpp (>= 0.12.11),
 	reticulate,
 	rvest,
 	SingleCellExperiment,
@@ -26,14 +29,19 @@ Imports: hash,
         utils
 Suggests: 
 	BiocStyle,
+	ggraph,
 	ggplot2,
 	knitr,
+	graph,
 	markdown,
 	rmarkdown, 
 	scRepertoire,
 	spelling,
         testthat,
+	tidygraph,
 	viridis
+LinkingTo:
+        Rcpp
 VignetteBuilder: knitr
 Language: en-US
 URL: https://github.com/BorchLab/immApex/
 
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(adjacencyMatrix)
+export(buildNetwork)
 export(formatGenes)
 export(generateSequences)
 export(geometricEncoder)
@@ -15,11 +16,18 @@ export(propertyEncoder)
 export(sequenceDecoder)
 export(tokenizeSequences)
 export(variationalSequences)
+importFrom(Rcpp,evalCpp)
 importFrom(SingleCellExperiment,colData)
 importFrom(hash,hash)
 importFrom(httr,GET)
 importFrom(httr,content)
 importFrom(httr,user_agent)
+importFrom(igraph,E)
+importFrom(igraph,V)
+importFrom(igraph,`E<-`)
+importFrom(igraph,`V<-`)
+importFrom(igraph,graph_from_data_frame)
+importFrom(igraph,make_empty_graph)
 importFrom(keras3,callback_early_stopping)
 importFrom(keras3,compile)
 importFrom(keras3,fit)
@@ -61,3 +69,4 @@ importFrom(stringr,str_sort)
 importFrom(stringr,str_split)
 importFrom(tensorflow,tf)
 importFrom(utils,data)
+useDynLib(immApex, .registration = TRUE)
@@ -0,0 +1,15 @@
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+symmetric_deletion_lookup_cpp <- function(sequences, threshold) {
+    .Call(`_immApex_symmetric_deletion_lookup_cpp`, sequences, threshold)
+}
+
+edit_distance_threshold <- function(a, b, threshold) {
+    .Call(`_immApex_edit_distance_threshold`, a, b, threshold)
+}
+
+post_filter_candidates_seq <- function(candidatePairs, sequences, vGenes, jGenes, threshold, filterV, filterJ) {
+    .Call(`_immApex_post_filter_candidates_seq`, candidatePairs, sequences, vGenes, jGenes, threshold, filterV, filterJ)
+}
+
@@ -0,0 +1,184 @@
+#' Build Edit Distance Network Using Symmetric Deletion Lookup
+#'
+#' Constructs a weighted similarity network from biological sequences using a 
+#' symmetric deletion lookup strategy combined with a banded edit-distance 
+#' computation. The returned igraph object contains vertices representing the 
+#' input sequences and edges representing pairs of sequences whose edit distance 
+#' is less than or equal to the specified threshold. The edge attribute 
+#' \code{weight} stores the computed edit distance.
+#'
+#' This function supports both a character vector of sequences and a data frame.
+#' When provided a data frame, the user can specify the column containing sequences 
+#' using the \code{sequence.column} parameter. Additionally, candidate pairs can be 
+#' filtered by requiring matching \code{v.gene} and/or \code{j.gene} annotations 
+#' (see \code{filter.v} and \code{filter.j}). If filtering is enabled, the corresponding 
+#' gene annotation columns are required.
+#'
+#' @param input.data A character vector of AIR sequences, or a data frame containing 
+#'   sequence data.
+#' @param sequence.column A character string specifying the name of the column in 
+#'   \code{input.data} that contains the sequences. Default is \code{"sequence"}. 
+#'   This parameter is ignored when \code{input.data} is a character vector.
+#' @param threshold An integer specifying the maximum allowed edit distance. Only 
+#'   pairs of sequences with an edit distance less than or equal to this value 
+#'   will be connected. Default is \code{2}.
+#' @param filter.v Logical indicating whether to filter candidate pairs to only 
+#'   those that have matching \code{v.gene} family annotations. Default is 
+#'   \code{FALSE}. When \code{TRUE}, the input data frame must contain a column 
+#'   with V gene annotations, either named \code{v.gene} or determined by 
+#'   \code{.get.genes.updated}.
+#' @param filter.j Logical indicating whether to filter candidate pairs to only 
+#'   those that have matching \code{j.gene} family annotations. Default is 
+#'   \code{FALSE}. When \code{TRUE}, the input data frame must contain a column 
+#'   with J gene annotations, either named \code{j.gene} or determined by 
+#'   \code{.get.genes.updated}.
+#' @param technology The sequencing technology employed - \strong{'TenX'}, 
+#'   \strong{'Adaptive'}, or \strong{'AIRR'}.
+#' @param simplify.format If applicable, remove the allelic designation (\strong{TRUE}) or
+#' retain all information (\strong{FALSE})
+#' @param simplify.families If applicable, remove the hyphenated designation 
+#' (\strong{TRUE}) or retain all information (\strong{FALSE})
+#'
+#' @return An igraph object representing the AIR similarity network. Vertices 
+#' contain the original sequences (and gene annotations, if available), and each 
+#' edge has a \code{weight} attribute corresponding to the computed edit distance. 
+#' If no edges meet the threshold, an igraph object with only vertices is returned.
+#'
+#' @details
+#' The function first calls a C++ routine (via Rcpp) to perform a symmetric deletion 
+#' lookup, generating candidate pairs of sequences that might be within the specified 
+#' edit distance. It then uses a banded dynamic programming algorithm (also implemented 
+#' in C++) to compute the exact edit distance for each candidate pair. When using a 
+#' data frame input, the candidate pairs can be further filtered by requiring that 
+#' sequences have matching \code{v.gene} and/or \code{j.gene} values. Note that gene 
+#' filtering is only applied if the corresponding filtering flag is set to \code{TRUE}.
+#'
+#' @examples
+#' # Using a character vector of sequences:
+#' sequences <- c("CASSLGTDTQYF", "CASSPGTDTQYF", "CASSLGNDTQYF", "CASRLGNDTQYF")
+#' g <- buildNetwork(sequences, threshold = 2)
+#' plot(g)
+#'
+#' # Using a data frame with a custom sequence column:
+#' df <- data.frame(
+#'   mySeqs = c("CASSLGTDTQYF", "CASSPGTDTQYF", "CASSLGNDTQYF", "CASRLGNDTQYF"),
+#'   v = c("TRBV20", "TRBV20", "TRBV12", "TRBV20"),
+#'   j = c("TRBJ2-7", "TRBJ2-7", "TRBJ2-1", "TRBJ2-7")
+#' )
+#' g_df <- buildNetwork(df, 
+#'                      threshold = 2, 
+#'                      filter.v = TRUE, 
+#'                      filter.j = TRUE, 
+#'                      sequence.column = "mySeqs")
+#' plot(g_df)
+#'
+#' @importFrom igraph graph_from_data_frame make_empty_graph V E `V<-` `E<-`
+#' @importFrom Rcpp evalCpp
+#' @useDynLib immApex, .registration = TRUE
+#' @export
+buildNetwork <- function(input.data, 
+                         sequence.column = "sequence",
+                         threshold = 2, 
+                         filter.v = FALSE, 
+                         filter.j = FALSE, 
+                         technology = NULL, 
+                         simplify.format = TRUE, 
+                         simplify.families = TRUE) {
+  # Handle input based on its type.
+  if (is.data.frame(input.data)) {
+    req_cols <- sequence.column
+    
+    # Determine the V gene header.
+    if (filter.v) {
+      v.genes.header <- .get.genes.updated(input.data, technology, "v")
+      req_cols <- c(req_cols, v.genes.header)
+    } else if ("v.gene" %in% colnames(input.data)) {
+      v.genes.header <- "v.gene"
+    } else {
+      v.genes.header <- NULL
+    }
+    
+    # Determine the J gene header.
+    if (filter.j) {
+      j.genes.header <- .get.genes.updated(input.data, technology, "j")
+      req_cols <- c(req_cols, j.genes.header)
+    } else if ("j.gene" %in% colnames(input.data)) {
+      j.genes.header <- "j.gene"
+    } else {
+      j.genes.header <- NULL
+    }
+    
+    # Check for required columns.
+    if (!all(req_cols %in% colnames(input.data))) {
+      stop(paste("Data frame must contain the following column(s):", 
+                 paste(req_cols, collapse = ", ")))
+    }
+    
+    sequences <- as.character(input.data[[sequence.column]])
+    if (!is.null(v.genes.header)) {
+      v_genes <- as.character(input.data[[v.genes.header]])
+    } else {
+      v_genes <- rep(NA, length(sequences))
+    }
+    if (!is.null(j.genes.header)) {
+      j_genes <- as.character(input.data[[j.genes.header]])
+    } else {
+      j_genes <- rep(NA, length(sequences))
+    }
+    
+  } else if (is.character(input.data)) {
+    sequences <- input.data
+    v_genes <- rep(NA, length(sequences))
+    j_genes <- rep(NA, length(sequences))
+  } else {
+    stop("Input must be either a character vector or a data frame with the required columns.")
+  }
+  
+  n <- length(sequences)
+  
+  if(simplify.format) {
+    v_genes <- str_split(v_genes, "[*]", simplify = TRUE)[,1]
+    j_genes <- str_split(j_genes, "[*]", simplify = TRUE)[,1]
+  }
+  
+  if(simplify.families) {
+    v_genes <- str_split(v_genes, "[-]", simplify = TRUE)[,1]
+    j_genes <- str_split(j_genes, "[-]", simplify = TRUE)[,1]
+  }
+  
+  # Get candidate pairs using the symmetric deletion lookup (implemented in C++).
+  candidate_pairs <- symmetric_deletion_lookup_cpp(sequences, threshold)
+  
+  # Post-filter candidates to verify edit distances and obtain edge weights.
+  edge_df <- post_filter_candidates_seq(candidate_pairs, 
+                                        sequences, 
+                                        v_genes, 
+                                        j_genes,
+                                        threshold, 
+                                        filter.v, 
+                                        filter.j)
+  
+  # Build a vertices data frame including all sequences.
+  vertices_df <- data.frame(name = as.character(seq_len(n)), 
+                            sequence = sequences,
+                            stringsAsFactors = FALSE)
+  if (!all(is.na(v_genes))) {
+    vertices_df$v.gene <- v_genes
+  }
+  if (!all(is.na(j_genes))) {
+    vertices_df$j.gene <- j_genes
+  }
+  
+  # Create the igraph object with weighted edges.
+  if (nrow(edge_df) == 0) {
+    g <- make_empty_graph(n)
+    V(g)$sequence <- sequences
+    if (!all(is.na(v_genes))) V(g)$v.gene <- v_genes
+    if (!all(is.na(j_genes))) V(g)$j.gene <- j_genes
+  } else {
+    g <- graph_from_data_frame(d = as.data.frame(edge_df), directed = FALSE, vertices = vertices_df)
+    E(g)$weight <- as.numeric(edge_df$weight)
+  }
+  
+  return(g)
+}
@@ -38,10 +38,10 @@ formatGenes <- function(input.data,
   }
   if(!.is_seurat_or_se_object(input.data)) {
     if(technology %!in% c("TenX", "AIRR", "Adaptive")) {
-      stop("Please select a technology in the following category: 'TenX', 'AIRR', 'Adaptive', 'Omniscope'")
+      stop("Please select a technology in the following category: 'TenX', 'AIRR', 'Adaptive'")
     }
   }
-  
+  genes.updated <- .get.genes.updated(input.data, technology, region)
   if (.is_seurat_or_se_object(input.data)) {
     chain.1 <- getIR(input.data, 
                      chains = "TRA", 
@@ -50,17 +50,12 @@ formatGenes <- function(input.data,
                      chains = "TRB", 
                      sequence.type = "aa")[[1]]
     input.data <- rbind(chain.1, chain.2)
-    genes.updated <- region
   } else {
     input.data[input.data == ""] <- NA
     if(technology %in% c("TenX","Adaptive")) {
-      genes.updated <- paste0(region, "_gene")
       if(any(genes.updated %!in% colnames(input.data))) {
-        genes.updated <- paste0(region, "GeneName")
         input.data[,genes.updated][is.na(input.data[,genes.updated])] <- str_split(input.data[,"vGeneNameTies"][is.na(input.data[,genes.updated])], "[,]", simplify = TRUE)[,1]
       }
-    } else if (technology %in% c("AIRR")) {
-      genes.updated <- paste0(region, "_call")
     }
   }
 
 
@@ -91,3 +91,25 @@ array.dimnamer <- function(array) {
 .is_seurat_or_se_object <- function(obj) {
   .is_seurat_object(obj) || .is_se_object(obj)
 }
+
+.get.genes.updated <- function(input.data, technology, region) {
+  if (.is_seurat_or_se_object(input.data)) {
+    genes.updated <- region
+  } else {
+    if (is.null(technology)) technology <- NA
+    if (technology %in% c("TenX", "Adaptive")) {
+      potential_col <- paste0(region, "_gene")
+      if (!(potential_col %in% colnames(input.data))) {
+        genes.updated <- paste0(region, "GeneName")
+      } else {
+        genes.updated <- potential_col
+      }
+    } else if (technology %in% c("AIRR")) {
+      genes.updated <- paste0(region, "_call")
+    } else {
+      genes.updated <- region
+    }
+  }
+  return(genes.updated)
+}
+
Original file line number	Diff line number	Diff line change
`@@ -38,10 +38,10 @@ formatGenes <- function(input.data,`
`38`	`38`	`}`
`39`	`39`	`if(!.is_seurat_or_se_object(input.data)) {`
`40`	`40`	`if(technology %!in% c("TenX", "AIRR", "Adaptive")) {`
`41`		`- stop("Please select a technology in the following category: 'TenX', 'AIRR', 'Adaptive', 'Omniscope'")`
	`41`	`+ stop("Please select a technology in the following category: 'TenX', 'AIRR', 'Adaptive'")`
`42`	`42`	`}`
`43`	`43`	`}`
`44`		`-`
	`44`	`+ genes.updated <- .get.genes.updated(input.data, technology, region)`
`45`	`45`	`if (.is_seurat_or_se_object(input.data)) {`
`46`	`46`	`chain.1 <- getIR(input.data,`
`47`	`47`	`chains = "TRA",`
`@@ -50,17 +50,12 @@ formatGenes <- function(input.data,`
`50`	`50`	`chains = "TRB",`
`51`	`51`	`sequence.type = "aa")[[1]]`
`52`	`52`	`input.data <- rbind(chain.1, chain.2)`
`53`		`- genes.updated <- region`
`54`	`53`	`} else {`
`55`	`54`	`input.data[input.data == ""] <- NA`
`56`	`55`	`if(technology %in% c("TenX","Adaptive")) {`
`57`		`- genes.updated <- paste0(region, "_gene")`
`58`	`56`	`if(any(genes.updated %!in% colnames(input.data))) {`
`59`		`- genes.updated <- paste0(region, "GeneName")`
`60`	`57`	`input.data[,genes.updated][is.na(input.data[,genes.updated])] <- str_split(input.data[,"vGeneNameTies"][is.na(input.data[,genes.updated])], "[,]", simplify = TRUE)[,1]`
`61`	`58`	`}`
`62`		`- } else if (technology %in% c("AIRR")) {`
`63`		`- genes.updated <- paste0(region, "_call")`
`64`	`59`	`}`
`65`	`60`	`}`
`66`	`61`