Skip to content

Commit b8e0c77

Browse files
Merge pull request #8 from BorchLab/dev
adding buildNetwork
2 parents 905f2dd + 5394c18 commit b8e0c77

File tree

17 files changed

+750
-13
lines changed

17 files changed

+750
-13
lines changed

.Rhistory

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
setwd("~/Documents/GitHub/ImmApex")
2+
devtools::document()
3+
install.packages("RcppParallel")
4+
devtools::check()
5+
devtools::document()
6+
devtools::check()
7+
devtools::document()
8+
devtools::check()
9+
keras3_installed <- packageVersion("keras3") < "1.2.0"
10+
keras3_installed <- packageVersion("keras3") >= "1.2.0"
11+
# Check Python modules once at the start
12+
keras_installed <- reticulate::py_module_available("keras")
13+
# Check Python modules once at the start
14+
keras_installed <- reticulate::py_module_available("keras")
15+
numpy_installed <- reticulate::py_module_available("numpy")
16+
keras_installed && numpy_installed && keras3_installed
17+
devtools::document()
18+
devtools::check()
19+
devtools::check()
20+
devtools::document()
21+
devtools::check()
22+
roxygen2::roxygenise()
23+
devtools::check()
24+
devtools::document()
25+
devtools::document()
26+
devtools::check()
27+
devtools::document()
28+
devtools::check()
29+
// [[Rcpp::plugins(cpp11)]]
30+
devtools::document()
31+
devtools::check()
32+
devtools::check()
33+
devtools::document()
34+
devtools::check()
35+
technology <- NULL
36+
if (technology %in% c("TenX", "Adaptive")) {
37+
potential_col <- paste0(region, "_gene")
38+
if (!(potential_col %in% colnames(input.data))) {
39+
genes.updated <- paste0(region, "GeneName")
40+
} else {
41+
genes.updated <- potential_col
42+
}
43+
} else if (technology %in% c("AIRR")) {
44+
genes.updated <- paste0(region, "_call")
45+
} else {
46+
genes.updated <- region
47+
}
48+
if is.null(technology) technology <- NA
49+
is.null(technology)
50+
if (is.null(technology)) technology <- NA
51+
if (technology %in% c("TenX", "Adaptive")) {
52+
potential_col <- paste0(region, "_gene")
53+
if (!(potential_col %in% colnames(input.data))) {
54+
genes.updated <- paste0(region, "GeneName")
55+
} else {
56+
genes.updated <- potential_col
57+
}
58+
} else if (technology %in% c("AIRR")) {
59+
genes.updated <- paste0(region, "_call")
60+
} else {
61+
genes.updated <- region
62+
}
63+
devtools::check()

DESCRIPTION

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,15 @@ RoxygenNote: 7.3.2
1010
biocViews: Software, ImmunoOncology, SingleCell, Classification, Annotation, Sequencing, MotifAnnotation
1111
Depends:
1212
R (>= 4.3.0)
13-
Imports: hash,
13+
Imports:
14+
hash,
1415
httr,
16+
igraph,
1517
keras3,
1618
magrittr,
1719
matrixStats,
1820
methods,
21+
Rcpp (>= 0.12.11),
1922
reticulate,
2023
rvest,
2124
SingleCellExperiment,
@@ -26,14 +29,19 @@ Imports: hash,
2629
utils
2730
Suggests:
2831
BiocStyle,
32+
ggraph,
2933
ggplot2,
3034
knitr,
35+
graph,
3136
markdown,
3237
rmarkdown,
3338
scRepertoire,
3439
spelling,
3540
testthat,
41+
tidygraph,
3642
viridis
43+
LinkingTo:
44+
Rcpp
3745
VignetteBuilder: knitr
3846
Language: en-US
3947
URL: https://github.com/BorchLab/immApex/

NAMESPACE

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Generated by roxygen2: do not edit by hand
22

33
export(adjacencyMatrix)
4+
export(buildNetwork)
45
export(formatGenes)
56
export(generateSequences)
67
export(geometricEncoder)
@@ -15,11 +16,18 @@ export(propertyEncoder)
1516
export(sequenceDecoder)
1617
export(tokenizeSequences)
1718
export(variationalSequences)
19+
importFrom(Rcpp,evalCpp)
1820
importFrom(SingleCellExperiment,colData)
1921
importFrom(hash,hash)
2022
importFrom(httr,GET)
2123
importFrom(httr,content)
2224
importFrom(httr,user_agent)
25+
importFrom(igraph,E)
26+
importFrom(igraph,V)
27+
importFrom(igraph,`E<-`)
28+
importFrom(igraph,`V<-`)
29+
importFrom(igraph,graph_from_data_frame)
30+
importFrom(igraph,make_empty_graph)
2331
importFrom(keras3,callback_early_stopping)
2432
importFrom(keras3,compile)
2533
importFrom(keras3,fit)
@@ -61,3 +69,4 @@ importFrom(stringr,str_sort)
6169
importFrom(stringr,str_split)
6270
importFrom(tensorflow,tf)
6371
importFrom(utils,data)
72+
useDynLib(immApex, .registration = TRUE)

R/RcppExports.R

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
2+
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
3+
4+
symmetric_deletion_lookup_cpp <- function(sequences, threshold) {
5+
.Call(`_immApex_symmetric_deletion_lookup_cpp`, sequences, threshold)
6+
}
7+
8+
edit_distance_threshold <- function(a, b, threshold) {
9+
.Call(`_immApex_edit_distance_threshold`, a, b, threshold)
10+
}
11+
12+
post_filter_candidates_seq <- function(candidatePairs, sequences, vGenes, jGenes, threshold, filterV, filterJ) {
13+
.Call(`_immApex_post_filter_candidates_seq`, candidatePairs, sequences, vGenes, jGenes, threshold, filterV, filterJ)
14+
}
15+

R/buildNetwork.R

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
#' Build Edit Distance Network Using Symmetric Deletion Lookup
2+
#'
3+
#' Constructs a weighted similarity network from biological sequences using a
4+
#' symmetric deletion lookup strategy combined with a banded edit-distance
5+
#' computation. The returned igraph object contains vertices representing the
6+
#' input sequences and edges representing pairs of sequences whose edit distance
7+
#' is less than or equal to the specified threshold. The edge attribute
8+
#' \code{weight} stores the computed edit distance.
9+
#'
10+
#' This function supports both a character vector of sequences and a data frame.
11+
#' When provided a data frame, the user can specify the column containing sequences
12+
#' using the \code{sequence.column} parameter. Additionally, candidate pairs can be
13+
#' filtered by requiring matching \code{v.gene} and/or \code{j.gene} annotations
14+
#' (see \code{filter.v} and \code{filter.j}). If filtering is enabled, the corresponding
15+
#' gene annotation columns are required.
16+
#'
17+
#' @param input.data A character vector of AIR sequences, or a data frame containing
18+
#' sequence data.
19+
#' @param sequence.column A character string specifying the name of the column in
20+
#' \code{input.data} that contains the sequences. Default is \code{"sequence"}.
21+
#' This parameter is ignored when \code{input.data} is a character vector.
22+
#' @param threshold An integer specifying the maximum allowed edit distance. Only
23+
#' pairs of sequences with an edit distance less than or equal to this value
24+
#' will be connected. Default is \code{2}.
25+
#' @param filter.v Logical indicating whether to filter candidate pairs to only
26+
#' those that have matching \code{v.gene} family annotations. Default is
27+
#' \code{FALSE}. When \code{TRUE}, the input data frame must contain a column
28+
#' with V gene annotations, either named \code{v.gene} or determined by
29+
#' \code{.get.genes.updated}.
30+
#' @param filter.j Logical indicating whether to filter candidate pairs to only
31+
#' those that have matching \code{j.gene} family annotations. Default is
32+
#' \code{FALSE}. When \code{TRUE}, the input data frame must contain a column
33+
#' with J gene annotations, either named \code{j.gene} or determined by
34+
#' \code{.get.genes.updated}.
35+
#' @param technology The sequencing technology employed - \strong{'TenX'},
36+
#' \strong{'Adaptive'}, or \strong{'AIRR'}.
37+
#' @param simplify.format If applicable, remove the allelic designation (\strong{TRUE}) or
38+
#' retain all information (\strong{FALSE})
39+
#' @param simplify.families If applicable, remove the hyphenated designation
40+
#' (\strong{TRUE}) or retain all information (\strong{FALSE})
41+
#'
42+
#' @return An igraph object representing the AIR similarity network. Vertices
43+
#' contain the original sequences (and gene annotations, if available), and each
44+
#' edge has a \code{weight} attribute corresponding to the computed edit distance.
45+
#' If no edges meet the threshold, an igraph object with only vertices is returned.
46+
#'
47+
#' @details
48+
#' The function first calls a C++ routine (via Rcpp) to perform a symmetric deletion
49+
#' lookup, generating candidate pairs of sequences that might be within the specified
50+
#' edit distance. It then uses a banded dynamic programming algorithm (also implemented
51+
#' in C++) to compute the exact edit distance for each candidate pair. When using a
52+
#' data frame input, the candidate pairs can be further filtered by requiring that
53+
#' sequences have matching \code{v.gene} and/or \code{j.gene} values. Note that gene
54+
#' filtering is only applied if the corresponding filtering flag is set to \code{TRUE}.
55+
#'
56+
#' @examples
57+
#' # Using a character vector of sequences:
58+
#' sequences <- c("CASSLGTDTQYF", "CASSPGTDTQYF", "CASSLGNDTQYF", "CASRLGNDTQYF")
59+
#' g <- buildNetwork(sequences, threshold = 2)
60+
#' plot(g)
61+
#'
62+
#' # Using a data frame with a custom sequence column:
63+
#' df <- data.frame(
64+
#' mySeqs = c("CASSLGTDTQYF", "CASSPGTDTQYF", "CASSLGNDTQYF", "CASRLGNDTQYF"),
65+
#' v = c("TRBV20", "TRBV20", "TRBV12", "TRBV20"),
66+
#' j = c("TRBJ2-7", "TRBJ2-7", "TRBJ2-1", "TRBJ2-7")
67+
#' )
68+
#' g_df <- buildNetwork(df,
69+
#' threshold = 2,
70+
#' filter.v = TRUE,
71+
#' filter.j = TRUE,
72+
#' sequence.column = "mySeqs")
73+
#' plot(g_df)
74+
#'
75+
#' @importFrom igraph graph_from_data_frame make_empty_graph V E `V<-` `E<-`
76+
#' @importFrom Rcpp evalCpp
77+
#' @useDynLib immApex, .registration = TRUE
78+
#' @export
79+
buildNetwork <- function(input.data,
80+
sequence.column = "sequence",
81+
threshold = 2,
82+
filter.v = FALSE,
83+
filter.j = FALSE,
84+
technology = NULL,
85+
simplify.format = TRUE,
86+
simplify.families = TRUE) {
87+
# Handle input based on its type.
88+
if (is.data.frame(input.data)) {
89+
req_cols <- sequence.column
90+
91+
# Determine the V gene header.
92+
if (filter.v) {
93+
v.genes.header <- .get.genes.updated(input.data, technology, "v")
94+
req_cols <- c(req_cols, v.genes.header)
95+
} else if ("v.gene" %in% colnames(input.data)) {
96+
v.genes.header <- "v.gene"
97+
} else {
98+
v.genes.header <- NULL
99+
}
100+
101+
# Determine the J gene header.
102+
if (filter.j) {
103+
j.genes.header <- .get.genes.updated(input.data, technology, "j")
104+
req_cols <- c(req_cols, j.genes.header)
105+
} else if ("j.gene" %in% colnames(input.data)) {
106+
j.genes.header <- "j.gene"
107+
} else {
108+
j.genes.header <- NULL
109+
}
110+
111+
# Check for required columns.
112+
if (!all(req_cols %in% colnames(input.data))) {
113+
stop(paste("Data frame must contain the following column(s):",
114+
paste(req_cols, collapse = ", ")))
115+
}
116+
117+
sequences <- as.character(input.data[[sequence.column]])
118+
if (!is.null(v.genes.header)) {
119+
v_genes <- as.character(input.data[[v.genes.header]])
120+
} else {
121+
v_genes <- rep(NA, length(sequences))
122+
}
123+
if (!is.null(j.genes.header)) {
124+
j_genes <- as.character(input.data[[j.genes.header]])
125+
} else {
126+
j_genes <- rep(NA, length(sequences))
127+
}
128+
129+
} else if (is.character(input.data)) {
130+
sequences <- input.data
131+
v_genes <- rep(NA, length(sequences))
132+
j_genes <- rep(NA, length(sequences))
133+
} else {
134+
stop("Input must be either a character vector or a data frame with the required columns.")
135+
}
136+
137+
n <- length(sequences)
138+
139+
if(simplify.format) {
140+
v_genes <- str_split(v_genes, "[*]", simplify = TRUE)[,1]
141+
j_genes <- str_split(j_genes, "[*]", simplify = TRUE)[,1]
142+
}
143+
144+
if(simplify.families) {
145+
v_genes <- str_split(v_genes, "[-]", simplify = TRUE)[,1]
146+
j_genes <- str_split(j_genes, "[-]", simplify = TRUE)[,1]
147+
}
148+
149+
# Get candidate pairs using the symmetric deletion lookup (implemented in C++).
150+
candidate_pairs <- symmetric_deletion_lookup_cpp(sequences, threshold)
151+
152+
# Post-filter candidates to verify edit distances and obtain edge weights.
153+
edge_df <- post_filter_candidates_seq(candidate_pairs,
154+
sequences,
155+
v_genes,
156+
j_genes,
157+
threshold,
158+
filter.v,
159+
filter.j)
160+
161+
# Build a vertices data frame including all sequences.
162+
vertices_df <- data.frame(name = as.character(seq_len(n)),
163+
sequence = sequences,
164+
stringsAsFactors = FALSE)
165+
if (!all(is.na(v_genes))) {
166+
vertices_df$v.gene <- v_genes
167+
}
168+
if (!all(is.na(j_genes))) {
169+
vertices_df$j.gene <- j_genes
170+
}
171+
172+
# Create the igraph object with weighted edges.
173+
if (nrow(edge_df) == 0) {
174+
g <- make_empty_graph(n)
175+
V(g)$sequence <- sequences
176+
if (!all(is.na(v_genes))) V(g)$v.gene <- v_genes
177+
if (!all(is.na(j_genes))) V(g)$j.gene <- j_genes
178+
} else {
179+
g <- graph_from_data_frame(d = as.data.frame(edge_df), directed = FALSE, vertices = vertices_df)
180+
E(g)$weight <- as.numeric(edge_df$weight)
181+
}
182+
183+
return(g)
184+
}

R/formatGenes.R

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,10 @@ formatGenes <- function(input.data,
3838
}
3939
if(!.is_seurat_or_se_object(input.data)) {
4040
if(technology %!in% c("TenX", "AIRR", "Adaptive")) {
41-
stop("Please select a technology in the following category: 'TenX', 'AIRR', 'Adaptive', 'Omniscope'")
41+
stop("Please select a technology in the following category: 'TenX', 'AIRR', 'Adaptive'")
4242
}
4343
}
44-
44+
genes.updated <- .get.genes.updated(input.data, technology, region)
4545
if (.is_seurat_or_se_object(input.data)) {
4646
chain.1 <- getIR(input.data,
4747
chains = "TRA",
@@ -50,17 +50,12 @@ formatGenes <- function(input.data,
5050
chains = "TRB",
5151
sequence.type = "aa")[[1]]
5252
input.data <- rbind(chain.1, chain.2)
53-
genes.updated <- region
5453
} else {
5554
input.data[input.data == ""] <- NA
5655
if(technology %in% c("TenX","Adaptive")) {
57-
genes.updated <- paste0(region, "_gene")
5856
if(any(genes.updated %!in% colnames(input.data))) {
59-
genes.updated <- paste0(region, "GeneName")
6057
input.data[,genes.updated][is.na(input.data[,genes.updated])] <- str_split(input.data[,"vGeneNameTies"][is.na(input.data[,genes.updated])], "[,]", simplify = TRUE)[,1]
6158
}
62-
} else if (technology %in% c("AIRR")) {
63-
genes.updated <- paste0(region, "_call")
6459
}
6560
}
6661

R/utils.R

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,3 +91,25 @@ array.dimnamer <- function(array) {
9191
.is_seurat_or_se_object <- function(obj) {
9292
.is_seurat_object(obj) || .is_se_object(obj)
9393
}
94+
95+
.get.genes.updated <- function(input.data, technology, region) {
96+
if (.is_seurat_or_se_object(input.data)) {
97+
genes.updated <- region
98+
} else {
99+
if (is.null(technology)) technology <- NA
100+
if (technology %in% c("TenX", "Adaptive")) {
101+
potential_col <- paste0(region, "_gene")
102+
if (!(potential_col %in% colnames(input.data))) {
103+
genes.updated <- paste0(region, "GeneName")
104+
} else {
105+
genes.updated <- potential_col
106+
}
107+
} else if (technology %in% c("AIRR")) {
108+
genes.updated <- paste0(region, "_call")
109+
} else {
110+
genes.updated <- region
111+
}
112+
}
113+
return(genes.updated)
114+
}
115+

0 commit comments

Comments
 (0)