Skip to content

Commit 1461474

Browse files
committed
fix aggregate networks
1 parent 935529b commit 1461474

File tree

6 files changed

+55
-30
lines changed

6 files changed

+55
-30
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: immApex
22
Title: Tools for Adaptive Immune Receptor Sequence-Based Machine and Deep Learning
3-
Version: 1.4.3
3+
Version: 1.4.4
44
Authors@R: c(
55
person(given = "Nick", family = "Borcherding", role = c("aut", "cre"), email = "ncborch@gmail.com"))
66
Description: A set of tools to for machine and deep learning in R from amino acid and nucleotide sequences focusing on adaptive immune receptors. The package includes pre-processing of sequences, unifying gene nomenclature usage, encoding sequences, and combining models. This package will serve as the basis of future immune receptor sequence functions/packages/models compatible with the scRepertoire ecosystem.

NEWS.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
# immApex VERSION 1.4.4
2+
3+
## BUGS
4+
* Fixed `buildNetworks()` issue that aggregates similarity for duplicated sequences.
5+
16
# immApex VERSION 1.4.3
27

38
## NEW FEATURES

R/buildNetwork.R

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@
1010
#' inside `input.data`. Ignored when `NULL` and `seq_col` is non-`NULL`.
1111
#' @param seq_col,v_col,j_col Column names to use when `input.data` is given.
1212
#' By default the function looks for common AIRR names (`junction_aa`,
13-
#' `cdr3`, `v_call`, `j_call`).
13+
#' `cdr3`, `sequence`, `seq`).
1414
#' @param threshold >= 1 for absolute distance **or** 0 < x <= 1 for relative.
1515
#' When using normalized distances (`normalize != "none"`), this typically
16-
#' should be a value between 0 and 1 (e.g., 0.9 for 10 percent dissimilarity).
16+
#' should be a value between 0.0 and 1.0 (e.g., 0.1 for 10 percent dissimilarity).
1717
#' @param filter.v Logical; require identical V when `TRUE`.
1818
#' @param filter.j Logical; require identical J when `TRUE`.
1919
#' @param ids Optional character labels; recycled from row-names if missing.
@@ -96,7 +96,7 @@
9696
#' dist_type = "damerau",
9797
#' filter.v = TRUE)
9898
#'
99-
#' @return edge-list `data.frame` **or** sparse adjacency `dgCMatrix`
99+
#' @return edge-list `data.frame` **or** sparse adjacency `dgCMatrix` of distances
100100
#' @importFrom Matrix sparseMatrix
101101
#' @export
102102
buildNetwork <- function(input.data = NULL,
@@ -176,7 +176,7 @@ buildNetwork <- function(input.data = NULL,
176176
if (n > 0) {
177177
first_len <- nchar(seq_vec[1])
178178
if (any(nchar(seq_vec) != first_len)) {
179-
warning("Hamming distance requires equal-length sequences. ",
179+
message("Hamming distance requires equal-length sequences. ",
180180
"Pairs of unequal length will be assigned max distance.")
181181
}
182182
}
@@ -208,13 +208,30 @@ buildNetwork <- function(input.data = NULL,
208208
normalize = normalize
209209
)
210210

211+
# Remove Duplicate Edges
212+
if (nrow(edge_df) > 0) {
213+
edge_key <- paste(
214+
pmin(edge_df$from, edge_df$to),
215+
pmax(edge_df$from, edge_df$to),
216+
sep = "\t"
217+
)
218+
219+
# Check if there are duplicates
220+
if (anyDuplicated(edge_key)) {
221+
agg_idx <- !duplicated(edge_key)
222+
edge_df <- edge_df[agg_idx, , drop = FALSE]
223+
}
224+
}
225+
211226
if (output == "edges")
212227
return(edge_df)
213228

214229
## 6. Convert edge list to sparse adjacency
215230
if (!requireNamespace("Matrix", quietly = TRUE))
216231
stop("Matrix package required for sparse output.")
217232

233+
# =========================================================================
234+
218235
all_ids <- sort(unique(c(edge_df$from, edge_df$to)))
219236
idx_from <- match(edge_df$from, all_ids)
220237
idx_to <- match(edge_df$to, all_ids)

man/buildNetwork.Rd

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/fastEditEdges.cpp

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -288,8 +288,23 @@ DataFrame fast_edge_list(CharacterVector seqs,
288288
if (max_l == 0) continue;
289289

290290
int maxd;
291-
if (thresh >= 1.0) maxd = (int)thresh;
292-
else maxd = (int)((1.0 - thresh) * max_l);
291+
if (thresh >= 1.0) {
292+
// Absolute distance threshold
293+
maxd = (int)thresh;
294+
} else {
295+
// Normalized distance threshold - use same length as normalization
296+
double norm_len;
297+
if (normalize == "maxlen") {
298+
norm_len = (double)max_l;
299+
} else if (normalize == "length") {
300+
norm_len = (lens[i] + lens[k]) / 2.0;
301+
} else {
302+
// normalize == "none": interpret threshold as fraction of max_l
303+
norm_len = (double)max_l;
304+
}
305+
maxd = (int)(thresh * norm_len);
306+
}
307+
// =====================================================================
293308

294309
if (metric != "nw" && metric != "sw" && std::abs(lens[i] - lens[k]) > maxd) continue;
295310

@@ -310,6 +325,11 @@ DataFrame fast_edge_list(CharacterVector seqs,
310325
if (normalize == "maxlen") fd /= max_l;
311326
else if (normalize == "length") fd /= ((lens[i]+lens[k])/2.0);
312327

328+
if (thresh < 1.0 && normalize != "none" && fd > thresh) {
329+
continue; // Skip edges that exceed normalized threshold
330+
}
331+
// =====================================================================
332+
313333
loc_f.push_back(lbl[i]);
314334
loc_t.push_back(lbl[k]);
315335
loc_d_vec.push_back(fd);
@@ -326,4 +346,4 @@ DataFrame fast_edge_list(CharacterVector seqs,
326346
}
327347

328348
return DataFrame::create(_["from"] = out_from, _["to"] = out_to, _["dist"] = out_d, _["stringsAsFactors"] = false);
329-
}
349+
}

tests/testthat/test-buildNetwork.R

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ test_that("Backward compatibility - relative threshold", {
5555
input.data = data,
5656
seq_col = "junction_aa",
5757
threshold = 0.9,
58+
normalize = "length",
5859
filter.v = TRUE
5960
)
6061

@@ -466,22 +467,4 @@ test_that("Invalid substitution matrix rejected", {
466467
)
467468
})
468469

469-
test_that("Hamming requires equal length", {
470-
data <- data.frame(
471-
sequence_id = c("seq1", "seq2", "seq3"),
472-
junction_aa = c("AAAA", "BBBB", "CCCCC"), # Different lengths
473-
v_call = rep("IGHV1", 3),
474-
stringsAsFactors = FALSE
475-
)
476-
477-
expect_warning(
478-
edges <- buildNetwork(
479-
input.data = data,
480-
seq_col = "junction_aa",
481-
threshold = 2,
482-
dist_type = "hamming",
483-
filter.v = TRUE
484-
),
485-
"equal-length"
486-
)
487-
})
470+

0 commit comments

Comments
 (0)