matchms
diff --git a/‎ms2query/data_processing/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎ms2query/data_processing/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎ms2query/data_processing/fingerprint_computation.py‎
Lines changed: 101 additions & 0 deletions b/‎ms2query/data_processing/fingerprint_computation.py‎
Lines changed: 101 additions & 0 deletions
diff --git a/‎ms2query/database/ann_vector_index.py‎
Lines changed: 159 additions & 33 deletions b/‎ms2query/database/ann_vector_index.py‎
Lines changed: 159 additions & 33 deletions
@@ -1,5 +1,5 @@
 from .chemistry_utils import compute_morgan_fingerprints, inchikey14_from_full
-from .fingerprint_computation import compute_fingerprints_from_smiles
+from .fingerprint_computation import compute_fingerprints_from_smiles, merge_fingerprints
 from .merging_utils import cluster_block, get_merged_spectra
 from .spectra_processing import compute_spectra_embeddings, normalize_spectrum_sum
 
@@ -11,5 +11,6 @@
     "compute_spectra_embeddings",
     "get_merged_spectra",
     "inchikey14_from_full",
+    "merge_fingerprints",
     "normalize_spectrum_sum",
 ]
@@ -1,6 +1,8 @@
+from typing import Optional, Sequence, Tuple
 import numba
 import numpy as np
 from numba import typed, types
+from numpy.typing import NDArray
 from rdkit import Chem
 from tqdm import tqdm
 
@@ -255,6 +257,105 @@ def count_fingerprint_keys(fingerprints):
     return unique_keys[order], count_arr[order], first_arr[order]
 
 
+def merge_fingerprints(
+    fingerprints: Sequence[Tuple[NDArray[np.integer], NDArray[np.floating]]],
+    weights: Optional[NDArray[np.floating]] = None,
+) -> Tuple[NDArray[np.integer], NDArray[np.floating]]:
+    """
+    Merge multiple sparse Morgan (count/TF-IDF) fingerprints into a single
+    weighted-average fingerprint.
+
+    Parameters
+    ----------
+    fingerprints :
+        Sequence of (bits, values) pairs.
+        - bits: 1D integer array of bit indices (non-zero entries)
+        - values: 1D float array of TF-IDF (or other) weights,
+          same length as `bits`.
+    weights :
+        Optional 1D array-like of length len(fingerprints) with one weight
+        per fingerprint. Each fingerprint's values are scaled by its weight,
+        then the merged fingerprint is normalized by the sum of all weights.
+
+        - If None, all fingerprints are weighted equally (weight = 1.0).
+
+    Returns
+    -------
+    merged_bits, merged_values :
+        - merged_bits: 1D integer array of unique bit indices
+        - merged_values: 1D float array of weighted-average values per bit
+          (sum over all weighted fingerprints, divided by sum(weights)).
+    """
+    n_fps = len(fingerprints)
+    if n_fps == 0:
+        # Return empty sparse fingerprint
+        return (
+            np.array([], dtype=np.int64),
+            np.array([], dtype=np.float64),
+        )
+
+    if weights is not None:
+        w = np.asarray(weights, dtype=np.float64).ravel()
+        if w.shape[0] != n_fps:
+            raise ValueError(
+                f"weights must have length {n_fps}, got {w.shape[0]}"
+            )
+        total_weight = float(w.sum())
+        if total_weight <= 0.0:
+            raise ValueError("Sum of weights must be positive.")
+    else:
+        # Equal weighting
+        w = None
+        total_weight = float(n_fps)
+
+    # Concatenate all indices and (weighted) values
+    bits_list = []
+    vals_list = []
+
+    for i, (bits, vals) in enumerate(fingerprints):
+        bits = np.asarray(bits)
+        vals = np.asarray(vals, dtype=np.float64)
+
+        if bits.shape[0] != vals.shape[0]:
+            raise ValueError(
+                f"Fingerprint {i}: bits and values must have same length, "
+                f"got {bits.shape[0]} and {vals.shape[0]}"
+            )
+
+        if w is not None:
+            vals = vals * w[i]
+
+        bits_list.append(bits)
+        vals_list.append(vals)
+
+    if not bits_list:
+        return (
+            np.array([], dtype=np.int64),
+            np.array([], dtype=np.float64),
+        )
+
+    all_bits = np.concatenate(bits_list)
+    all_vals = np.concatenate(vals_list)
+
+    if all_bits.size == 0:
+        return (
+            np.array([], dtype=np.int64),
+            np.array([], dtype=np.float64),
+        )
+
+    # Group by bit index and sum weighted values
+    unique_bits, inverse = np.unique(all_bits, return_inverse=True)
+    summed_vals = np.bincount(inverse, weights=all_vals)
+
+    # Weighted average: divide by sum of weights
+    avg_vals = summed_vals / total_weight
+
+    # Keep dtypes reasonably tight
+    merged_bits = unique_bits.astype(all_bits.dtype, copy=False)
+    merged_vals = avg_vals.astype(np.float32, copy=False)
+
+    return merged_bits, merged_vals
+
 ### ------------------------
 ### Bit Scaling and Weighing
 ### ------------------------
 
@@ -323,28 +323,63 @@ def _create_hnsw_index(
 
     def query(
         self,
-        vector: np.ndarray,
+        vectors: np.ndarray,
         k: int = 10,
         ef: Optional[int] = None,
-    ) -> List[Tuple[str, float]]:
+        num_threads: int = 0,
+    ) -> List[Tuple[str, float]] | List[List[Tuple[str, float]]]:
         """
         Query for k nearest neighbors.
 
-        Returns list of (spec_id, similarity) tuples.
+        Parameters
+        ----------
+        vectors : np.ndarray
+            Either a single vector of shape (dim,) or a batch of shape (N, dim).
+        k : int
+            Number of neighbors.
+        ef : Optional[int]
+            Optional per-query ef parameter for HNSW.
+        num_threads : int
+            Number of threads to use inside nmslib (0 = library default).
+
+        Returns
+        -------
+        Union[List[Tuple[str, float]], List[List[Tuple[str, float]]]]
+            - If a single vector is given, returns a list of (spec_id, similarity).
+            - If a batch is given, returns a list (per query) of such lists.
         """
         if self._index is None:
             raise RuntimeError("Index not built or loaded.")
 
-        v = np.asarray(vector, dtype=np.float32).reshape(1, -1)
-        if v.shape[1] != self.dim:
-            raise ValueError(f"Query must have dim={self.dim}")
+        X = np.asarray(vectors, dtype=np.float32)
+
+        single = False
+        if X.ndim == 1:
+            # Single query vector: (dim,) -> (1, dim)
+            if X.size != self.dim:
+                raise ValueError(f"Query must have dim={self.dim}")
+            X = X.reshape(1, -1)
+            single = True
+        elif X.ndim == 2:
+            if X.shape[1] != self.dim:
+                raise ValueError(f"Expected shape (N, {self.dim}), got {X.shape}")
+        else:
+            raise ValueError("vectors must be 1D or 2D array.")
 
         if ef is not None:
             self._index.setQueryTimeParams({"ef": ef})
 
-        idxs, dists = self._index.knnQueryBatch(v, k=k)[0]
-        sims = 1.0 - np.asarray(dists, dtype=np.float32)  # cosine distance -> similarity
-        return [(str(self._ids[i]), float(sims[j])) for j, i in enumerate(idxs)]
+        batch_results = self._index.knnQueryBatch(X, k=k, num_threads=num_threads)
+
+        all_out: List[List[Tuple[str, float]]] = []
+        for idxs, dists in batch_results:
+            idxs = np.asarray(idxs, dtype=np.int64)
+            dists = np.asarray(dists, dtype=np.float32)
+            sims = 1.0 - dists  # cosine distance -> similarity
+            out = [(str(self._ids[i]), float(s)) for i, s in zip(idxs, sims)]
+            all_out.append(out)
+
+        return all_out[0] if single else all_out
 
     def save_index(self, path_prefix: str) -> None:
         if self._index is None:
@@ -450,51 +485,141 @@ def build_index(
 
     def query(
         self,
-        query_fp: Tuple[np.ndarray, np.ndarray] | sp.csr_matrix,
+        query_fp: (
+            Tuple[np.ndarray, np.ndarray]
+            | sp.csr_matrix
+            | Sequence[Tuple[np.ndarray, np.ndarray]]
+        ),
         k: int = 10,
         *,
         ef: Optional[int] = None,
         re_rank: bool = True,
         candidate_multiplier: int = 5,
-    ) -> List[Tuple[int, float]]:
+        num_threads: int = 0,
+    ) -> List[Tuple[int, float]] | List[List[Tuple[int, float]]]:
         """
         Query for k nearest neighbors.
 
         Parameters
         ----------
-        query_fp : (indices, values) tuple or single-row CSR
-        k : Number of results
-        re_rank : Use exact Tanimoto re-ranking
-        candidate_multiplier : Fetch k * multiplier candidates for re-ranking
-
-        Returns list of (comp_id, similarity) tuples.
+        query_fp :
+            - Single query:
+                * (indices, values) tuple
+                * single-row CSR of shape (1, dim)
+            - Batched queries:
+                * CSR of shape (N, dim)
+                * Sequence of (indices, values) tuples
+        k : int
+            Number of results per query.
+        re_rank : bool
+            Use exact Tanimoto re-ranking.
+        candidate_multiplier : int
+            Fetch k * multiplier candidates for re-ranking.
+        num_threads : int
+            Number of threads to use inside nmslib (0 = library default).
+
+        Returns
+        -------
+        Union[List[Tuple[int, float]], List[List[Tuple[int, float]]]]
+            - For a single query, returns a list of (comp_id, similarity).
+            - For multiple queries, returns a list (per query) of such lists.
         """
         if self._index is None:
             raise RuntimeError("Index not built or loaded.")
 
-        q = self._normalize_query(query_fp)
-        if q.nnz == 0:
-            return []
+        # -------------------------
+        # Normalize input to CSR
+        # -------------------------
+        single = False
+
+        if isinstance(query_fp, sp.csr_matrix):
+            Q = query_fp.astype(np.float32, copy=False)
+            if Q.shape[1] != self.dim:
+                raise ValueError(f"CSR query must have shape (N, {self.dim})")
+            single = Q.shape[0] == 1
+
+        elif isinstance(query_fp, tuple):
+            # Single (indices, values)
+            Q = csr_row_from_tuple(query_fp, dim=self.dim)
+            single = True
+
+        else:
+            # Assume sequence of (indices, values) tuples -> batched queries
+            Q = tuples_to_csr(query_fp, dim=self.dim)
+            single = Q.shape[0] == 1
+
+        if (Q.data < 0).any():
+            raise ValueError("Query must be non-negative for Tanimoto.")
+
+        # Handle completely empty queries quickly
+        row_nnz = Q.indptr[1:] - Q.indptr[:-1]
+        if row_nnz.sum() == 0:
+            if single:
+                return []
+            return [[] for _ in range(Q.shape[0])]
 
         if ef is not None:
             self._index.setQueryTimeParams({"ef": ef})
 
         fetch = max(k, k * candidate_multiplier)
-        idxs, dists = self._index.knnQueryBatch(q, k=fetch)[0]
-        idxs = np.asarray(idxs, dtype=np.int64)
-        dists = np.asarray(dists, dtype=np.float32)
 
-        # Without re-ranking, return cosine similarities
+        # -------------------------
+        # ANN search for all queries
+        # -------------------------
+        batch_results = self._index.knnQueryBatch(Q, k=fetch, num_threads=num_threads)
+
+        # -------------------------
+        # No re-ranking: cosine sims only
+        # -------------------------
         if not re_rank or self._csr is None or self._l1 is None:
-            sims = 1.0 - dists
-            return [(int(self._comp_ids[i]), float(s)) for i, s in zip(idxs[:k], sims[:k])]
+            all_out: List[List[Tuple[int, float]]] = []
 
-        # Re-rank with exact Tanimoto
-        Y = self._csr[idxs]
-        tan = tanimoto_l1_query_vs_block(q, Y, sum1=float(q.sum()), sumsY=self._l1[idxs])
-        order = np.argsort(-tan)[:k]
+            for qi, (idxs, dists) in enumerate(batch_results):
+                if row_nnz[qi] == 0:
+                    all_out.append([])
+                    continue
 
-        return [(int(self._comp_ids[idxs[i]]), float(tan[i])) for i in order]
+                idxs = np.asarray(idxs, dtype=np.int64)
+                dists = np.asarray(dists, dtype=np.float32)
+
+                sims = 1.0 - dists
+                out = [
+                    (self._comp_ids[i], float(s))
+                    for i, s in zip(idxs[:k], sims[:k])
+                ]
+                all_out.append(out)
+
+            return all_out[0] if single else all_out
+
+        # -------------------------
+        # Exact Tanimoto re-ranking
+        # -------------------------
+        all_out: List[List[Tuple[int, float]]] = []
+
+        for qi, (idxs, dists) in enumerate(batch_results):
+            if row_nnz[qi] == 0:
+                all_out.append([])
+                continue
+
+            idxs = np.asarray(idxs, dtype=np.int64)
+
+            q_row = Q[qi]
+            Y = self._csr[idxs]
+            tan = tanimoto_l1_query_vs_block(
+                q_row,
+                Y,
+                sum1=float(q_row.sum()),
+                sumsY=self._l1[idxs],
+            )
+
+            order = np.argsort(-tan)[:k]
+            out = [
+                (self._comp_ids[idxs[i]], float(tan[i]))
+                for i in order
+            ]
+            all_out.append(out)
+
+        return all_out[0] if single else all_out
 
     def _normalize_query(self, query_fp) -> sp.csr_matrix:
         """Convert query to single-row CSR and validate."""
@@ -602,10 +727,11 @@ def save_index(self, path_prefix: str) -> None:
         if self._index is None:
             raise RuntimeError("Index not built.")
 
-        self._index.saveIndex(f"{path_prefix}.nmslib")
+        # Also save data so that load_index(..., load_data=True) works
+        self._index.saveIndex(f"{path_prefix}.nmslib", save_data=True)
         np.save(f"{path_prefix}.ids.npy", self._comp_ids)
 
-        meta = {**self._meta, "dim": self.dim, "space": self.space}
+        meta = {**self._meta, "dim": int(self.dim), "space": str(self.space)}
         with open(f"{path_prefix}.meta.json", "w") as f:
             json.dump(meta, f)