further batch querying method update

florian-huber · florian-huber · commit b0b7748bd0c1 · 2025-12-08T22:55:09.000+01:00
diff --git a/ms2query/database/ann_vector_index.py b/ms2query/database/ann_vector_index.py
@@ -485,51 +485,141 @@ def build_index(
 
     def query(
         self,
-        query_fp: Tuple[np.ndarray, np.ndarray] | sp.csr_matrix,
+        query_fp: (
+            Tuple[np.ndarray, np.ndarray]
+            | sp.csr_matrix
+            | Sequence[Tuple[np.ndarray, np.ndarray]]
+        ),
         k: int = 10,
         *,
         ef: Optional[int] = None,
         re_rank: bool = True,
         candidate_multiplier: int = 5,
-    ) -> List[Tuple[int, float]]:
+        num_threads: int = 0,
+    ) -> List[Tuple[int, float]] | List[List[Tuple[int, float]]]:
         """
         Query for k nearest neighbors.
 
         Parameters
         ----------
-        query_fp : (indices, values) tuple or single-row CSR
-        k : Number of results
-        re_rank : Use exact Tanimoto re-ranking
-        candidate_multiplier : Fetch k * multiplier candidates for re-ranking
+        query_fp :
+            - Single query:
+                * (indices, values) tuple
+                * single-row CSR of shape (1, dim)
+            - Batched queries:
+                * CSR of shape (N, dim)
+                * Sequence of (indices, values) tuples
+        k : int
+            Number of results per query.
+        re_rank : bool
+            Use exact Tanimoto re-ranking.
+        candidate_multiplier : int
+            Fetch k * multiplier candidates for re-ranking.
+        num_threads : int
+            Number of threads to use inside nmslib (0 = library default).
 
-        Returns list of (comp_id, similarity) tuples.
+        Returns
+        -------
+        Union[List[Tuple[int, float]], List[List[Tuple[int, float]]]]
+            - For a single query, returns a list of (comp_id, similarity).
+            - For multiple queries, returns a list (per query) of such lists.
         """
         if self._index is None:
             raise RuntimeError("Index not built or loaded.")
 
-        q = self._normalize_query(query_fp)
-        if q.nnz == 0:
-            return []
+        # -------------------------
+        # Normalize input to CSR
+        # -------------------------
+        single = False
+
+        if isinstance(query_fp, sp.csr_matrix):
+            Q = query_fp.astype(np.float32, copy=False)
+            if Q.shape[1] != self.dim:
+                raise ValueError(f"CSR query must have shape (N, {self.dim})")
+            single = Q.shape[0] == 1
+
+        elif isinstance(query_fp, tuple):
+            # Single (indices, values)
+            Q = csr_row_from_tuple(query_fp, dim=self.dim)
+            single = True
+
+        else:
+            # Assume sequence of (indices, values) tuples -> batched queries
+            Q = tuples_to_csr(query_fp, dim=self.dim)
+            single = Q.shape[0] == 1
+
+        if (Q.data < 0).any():
+            raise ValueError("Query must be non-negative for Tanimoto.")
+
+        # Handle completely empty queries quickly
+        row_nnz = Q.indptr[1:] - Q.indptr[:-1]
+        if row_nnz.sum() == 0:
+            if single:
+                return []
+            return [[] for _ in range(Q.shape[0])]
 
         if ef is not None:
             self._index.setQueryTimeParams({"ef": ef})
 
         fetch = max(k, k * candidate_multiplier)
-        idxs, dists = self._index.knnQueryBatch(q, k=fetch)[0]
-        idxs = np.asarray(idxs, dtype=np.int64)
-        dists = np.asarray(dists, dtype=np.float32)
 
-        # Without re-ranking, return cosine similarities
+        # -------------------------
+        # ANN search for all queries
+        # -------------------------
+        batch_results = self._index.knnQueryBatch(Q, k=fetch, num_threads=num_threads)
+
+        # -------------------------
+        # No re-ranking: cosine sims only
+        # -------------------------
         if not re_rank or self._csr is None or self._l1 is None:
-            sims = 1.0 - dists
-            return [(self._comp_ids[i], float(s)) for i, s in zip(idxs[:k], sims[:k])]
+            all_out: List[List[Tuple[int, float]]] = []
+
+            for qi, (idxs, dists) in enumerate(batch_results):
+                if row_nnz[qi] == 0:
+                    all_out.append([])
+                    continue
+
+                idxs = np.asarray(idxs, dtype=np.int64)
+                dists = np.asarray(dists, dtype=np.float32)
+
+                sims = 1.0 - dists
+                out = [
+                    (self._comp_ids[i], float(s))
+                    for i, s in zip(idxs[:k], sims[:k])
+                ]
+                all_out.append(out)
+
+            return all_out[0] if single else all_out
+
+        # -------------------------
+        # Exact Tanimoto re-ranking
+        # -------------------------
+        all_out: List[List[Tuple[int, float]]] = []
 
-        # Re-rank with exact Tanimoto
-        Y = self._csr[idxs]
-        tan = tanimoto_l1_query_vs_block(q, Y, sum1=float(q.sum()), sumsY=self._l1[idxs])
-        order = np.argsort(-tan)[:k]
+        for qi, (idxs, dists) in enumerate(batch_results):
+            if row_nnz[qi] == 0:
+                all_out.append([])
+                continue
 
-        return [(self._comp_ids[idxs[i]], float(tan[i])) for i in order]
+            idxs = np.asarray(idxs, dtype=np.int64)
+
+            q_row = Q[qi]
+            Y = self._csr[idxs]
+            tan = tanimoto_l1_query_vs_block(
+                q_row,
+                Y,
+                sum1=float(q_row.sum()),
+                sumsY=self._l1[idxs],
+            )
+
+            order = np.argsort(-tan)[:k]
+            out = [
+                (self._comp_ids[idxs[i]], float(tan[i]))
+                for i in order
+            ]
+            all_out.append(out)
+
+        return all_out[0] if single else all_out
 
     def _normalize_query(self, query_fp) -> sp.csr_matrix:
         """Convert query to single-row CSR and validate."""