remove redundant normalization calls/functions

florian-huber · florian-huber · commit dec423d77ba4 · 2025-11-13T16:17:25.000+01:00
diff --git a/ms2query/database/ann_vector_index.py b/ms2query/database/ann_vector_index.py
@@ -115,7 +115,6 @@ def __init__(self, dim: int = 500):
         self._index = None
         self._comp_ids: Optional[np.ndarray] = None
 
-    # ---------- direct build from arrays ----------
     def build_index(
         self,
         vectors: np.ndarray,
@@ -124,15 +123,25 @@ def build_index(
         M: int = 16,
         ef_construction: int = 200,
         post_init_ef: int = 200,
-        assume_normalized: bool = True,
     ) -> None:
+        """Build index from dense vectors and spec_ids.
+        
+        Parameters
+        ----------
+        vectors : np.ndarray
+            2D array of shape (N, dim) with float32 vectors.
+        spec_ids : Iterable[str]
+            Iterable of spec_id strings of length N.
+        M : int
+            HNSW M parameter (connectivity)
+        ef_construction : int
+            HNSW efConstruction parameter
+        post_init_ef : int
+            HNSW query-time ef parameter
+        """
         X = np.asarray(vectors, dtype=np.float32)
         if X.ndim != 2 or X.shape[1] != self.dim:
             raise ValueError(f"Expected vectors shape (N, {self.dim}), got {X.shape}")
-        if not assume_normalized:
-            n = np.linalg.norm(X, axis=1, keepdims=True)
-            n = np.maximum(n, 1e-12)
-            X = X / n
         ids = np.asarray(list(spec_ids), dtype=object)
         if ids.shape[0] != X.shape[0]:
             raise ValueError("spec_ids length must match number of vectors.")
@@ -146,7 +155,6 @@ def build_index(
         self._ids = ids
         self._meta = {
             "type": "ANNMS2DeepIndex",
-            "assume_normalized": bool(assume_normalized),
             "M": M,
             "ef_construction": ef_construction,
             "post_init_ef": post_init_ef,
@@ -168,7 +176,6 @@ def build_index_from_sqlite(
         M: int = 16,
         ef_construction: int = 200,
         post_init_ef: int = 200,
-        l2_normalize: bool = True,
     ) -> int:
         """
         Streams embeddings from SQLite and constructs an HNSW index in-place.
@@ -191,8 +198,6 @@ def build_index_from_sqlite(
             HNSW efConstruction parameter
         post_init_ef : int
             HNSW query-time ef parameter
-        l2_normalize : bool
-            Whether to L2-normalize vectors before indexing
 
         Returns
         -------
@@ -244,12 +249,6 @@ def build_index_from_sqlite(
             if total == 0:
                 raise ValueError(f"No embeddings loaded from {embeddings_table}.")
 
-        # Optional L2 normalization (in-place, cache-friendly)
-        if l2_normalize:
-            norms = np.linalg.norm(X, axis=1, keepdims=True)
-            np.maximum(norms, 1e-12, out=norms)
-            X /= norms
-
         # Build the HNSW index with a SINGLE batch add
         index = nmslib.init(method='hnsw', space='cosinesimil', data_type=nmslib.DataType.DENSE_VECTOR)
         index.addDataPointBatch(X)
@@ -266,7 +265,6 @@ def build_index_from_sqlite(
             "M": M,
             "ef_construction": ef_construction,
             "post_init_ef": post_init_ef,
-            "l2_normalize": bool(l2_normalize),
         }
         return int(total)
 
@@ -302,22 +300,24 @@ def query(
             vector: np.ndarray,
             k: int = 10,
             ef: Optional[int] = None,
-            assume_normalized: Optional[bool] = None
             ) -> List[Tuple[str, float]]:
         """Query the index with a single vector.
+    
+        Parameters
+        ----------
+        vector : np.ndarray
+            1D array of shape (dim,) with float32 vector.
+        k : int
+            Number of nearest neighbors to return.
+        ef : Optional[int]
+            nmslib ef parameter (higher = better recall / slower).
         """
         if self._index is None:
             raise RuntimeError("Index not built or loaded.")
         v = np.asarray(vector, dtype=np.float32).reshape(1, -1)
         if v.shape[1] != self.dim:
             raise ValueError(f"Query vector must have dim={self.dim}")
 
-        norm_flag = self._meta.get("assume_normalized", True) if assume_normalized is None else assume_normalized
-        if not norm_flag:
-            n = np.linalg.norm(v, axis=1, keepdims=True)
-            n = np.maximum(n, 1e-12)
-            v = v / n
-
         if ef is not None:
             self._index.setQueryTimeParams({'ef': int(ef)})
 
diff --git a/ms2query/library_io.py b/ms2query/library_io.py
@@ -192,7 +192,6 @@ def create_new_library(
             M=params["M"],
             ef_construction=params["ef_construction"],
             post_init_ef=params["post_init_ef"],
-            l2_normalize=True,
         )
         _print_progress(f"Indexed {n_vecs} embedding vectors.")
         emb_prefix = str(out_dir / _EMB_INDEX_BASENAME)
diff --git a/ms2query/ms2query_library.py b/ms2query/ms2query_library.py
@@ -67,6 +67,8 @@ def process_spectra(self, spectra: list[Spectrum]) -> List[Spectrum]:
     def compute_embeddings(self, spectra: list[Spectrum]) -> np.ndarray:
         """
         Compute MS2DeepScore embeddings for arbitrary query spectra.
+
+        Spectra will be preprocessed via self.process_spectra(...) first.
         """
         if not spectra:
             return np.empty((0, 0), dtype=np.float32)
@@ -88,7 +90,6 @@ def query_embedding_index(
         *,
         k: int = 10,
         ef: Optional[int] = None,
-        assume_normalized: bool = True,
         return_dataframe: bool = True,
     ) -> Union[List[List[Dict[str, Any]]], "pd.DataFrame"]:
         """
@@ -106,8 +107,6 @@ def query_embedding_index(
             Top-k to return.
         ef : Optional[int]
             nmslib ef (higher = better recall / slower).
-        assume_normalized : bool
-            If False, will L2-normalize vectors again before query (normally keep True).
         return_dataframe : bool
             If True, returns a tidy DataFrame with columns:
               ['query_ix','rank','spec_id','score']
@@ -126,7 +125,7 @@ def query_embedding_index(
         for qi in range(embeddings.shape[0]):
             # TODO: make faster by querying batch-wise
             # EmbeddingIndex.query returns list[(spec_id, similarity)]
-            hits = self.embedding_index.query(embeddings[qi], k=k, ef=ef, assume_normalized=assume_normalized)
+            hits = self.embedding_index.query(embeddings[qi], k=k, ef=ef)
             # convert to standard structure
             one = []
             for rk, (spec_id, score) in enumerate(hits, start=1):
@@ -143,6 +142,15 @@ def query_embedding_index(
         df = pd.DataFrame(rows, columns=["query_ix", "rank", "spec_id", "score"])
         return df
 
+    def query_compounds_by_spectra(
+        self,
+        spectra: Union[Spectrum, Sequence[Spectrum]],
+        *,
+        k: int = 10,
+        ef: Optional[int] = None,
+        return_dataframe: bool = True,
+        ):
+        pass
     # ----------------------------- helpers / optional glue -----------------------------
 
     def set_embedding_index(self, index: EmbeddingIndex) -> None:
@@ -170,7 +178,7 @@ def query_by_spec_ids(
 
         results_all: List[List[Dict[str, Any]]] = []
         for qi in range(X.shape[0]):
-            hits = self.embedding_index.query(X[qi], k=k, ef=ef, assume_normalized=True)
+            hits = self.embedding_index.query(X[qi], k=k, ef=ef)
             one = [{"rank": rk + 1, "spec_id": sid, "score": float(score)} for rk, (sid, score) in enumerate(hits)]
             results_all.append(one)
 
diff --git a/tests/test_ann_vector_index.py b/tests/test_ann_vector_index.py
@@ -13,27 +13,30 @@ def _mk_unit_vecs(*rows):
     n = np.maximum(n, 1e-12)
     return X / n
 
+
 def test_build_index_and_query_dense():
     X = _mk_unit_vecs([1,0,0], [0,1,0], [0,0,1])
     ids = ["a","b","c"]
     idx = EmbeddingIndex(dim=3)
-    idx.build_index(X, ids, assume_normalized=True)
+    idx.build_index(X, ids)
     # Query close to [1,0,0]
     q = _mk_unit_vecs([0.9, 0.1, 0.0])[0]
     res = idx.query(q, k=2)
     assert [r[0] for r in res] == ["a", "b"]
     assert res[0][1] > res[1][1]  # similarity desc
 
+
 def test_build_index_normalizes_when_requested():
     X = np.array([[2.0,0,0],[0,2.0,0]], dtype=np.float32)
     ids = ["x","y"]
     idx = EmbeddingIndex(dim=3)
-    idx.build_index(X, ids, assume_normalized=False)
+    idx.build_index(X, ids)
     q = np.array([1.0,0,0], dtype=np.float32)
-    out = idx.query(q, k=1, assume_normalized=False)
+    out = idx.query(q, k=1)
     assert out[0][0] == "x"
     assert 0.99 <= out[0][1] <= 1.0
 
+
 def test_query_errors_and_dim_check():
     idx = EmbeddingIndex(dim=3)
     with pytest.raises(RuntimeError):
@@ -42,6 +45,7 @@ def test_query_errors_and_dim_check():
     with pytest.raises(ValueError, match="dim=3"):
         idx.query(np.zeros(4, np.float32))
 
+
 def test_save_and_load_roundtrip_dense(tmp_path):
     X = _mk_unit_vecs([1,0,0],[0,1,0],[0,0,1])
     ids = ["a","b","c"]
@@ -62,6 +66,7 @@ def test_save_and_load_roundtrip_dense(tmp_path):
         meta = json.load(f)
     assert meta["space"] == "cosinesimil"
 
+
 @pytest.mark.parametrize("batch_rows", [1, 2, 3])
 def test_build_index_from_sqlite_streams_and_orders(batch_rows):
     conn = sqlite3.connect(":memory:")
@@ -75,12 +80,13 @@ def test_build_index_from_sqlite_streams_and_orders(batch_rows):
         ],
     )
     idx = EmbeddingIndex(dim=3)
-    n = idx.build_index_from_sqlite(conn, embeddings_table="embeddings", batch_rows=batch_rows, l2_normalize=True)
+    n = idx.build_index_from_sqlite(conn, embeddings_table="embeddings", batch_rows=batch_rows)
     assert n == 3
     # Should be ordered by spec_id ascending ("id_1","id_2")
     out = idx.query(np.array([1.0, 0.0, 0.0], np.float32), k=2)
     assert [o[0] for o in out] == ["id_1", "id_2"]
 
+
 def test_build_index_from_sqlite_errors():
     conn = sqlite3.connect(":memory:")
     conn.execute("CREATE TABLE embeddings(spec_id TEXT, vec BLOB, d INTEGER)")
@@ -124,13 +130,15 @@ def test_tuples_to_csr_basic():
     r2 = csr[2].toarray().ravel()
     np.testing.assert_allclose(r2, [1.0, 0, 0, 0, 1.0])
 
+
 def test_tuples_to_csr_errors_when_index_out_of_bounds():
     tuples = [
         (np.array([0, 6], dtype=np.int32), np.array([1.0, 2.0], dtype=np.float32)),
     ]
     with pytest.raises(ValueError, match=">= dim"):
         tuples_to_csr(tuples, dim=5)
 
+
 def test_csr_row_from_tuple_coalesces_and_validates():
     idxs = np.array([2, 2, 0], dtype=np.int32)
     vals = np.array([1.0, 2.0, 3.0], dtype=np.float32)
@@ -142,6 +150,7 @@ def test_csr_row_from_tuple_coalesces_and_validates():
     with pytest.raises(ValueError, match="Query index"):
         csr_row_from_tuple((np.array([5]), np.array([1.0], np.float32)), dim=5)
 
+
 def test_l1_norms_csr():
     X = sp.csr_matrix(
         np.array([[1.0, 2.0, 0.0], [0.0, 0.5, 0.5], [3.0, 0.0, 1.0]], dtype=np.float32)

Original file line number	Diff line number	Diff line change
`@@ -192,7 +192,6 @@ def create_new_library(`
`192`	`192`	`M=params["M"],`
`193`	`193`	`ef_construction=params["ef_construction"],`
`194`	`194`	`post_init_ef=params["post_init_ef"],`
`195`		`- l2_normalize=True,`
`196`	`195`	`)`
`197`	`196`	`_print_progress(f"Indexed {n_vecs} embedding vectors.")`
`198`	`197`	`emb_prefix = str(out_dir / _EMB_INDEX_BASENAME)`