44from typing import Any , Dict , Iterable , List , Optional , Tuple
55import numpy as np
66import pandas as pd
7- from rdkit . Chem import rdFingerprintGenerator
8- from ms2query . data_processing import compute_fingerprints_from_smiles
7+ from ms2query . data_processing import compute_fingerprints_from_smiles , inchikey14_from_full , compute_morgan_fingerprint
8+
99
1010
1111# =========================
1212# Utilities & placeholders
1313# =========================
1414
15- def inchikey14_from_full (inchikey : str ) -> Optional [str ]:
16- """Return the first 14 characters (inchikey14). Robust to hyphens/malformed keys."""
17- if not inchikey :
18- return None
19- s = str (inchikey ).strip ().upper ()
20- if "-" in s :
21- return s .split ("-" , 1 )[0 ][:14 ]
22- return s [:14 ] if len (s ) >= 14 else None
23-
2415def encode_sparse_fp (bits : Optional [np .ndarray ], counts : Optional [np .ndarray ]) -> tuple [bytes , bytes ]:
2516 """Store bits as uint32 indices, counts as int32
2617
@@ -68,56 +59,6 @@ def decode_fp_blob(blob: bytes) -> np.ndarray:
6859 return np .zeros (0 , dtype = np .uint8 )
6960 return np .frombuffer (blob , dtype = np .uint8 ).copy ()
7061
71- def compute_fingerprints (
72- smiles : Optional [str ] = None ,
73- inchis : Optional [str ] = None ,
74- sparse : bool = True ,
75- count : bool = True ,
76- radius : int = 9 ,
77- progress_bar : bool = True ,
78- ) -> np .ndarray :
79- """
80- Compute a molecular fingerprint from SMILES or InChI.
81-
82- Parameters
83- ----------
84- smiles : str or None
85- SMILES string to compute the fingerprint from.
86- inchis : str or None
87- InChI strings to compute the fingerprint from (used if smiles is None).
88- sparse : bool
89- If True, compute sparse fingerprint (indices/counts); else dense bit vector.
90- count : bool
91- If True, compute count-based fingerprint; else binary fingerprint.
92- radius : int
93- Radius for Morgan fingerprint. Default 9.
94- progress_bar : bool
95- Whether to show a progress bar during computation. Default True.
96- """
97- fpgen = rdFingerprintGenerator .GetMorganGenerator (radius = radius , fpSize = 4096 )
98-
99- if inchis and not smiles :
100- # convert inchis to smiles
101- smiles = []
102- for inchi in inchis :
103- try :
104- from rdkit import Chem
105- mol = Chem .MolFromInchi (inchi )
106- smi = Chem .MolToSmiles (mol ) if mol is not None else None
107- smiles .append (smi )
108- except Exception as e :
109- print (f"Error converting InChI to SMILES for { inchi } : { e } " )
110- smiles .append (None )
111- elif not smiles and not inchis :
112- raise ValueError ("Either smiles or inchis must be provided." )
113- return compute_fingerprints_from_smiles (
114- smiles ,
115- fpgen ,
116- count = count ,
117- sparse = sparse ,
118- progress_bar = progress_bar ,
119- )
120-
12162
12263# ==================================================
12364# Compound database (compounds table) in SQLite
@@ -396,7 +337,7 @@ def compute_fingerprints_missing(
396337 """
397338 Compute fingerprints for all compounds that have SMILES (pass A) or, if no SMILES,
398339 have InChI (pass B), and where fingerprints are missing.
399- Uses the project-level `compute_fingerprints ` function that returns a
340+ Uses the project-level `compute_morgan_fingerprint ` function that returns a
400341 List[Optional[Tuple[np.ndarray,np.ndarray]]].
401342
402343 Returns stats: {"updated": int, "attempted": int, "skipped": int}
@@ -468,8 +409,8 @@ def _update_rows(comp_ids: List[str], results: List[Optional[Tuple[np.ndarray, n
468409 comp_ids = [r [0 ] for r in rows ]
469410 reps = [r [1 ] for r in rows ] # list[str] of smiles or inchi
470411
471- # call compute_fingerprints ONCE for the whole batch
472- results = compute_fingerprints (
412+ # call compute_morgan_fingerprint ONCE for the whole batch
413+ results = compute_morgan_fingerprint (
473414 smiles = reps if which == "smiles" else None ,
474415 inchis = reps if which == "inchi" else None ,
475416 sparse = sparse ,
@@ -494,7 +435,6 @@ def _update_rows(comp_ids: List[str], results: List[Optional[Tuple[np.ndarray, n
494435 return stats
495436
496437
497-
498438# ==================================================
499439# Mapping: spectrum <-> compound (spec_to_comp)
500440# ==================================================
0 commit comments