Skip to content

Commit 1d26126

Browse files
committed
moving things
1 parent 222cc05 commit 1d26126

File tree

3 files changed

+72
-65
lines changed

3 files changed

+72
-65
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
from .chemistry_utils import compute_morgan_fingerprint, inchikey14_from_full
12
from .fingerprint_computation import compute_fingerprints_from_smiles
23
from .merging_utils import cluster_block, get_merged_spectra, normalize_spectrum_sum
34

45

56
__all__ = [
67
"cluster_block",
8+
"compute_morgan_fingerprint",
79
"compute_fingerprints_from_smiles",
810
"get_merged_spectra",
11+
"inchikey14_from_full",
912
"normalize_spectrum_sum",
1013
]
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
from typing import Optional
2+
from rdkit import Chem
3+
from rdkit.Chem import rdFingerprintGenerator
4+
5+
6+
def inchikey14_from_full(inchikey: str) -> Optional[str]:
7+
"""Return the first 14 characters (inchikey14). Robust to hyphens/malformed keys."""
8+
if not inchikey:
9+
return None
10+
s = str(inchikey).strip().upper()
11+
if "-" in s:
12+
return s.split("-", 1)[0][:14]
13+
return s[:14] if len(s) >= 14 else None
14+
15+
16+
17+
def compute_morgan_fingerprints(
18+
smiles: Optional[str] = None,
19+
inchis: Optional[str] = None,
20+
sparse: bool = True,
21+
count: bool = True,
22+
radius: int = 9,
23+
progress_bar: bool = True,
24+
) -> np.ndarray:
25+
"""
26+
Compute a molecular fingerprint from SMILES or InChI.
27+
28+
Parameters
29+
----------
30+
smiles : str or None
31+
SMILES string to compute the fingerprint from.
32+
inchis : str or None
33+
InChI strings to compute the fingerprint from (used if smiles is None).
34+
sparse : bool
35+
If True, compute sparse fingerprint (indices/counts); else dense bit vector.
36+
count : bool
37+
If True, compute count-based fingerprint; else binary fingerprint.
38+
radius : int
39+
Radius for Morgan fingerprint. Default 9.
40+
progress_bar : bool
41+
Whether to show a progress bar during computation. Default True.
42+
"""
43+
fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=4096)
44+
45+
if inchis and not smiles:
46+
# convert inchis to smiles
47+
smiles = []
48+
for inchi in inchis:
49+
try:
50+
mol = Chem.MolFromInchi(inchi)
51+
smi = Chem.MolToSmiles(mol) if mol is not None else None
52+
smiles.append(smi)
53+
except Exception as e:
54+
print(f"Error converting InChI to SMILES for {inchi}: {e}")
55+
smiles.append(None)
56+
elif not smiles and not inchis:
57+
raise ValueError("Either smiles or inchis must be provided.")
58+
return compute_fingerprints_from_smiles(
59+
smiles,
60+
fpgen,
61+
count=count,
62+
sparse=sparse,
63+
progress_bar=progress_bar,
64+
)

ms2query/database/compound_database.py

Lines changed: 5 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,14 @@
44
from typing import Any, Dict, Iterable, List, Optional, Tuple
55
import numpy as np
66
import pandas as pd
7-
from rdkit.Chem import rdFingerprintGenerator
8-
from ms2query.data_processing import compute_fingerprints_from_smiles
7+
from ms2query.data_processing import compute_fingerprints_from_smiles, inchikey14_from_full, compute_morgan_fingerprint
8+
99

1010

1111
# =========================
1212
# Utilities & placeholders
1313
# =========================
1414

15-
def inchikey14_from_full(inchikey: str) -> Optional[str]:
16-
"""Return the first 14 characters (inchikey14). Robust to hyphens/malformed keys."""
17-
if not inchikey:
18-
return None
19-
s = str(inchikey).strip().upper()
20-
if "-" in s:
21-
return s.split("-", 1)[0][:14]
22-
return s[:14] if len(s) >= 14 else None
23-
2415
def encode_sparse_fp(bits: Optional[np.ndarray], counts: Optional[np.ndarray]) -> tuple[bytes, bytes]:
2516
"""Store bits as uint32 indices, counts as int32
2617
@@ -68,56 +59,6 @@ def decode_fp_blob(blob: bytes) -> np.ndarray:
6859
return np.zeros(0, dtype=np.uint8)
6960
return np.frombuffer(blob, dtype=np.uint8).copy()
7061

71-
def compute_fingerprints(
72-
smiles: Optional[str] = None,
73-
inchis: Optional[str] = None,
74-
sparse: bool = True,
75-
count: bool = True,
76-
radius: int = 9,
77-
progress_bar: bool = True,
78-
) -> np.ndarray:
79-
"""
80-
Compute a molecular fingerprint from SMILES or InChI.
81-
82-
Parameters
83-
----------
84-
smiles : str or None
85-
SMILES string to compute the fingerprint from.
86-
inchis : str or None
87-
InChI strings to compute the fingerprint from (used if smiles is None).
88-
sparse : bool
89-
If True, compute sparse fingerprint (indices/counts); else dense bit vector.
90-
count : bool
91-
If True, compute count-based fingerprint; else binary fingerprint.
92-
radius : int
93-
Radius for Morgan fingerprint. Default 9.
94-
progress_bar : bool
95-
Whether to show a progress bar during computation. Default True.
96-
"""
97-
fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=4096)
98-
99-
if inchis and not smiles:
100-
# convert inchis to smiles
101-
smiles = []
102-
for inchi in inchis:
103-
try:
104-
from rdkit import Chem
105-
mol = Chem.MolFromInchi(inchi)
106-
smi = Chem.MolToSmiles(mol) if mol is not None else None
107-
smiles.append(smi)
108-
except Exception as e:
109-
print(f"Error converting InChI to SMILES for {inchi}: {e}")
110-
smiles.append(None)
111-
elif not smiles and not inchis:
112-
raise ValueError("Either smiles or inchis must be provided.")
113-
return compute_fingerprints_from_smiles(
114-
smiles,
115-
fpgen,
116-
count=count,
117-
sparse=sparse,
118-
progress_bar=progress_bar,
119-
)
120-
12162

12263
# ==================================================
12364
# Compound database (compounds table) in SQLite
@@ -396,7 +337,7 @@ def compute_fingerprints_missing(
396337
"""
397338
Compute fingerprints for all compounds that have SMILES (pass A) or, if no SMILES,
398339
have InChI (pass B), and where fingerprints are missing.
399-
Uses the project-level `compute_fingerprints` function that returns a
340+
Uses the project-level `compute_morgan_fingerprint` function that returns a
400341
List[Optional[Tuple[np.ndarray,np.ndarray]]].
401342
402343
Returns stats: {"updated": int, "attempted": int, "skipped": int}
@@ -468,8 +409,8 @@ def _update_rows(comp_ids: List[str], results: List[Optional[Tuple[np.ndarray, n
468409
comp_ids = [r[0] for r in rows]
469410
reps = [r[1] for r in rows] # list[str] of smiles or inchi
470411

471-
# call compute_fingerprints ONCE for the whole batch
472-
results = compute_fingerprints(
412+
# call compute_morgan_fingerprint ONCE for the whole batch
413+
results = compute_morgan_fingerprint(
473414
smiles=reps if which == "smiles" else None,
474415
inchis=reps if which == "inchi" else None,
475416
sparse=sparse,
@@ -494,7 +435,6 @@ def _update_rows(comp_ids: List[str], results: List[Optional[Tuple[np.ndarray, n
494435
return stats
495436

496437

497-
498438
# ==================================================
499439
# Mapping: spectrum <-> compound (spec_to_comp)
500440
# ==================================================

0 commit comments

Comments
 (0)