From 8c43dd7aa5fed0517fbf22c22f7e88c8c5a1df74 Mon Sep 17 00:00:00 2001 From: AStaniszewski Date: Sat, 7 Mar 2026 16:58:30 +0100 Subject: [PATCH 1/4] Add support for InChI strings inside the mol validator --- skfp/utils/validators.py | 17 ++++++++++++----- tests/utils/validators.py | 9 +++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/skfp/utils/validators.py b/skfp/utils/validators.py index af53444c..daec8f93 100644 --- a/skfp/utils/validators.py +++ b/skfp/utils/validators.py @@ -2,24 +2,31 @@ from collections.abc import Callable, Sequence from typing import Any -from rdkit.Chem import Mol, MolFromSmiles, MolToSmiles +from rdkit.Chem import Mol, MolFromInchi, MolFromSmiles, MolToSmiles from rdkit.Chem.PropertyMol import PropertyMol def ensure_mols(X: Sequence[Any]) -> list[Mol]: """ Ensure that all input sequence elements are RDKit ``Mol`` objects. Requires - all input elements to be of the same type: string (SMILES strings) or ``Mol``. - In the case of SMILES strings, they are converted to RDKit ``Mol`` objects with + all input elements to be of the same type: string (SMILES or InChI strings) or ``Mol``. + In the case of SMILES or InChI strings, they are converted to RDKit ``Mol`` objects with default settings. """ if not all(isinstance(x, (Mol, PropertyMol, str)) for x in X): types = {type(x) for x in X} raise TypeError( - f"Passed values must be RDKit Mol objects or SMILES strings, got types: {types}" + f"Passed values must be RDKit Mol objects, SMILES or InChI strings, got types: {types}" ) - mols = [MolFromSmiles(x) if isinstance(x, str) else x for x in X] + mols = [ + MolFromInchi(x) + if isinstance(x, str) and x.startswith("InChI=") + else MolFromSmiles(x) + if isinstance(x, str) + else x + for x in X + ] if any(x is None for x in mols): idx = mols.index(None) diff --git a/tests/utils/validators.py b/tests/utils/validators.py index 983131d4..0df5197f 100644 --- a/tests/utils/validators.py +++ b/tests/utils/validators.py @@ -29,6 +29,15 @@ def test_ensure_mols_wrong_smiles(): assert "at index 1 as molecule" in str(exc_info) +def test_ensure_mols_wrong_inchi(): + inchi_list = ["InChI=1S/H2O/h1H2", "InChI=1S/invalid"] + with pytest.raises(TypeError) as exc_info: + ensure_mols(inchi_list) + + assert "Could not parse" in str(exc_info) + assert "at index 1 as molecule" in str(exc_info) + + def test_ensure_mols_in_fingerprint(): smiles_list = ["O", "O=N([O-])C1=C(CN=C1NCCSCc2ncccc2)Cc3ccccc3"] fp = AtomPairFingerprint() From d3e4f5047a99ac1d0537926bf1661878f87ca440 Mon Sep 17 00:00:00 2001 From: AStaniszewski Date: Sat, 7 Mar 2026 17:00:42 +0100 Subject: [PATCH 2/4] Adjust formatting --- skfp/utils/validators.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/skfp/utils/validators.py b/skfp/utils/validators.py index daec8f93..5fe7f359 100644 --- a/skfp/utils/validators.py +++ b/skfp/utils/validators.py @@ -19,14 +19,12 @@ def ensure_mols(X: Sequence[Any]) -> list[Mol]: f"Passed values must be RDKit Mol objects, SMILES or InChI strings, got types: {types}" ) - mols = [ - MolFromInchi(x) - if isinstance(x, str) and x.startswith("InChI=") - else MolFromSmiles(x) - if isinstance(x, str) - else x - for x in X - ] + mols = [] + for x in X: + if isinstance(x, str): + mols.append(MolFromInchi(x) if x.startswith("InChI=") else MolFromSmiles(x)) + else: + mols.append(x) if any(x is None for x in mols): idx = mols.index(None) From 119e0ff8235e227c1e08bbf8859ccccc9fb3e3c7 Mon Sep 17 00:00:00 2001 From: AStaniszewski Date: Sat, 7 Mar 2026 17:03:38 +0100 Subject: [PATCH 3/4] Remove redundant whitespace in the docstring --- skfp/utils/validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfp/utils/validators.py b/skfp/utils/validators.py index 5fe7f359..dd0d6c0e 100644 --- a/skfp/utils/validators.py +++ b/skfp/utils/validators.py @@ -9,7 +9,7 @@ def ensure_mols(X: Sequence[Any]) -> list[Mol]: """ Ensure that all input sequence elements are RDKit ``Mol`` objects. Requires - all input elements to be of the same type: string (SMILES or InChI strings) or ``Mol``. + all input elements to be of the same type: string (SMILES or InChI strings) or ``Mol``. In the case of SMILES or InChI strings, they are converted to RDKit ``Mol`` objects with default settings. """ From 76698c373c19a202bfc0326a56f09ab366e80474 Mon Sep 17 00:00:00 2001 From: AStaniszewski Date: Sun, 8 Mar 2026 09:24:44 +0100 Subject: [PATCH 4/4] Check only the first element of the input sequence. Add test case for valid InChI input --- skfp/utils/validators.py | 11 +++++------ tests/utils/validators.py | 14 +++++++++++++- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/skfp/utils/validators.py b/skfp/utils/validators.py index dd0d6c0e..d01f77d3 100644 --- a/skfp/utils/validators.py +++ b/skfp/utils/validators.py @@ -19,12 +19,11 @@ def ensure_mols(X: Sequence[Any]) -> list[Mol]: f"Passed values must be RDKit Mol objects, SMILES or InChI strings, got types: {types}" ) - mols = [] - for x in X: - if isinstance(x, str): - mols.append(MolFromInchi(x) if x.startswith("InChI=") else MolFromSmiles(x)) - else: - mols.append(x) + if isinstance(X[0], str): + parser = MolFromInchi if X[0].startswith("InChI=") else MolFromSmiles + mols = [parser(x) for x in X] + else: + mols = list(X) if any(x is None for x in mols): idx = mols.index(None) diff --git a/tests/utils/validators.py b/tests/utils/validators.py index 0df5197f..791238df 100644 --- a/tests/utils/validators.py +++ b/tests/utils/validators.py @@ -29,7 +29,19 @@ def test_ensure_mols_wrong_smiles(): assert "at index 1 as molecule" in str(exc_info) -def test_ensure_mols_wrong_inchi(): +def test_ensure_mols_valid_inchi(): + inchi_list = ["InChI=1S/H2O/h1H2", "InChI=1S/CH4/h1H4"] + mols = ensure_mols(inchi_list) + assert all(m is not None for m in mols) + assert len(mols) == 2 + from rdkit.Chem import MolToSmiles + + smiles = [MolToSmiles(m) for m in mols] + assert "O" in smiles + assert "C" in smiles + + +def test_ensure_mols_invalid_inchi(): inchi_list = ["InChI=1S/H2O/h1H2", "InChI=1S/invalid"] with pytest.raises(TypeError) as exc_info: ensure_mols(inchi_list)