From 2f7e86799db7fbaa6dff99a51550dde522532daf Mon Sep 17 00:00:00 2001 From: Mateusz Praski Date: Wed, 15 Oct 2025 08:48:42 +0200 Subject: [PATCH 1/6] Added implementation of first MoleculeACE dataset --- .gitignore | 3 + docs/modules/datasets.rst | 1 + docs/modules/datasets/moleculeace.rst | 26 ++++ skfp/datasets/moleculeace/__init__.py | 5 + skfp/datasets/moleculeace/benchmark.py | 185 +++++++++++++++++++++++ skfp/datasets/moleculeace/moleculeace.py | 93 ++++++++++++ tests/datasets/moleculeace.py | 31 ++++ 7 files changed, 344 insertions(+) create mode 100644 docs/modules/datasets/moleculeace.rst create mode 100644 skfp/datasets/moleculeace/__init__.py create mode 100644 skfp/datasets/moleculeace/benchmark.py create mode 100644 skfp/datasets/moleculeace/moleculeace.py create mode 100644 tests/datasets/moleculeace.py diff --git a/.gitignore b/.gitignore index 06770bb4..fb227d1d 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,6 @@ tests/preprocessing/input_output/data/mol_out.sdf .pip_audit_cache .python-version + +.DS_Store + diff --git a/docs/modules/datasets.rst b/docs/modules/datasets.rst index 13ab555e..ec4e56d7 100644 --- a/docs/modules/datasets.rst +++ b/docs/modules/datasets.rst @@ -10,3 +10,4 @@ Functions for loading benchmark molecular datasets. datasets/lrgb datasets/moleculenet datasets/tdc + datasets/moleculeace diff --git a/docs/modules/datasets/moleculeace.rst b/docs/modules/datasets/moleculeace.rst new file mode 100644 index 00000000..a07d9d4d --- /dev/null +++ b/docs/modules/datasets/moleculeace.rst @@ -0,0 +1,26 @@ +=========== +MoleculeACE +=========== + +.. automodule:: skfp.datasets.moleculeace + :exclude-members: load_moleculeace_benchmark + +========================================================= + +.. py:currentmodule:: skfp.datasets.moleculeace + +MoleculeACE benchmark + +.. autosummary:: + :nosignatures: + :toctree: generated/ + + load_moleculeace_benchmark + +Dataset loaders + +.. autosummary:: + :nosignatures: + :toctree: generated/ + + load_chembl204_ki diff --git a/skfp/datasets/moleculeace/__init__.py b/skfp/datasets/moleculeace/__init__.py new file mode 100644 index 00000000..2662e68e --- /dev/null +++ b/skfp/datasets/moleculeace/__init__.py @@ -0,0 +1,5 @@ +from .benchmark import ( + load_moleculeace_benchmark, + load_moleculeace_dataset, +) +from .moleculeace import load_chembl204_ki diff --git a/skfp/datasets/moleculeace/benchmark.py b/skfp/datasets/moleculeace/benchmark.py new file mode 100644 index 00000000..888d16bf --- /dev/null +++ b/skfp/datasets/moleculeace/benchmark.py @@ -0,0 +1,185 @@ +import os +from collections.abc import Iterator + +import numpy as np +import pandas as pd +from sklearn.utils._param_validation import StrOptions, validate_params + +from .moleculeace import ( + load_chembl204_ki, +) + +MOLECULEACE_DATASET_NAMES = [ + "chembl204_ki", +] + +MOLECULEACE_DATASET_NAME_TO_LOADER_FUNC = { + "chembl204_ki": load_chembl204_ki, +} + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_moleculeace_benchmark( + subset: str | list[str] | None = None, + data_dir: str | os.PathLike | None = None, + as_frames: bool = False, + verbose: bool = False, +) -> Iterator[tuple[str, pd.DataFrame]] | Iterator[tuple[str, list[str], np.ndarray]]: + """ + Load the MoleculeACE benchmark datasets. + + MoleculeACE [1]_ datasets are varied inhibition and effective concentration targets from ChEMBL [2]_. + Activity cliff is recommended for all of them. + + For more details, see loading functions for particular datasets. Allowed individual + dataset names are listed below. Dataset names are also returned (case-sensitive). + + - "chembl204_ki" + + Parameters + ---------- + subset : None or list of strings + If ``None``, returns all datasets. List of strings loads only datasets with given names. + + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frames : bool, default=False + If True, returns the raw DataFrame for each dataset. Otherwise, returns SMILES + as a list of strings, and labels as a NumPy array for each dataset. + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : generator of pd.DataFrame or tuples (list[str], np.ndarray) + Loads and returns datasets with a generator. Returned types depend on the + ``as_frame`` parameter, either: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022. + `_ + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods,” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023. + `_ + """ + dataset_names = _subset_to_dataset_names(subset) + + dataset_functions = [ + MOLECULEACE_DATASET_NAME_TO_LOADER_FUNC[name] for name in dataset_names + ] + + if as_frames: + datasets = ( + (dataset_name, load_function(data_dir, as_frame=True, verbose=verbose)) + for dataset_name, load_function in zip( + dataset_names, dataset_functions, strict=False + ) + ) + else: + datasets = ( + (dataset_name, *load_function(data_dir, as_frame=False, verbose=verbose)) + for dataset_name, load_function in zip( + dataset_names, dataset_functions, strict=False + ) + ) + return datasets + + +@validate_params( + { + "dataset_name": [StrOptions(set(MOLECULEACE_DATASET_NAMES))], + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_moleculeace_dataset( + dataset_name: str, + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + """ + Load MoleculeACE dataset by name. + + Loads a given dataset from MoleculeACE [1]_ benchmark by its name. This is a proxy + for easier benchmarking, that avoids looking for individual functions. + + Dataset names here are the same as returned by `load_moleculenet_benchmark` function, + and are case-sensitive. + + Parameters + ---------- + dataset_name : {} + Name of the dataset to load. + + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns "SMILES" and labels + (dataset-dependent). Otherwise, returns SMILES as list of strings, and + labels as a NumPy array (shape and type are dataset-dependent). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns depending on the dataset + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022. + `_ + + Examples + -------- + >> from skfp.datasets.moleculeace import load_moleculeace_dataset + >> dataset = load_moleculeace_dataset("chembl204_ki") + >> dataset # doctest: +SKIP + (['CCCCCCCC(=O)OC[C@H](NC(=O)CN)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccccc1)C(=O)O', ..., ']) + """ + loader_func = MOLECULEACE_DATASET_NAME_TO_LOADER_FUNC[dataset_name] + return loader_func(data_dir, as_frame, verbose) + + +def _subset_to_dataset_names(subset: str | list[str] | None) -> list[str]: + if subset is None: + dataset_names = MOLECULEACE_DATASET_NAMES + elif isinstance(subset, (list, set, tuple)): + for name in subset: + if name not in MOLECULEACE_DATASET_NAMES: + raise ValueError( + f"Dataset name '{name}' not recognized among MoleculeACE datasets" + ) + dataset_names = subset + else: + raise ValueError( + f'Value "{subset}" for subset not recognized, must be a list of strings' + f"with dataset names from MoleculeACE to load" + ) + return dataset_names diff --git a/skfp/datasets/moleculeace/moleculeace.py b/skfp/datasets/moleculeace/moleculeace.py new file mode 100644 index 00000000..7f6e7064 --- /dev/null +++ b/skfp/datasets/moleculeace/moleculeace.py @@ -0,0 +1,93 @@ +import os + +import numpy as np +import pandas as pd +from sklearn.utils._param_validation import validate_params + +from skfp.datasets.utils import fetch_dataset, get_mol_strings_and_labels + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl204_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + """ + Load the ChEMBL204 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Prothrombin target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 2754 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl204_ki + >>> dataset = load_chembl204_ki() + >>> dataset # doctest: +SKIP + (['CC(=N)N1CCC(Oc2ccc3nc(CCC(=O)O)n(Cc4ccc5ccc(C(=N)N)cc5c4)c3c2)CC1, ..., 'CCC(=O)N1CCC[C@H]1C(=O)NCc1ccc(C(=N)N)cc1'], \ + array([-3.427, ..., -4.146])) + + >>> dataset = load_chembl204_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CC(=N)N1CCC(Oc2ccc3nc(CCC(=O)O)n(Cc4ccc5ccc(C(=N)N)cc5c4)c3c2)CC1 -3.426511 + 1 CC(=N)N1CCC(Oc2ccc3c(c2)nc(C(C)C)n3Cc2ccc3ccc(C(=N)N)cc3c2)CC1 -2.939519 + 2 CCC(C)c1nc2cc(OC3CCN(C(C)=N)CC3)ccc2n1Cc1ccc2ccc(C(=N)N)cc2c1 -3.361728 + 3 COC(=O)C(C)CN(c1ccc2c(c1)nc(C)n2Cc1ccc2ccc(C(=N)N)cc2c1)C1CCN(C(C)=N)CC1 -3.698970 + 4 CCCCc1nc2cc(OC3CCN(C(C)=N)CC3)ccc2n1Cc1ccc2ccc(C(=N)N)cc2c1 -3.301030 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl204_ki", + filename="chembl204_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) diff --git a/tests/datasets/moleculeace.py b/tests/datasets/moleculeace.py new file mode 100644 index 00000000..9e482ad8 --- /dev/null +++ b/tests/datasets/moleculeace.py @@ -0,0 +1,31 @@ +import pytest + +from skfp.datasets.moleculeace import ( + load_chembl204_ki, + load_moleculeace_dataset, +) +from tests.datasets.test_utils import run_basic_dataset_checks + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +@pytest.mark.parametrize( + "dataset_name, load_func, expected_length, num_tasks, task_type", + [ + ("chembl204_ki", load_chembl204_ki, 2754, 1, "regression"), + ], +) +def test_load_dataset(dataset_name, load_func, expected_length, num_tasks, task_type): + smiles_list, y = load_func() + df = load_moleculeace_dataset(dataset_name, as_frame=True) + run_basic_dataset_checks( + smiles_list, + y, + df, + expected_length=expected_length, + num_tasks=num_tasks, + task_type=task_type, + ) From acf7c4f9b1a04ef63e837cbe7ea0bb946a29bb33 Mon Sep 17 00:00:00 2001 From: Mateusz Praski Date: Wed, 15 Oct 2025 12:26:55 +0200 Subject: [PATCH 2/6] Added individual MoleculeACE datasets --- skfp/datasets/moleculeace/__init__.py | 33 +- skfp/datasets/moleculeace/benchmark.py | 87 + skfp/datasets/moleculeace/moleculeace.py | 2498 +++++++++++++++++++++- tests/datasets/moleculeace.py | 58 + 4 files changed, 2673 insertions(+), 3 deletions(-) diff --git a/skfp/datasets/moleculeace/__init__.py b/skfp/datasets/moleculeace/__init__.py index 2662e68e..faa73c91 100644 --- a/skfp/datasets/moleculeace/__init__.py +++ b/skfp/datasets/moleculeace/__init__.py @@ -2,4 +2,35 @@ load_moleculeace_benchmark, load_moleculeace_dataset, ) -from .moleculeace import load_chembl204_ki +from .moleculeace import ( + load_chembl204_ki, + load_chembl214_ki, + load_chembl218_ec50, + load_chembl219_ki, + load_chembl228_ki, + load_chembl231_ki, + load_chembl233_ki, + load_chembl234_ki, + load_chembl235_ec50, + load_chembl236_ki, + load_chembl237_ec50, + load_chembl237_ki, + load_chembl238_ki, + load_chembl239_ec50, + load_chembl244_ki, + load_chembl262_ki, + load_chembl264_ki, + load_chembl287_ki, + load_chembl1862_ki, + load_chembl1871_ki, + load_chembl2034_ki, + load_chembl2047_ec50, + load_chembl2147_ki, + load_chembl2835_ki, + load_chembl2971_ki, + load_chembl3979_ec50, + load_chembl4005_ki, + load_chembl4203_ki, + load_chembl4616_ec50, + load_chembl4792_ki, +) diff --git a/skfp/datasets/moleculeace/benchmark.py b/skfp/datasets/moleculeace/benchmark.py index 888d16bf..5eca947c 100644 --- a/skfp/datasets/moleculeace/benchmark.py +++ b/skfp/datasets/moleculeace/benchmark.py @@ -7,14 +7,101 @@ from .moleculeace import ( load_chembl204_ki, + load_chembl214_ki, + load_chembl218_ec50, + load_chembl219_ki, + load_chembl228_ki, + load_chembl231_ki, + load_chembl233_ki, + load_chembl234_ki, + load_chembl235_ec50, + load_chembl236_ki, + load_chembl237_ec50, + load_chembl237_ki, + load_chembl238_ki, + load_chembl239_ec50, + load_chembl244_ki, + load_chembl262_ki, + load_chembl264_ki, + load_chembl287_ki, + load_chembl1862_ki, + load_chembl1871_ki, + load_chembl2034_ki, + load_chembl2047_ec50, + load_chembl2147_ki, + load_chembl2835_ki, + load_chembl2971_ki, + load_chembl3979_ec50, + load_chembl4005_ki, + load_chembl4203_ki, + load_chembl4616_ec50, + load_chembl4792_ki, ) MOLECULEACE_DATASET_NAMES = [ "chembl204_ki", + "chembl214_ki", + "chembl218_ec50", + "chembl219_ki", + "chembl228_ki", + "chembl231_ki", + "chembl233_ki", + "chembl234_ki", + "chembl235_ec50", + "chembl236_ki", + "chembl237_ec50", + "chembl237_ki", + "chembl238_ki", + "chembl239_ec50", + "chembl244_ki", + "chembl262_ki", + "chembl264_ki", + "chembl287_ki", + "chembl1862_ki", + "chembl1871_ki", + "chembl2034_ki", + "chembl2047_ec50", + "chembl2147_ki", + "chembl2835_ki", + "chembl2971_ki", + "chembl3979_ec50", + "chembl4005_ki", + "chembl4203_ki", + "chembl4616_ec50", + "chembl4792_ki", ] MOLECULEACE_DATASET_NAME_TO_LOADER_FUNC = { "chembl204_ki": load_chembl204_ki, + "chembl214_ki": load_chembl214_ki, + "chembl218_ec50": load_chembl218_ec50, + "chembl219_ki": load_chembl219_ki, + "chembl228_ki": load_chembl228_ki, + "chembl231_ki": load_chembl231_ki, + "chembl233_ki": load_chembl233_ki, + "chembl234_ki": load_chembl234_ki, + "chembl235_ec50": load_chembl235_ec50, + "chembl236_ki": load_chembl236_ki, + "chembl237_ec50": load_chembl237_ec50, + "chembl237_ki": load_chembl237_ki, + "chembl238_ki": load_chembl238_ki, + "chembl239_ec50": load_chembl239_ec50, + "chembl244_ki": load_chembl244_ki, + "chembl262_ki": load_chembl262_ki, + "chembl264_ki": load_chembl264_ki, + "chembl287_ki": load_chembl287_ki, + "chembl1862_ki": load_chembl1862_ki, + "chembl1871_ki": load_chembl1871_ki, + "chembl2034_ki": load_chembl2034_ki, + "chembl2047_ec50": load_chembl2047_ec50, + "chembl2147_ki": load_chembl2147_ki, + "chembl2835_ki": load_chembl2835_ki, + "chembl2971_ki": load_chembl2971_ki, + "chembl3979_ec50": load_chembl3979_ec50, + "chembl4005_ki": load_chembl4005_ki, + "chembl4203_ki": load_chembl4203_ki, + "chembl4616_ec50": load_chembl4616_ec50, + "chembl4792_ki": load_chembl4792_ki, } diff --git a/skfp/datasets/moleculeace/moleculeace.py b/skfp/datasets/moleculeace/moleculeace.py index 7f6e7064..0f171b72 100644 --- a/skfp/datasets/moleculeace/moleculeace.py +++ b/skfp/datasets/moleculeace/moleculeace.py @@ -20,7 +20,7 @@ def load_chembl204_ki( as_frame: bool = False, verbose: bool = False, ) -> pd.DataFrame | tuple[list[str]] | np.ndarray: - """ + r""" Load the ChEMBL204 Ki dataset. The task is to predict the inhibitor constant (Ki) of molecules against the Prothrombin target [1]_ [2]_. @@ -77,7 +77,7 @@ def load_chembl204_ki( >>> dataset = load_chembl204_ki(as_frame=True) >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE - SMILES Ki + SMILES Ki 0 CC(=N)N1CCC(Oc2ccc3nc(CCC(=O)O)n(Cc4ccc5ccc(C(=N)N)cc5c4)c3c2)CC1 -3.426511 1 CC(=N)N1CCC(Oc2ccc3c(c2)nc(C(C)C)n3Cc2ccc3ccc(C(=N)N)cc3c2)CC1 -2.939519 2 CCC(C)c1nc2cc(OC3CCN(C(C)=N)CC3)ccc2n1Cc1ccc2ccc(C(=N)N)cc2c1 -3.361728 @@ -91,3 +91,2497 @@ def load_chembl204_ki( verbose=verbose, ) return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl214_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL214 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the 5-hydroxytryptamine receptor 1a target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 3317 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl214_ki + >>> dataset = load_chembl214_ki() + >>> dataset # doctest: +SKIP + (['COc1ccc(NC(=O)c2ccc(-c3ccc(-c4noc(C)n4)cc3C)cc2)cc1N1CCN(C)CC1, ..., 'O=S(=O)(NCCCCCCN1CCN(c2nsc3ccccc23)CC1)c1ccc2ccccc2c1'], \ + array([-1.869, ..., -1.863])) + + >>> dataset = load_chembl214_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 COc1ccc(NC(=O)c2ccc(-c3ccc(-c4noc(C)n4)cc3C)cc2)cc1N1CCN(C)CC1 -1.869232 + 1 Nc1cccc(-c2ccc(CCN3CCN(c4cccc5cccnc45)CC3)cc2)n1 -0.477121 + 2 COc1ccc(NS(=O)(=O)c2ccc(Br)cc2)cc1N1CCN(C)CC1 -2.400002 + 3 COc1ccc(NS(=O)(=O)c2sc3ccc(Cl)cc3c2C)cc1N1CCN(C)CC1 -2.700002 + 4 CN1CCc2cccc3c2[C@H]1Cc1cccc(-c2ccccc2)c1-3 -0.255273 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl214_ki", + filename="chembl214_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl218_ec50( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL218 EC50 dataset. + + The task is to predict the half maximal effective concentration (EC50) of molecules against the Cannabinoid receptor 1 target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 1031 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl218_ec50 + >>> dataset = load_chembl218_ec50() + >>> dataset # doctest: +SKIP + (['Cn1c(C(=O)NN2CCCCC2)nc(-c2ccc(Cl)cc2)c1-c1ccc(Cl)cc1, ..., 'CCCCCc1cccc(OCCCCCCCCCCC(=O)NC2CC2)c1'], \ + array([-2.0, ..., -1.491])) + + >>> dataset = load_chembl218_ec50(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES EC50 + 0 Cn1c(C(=O)NN2CCCCC2)nc(-c2ccc(Cl)cc2)c1-c1ccc(Cl)cc1 -2.000000 + 1 Cn1c(C(=O)NC2CCCCC2)nc(-c2ccc(Cl)cc2)c1-c1ccc(Cl)cc1 -2.698970 + 2 Cn1c(C(=O)NN2CCCCC2)nc(-c2ccc(Cl)cc2Cl)c1-c1ccc(Cl)cc1 -0.698970 + 3 Cn1c(C(=O)NC2CCCCC2)nc(-c2ccc(Cl)cc2Cl)c1-c1ccc(Cl)cc1 -1.255273 + 4 N#Cc1cc(-c2ccc(Cl)cc2)c(-c2ccc(Cl)cc2Cl)nc1OCc1ccc(F)c(F)c1 -0.903090 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl218_ec50", + filename="chembl218_ec50.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl219_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL219 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the D(4) dopamine receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 1865 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl219_ki + >>> dataset = load_chembl219_ki() + >>> dataset # doctest: +SKIP + (['COc1ccccc1N1CCN(Cc2ccn(-c3ccccc3)c2)CC1, ..., 'CNc1cc(OC)c(C(=O)N[C@@H]2CCN(Cc3ccccc3)[C@@H]2C)cc1Cl'], \ + array([-0.1139, ..., 0.0655])) + + >>> dataset = load_chembl219_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 COc1ccccc1N1CCN(Cc2ccn(-c3ccccc3)c2)CC1 -0.113943 + 1 c1ccc(N2CCN(Cc3ccn(-c4ccccc4)c3)CC2)cc1 -0.602060 + 2 CC1Cc2cccc3c2N1C(=O)C(N1CCN(Cc2ccc(Cl)cc2)CC1)CC3 -0.954243 + 3 CC1(C)Cc2cccc3c2N1C(=O)C(N1CCN(Cc2ccc(Cl)cc2)CC1)CC3 -1.278754 + 4 Cc1ccc(CN2CCN(C3CCc4cccc5c4N(CC5)C3=O)CC2)cc1 -0.602060 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl219_ki", + filename="chembl219_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl228_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL228 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Sodium-dependent serotonin transporter target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 1704 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl228_ki + >>> dataset = load_chembl228_ki() + >>> dataset # doctest: +SKIP + (['CN(C)Cc1ccccc1Sc1ccc(C#N)cc1N, ..., 'CCCN(CC[C@]1(O)C[C@H](NC(=O)c2ccc3ccccc3c2)C1)[C@H]1CCc2nc(N)sc2C1'], \ + array([-0.04139, ..., -1.505])) + + >>> dataset = load_chembl228_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CN(C)Cc1ccccc1Sc1ccc(C#N)cc1N -0.041393 + 1 CN(C)Cc1ccccc1Sc1ccc(C(F)(F)F)cc1N 0.481486 + 2 COc1ccc(Sc2ccccc2CN(C)C)c(N)c1 -0.276462 + 3 CN(C)Cc1ccccc1Sc1ccc(Cl)cc1N 0.568636 + 4 Fc1ccc([C@@H]2CCNC[C@H]2COc2ccc3c(c2)OCO3)cc1 0.661986 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl228_ki", + filename="chembl228_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl231_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL231 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Histamine h1 receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 973 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl231_ki + >>> dataset = load_chembl231_ki() + >>> dataset # doctest: +SKIP + (['CN1CCN(C2=Nc3ccccc3Nc3sc(CO)cc32)CC1, ..., 'O=C(O)c1cc(-c2ccc(C3CCNCC3)cc2)cc(-n2cc(-c3ccc(Cl)s3)nn2)c1'], \ + array([-0.7782, ..., -2.23])) + + >>> dataset = load_chembl231_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CN1CCN(C2=Nc3ccccc3Nc3sc(CO)cc32)CC1 -0.778151 + 1 Cc1cc2c(s1)Nc1ccccc1N=C2N1CCN(C)CC1 -0.622900 + 2 Cc1cc2c(s1)Nc1ccccc1N=C2N1CCNCC1 -1.342423 + 3 Cc1cc2c(s1)Nc1ccccc1N=C2N1CC[N+](C)([O-])CC1 -1.939519 + 4 CC(=O)c1ccc(OCCCN2CC[C@H](NC(=O)[C@@H](N)CO)C2)cc1 -4.633468 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl231_ki", + filename="chembl231_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl233_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL233 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Mu-type opioid receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 3142 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl233_ki + >>> dataset = load_chembl233_ki() + >>> dataset # doctest: +SKIP + (['CC(c1ccccc1)N1CC[C@H]1[C@@H](N)c1cccc(Cl)c1, ..., 'CCO[C@@]12Cc3cc(-c4ccccc4)cnc3[C@@H]3Oc4c(O)ccc5c4[C@@]31CCN(CC1CC1)[C@@H]2C5'], \ + array([-4.026, ..., -2.698])) + + >>> dataset = load_chembl233_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CC(c1ccccc1)N1CC[C@H]1[C@@H](N)c1cccc(Cl)c1 -4.026125 + 1 Cc1ccc(C(c2ccc(C)cc2)N2CC[C@H]2[C@H](N)c2cccc(Cl)c2)cc1 -2.903633 + 2 COc1ccc([C@H](N)[C@@H]2CCN2C(c2ccccc2)c2ccccc2)cc1 -2.937016 + 3 N[C@H](c1cccc(Cl)c1)[C@@H]1CCN1C(c1ccc(F)cc1)c1ccc(F)cc1 -3.337659 + 4 N[C@H](c1cccc(Cl)c1)[C@@H]1CCN1C(c1cccc(Cl)c1)c1cccc(Cl)c1 -3.854852 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl233_ki", + filename="chembl233_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl234_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL234 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the D(3) dopamine receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 3657 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl234_ki + >>> dataset = load_chembl234_ki() + >>> dataset # doctest: +SKIP + (['CN1C2CCC1CC(OC(c1ccc(F)cc1)c1ccc(F)cc1)C2, ..., 'CNc1cc(OC)c(C(=O)N[C@@H]2CCN(Cc3ccccc3)[C@@H]2C)cc1Cl'], \ + array([-2.161, ..., -0.07188])) + + >>> dataset = load_chembl234_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CN1C2CCC1CC(OC(c1ccc(F)cc1)c1ccc(F)cc1)C2 -2.161368 + 1 O=C(NCCCN1CCN(c2cccc(Cl)c2Cl)CC1)c1cccc2c1-c1ccccc1C2=O -1.556303 + 2 c1ccc(N2CCN(CCCn3c4ccccc4c4ccccc43)CC2)cc1 -3.383815 + 3 Oc1nc2c(N3CCN(Cc4ccccc4)CC3)cccc2[nH]1 -1.752048 + 4 O=C(NCCCN1CCN(c2ccccc2)CC1)c1cccc2c1-c1ccccc1C2=O -2.633468 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl234_ki", + filename="chembl234_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl235_ec50( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL235 EC50 dataset. + + The task is to predict the half maximal effective concentration (EC50) of molecules against the Peroxisome proliferator-activated receptor gamma target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 2349 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl235_ec50 + >>> dataset = load_chembl235_ec50() + >>> dataset # doctest: +SKIP + (['CC(/C=C/C(F)=C(/C)c1cc(C(C)(C)C)cc(C(C)(C)C)c1OCC(F)(F)F)=C\C(=O)O, ..., 'O=C(O)Cc1cc(Br)c(Oc2cc(I)c(O)c(I)c2)c(I)c1'], \ + array([-1.324, ..., -2.477])) + + >>> dataset = load_chembl235_ec50(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES EC50 + 0 CC(/C=C/C(F)=C(/C)c1cc(C(C)(C)C)cc(C(C)(C)C)c1OCC(F)(F)F)=C\C(=O)O -1.324282 + 1 CCCOc1c(/C(C)=C\C=C\C(C)=C\C(=O)O)cc(C(C)C)cc1C(F)(F)C(F)(F)F -1.343409 + 2 C/C(=C/C=C/C(C)=C/C(=O)O)c1cc(-c2cccs2)cc(C(C)C)c1OCC(F)F -0.993436 + 3 CCC(Cc1ccc(OC)c(C(=O)NCc2ccc(OCCc3ccccc3)cc2)c1)C(=O)O -3.477121 + 4 CCCCC(Cc1ccc(OC)c(C(=O)NCc2ccc(C(F)(F)F)cc2)c1)C(=O)O -3.397940 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl235_ec50", + filename="chembl235_ec50.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl236_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL236 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Delta-type opioid receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 2598 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl236_ki + >>> dataset = load_chembl236_ki() + >>> dataset # doctest: +SKIP + (['CC(c1ccccc1)N1CC[C@H]1[C@@H](N)c1cccc(Cl)c1, ..., 'CCO[C@@]12Cc3cc(-c4ccccc4)cnc3[C@@H]3Oc4c(O)ccc5c4[C@@]31CCN(CC1CC1)[C@@H]2C5'], \ + array([-4.592, ..., -0.8739])) + + >>> dataset = load_chembl236_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CC(c1ccccc1)N1CC[C@H]1[C@@H](N)c1cccc(Cl)c1 -4.592399 + 2 Cc1ccc(C(c2ccc(C)cc2)N2CC[C@H]2[C@H](N)c2cccc(Cl)c2)cc1 -3.699924 + 4 COc1ccc([C@H](N)[C@@H]2CCN2C(c2ccccc2)c2ccccc2)cc1 -3.465234 + 5 N[C@H](c1cccc(Cl)c1)[C@@H]1CCN1C(c1ccc(F)cc1)c1ccc(F)cc1 -3.870989 + 6 N[C@H](c1cccc(Cl)c1)[C@@H]1CCN1C(c1cccc(Cl)c1)c1cccc(Cl)c1 -3.432809 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl236_ki", + filename="chembl236_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl237_ec50( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL237 EC50 dataset. + + The task is to predict the half maximal effective concentration (EC50) of molecules against the Kappa-type opioid receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 955 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl237_ec50 + >>> dataset = load_chembl237_ec50() + >>> dataset # doctest: +SKIP + (['C=CCN1CC[C@]23c4c5ccc(O)c4O[C@H]2C(=O)CC[C@@]3(O)[C@H]1C5, ..., 'Oc1ccc2c3c1O[C@H]1c4ncc(-c5ccccc5)cc4C[C@@]4(OCCCC5CCCCC5)[C@@H](C2)N(CC2CC2)CC[C@]314'], \ + array([-0.9191, ..., -1.538])) + + >>> dataset = load_chembl237_ec50(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES EC50 + 1 C=CCN1CC[C@]23c4c5ccc(O)c4O[C@H]2C(=O)CC[C@@]3(O)[C@H]1C5 -0.919078 + 2 CN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@]24CC[C@@]3(C[C@H]2C(C)(C)C(C)(C)O4)C1C5 -1.320146 + 3 CO[C@@]12CCC3(C[C@H]1[C@@](C)(O)C(C)(C)C)[C@H]1Cc4ccc(O)c5c4C3(CCN1C)[C@H]2O5 -0.380211 + 4 Nc1nc2cc3c(cc2s1)C[C@@H]1[C@@H]2CCCC[C@]32CCN1CC1CC1 -0.380211 + 5 CN1CCC23c4c5ccc(O)c4OC2c2nc(N)ncc2CC3(O)C1C5 -3.031408 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl237_ec50", + filename="chembl237_ec50.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl237_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL237 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Kappa-type opioid receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 2603 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl237_ki + >>> dataset = load_chembl237_ki() + >>> dataset # doctest: +SKIP + (['CC(c1ccccc1)N1CC[C@H]1[C@@H](N)c1cccc(Cl)c1, ..., 'CCO[C@@]12Cc3cc(-c4ccccc4)cnc3[C@@H]3Oc4c(O)ccc5c4[C@@]31CCN(CC1CC1)[C@@H]2C5'], \ + array([-3.613, ..., -2.401])) + + >>> dataset = load_chembl237_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CC(c1ccccc1)N1CC[C@H]1[C@@H](N)c1cccc(Cl)c1 -3.612678 + 1 Cc1ccc(C(c2ccc(C)cc2)N2CC[C@H]2[C@H](N)c2cccc(Cl)c2)cc1 -3.265054 + 2 COc1ccc([C@H](N)[C@@H]2CCN2C(c2ccccc2)c2ccccc2)cc1 -3.127429 + 3 N[C@H](c1cccc(Cl)c1)[C@@H]1CCN1C(c1ccc(F)cc1)c1ccc(F)cc1 -3.350248 + 4 N[C@H](c1cccc(Cl)c1)[C@@H]1CCN1C(c1cccc(Cl)c1)c1cccc(Cl)c1 -3.780821 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl237_ki", + filename="chembl237_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl238_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL238 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Sodium-dependent dopamine transporter target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 1052 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl238_ki + >>> dataset = load_chembl238_ki() + >>> dataset # doctest: +SKIP + (['CN1CCC(O)(c2ccc(Cl)c(Cl)c2)C([C@@H](O)c2ccc(Cl)c(Cl)c2)C1, ..., 'C[C@H]1CN(CC[S+](O)C(c2ccc(F)cc2)c2ccc(F)cc2)C[C@@H](C)N1CC(O)Cc1ccccc1'], \ + array([-3.617, ..., -0.873])) + + >>> dataset = load_chembl238_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CN1CCC(O)(c2ccc(Cl)c(Cl)c2)C([C@@H](O)c2ccc(Cl)c(Cl)c2)C1 -3.617000 + 1 CN1CCC(O)(c2ccc(Cl)c(Cl)c2)C(C(=O)c2ccc(Cl)c(Cl)c2)C1 -1.037426 + 2 Cc1ccc(C2OC(=O)OC3(c4ccc(C)cc4)CCN(C)CC23)cc1 -3.913284 + 3 Cc1ccc([C@H](O)C2CN(C)CCC2(O)c2ccc(C)cc2)cc1 -4.027350 + 4 CN1CCC(O)(c2ccc(F)cc2)C(C(=O)c2ccc(F)cc2)C1 -3.755875 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl238_ki", + filename="chembl238_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl239_ec50( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL239 EC50 dataset. + + The task is to predict the half maximal effective concentration (EC50) of molecules against the Peroxisome proliferator-activated receptor alpha target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 1721 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl239_ec50 + >>> dataset = load_chembl239_ec50() + >>> dataset # doctest: +SKIP + (['CCC(Cc1ccc(OC)c(C(=O)NCc2ccc(OCCc3ccccc3)cc2)c1)C(=O)O, ..., 'CC(C)(Oc1ccc(CCOc2ccc(/N=N/c3ccc(Cl)cc3)cc2)cc1)C(=O)O'], \ + array([-3.431, ..., -2.58])) + + >>> dataset = load_chembl239_ec50(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES EC50 + 0 CCC(Cc1ccc(OC)c(C(=O)NCc2ccc(OCCc3ccccc3)cc2)c1)C(=O)O -3.431364 + 1 CC[C@@H](Cc1ccc(OC)c(C(=O)NCc2ccc(Oc3ccc(F)cc3)cc2)c1)C(=O)O -0.964024 + 2 CCCCC(Cc1ccc(OC)c(C(=O)NCc2ccc(C(F)(F)F)cc2)c1)C(=O)O -3.000000 + 3 CCC(Cc1ccc(OC)c(C(=O)NCCc2ccc(C(F)(F)F)cc2)c1)C(=O)O -2.869232 + 4 CCC(Cc1ccc(OC)c(C(=O)NCc2ccc(OC(F)(F)F)cc2)c1)C(=O)O -1.633468 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl239_ec50", + filename="chembl239_ec50.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl244_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL244 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Coagulation factor x target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 3097 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl244_ki + >>> dataset = load_chembl244_ki() + >>> dataset # doctest: +SKIP + (['CC(=N)N1CCC(Oc2ccc3nc(CCC(=O)O)n(Cc4ccc5ccc(C(=N)N)cc5c4)c3c2)CC1, ..., 'CC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CO)C(=O)N[C@H](C=O)CCCNC(=N)N'], \ + array([-0.1139, ..., -2.556])) + + >>> dataset = load_chembl244_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CC(=N)N1CCC(Oc2ccc3nc(CCC(=O)O)n(Cc4ccc5ccc(C(=N)N)cc5c4)c3c2)CC1 -0.113943 + 1 CC(=N)N1CCC(Oc2ccc3c(c2)nc(C(C)C)n3Cc2ccc3ccc(C(=N)N)cc3c2)CC1 -0.301030 + 2 CCC(C)c1nc2cc(OC3CCN(C(C)=N)CC3)ccc2n1Cc1ccc2ccc(C(=N)N)cc2c1 -0.518514 + 3 CC1CCN(C(=O)[C@H](Cc2cccc(C(=N)N)c2)NS(=O)(=O)c2c(C(C)C)cc(C(C)C)cc2C(C)C)CC1 -3.301000 + 4 COC(=O)[C@H]1Cc2ccccc2CN1C(=O)[C@H](Cc1cccc(C(=N)N)c1)NS(=O)(=O)c1ccc2ccccc2c1 -4.431000 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl244_ki", + filename="chembl244_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl262_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL262 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Glycogen synthase kinase-3 beta target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 856 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl262_ki + >>> dataset = load_chembl262_ki() + >>> dataset # doctest: +SKIP + (['Cc1nc(N)sc1-c1ccnc(Nc2cccc([N+](=O)[O-])c2)n1, ..., 'CC(C)(C#N)c1cccc(-c2ccnc3[nH]ccc23)n1'], \ + array([-1.301, ..., -2.322])) + + >>> dataset = load_chembl262_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 Cc1nc(N)sc1-c1ccnc(Nc2cccc([N+](=O)[O-])c2)n1 -1.30103 + 1 Cc1ccc2c(-c3ccnc(Nc4cccc(C(F)(F)F)c4)n3)c(-c3ccc(F)cc3)nn2n1 -1.30103 + 2 Cc1ccc2c(-c3ccnc(Nc4ccc(F)c(F)c4)n3)c(-c3ccc(F)cc3)nn2n1 -1.00000 + 3 Cc1ccc2c(-c3ccnc(Nc4ccc5c(c4)OCCO5)n3)c(-c3ccc(F)cc3)nn2n1 -1.00000 + 4 Cc1ccc2c(-c3ccnc(Nc4ccc(Cl)c(C(F)(F)F)c4)n3)c(-c3ccc(F)cc3)nn2n1 -1.69897 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl262_ki", + filename="chembl262_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl264_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL264 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Histamine h3 receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 2862 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl264_ki + >>> dataset = load_chembl264_ki() + >>> dataset # doctest: +SKIP + (['CC(=O)c1ccc(OCCCc2c[nH]cn2)cc1, ..., 'CC(C)(C)c1ccc(OCCCCCCN2CCCCCC2)cc1'], \ + array([-1.94, ..., -2.919])) + + >>> dataset = load_chembl264_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CC(=O)c1ccc(OCCCc2c[nH]cn2)cc1 -1.939519 + 1 c1ccc(COCCCc2c[nH]cn2)cc1 -0.415974 + 2 CC(=O)c1ccc(SCCc2c[nH]cn2)cc1 -0.041393 + 3 c1ccc(OCCCc2c[nH]cn2)cc1 -1.431364 + 4 CC(=O)c1ccc(SCCCc2c[nH]cn2)cc1 -1.255273 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl264_ki", + filename="chembl264_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl287_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL287 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Sigma non-opioid intracellular receptor 1 target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 1328 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl287_ki + >>> dataset = load_chembl287_ki() + >>> dataset # doctest: +SKIP + (['O=S1(=O)c2ccccc2CCC12CCN(Cc1ccccc1)CC2, ..., 'Cc1[nH]c2cc(C(F)(F)F)ccc2c(=O)c1CN(C)Cc1ccccc1'], \ + array([-1.301, ..., -1.949])) + + >>> dataset = load_chembl287_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 O=S1(=O)c2ccccc2CCC12CCN(Cc1ccccc1)CC2 -1.301030 + 1 COc1ccc(N2C[C@H](CN3CCC(O)(c4ccsc4)CC3)OC2=O)cc1 -1.531479 + 2 COc1ccc(N2C[C@H](CN3CCC(O)(c4ccc5c(c4)OCO5)CC3)OC2=O)cc1 -1.278754 + 3 CNC(=O)CC1Cc2ccccc2C2(CCN(Cc3ccccc3)CC2)O1 -2.230449 + 4 OCC1OC2(CCN(Cc3ccccc3)CC2)c2ccccc21 -0.752816 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl287_ki", + filename="chembl287_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl1862_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL1862 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Tyrosine-protein kinase abl1 target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 794 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl1862_ki + >>> dataset = load_chembl1862_ki() + >>> dataset # doctest: +SKIP + (['Nc1[nH]cnc2nnc(-c3ccc(Cl)cc3)c1-2, ..., 'CCCCNc1ncnc2c1cnn2CC(Cl)c1ccccc1'], \ + array([-2.699, ..., -3.3])) + + >>> dataset = load_chembl1862_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 Nc1[nH]cnc2nnc(-c3ccc(Cl)cc3)c1-2 -2.69897 + 1 Cc1ccc(N2NC(=O)/C(=C/c3ccc(-c4ccc(C)c(Cl)c4)o3)C2=O)cc1C -3.69897 + 2 O=C1NN(c2ccc(Cl)c(Cl)c2)C(=O)/C1=C\c1cccc(OCc2ccccc2)c1 -3.00000 + 3 O=C1NN(c2ccc(I)cc2)C(=O)/C1=C\c1cc2c(cc1Br)OCO2 -3.39794 + 4 O=C1NN(c2ccc(I)cc2)C(=O)/C1=C\c1ccc(N2CCOCC2)cc1 -4.30103 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl1862_ki", + filename="chembl1862_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl1871_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL1871 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Androgen receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 659 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl1871_ki + >>> dataset = load_chembl1871_ki() + >>> dataset # doctest: +SKIP + (['CC1=CC(C)(C)Nc2ccc3c(c21)/C(=C/c1ccsc1)Oc1ccc(F)cc1-3, ..., 'CN(C[C@](C)(O)C(=O)Nc1ccc(C#N)c(C(F)(F)F)c1)c1ccc(C#N)c(-c2ccccc2)c1'], \ + array([-2.825, ..., -1.892])) + + >>> dataset = load_chembl1871_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CC1=CC(C)(C)Nc2ccc3c(c21)/C(=C/c1ccsc1)Oc1ccc(F)cc1-3 -2.825426 + 1 CCc1ccccc1/C=C1\Oc2ccc(F)cc2-c2ccc3c(c21)C(C)=CC(C)(C)N3 -3.201124 + 2 CC1=CC(C)(C)Nc2ccc3c(c21)/C(=C/c1ccccc1N(C)C)Oc1ccc(F)cc1-3 -2.913284 + 3 CC1=CC(C)(C)Nc2ccc3c(c21)/C(=C/c1ccccc1)Oc1c(F)cccc1-3 -3.163161 + 4 CC(=O)O[C@]1(C(C)=O)CC[C@H]2[C@@H]3C[C@H](C)C4=CC(=O)CC[C@]4(C)[C@H]3CC[C@@]21C -0.462398 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl1871_ki", + filename="chembl1871_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl2034_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL2034 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Glucocorticoid receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 750 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl2034_ki + >>> dataset = load_chembl2034_ki() + >>> dataset # doctest: +SKIP + (['CC1=CC(C)(C)Nc2ccc3c(c21)/C(=C/c1ccsc1)Oc1ccc(F)cc1-3, ..., 'NS(=O)(=O)C[C@H]1COc2cc(F)ccc2N1C(=O)c1ccc2c(c1)NCCO2'], \ + array([-1.924, ..., -3.1])) + + >>> dataset = load_chembl2034_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CC1=CC(C)(C)Nc2ccc3c(c21)/C(=C/c1ccsc1)Oc1ccc(F)cc1-3 -1.924279 + 1 CCc1ccccc1/C=C1\Oc2ccc(F)cc2-c2ccc3c(c21)C(C)=CC(C)(C)N3 -2.431364 + 2 CC1=CC(C)(C)Nc2ccc3c(c21)/C(=C/c1ccccc1N(C)C)Oc1ccc(F)cc1-3 -2.692847 + 3 CC1=CC(C)(C)Nc2ccc3c(c21)/C(=C/c1ccccc1)Oc1c(F)cccc1-3 -2.506505 + 4 CC(=O)O[C@]1(C(C)=O)CC[C@H]2[C@@H]3C[C@H](C)C4=CC(=O)CC[C@]4(C)[C@H]3CC[C@@]21C -1.120574 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl2034_ki", + filename="chembl2034_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl2047_ec50( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL2047 EC50 dataset. + + The task is to predict the half maximal effective concentration (EC50) of molecules against the Bile acid receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 631 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl2047_ec50 + >>> dataset = load_chembl2047_ec50() + >>> dataset # doctest: +SKIP + (['C[C@H](CCC(=O)NCC(=O)O)[C@H]1CC[C@H]2[C@H]3[C@H](CC[C@@]21C)[C@@]1(C)CC[C@@H](O)C[C@H]1C[C@H]3O, ..., 'CC(C)c1onc(-c2c(Cl)cccc2Cl)c1COc1ccc(CNc2ccc(CC(=O)O)cc2)c(Cl)c1'], \ + array([-3.477, ..., -2.973])) + + >>> dataset = load_chembl2047_ec50(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES EC50 + 0 C[C@H](CCC(=O)NCC(=O)O)[C@H]1CC[C@H]2[C@H]3[C@H](CC[C@@]21C)[C@@]1(C)CC[C@@H](O)C[C@H]1C[C@H]3O -3.477121 + 1 C[C@H](CCC(=O)O)C1CC[C@H]2[C@H]3[C@H](CC[C@]12C)[C@@]1(C)CC[C@@H](O)CC1[C@@H](C)[C@H]3O -2.875061 + 3 CCC[C@@H]1C2C[C@H](O)CC[C@]2(C)[C@H]2CC[C@]3(C)C([C@H](C)CCC(=O)O)CC[C@H]3[C@@H]2[C@@H]1O -3.045323 + 4 CC(C)c1onc(-c2c(Cl)cccc2Br)c1COc1ccc(/C=C/c2cccc(C(=O)O)c2)c(Cl)c1 -1.079181 + 5 Cc1cc(OCc2c(-c3c(Cl)cccc3Cl)noc2C(C)C)ccc1/C=C/c1cccc(C(=O)O)c1 -1.672098 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl2047_ec50", + filename="chembl2047_ec50.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl2147_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL2147 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Serine/threonine-protein kinase pim-1 target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 1456 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl2147_ki + >>> dataset = load_chembl2147_ki() + >>> dataset # doctest: +SKIP + (['FC(F)(F)c1cccc(-c2nnc3ccc(NC4CCCCC4)cn23)c1, ..., 'NC(=O)c1cc(Cl)c2c(Cl)c(C#CC3CNCCO3)n([C@H]3CCCNC3)c2n1'], \ + array([-1.041, ..., 0.04576])) + + >>> dataset = load_chembl2147_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 FC(F)(F)c1cccc(-c2nnc3ccc(NC4CCCCC4)cn23)c1 -1.041393 + 1 Cc1ccc2[nH]c(=O)c(CC(=O)O)c(-c3ccccc3)c2c1 -3.653213 + 2 O=C(O)c1cccc(Nc2nc(-c3ccc(O)cc3O)cs2)c1 -3.531479 + 3 O=C(O)c1cccc2c(-c3ccccc3)c(-c3ccccc3)[nH]c12 -2.740363 + 4 CCc1ccc(C2C(C(C)=O)=C(O)C(=O)N2CCc2c[nH]c3ccccc23)cc1 -3.322219 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl2147_ki", + filename="chembl2147_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl2835_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL2835 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Tyrosine-protein kinase jak1 target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 615 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl2835_ki + >>> dataset = load_chembl2835_ki() + >>> dataset # doctest: +SKIP + (['C[C@@H]1CCN(C(=O)CC#N)C[C@@H]1N(C)c1ncnc2[nH]ccc12, ..., 'Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(=O)NC(C)(C)C)c1'], \ + array([0.1549, ..., -2.021])) + + >>> dataset = load_chembl2835_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 C[C@@H]1CCN(C(=O)CC#N)C[C@@H]1N(C)c1ncnc2[nH]ccc12 0.154902 + 1 C[C@@H]1CCN(C(=O)CC#N)C[C@@H]1n1cnc2cnc3[nH]ccc3c21 0.301030 + 2 C[C@@H]1CCN(Cc2ccccc2)C[C@@H]1N(C)c1ncnc2[nH]ccc12 -2.785330 + 3 C[C@@H]1CCN(Cc2ccccc2)C[C@@H]1n1cnc2cnc3[nH]ccc3c21 -1.079181 + 4 N#CCC(=O)N1CCC[C@@H](n2cnc3cnc4[nH]ccc4c32)C1 0.397940 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl2835_ki", + filename="chembl2835_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl2971_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL2971 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Tyrosine-protein kinase jak2 target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 976 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl2971_ki + >>> dataset = load_chembl2971_ki() + >>> dataset # doctest: +SKIP + (['NC(=O)Nc1sc(-c2ccc(F)cc2)cc1C(N)=O, ..., 'Cc1cc(Nc2nc(N[C@@H](C)c3ccc(F)cc3)c(C#N)cc2F)n[nH]1'], \ + array([-0.699, ..., 0.3468])) + + >>> dataset = load_chembl2971_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 NC(=O)Nc1sc(-c2ccc(F)cc2)cc1C(N)=O -0.698970 + 1 O[C@H]1CC[C@H](Nc2ccc3nnc(-c4cccc(C(F)(F)F)c4)n3n2)CC1 -3.380211 + 2 c1ccc(-c2ncnc3[nH]ccc23)cc1 -2.683947 + 3 Clc1cnc2[nH]cc(-c3ccccc3)c2c1 -2.414973 + 4 CCC1Nc2ccccc2-c2ccnc3[nH]cc1c23 -3.230449 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl2971_ki", + filename="chembl2971_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl3979_ec50( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL3979 EC50 dataset. + + The task is to predict the half maximal effective concentration (EC50) of molecules against the Peroxisome proliferator-activated receptor delta target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 1125 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl3979_ec50 + >>> dataset = load_chembl3979_ec50() + >>> dataset # doctest: +SKIP + (['CCC(Cc1ccc(OC)c(C(=O)NCCc2ccc(C(F)(F)F)cc2)c1)C(=O)O, ..., 'CC(C)c1onc(-c2c(Cl)cccc2Cl)c1COc1ccc(CNc2ccc(CC(=O)O)cc2)c(Cl)c1'], \ + array([-3.176, ..., -3.176])) + + >>> dataset = load_chembl3979_ec50(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES EC50 + 0 CCC(Cc1ccc(OC)c(C(=O)NCCc2ccc(C(F)(F)F)cc2)c1)C(=O)O -3.176091 + 1 CCC(Cc1ccc(OC)c(C(=O)NCc2ccc(OC(F)(F)F)cc2)c1)C(=O)O -2.954243 + 2 CCC(Cc1ccc(OC)c(CCCc2ccc(C(F)(F)F)cc2)c1)C(=O)O -2.806180 + 3 CCSC(Cc1ccc(OC)c(C(=O)NCc2ccc(C(F)(F)F)cc2)c1)C(=O)O -3.477121 + 4 CCOC(Cc1ccc(OC)c(C(=O)NCc2ccc(C(F)(F)F)cc2)c1)C(=O)O -3.477121 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl3979_ec50", + filename="chembl3979_ec50.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl4005_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL4005 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Phosphatidylinositol 4,5-bisphosphate 3-kinase catalytic subunit alpha isoform target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 960 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl4005_ki + >>> dataset = load_chembl4005_ki() + >>> dataset # doctest: +SKIP + (['COC[C@H]1OC(=O)c2coc3c2[C@@]1(C)C1=C(C3=O)[C@@H]2CCC(=O)[C@@]2(C)C[C@H]1OC(C)=O, ..., 'CC(C)n1nc(-c2ccc3oc(N)nc3c2)c2c(N)ncnc21'], \ + array([-2.079, ..., -1.447])) + + >>> dataset = load_chembl4005_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 COC[C@H]1OC(=O)c2coc3c2[C@@]1(C)C1=C(C3=O)[C@@H]2CCC(=O)[C@@]2(C)C[C@H]1OC(C)=O -2.079181 + 1 O=c1cc(N2CCOCC2)oc2c(-c3ccccc3)cccc12 -3.778151 + 2 CS(=O)(=O)N1CCN(Cc2cc3nc(-c4cccc5[nH]ncc45)nc(N4CCOCC4)c3s2)CC1 -0.806180 + 3 COc1ccc(NC(=O)c2c(C)ccc3c(N)nc(C)nc23)cn1 -2.000000 + 4 COc1ccc(NC(=O)c2cc(C)cc3c(N)nc(C)nc23)cn1 0.301030 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl4005_ki", + filename="chembl4005_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl4203_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL4203 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Dual specificity protein kinase clk4 target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 731 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl4203_ki + >>> dataset = load_chembl4203_ki() + >>> dataset # doctest: +SKIP + (['O=c1[nH]cnc2c1sc1c(Cl)ccc(Cl)c12, ..., 'O=C(c1cccc(-c2cnc3[nH]ccc3c2)c1)N1CCOCC1'], \ + array([-1.977, ..., -3.8])) + + >>> dataset = load_chembl4203_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 O=c1[nH]cnc2c1sc1c(Cl)ccc(Cl)c12 -1.976808 + 1 Nc1ncnc2onc(-c3ccc(NC(=O)Nc4cccc(C(F)(F)F)c4)cc3)c12 -2.400002 + 2 O=c1[nH]cnc2c(-c3ccccc3)c(C(F)(F)F)sc12 -3.299999 + 3 O=C1Nc2ccccc2Nc2cc(-c3ccncc3F)ccc21 -1.400020 + 4 Cc1cc(N2CCOCC2)cc2[nH]c(-c3c(NCC(O)c4cccc(Cl)c4)cc[nH]c3=O)nc12 -1.700011 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl4203_ki", + filename="chembl4203_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl4616_ec50( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL4616 EC50 dataset. + + The task is to predict the half maximal effective concentration (EC50) of molecules against the Growth hormone secretagogue receptor type 1 target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 682 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl4616_ec50 + >>> dataset = load_chembl4616_ec50() + >>> dataset # doctest: +SKIP + (['CCCCCCCC(=O)OC[C@H](NC(=O)CN)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccccc1)C(=O)O, ..., 'CC(=O)N1CCC[C@H](NC(=O)[C@H]2CN(S(=O)(=O)c3ccccc3)C[C@@H]2NC(=O)c2cc(-c3ccccc3Cl)on2)C1'], \ + array([-1.857, ..., -2.111])) + + >>> dataset = load_chembl4616_ec50(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES EC50 + 0 CCCCCCCC(=O)OC[C@H](NC(=O)CN)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccccc1)C(=O)O -1.857332 + 2 CC(C)(N)C(=O)N[C@H](COCc1ccccc1)C(=O)N1CCC2(CC1)CN(S(C)(=O)=O)c1ccccc12 0.072578 + 3 NC(=O)CN(CCc1ccccc1)C(=O)[C@@H](Cc1ccc2ccccc2c1)NC(=O)[C@@H](Cc1ccc2ccccc2c1)NC(=O)C1CCNCC1 0.468521 + 4 CC(C)N(CCNC(=O)C1c2ccc(Oc3cccc(F)c3)cc2CCN1C(=O)OC(C)(C)C)C(C)C -0.633468 + 5 CC(C)N(CCNC(=O)C1c2ccc(Oc3ccc(Cl)cc3)cc2CCN1C(=O)OC(C)(C)C)C(C)C 0.136677 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl4616_ec50", + filename="chembl4616_ec50.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl4792_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL4792 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Orexin receptor type 2 target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 1471 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl4792_ki + >>> dataset = load_chembl4792_ki() + >>> dataset # doctest: +SKIP + (['CC1(C)OC[C@H](NC(=O)Nc2ccc(Br)cc2Cl)[C@H](c2ccccc2)O1, ..., 'CC(/C=C/c1ccccc1)=N/Nc1nc(Nc2ccccc2)nc(-n2nc(C)cc2C)n1'], \ + array([-0.8, ..., -4.25])) + + >>> dataset = load_chembl4792_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CC1(C)OC[C@H](NC(=O)Nc2ccc(Br)cc2Cl)[C@H](c2ccccc2)O1 -0.800029 + 1 Cc1cc(Br)ccc1NC(=O)N[C@H]1COC(C)(C)O[C@H]1c1ccccc1 -1.599992 + 2 Cc1ccc(Cl)c(NC(=O)N[C@H]2COC(C)(C)O[C@H]2c2ccccc2)c1 -1.800029 + 3 Cc1ccc(NC(=O)N[C@H]2COC(C)(C)O[C@H]2c2ccccc2)c(C)c1 -2.099991 + 4 CC1(C)OC[C@H](NC(=O)Nc2cc(Cl)ccc2Cl)[C@H](c2ccccc2)O1 -1.800029 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl4792_ki", + filename="chembl4792_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) diff --git a/tests/datasets/moleculeace.py b/tests/datasets/moleculeace.py index 9e482ad8..ec206e77 100644 --- a/tests/datasets/moleculeace.py +++ b/tests/datasets/moleculeace.py @@ -2,6 +2,35 @@ from skfp.datasets.moleculeace import ( load_chembl204_ki, + load_chembl214_ki, + load_chembl218_ec50, + load_chembl219_ki, + load_chembl228_ki, + load_chembl231_ki, + load_chembl233_ki, + load_chembl234_ki, + load_chembl235_ec50, + load_chembl236_ki, + load_chembl237_ec50, + load_chembl237_ki, + load_chembl238_ki, + load_chembl239_ec50, + load_chembl244_ki, + load_chembl262_ki, + load_chembl264_ki, + load_chembl287_ki, + load_chembl1862_ki, + load_chembl1871_ki, + load_chembl2034_ki, + load_chembl2047_ec50, + load_chembl2147_ki, + load_chembl2835_ki, + load_chembl2971_ki, + load_chembl3979_ec50, + load_chembl4005_ki, + load_chembl4203_ki, + load_chembl4616_ec50, + load_chembl4792_ki, load_moleculeace_dataset, ) from tests.datasets.test_utils import run_basic_dataset_checks @@ -16,6 +45,35 @@ "dataset_name, load_func, expected_length, num_tasks, task_type", [ ("chembl204_ki", load_chembl204_ki, 2754, 1, "regression"), + ("chembl214_ki", load_chembl214_ki, 3317, 1, "regression"), + ("chembl218_ec50", load_chembl218_ec50, 1031, 1, "regression"), + ("chembl219_ki", load_chembl219_ki, 1865, 1, "regression"), + ("chembl228_ki", load_chembl228_ki, 1704, 1, "regression"), + ("chembl231_ki", load_chembl231_ki, 973, 1, "regression"), + ("chembl233_ki", load_chembl233_ki, 3142, 1, "regression"), + ("chembl234_ki", load_chembl234_ki, 3657, 1, "regression"), + ("chembl235_ec50", load_chembl235_ec50, 2349, 1, "regression"), + ("chembl236_ki", load_chembl236_ki, 2598, 1, "regression"), + ("chembl237_ec50", load_chembl237_ec50, 955, 1, "regression"), + ("chembl237_ki", load_chembl237_ki, 2603, 1, "regression"), + ("chembl238_ki", load_chembl238_ki, 1052, 1, "regression"), + ("chembl239_ec50", load_chembl239_ec50, 1721, 1, "regression"), + ("chembl244_ki", load_chembl244_ki, 3097, 1, "regression"), + ("chembl262_ki", load_chembl262_ki, 856, 1, "regression"), + ("chembl264_ki", load_chembl264_ki, 2862, 1, "regression"), + ("chembl287_ki", load_chembl287_ki, 1328, 1, "regression"), + ("chembl1862_ki", load_chembl1862_ki, 794, 1, "regression"), + ("chembl1871_ki", load_chembl1871_ki, 659, 1, "regression"), + ("chembl2034_ki", load_chembl2034_ki, 750, 1, "regression"), + ("chembl2047_ec50", load_chembl2047_ec50, 631, 1, "regression"), + ("chembl2147_ki", load_chembl2147_ki, 1456, 1, "regression"), + ("chembl2835_ki", load_chembl2835_ki, 615, 1, "regression"), + ("chembl2971_ki", load_chembl2971_ki, 976, 1, "regression"), + ("chembl3979_ec50", load_chembl3979_ec50, 1125, 1, "regression"), + ("chembl4005_ki", load_chembl4005_ki, 960, 1, "regression"), + ("chembl4203_ki", load_chembl4203_ki, 731, 1, "regression"), + ("chembl4616_ec50", load_chembl4616_ec50, 682, 1, "regression"), + ("chembl4792_ki", load_chembl4792_ki, 1471, 1, "regression"), ], ) def test_load_dataset(dataset_name, load_func, expected_length, num_tasks, task_type): From bd359db124b4def8e2e27a233339d4f4bfa4f731 Mon Sep 17 00:00:00 2001 From: Mateusz Praski Date: Wed, 15 Oct 2025 17:31:50 +0200 Subject: [PATCH 3/6] Added split function and unit tests --- .gitignore | 2 +- skfp/datasets/moleculeace/__init__.py | 1 + skfp/datasets/moleculeace/benchmark.py | 122 ++++++++++++++++- tests/datasets/moleculeace.py | 173 +++++++++++++++++++++++++ 4 files changed, 293 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index fb227d1d..f6995baa 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,4 @@ tests/preprocessing/input_output/data/mol_out.sdf .python-version .DS_Store - +.vscode diff --git a/skfp/datasets/moleculeace/__init__.py b/skfp/datasets/moleculeace/__init__.py index faa73c91..4361e47d 100644 --- a/skfp/datasets/moleculeace/__init__.py +++ b/skfp/datasets/moleculeace/__init__.py @@ -1,6 +1,7 @@ from .benchmark import ( load_moleculeace_benchmark, load_moleculeace_dataset, + load_moleculeace_splits, ) from .moleculeace import ( load_chembl204_ki, diff --git a/skfp/datasets/moleculeace/benchmark.py b/skfp/datasets/moleculeace/benchmark.py index 5eca947c..66bddecf 100644 --- a/skfp/datasets/moleculeace/benchmark.py +++ b/skfp/datasets/moleculeace/benchmark.py @@ -5,6 +5,8 @@ import pandas as pd from sklearn.utils._param_validation import StrOptions, validate_params +from skfp.datasets.utils import fetch_splits + from .moleculeace import ( load_chembl204_ki, load_chembl214_ki, @@ -107,6 +109,7 @@ @validate_params( { + "subset": [None, list], "data_dir": [None, str, os.PathLike], "as_frame": ["boolean"], "verbose": ["boolean"], @@ -114,7 +117,7 @@ prefer_skip_nested_validation=True, ) def load_moleculeace_benchmark( - subset: str | list[str] | None = None, + subset: list[str] | None = None, data_dir: str | os.PathLike | None = None, as_frames: bool = False, verbose: bool = False, @@ -128,7 +131,36 @@ def load_moleculeace_benchmark( For more details, see loading functions for particular datasets. Allowed individual dataset names are listed below. Dataset names are also returned (case-sensitive). - - "chembl204_ki" + - chembl204_ki + - chembl214_ki + - chembl218_ec50 + - chembl219_ki + - chembl228_ki + - chembl231_ki + - chembl233_ki + - chembl234_ki + - chembl235_ec50 + - chembl236_ki + - chembl237_ec50 + - chembl237_ki + - chembl238_ki + - chembl239_ec50 + - chembl244_ki + - chembl262_ki + - chembl264_ki + - chembl287_ki + - chembl1862_ki + - chembl1871_ki + - chembl2034_ki + - chembl2047_ec50 + - chembl2147_ki + - chembl2835_ki + - chembl2971_ki + - chembl3979_ec50 + - chembl4005_ki + - chembl4203_ki + - chembl4616_ec50 + - chembl4792_ki Parameters ---------- @@ -214,7 +246,7 @@ def load_moleculeace_dataset( Parameters ---------- - dataset_name : {} + dataset_name : str Name of the dataset to load. data_dir : {None, str, path-like}, default=None @@ -254,7 +286,89 @@ def load_moleculeace_dataset( return loader_func(data_dir, as_frame, verbose) -def _subset_to_dataset_names(subset: str | list[str] | None) -> list[str]: +@validate_params( + { + "dataset_name": [StrOptions(set(MOLECULEACE_DATASET_NAMES))], + "split_type": [StrOptions({"random", "activity_cliff"})], + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_moleculeace_splits( + dataset_name: str, + split_type: str = "activity_cliff", + data_dir: str | os.PathLike | None = None, + as_dict: bool = False, + verbose: bool = False, +) -> tuple[list[int], list[int]] | dict[str, list[int]]: + """ + Load pre-generated dataset splits for the MoleculeACE benchmark. + + MoleculeACE [1]_ provides two split types for stratified random + train/validation/test partitions with respect to activity cliffs: + - random + - activity_cliff + + Random splits use an 80/20 train/test split. Activity_cliff splits additionally + restrict the test set to molecules that are part of activity-cliff pairs. + Activity_cliff splits are recommended in the literature. + + Dataset names are the same as those returned by `load_moleculeace_benchmark` + and are case-sensitive. + + Parameters + ---------- + dataset_name : str + Name of the dataset to loads splits for. + + split_type: {"random", "activity_cliff"} + Type of the split to load. + + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_dict : bool, default=False + If True, returns the splits as dictionary with keys "train", "valid" and "test", + and index lists as values. Otherwise, returns three lists with splits indexes. + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : tuple(list[int], list[int], list[int]) or dict + Depending on the `as_dict` argument, one of: + - three lists of integer indexes + - dictionary with "train", "valid" and "test" keys, and values as lists with + splits indexes + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022. + `_ + """ + splits_suffix = {"random": "splits.json", "activity_cliff": "splits_activity.json"}[ + split_type + ] + + splits = fetch_splits( + data_dir, + dataset_name=f"MoleculeACE_{dataset_name}", + filename=f"{dataset_name}_{splits_suffix}", + verbose=verbose, + ) + if as_dict: + return splits + else: + return splits["train"], splits["test"] + + +def _subset_to_dataset_names(subset: list[str] | None) -> list[str]: if subset is None: dataset_names = MOLECULEACE_DATASET_NAMES elif isinstance(subset, (list, set, tuple)): diff --git a/tests/datasets/moleculeace.py b/tests/datasets/moleculeace.py index ec206e77..9b85566b 100644 --- a/tests/datasets/moleculeace.py +++ b/tests/datasets/moleculeace.py @@ -1,4 +1,5 @@ import pytest +from sklearn.utils._param_validation import InvalidParameterError from skfp.datasets.moleculeace import ( load_chembl204_ki, @@ -31,11 +32,183 @@ load_chembl4203_ki, load_chembl4616_ec50, load_chembl4792_ki, + load_moleculeace_benchmark, load_moleculeace_dataset, + load_moleculeace_splits, ) +from skfp.datasets.moleculeace.benchmark import MOLECULEACE_DATASET_NAMES from tests.datasets.test_utils import run_basic_dataset_checks +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +def test_load_moleculeace_benchmark(): + benchmark_full = load_moleculeace_benchmark(as_frames=True) + benchmark_names = [name for name, df in benchmark_full] + assert benchmark_names == MOLECULEACE_DATASET_NAMES + + benchmark_full_tuples = load_moleculeace_benchmark(as_frames=False) + benchmark_names = [name for name, smiles, y in benchmark_full_tuples] + assert benchmark_names == MOLECULEACE_DATASET_NAMES + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +def test_load_moleculeace_benchmark_subset(): + dataset_names = ["chembl4005_ki", "chembl204_ki", "chembl235_ec50"] + benchmark = load_moleculeace_benchmark(subset=dataset_names, as_frames=True) + benchmark_names = [name for name, df in benchmark] + assert benchmark_names == dataset_names + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +def test_load_moleculeace_benchmark_wrong_subset(): + dataset_names = ["chembl4005_ki", "Nonexistent"] + with pytest.raises(ValueError) as exc_info: + load_moleculeace_benchmark(subset=dataset_names, as_frames=True) + + assert "Dataset name 'Nonexistent' not recognized" in str(exc_info) + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +@pytest.mark.parametrize("split_type", ["random", "activity_cliff"]) +@pytest.mark.parametrize("dataset_name", MOLECULEACE_DATASET_NAMES) +def test_load_moleculeace_splits(dataset_name, split_type): + train, test = load_moleculeace_splits(dataset_name, split_type) + assert isinstance(train, list) + assert len(train) > 0 + assert all(isinstance(idx, int) for idx in train) + + assert isinstance(test, list) + assert len(test) > 0 + assert all(isinstance(idx, int) for idx in test) + + assert len(train) > len(test) + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +@pytest.mark.parametrize("split_type", ["random", "activity_cliff"]) +@pytest.mark.parametrize("dataset_name", MOLECULEACE_DATASET_NAMES) +def test_load_moleculeace_splits_as_dict(dataset_name, split_type): + train, test = load_moleculeace_splits(dataset_name, split_type) + split_idxs = load_moleculeace_splits(dataset_name, split_type, as_dict=True) + assert isinstance(split_idxs, dict) + assert set(split_idxs.keys()) == {"train", "test"} + assert split_idxs["train"] == train + assert split_idxs["test"] == test + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +@pytest.mark.parametrize( + "dataset_name, dataset_length", + [ + ("chembl204_ki", 2754), + ("chembl214_ki", 3317), + ("chembl218_ec50", 1031), + ("chembl219_ki", 1865), + ("chembl228_ki", 1704), + ("chembl231_ki", 973), + ("chembl233_ki", 3142), + ("chembl234_ki", 3657), + ("chembl235_ec50", 2349), + ("chembl236_ki", 2598), + ("chembl237_ec50", 955), + ("chembl237_ki", 2603), + ("chembl238_ki", 1052), + ("chembl239_ec50", 1721), + ("chembl244_ki", 3097), + ("chembl262_ki", 856), + ("chembl264_ki", 2862), + ("chembl287_ki", 1328), + ("chembl1862_ki", 794), + ("chembl1871_ki", 659), + ("chembl2034_ki", 750), + ("chembl2047_ec50", 631), + ("chembl2147_ki", 1456), + ("chembl2835_ki", 615), + ("chembl2971_ki", 976), + ("chembl3979_ec50", 1125), + ("chembl4005_ki", 960), + ("chembl4203_ki", 731), + ("chembl4616_ec50", 682), + ("chembl4792_ki", 1471), + ], +) +def test_load_moleculeace_splits_lengths(dataset_name, dataset_length): + train, test = load_moleculeace_splits(dataset_name, split_type="random") + loaded_length = len(train) + len(test) + assert loaded_length == dataset_length + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +@pytest.mark.parametrize("dataset_name", MOLECULEACE_DATASET_NAMES) +def test_load_moleculeace_splits_activity_cliffs(dataset_name): + random_train, random_test = load_moleculeace_splits( + dataset_name, split_type="random" + ) + activity_train, activity_test = load_moleculeace_splits( + dataset_name, split_type="activity_cliff" + ) + + assert set(random_train) == set(activity_train) + assert set(random_test) > set(activity_test) + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +@pytest.mark.parametrize("split_type", ["random", "activity_cliff"]) +def test_load_moleculeace_splits_nonexistent_dataset(split_type): + with pytest.raises(InvalidParameterError) as error: + load_moleculeace_splits("nonexistent", split_type) + + assert str(error.value).startswith( + "The 'dataset_name' parameter of load_moleculeace_splits must be a str among" + ) + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +def test_load_moleculeace_splits_nonexistent_splits(): + with pytest.raises(InvalidParameterError) as error: + load_moleculeace_splits("chembl204_ki", "nonexistent") + + assert str(error.value).startswith( + "The 'split_type' parameter of load_moleculeace_splits must be a str among" + ) + + @pytest.mark.flaky( reruns=100, reruns_delay=5, From bba7cbf5be0ea57a9ab0882e84494e78cc76f573 Mon Sep 17 00:00:00 2001 From: Mateusz Praski Date: Wed, 15 Oct 2025 17:43:49 +0200 Subject: [PATCH 4/6] Add missing docs --- docs/modules/datasets/moleculeace.rst | 31 +++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/docs/modules/datasets/moleculeace.rst b/docs/modules/datasets/moleculeace.rst index a07d9d4d..38d4cf08 100644 --- a/docs/modules/datasets/moleculeace.rst +++ b/docs/modules/datasets/moleculeace.rst @@ -16,6 +16,8 @@ MoleculeACE benchmark :toctree: generated/ load_moleculeace_benchmark + load_moleculeace_dataset + load_moleculeace_splits Dataset loaders @@ -24,3 +26,32 @@ Dataset loaders :toctree: generated/ load_chembl204_ki + load_chembl214_ki + load_chembl218_ec50 + load_chembl219_ki + load_chembl228_ki + load_chembl231_ki + load_chembl233_ki + load_chembl234_ki + load_chembl235_ec50 + load_chembl236_ki + load_chembl237_ec50 + load_chembl237_ki + load_chembl238_ki + load_chembl239_ec50 + load_chembl244_ki + load_chembl262_ki + load_chembl264_ki + load_chembl287_ki + load_chembl1862_ki + load_chembl1871_ki + load_chembl2034_ki + load_chembl2047_ec50 + load_chembl2147_ki + load_chembl2835_ki + load_chembl2971_ki + load_chembl3979_ec50 + load_chembl4005_ki + load_chembl4203_ki + load_chembl4616_ec50 + load_chembl4792_ki From b5da8ec8eaa5f26a18829c45adc3611e4deb599c Mon Sep 17 00:00:00 2001 From: Mateusz Praski Date: Thu, 16 Oct 2025 09:56:06 +0200 Subject: [PATCH 5/6] Update documentation per PR review --- docs/modules/datasets.rst | 2 +- skfp/datasets/moleculeace/benchmark.py | 22 ++++++++++++---------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/docs/modules/datasets.rst b/docs/modules/datasets.rst index ec4e56d7..694aebdd 100644 --- a/docs/modules/datasets.rst +++ b/docs/modules/datasets.rst @@ -8,6 +8,6 @@ Functions for loading benchmark molecular datasets. :maxdepth: 2 datasets/lrgb + datasets/moleculeace datasets/moleculenet datasets/tdc - datasets/moleculeace diff --git a/skfp/datasets/moleculeace/benchmark.py b/skfp/datasets/moleculeace/benchmark.py index 66bddecf..b768eb44 100644 --- a/skfp/datasets/moleculeace/benchmark.py +++ b/skfp/datasets/moleculeace/benchmark.py @@ -126,7 +126,7 @@ def load_moleculeace_benchmark( Load the MoleculeACE benchmark datasets. MoleculeACE [1]_ datasets are varied inhibition and effective concentration targets from ChEMBL [2]_. - Activity cliff is recommended for all of them. + Activity cliffs split is recommended for all of them. For more details, see loading functions for particular datasets. Allowed individual dataset names are listed below. Dataset names are also returned (case-sensitive). @@ -192,6 +192,7 @@ def load_moleculeace_benchmark( “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022. `_ + .. [2] `B. Zdrazil et al. “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods,” Nucleic Acids Research, vol. 52, no. D1, Nov. 2023. @@ -239,9 +240,9 @@ def load_moleculeace_dataset( Load MoleculeACE dataset by name. Loads a given dataset from MoleculeACE [1]_ benchmark by its name. This is a proxy - for easier benchmarking, that avoids looking for individual functions. + for easier benchmarking that avoids looking for individual functions. - Dataset names here are the same as returned by `load_moleculenet_benchmark` function, + Dataset names here are the same as returned by :py:func:`.load_moleculenet_benchmark` function, and are case-sensitive. Parameters @@ -306,16 +307,17 @@ def load_moleculeace_splits( """ Load pre-generated dataset splits for the MoleculeACE benchmark. - MoleculeACE [1]_ provides two split types for stratified random - train/validation/test partitions with respect to activity cliffs: - - random - - activity_cliff + MoleculeACE [1]_ provides two stratified split types based on activity-cliff membership. + The data are split into train/test partitions: + + * ``random`` + * ``activity_cliff`` - Random splits use an 80/20 train/test split. Activity_cliff splits additionally + Random splits use an 80/20 train/test split. Activity cliffs additionally restrict the test set to molecules that are part of activity-cliff pairs. - Activity_cliff splits are recommended in the literature. + Activity cliffs splits are recommended in the literature. - Dataset names are the same as those returned by `load_moleculeace_benchmark` + Dataset names are the same as those returned by :py:func:`.load_moleculeace_benchmark` and are case-sensitive. Parameters From 562cef10fd9636f41baa3d3af2eb0727b8c844b3 Mon Sep 17 00:00:00 2001 From: Mateusz Praski Date: Thu, 16 Oct 2025 23:40:40 +0200 Subject: [PATCH 6/6] Update documentation per PR review --- skfp/datasets/moleculeace/benchmark.py | 22 +++-- skfp/datasets/moleculeace/moleculeace.py | 120 +++++++++++------------ tests/datasets/moleculeace.py | 12 +-- 3 files changed, 80 insertions(+), 74 deletions(-) diff --git a/skfp/datasets/moleculeace/benchmark.py b/skfp/datasets/moleculeace/benchmark.py index b768eb44..451f31d4 100644 --- a/skfp/datasets/moleculeace/benchmark.py +++ b/skfp/datasets/moleculeace/benchmark.py @@ -189,12 +189,12 @@ def load_moleculeace_benchmark( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022. `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods,” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023. `_ """ @@ -272,7 +272,7 @@ def load_moleculeace_dataset( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022. `_ @@ -308,7 +308,7 @@ def load_moleculeace_splits( Load pre-generated dataset splits for the MoleculeACE benchmark. MoleculeACE [1]_ provides two stratified split types based on activity-cliff membership. - The data are split into train/test partitions: + The data are split into train/test partitions as one of: * ``random`` * ``activity_cliff`` @@ -350,13 +350,19 @@ def load_moleculeace_splits( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022. `_ """ - splits_suffix = {"random": "splits.json", "activity_cliff": "splits_activity.json"}[ - split_type - ] + if split_type == "random": + splits_suffix = "splits.json" + elif split_type == "activity_cliff": + splits_suffix = "splits_activity.json" + else: + raise ValueError( + f'Split type "{split_type}" not recognized, must be one of: ' + f'{{"random", "activity_cliff"}}' + ) splits = fetch_splits( data_dir, diff --git a/skfp/datasets/moleculeace/moleculeace.py b/skfp/datasets/moleculeace/moleculeace.py index 0f171b72..9b63238d 100644 --- a/skfp/datasets/moleculeace/moleculeace.py +++ b/skfp/datasets/moleculeace/moleculeace.py @@ -58,12 +58,12 @@ def load_chembl204_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -144,12 +144,12 @@ def load_chembl214_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -230,12 +230,12 @@ def load_chembl218_ec50( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -316,12 +316,12 @@ def load_chembl219_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -402,12 +402,12 @@ def load_chembl228_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -488,12 +488,12 @@ def load_chembl231_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -574,12 +574,12 @@ def load_chembl233_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -660,12 +660,12 @@ def load_chembl234_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -746,12 +746,12 @@ def load_chembl235_ec50( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -832,12 +832,12 @@ def load_chembl236_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -918,12 +918,12 @@ def load_chembl237_ec50( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -1004,12 +1004,12 @@ def load_chembl237_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -1090,12 +1090,12 @@ def load_chembl238_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -1176,12 +1176,12 @@ def load_chembl239_ec50( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -1262,12 +1262,12 @@ def load_chembl244_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -1348,12 +1348,12 @@ def load_chembl262_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -1434,12 +1434,12 @@ def load_chembl264_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -1520,12 +1520,12 @@ def load_chembl287_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -1606,12 +1606,12 @@ def load_chembl1862_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -1692,12 +1692,12 @@ def load_chembl1871_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -1778,12 +1778,12 @@ def load_chembl2034_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -1864,12 +1864,12 @@ def load_chembl2047_ec50( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -1950,12 +1950,12 @@ def load_chembl2147_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -2036,12 +2036,12 @@ def load_chembl2835_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -2122,12 +2122,12 @@ def load_chembl2971_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -2208,12 +2208,12 @@ def load_chembl3979_ec50( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -2294,12 +2294,12 @@ def load_chembl4005_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -2380,12 +2380,12 @@ def load_chembl4203_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -2466,12 +2466,12 @@ def load_chembl4616_ec50( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ @@ -2552,12 +2552,12 @@ def load_chembl4792_ki( References ---------- .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni - “Exposing the Limitations of Molecular Machine Learning with Activity Cliffs” + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 `_ .. [2] `B. Zdrazil et al. - “The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods” + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 `_ diff --git a/tests/datasets/moleculeace.py b/tests/datasets/moleculeace.py index 9b85566b..74ea9086 100644 --- a/tests/datasets/moleculeace.py +++ b/tests/datasets/moleculeace.py @@ -190,9 +190,9 @@ def test_load_moleculeace_splits_nonexistent_dataset(split_type): with pytest.raises(InvalidParameterError) as error: load_moleculeace_splits("nonexistent", split_type) - assert str(error.value).startswith( - "The 'dataset_name' parameter of load_moleculeace_splits must be a str among" - ) + assert str(error.value).startswith( + "The 'dataset_name' parameter of load_moleculeace_splits must be a str among" + ) @pytest.mark.flaky( @@ -204,9 +204,9 @@ def test_load_moleculeace_splits_nonexistent_splits(): with pytest.raises(InvalidParameterError) as error: load_moleculeace_splits("chembl204_ki", "nonexistent") - assert str(error.value).startswith( - "The 'split_type' parameter of load_moleculeace_splits must be a str among" - ) + assert str(error.value).startswith( + "The 'split_type' parameter of load_moleculeace_splits must be a str among" + ) @pytest.mark.flaky(