diff --git a/.gitignore b/.gitignore index 06770bb4..f6995baa 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,6 @@ tests/preprocessing/input_output/data/mol_out.sdf .pip_audit_cache .python-version + +.DS_Store +.vscode diff --git a/docs/modules/datasets.rst b/docs/modules/datasets.rst index 13ab555e..694aebdd 100644 --- a/docs/modules/datasets.rst +++ b/docs/modules/datasets.rst @@ -8,5 +8,6 @@ Functions for loading benchmark molecular datasets. :maxdepth: 2 datasets/lrgb + datasets/moleculeace datasets/moleculenet datasets/tdc diff --git a/docs/modules/datasets/moleculeace.rst b/docs/modules/datasets/moleculeace.rst new file mode 100644 index 00000000..38d4cf08 --- /dev/null +++ b/docs/modules/datasets/moleculeace.rst @@ -0,0 +1,57 @@ +=========== +MoleculeACE +=========== + +.. automodule:: skfp.datasets.moleculeace + :exclude-members: load_moleculeace_benchmark + +========================================================= + +.. py:currentmodule:: skfp.datasets.moleculeace + +MoleculeACE benchmark + +.. autosummary:: + :nosignatures: + :toctree: generated/ + + load_moleculeace_benchmark + load_moleculeace_dataset + load_moleculeace_splits + +Dataset loaders + +.. autosummary:: + :nosignatures: + :toctree: generated/ + + load_chembl204_ki + load_chembl214_ki + load_chembl218_ec50 + load_chembl219_ki + load_chembl228_ki + load_chembl231_ki + load_chembl233_ki + load_chembl234_ki + load_chembl235_ec50 + load_chembl236_ki + load_chembl237_ec50 + load_chembl237_ki + load_chembl238_ki + load_chembl239_ec50 + load_chembl244_ki + load_chembl262_ki + load_chembl264_ki + load_chembl287_ki + load_chembl1862_ki + load_chembl1871_ki + load_chembl2034_ki + load_chembl2047_ec50 + load_chembl2147_ki + load_chembl2835_ki + load_chembl2971_ki + load_chembl3979_ec50 + load_chembl4005_ki + load_chembl4203_ki + load_chembl4616_ec50 + load_chembl4792_ki diff --git a/skfp/datasets/moleculeace/__init__.py b/skfp/datasets/moleculeace/__init__.py new file mode 100644 index 00000000..4361e47d --- /dev/null +++ b/skfp/datasets/moleculeace/__init__.py @@ -0,0 +1,37 @@ +from .benchmark import ( + load_moleculeace_benchmark, + load_moleculeace_dataset, + load_moleculeace_splits, +) +from .moleculeace import ( + load_chembl204_ki, + load_chembl214_ki, + load_chembl218_ec50, + load_chembl219_ki, + load_chembl228_ki, + load_chembl231_ki, + load_chembl233_ki, + load_chembl234_ki, + load_chembl235_ec50, + load_chembl236_ki, + load_chembl237_ec50, + load_chembl237_ki, + load_chembl238_ki, + load_chembl239_ec50, + load_chembl244_ki, + load_chembl262_ki, + load_chembl264_ki, + load_chembl287_ki, + load_chembl1862_ki, + load_chembl1871_ki, + load_chembl2034_ki, + load_chembl2047_ec50, + load_chembl2147_ki, + load_chembl2835_ki, + load_chembl2971_ki, + load_chembl3979_ec50, + load_chembl4005_ki, + load_chembl4203_ki, + load_chembl4616_ec50, + load_chembl4792_ki, +) diff --git a/skfp/datasets/moleculeace/benchmark.py b/skfp/datasets/moleculeace/benchmark.py new file mode 100644 index 00000000..451f31d4 --- /dev/null +++ b/skfp/datasets/moleculeace/benchmark.py @@ -0,0 +1,394 @@ +import os +from collections.abc import Iterator + +import numpy as np +import pandas as pd +from sklearn.utils._param_validation import StrOptions, validate_params + +from skfp.datasets.utils import fetch_splits + +from .moleculeace import ( + load_chembl204_ki, + load_chembl214_ki, + load_chembl218_ec50, + load_chembl219_ki, + load_chembl228_ki, + load_chembl231_ki, + load_chembl233_ki, + load_chembl234_ki, + load_chembl235_ec50, + load_chembl236_ki, + load_chembl237_ec50, + load_chembl237_ki, + load_chembl238_ki, + load_chembl239_ec50, + load_chembl244_ki, + load_chembl262_ki, + load_chembl264_ki, + load_chembl287_ki, + load_chembl1862_ki, + load_chembl1871_ki, + load_chembl2034_ki, + load_chembl2047_ec50, + load_chembl2147_ki, + load_chembl2835_ki, + load_chembl2971_ki, + load_chembl3979_ec50, + load_chembl4005_ki, + load_chembl4203_ki, + load_chembl4616_ec50, + load_chembl4792_ki, +) + +MOLECULEACE_DATASET_NAMES = [ + "chembl204_ki", + "chembl214_ki", + "chembl218_ec50", + "chembl219_ki", + "chembl228_ki", + "chembl231_ki", + "chembl233_ki", + "chembl234_ki", + "chembl235_ec50", + "chembl236_ki", + "chembl237_ec50", + "chembl237_ki", + "chembl238_ki", + "chembl239_ec50", + "chembl244_ki", + "chembl262_ki", + "chembl264_ki", + "chembl287_ki", + "chembl1862_ki", + "chembl1871_ki", + "chembl2034_ki", + "chembl2047_ec50", + "chembl2147_ki", + "chembl2835_ki", + "chembl2971_ki", + "chembl3979_ec50", + "chembl4005_ki", + "chembl4203_ki", + "chembl4616_ec50", + "chembl4792_ki", +] + +MOLECULEACE_DATASET_NAME_TO_LOADER_FUNC = { + "chembl204_ki": load_chembl204_ki, + "chembl214_ki": load_chembl214_ki, + "chembl218_ec50": load_chembl218_ec50, + "chembl219_ki": load_chembl219_ki, + "chembl228_ki": load_chembl228_ki, + "chembl231_ki": load_chembl231_ki, + "chembl233_ki": load_chembl233_ki, + "chembl234_ki": load_chembl234_ki, + "chembl235_ec50": load_chembl235_ec50, + "chembl236_ki": load_chembl236_ki, + "chembl237_ec50": load_chembl237_ec50, + "chembl237_ki": load_chembl237_ki, + "chembl238_ki": load_chembl238_ki, + "chembl239_ec50": load_chembl239_ec50, + "chembl244_ki": load_chembl244_ki, + "chembl262_ki": load_chembl262_ki, + "chembl264_ki": load_chembl264_ki, + "chembl287_ki": load_chembl287_ki, + "chembl1862_ki": load_chembl1862_ki, + "chembl1871_ki": load_chembl1871_ki, + "chembl2034_ki": load_chembl2034_ki, + "chembl2047_ec50": load_chembl2047_ec50, + "chembl2147_ki": load_chembl2147_ki, + "chembl2835_ki": load_chembl2835_ki, + "chembl2971_ki": load_chembl2971_ki, + "chembl3979_ec50": load_chembl3979_ec50, + "chembl4005_ki": load_chembl4005_ki, + "chembl4203_ki": load_chembl4203_ki, + "chembl4616_ec50": load_chembl4616_ec50, + "chembl4792_ki": load_chembl4792_ki, +} + + +@validate_params( + { + "subset": [None, list], + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_moleculeace_benchmark( + subset: list[str] | None = None, + data_dir: str | os.PathLike | None = None, + as_frames: bool = False, + verbose: bool = False, +) -> Iterator[tuple[str, pd.DataFrame]] | Iterator[tuple[str, list[str], np.ndarray]]: + """ + Load the MoleculeACE benchmark datasets. + + MoleculeACE [1]_ datasets are varied inhibition and effective concentration targets from ChEMBL [2]_. + Activity cliffs split is recommended for all of them. + + For more details, see loading functions for particular datasets. Allowed individual + dataset names are listed below. Dataset names are also returned (case-sensitive). + + - chembl204_ki + - chembl214_ki + - chembl218_ec50 + - chembl219_ki + - chembl228_ki + - chembl231_ki + - chembl233_ki + - chembl234_ki + - chembl235_ec50 + - chembl236_ki + - chembl237_ec50 + - chembl237_ki + - chembl238_ki + - chembl239_ec50 + - chembl244_ki + - chembl262_ki + - chembl264_ki + - chembl287_ki + - chembl1862_ki + - chembl1871_ki + - chembl2034_ki + - chembl2047_ec50 + - chembl2147_ki + - chembl2835_ki + - chembl2971_ki + - chembl3979_ec50 + - chembl4005_ki + - chembl4203_ki + - chembl4616_ec50 + - chembl4792_ki + + Parameters + ---------- + subset : None or list of strings + If ``None``, returns all datasets. List of strings loads only datasets with given names. + + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frames : bool, default=False + If True, returns the raw DataFrame for each dataset. Otherwise, returns SMILES + as a list of strings, and labels as a NumPy array for each dataset. + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : generator of pd.DataFrame or tuples (list[str], np.ndarray) + Loads and returns datasets with a generator. Returned types depend on the + ``as_frame`` parameter, either: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022. + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023. + `_ + """ + dataset_names = _subset_to_dataset_names(subset) + + dataset_functions = [ + MOLECULEACE_DATASET_NAME_TO_LOADER_FUNC[name] for name in dataset_names + ] + + if as_frames: + datasets = ( + (dataset_name, load_function(data_dir, as_frame=True, verbose=verbose)) + for dataset_name, load_function in zip( + dataset_names, dataset_functions, strict=False + ) + ) + else: + datasets = ( + (dataset_name, *load_function(data_dir, as_frame=False, verbose=verbose)) + for dataset_name, load_function in zip( + dataset_names, dataset_functions, strict=False + ) + ) + return datasets + + +@validate_params( + { + "dataset_name": [StrOptions(set(MOLECULEACE_DATASET_NAMES))], + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_moleculeace_dataset( + dataset_name: str, + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + """ + Load MoleculeACE dataset by name. + + Loads a given dataset from MoleculeACE [1]_ benchmark by its name. This is a proxy + for easier benchmarking that avoids looking for individual functions. + + Dataset names here are the same as returned by :py:func:`.load_moleculenet_benchmark` function, + and are case-sensitive. + + Parameters + ---------- + dataset_name : str + Name of the dataset to load. + + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns "SMILES" and labels + (dataset-dependent). Otherwise, returns SMILES as list of strings, and + labels as a NumPy array (shape and type are dataset-dependent). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns depending on the dataset + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022. + `_ + + Examples + -------- + >> from skfp.datasets.moleculeace import load_moleculeace_dataset + >> dataset = load_moleculeace_dataset("chembl204_ki") + >> dataset # doctest: +SKIP + (['CCCCCCCC(=O)OC[C@H](NC(=O)CN)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccccc1)C(=O)O', ..., ']) + """ + loader_func = MOLECULEACE_DATASET_NAME_TO_LOADER_FUNC[dataset_name] + return loader_func(data_dir, as_frame, verbose) + + +@validate_params( + { + "dataset_name": [StrOptions(set(MOLECULEACE_DATASET_NAMES))], + "split_type": [StrOptions({"random", "activity_cliff"})], + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_moleculeace_splits( + dataset_name: str, + split_type: str = "activity_cliff", + data_dir: str | os.PathLike | None = None, + as_dict: bool = False, + verbose: bool = False, +) -> tuple[list[int], list[int]] | dict[str, list[int]]: + """ + Load pre-generated dataset splits for the MoleculeACE benchmark. + + MoleculeACE [1]_ provides two stratified split types based on activity-cliff membership. + The data are split into train/test partitions as one of: + + * ``random`` + * ``activity_cliff`` + + Random splits use an 80/20 train/test split. Activity cliffs additionally + restrict the test set to molecules that are part of activity-cliff pairs. + Activity cliffs splits are recommended in the literature. + + Dataset names are the same as those returned by :py:func:`.load_moleculeace_benchmark` + and are case-sensitive. + + Parameters + ---------- + dataset_name : str + Name of the dataset to loads splits for. + + split_type: {"random", "activity_cliff"} + Type of the split to load. + + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_dict : bool, default=False + If True, returns the splits as dictionary with keys "train", "valid" and "test", + and index lists as values. Otherwise, returns three lists with splits indexes. + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : tuple(list[int], list[int], list[int]) or dict + Depending on the `as_dict` argument, one of: + - three lists of integer indexes + - dictionary with "train", "valid" and "test" keys, and values as lists with + splits indexes + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022. + `_ + """ + if split_type == "random": + splits_suffix = "splits.json" + elif split_type == "activity_cliff": + splits_suffix = "splits_activity.json" + else: + raise ValueError( + f'Split type "{split_type}" not recognized, must be one of: ' + f'{{"random", "activity_cliff"}}' + ) + + splits = fetch_splits( + data_dir, + dataset_name=f"MoleculeACE_{dataset_name}", + filename=f"{dataset_name}_{splits_suffix}", + verbose=verbose, + ) + if as_dict: + return splits + else: + return splits["train"], splits["test"] + + +def _subset_to_dataset_names(subset: list[str] | None) -> list[str]: + if subset is None: + dataset_names = MOLECULEACE_DATASET_NAMES + elif isinstance(subset, (list, set, tuple)): + for name in subset: + if name not in MOLECULEACE_DATASET_NAMES: + raise ValueError( + f"Dataset name '{name}' not recognized among MoleculeACE datasets" + ) + dataset_names = subset + else: + raise ValueError( + f'Value "{subset}" for subset not recognized, must be a list of strings' + f"with dataset names from MoleculeACE to load" + ) + return dataset_names diff --git a/skfp/datasets/moleculeace/moleculeace.py b/skfp/datasets/moleculeace/moleculeace.py new file mode 100644 index 00000000..9b63238d --- /dev/null +++ b/skfp/datasets/moleculeace/moleculeace.py @@ -0,0 +1,2587 @@ +import os + +import numpy as np +import pandas as pd +from sklearn.utils._param_validation import validate_params + +from skfp.datasets.utils import fetch_dataset, get_mol_strings_and_labels + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl204_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL204 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Prothrombin target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 2754 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl204_ki + >>> dataset = load_chembl204_ki() + >>> dataset # doctest: +SKIP + (['CC(=N)N1CCC(Oc2ccc3nc(CCC(=O)O)n(Cc4ccc5ccc(C(=N)N)cc5c4)c3c2)CC1, ..., 'CCC(=O)N1CCC[C@H]1C(=O)NCc1ccc(C(=N)N)cc1'], \ + array([-3.427, ..., -4.146])) + + >>> dataset = load_chembl204_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CC(=N)N1CCC(Oc2ccc3nc(CCC(=O)O)n(Cc4ccc5ccc(C(=N)N)cc5c4)c3c2)CC1 -3.426511 + 1 CC(=N)N1CCC(Oc2ccc3c(c2)nc(C(C)C)n3Cc2ccc3ccc(C(=N)N)cc3c2)CC1 -2.939519 + 2 CCC(C)c1nc2cc(OC3CCN(C(C)=N)CC3)ccc2n1Cc1ccc2ccc(C(=N)N)cc2c1 -3.361728 + 3 COC(=O)C(C)CN(c1ccc2c(c1)nc(C)n2Cc1ccc2ccc(C(=N)N)cc2c1)C1CCN(C(C)=N)CC1 -3.698970 + 4 CCCCc1nc2cc(OC3CCN(C(C)=N)CC3)ccc2n1Cc1ccc2ccc(C(=N)N)cc2c1 -3.301030 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl204_ki", + filename="chembl204_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl214_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL214 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the 5-hydroxytryptamine receptor 1a target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 3317 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl214_ki + >>> dataset = load_chembl214_ki() + >>> dataset # doctest: +SKIP + (['COc1ccc(NC(=O)c2ccc(-c3ccc(-c4noc(C)n4)cc3C)cc2)cc1N1CCN(C)CC1, ..., 'O=S(=O)(NCCCCCCN1CCN(c2nsc3ccccc23)CC1)c1ccc2ccccc2c1'], \ + array([-1.869, ..., -1.863])) + + >>> dataset = load_chembl214_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 COc1ccc(NC(=O)c2ccc(-c3ccc(-c4noc(C)n4)cc3C)cc2)cc1N1CCN(C)CC1 -1.869232 + 1 Nc1cccc(-c2ccc(CCN3CCN(c4cccc5cccnc45)CC3)cc2)n1 -0.477121 + 2 COc1ccc(NS(=O)(=O)c2ccc(Br)cc2)cc1N1CCN(C)CC1 -2.400002 + 3 COc1ccc(NS(=O)(=O)c2sc3ccc(Cl)cc3c2C)cc1N1CCN(C)CC1 -2.700002 + 4 CN1CCc2cccc3c2[C@H]1Cc1cccc(-c2ccccc2)c1-3 -0.255273 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl214_ki", + filename="chembl214_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl218_ec50( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL218 EC50 dataset. + + The task is to predict the half maximal effective concentration (EC50) of molecules against the Cannabinoid receptor 1 target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 1031 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl218_ec50 + >>> dataset = load_chembl218_ec50() + >>> dataset # doctest: +SKIP + (['Cn1c(C(=O)NN2CCCCC2)nc(-c2ccc(Cl)cc2)c1-c1ccc(Cl)cc1, ..., 'CCCCCc1cccc(OCCCCCCCCCCC(=O)NC2CC2)c1'], \ + array([-2.0, ..., -1.491])) + + >>> dataset = load_chembl218_ec50(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES EC50 + 0 Cn1c(C(=O)NN2CCCCC2)nc(-c2ccc(Cl)cc2)c1-c1ccc(Cl)cc1 -2.000000 + 1 Cn1c(C(=O)NC2CCCCC2)nc(-c2ccc(Cl)cc2)c1-c1ccc(Cl)cc1 -2.698970 + 2 Cn1c(C(=O)NN2CCCCC2)nc(-c2ccc(Cl)cc2Cl)c1-c1ccc(Cl)cc1 -0.698970 + 3 Cn1c(C(=O)NC2CCCCC2)nc(-c2ccc(Cl)cc2Cl)c1-c1ccc(Cl)cc1 -1.255273 + 4 N#Cc1cc(-c2ccc(Cl)cc2)c(-c2ccc(Cl)cc2Cl)nc1OCc1ccc(F)c(F)c1 -0.903090 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl218_ec50", + filename="chembl218_ec50.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl219_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL219 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the D(4) dopamine receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 1865 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl219_ki + >>> dataset = load_chembl219_ki() + >>> dataset # doctest: +SKIP + (['COc1ccccc1N1CCN(Cc2ccn(-c3ccccc3)c2)CC1, ..., 'CNc1cc(OC)c(C(=O)N[C@@H]2CCN(Cc3ccccc3)[C@@H]2C)cc1Cl'], \ + array([-0.1139, ..., 0.0655])) + + >>> dataset = load_chembl219_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 COc1ccccc1N1CCN(Cc2ccn(-c3ccccc3)c2)CC1 -0.113943 + 1 c1ccc(N2CCN(Cc3ccn(-c4ccccc4)c3)CC2)cc1 -0.602060 + 2 CC1Cc2cccc3c2N1C(=O)C(N1CCN(Cc2ccc(Cl)cc2)CC1)CC3 -0.954243 + 3 CC1(C)Cc2cccc3c2N1C(=O)C(N1CCN(Cc2ccc(Cl)cc2)CC1)CC3 -1.278754 + 4 Cc1ccc(CN2CCN(C3CCc4cccc5c4N(CC5)C3=O)CC2)cc1 -0.602060 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl219_ki", + filename="chembl219_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl228_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL228 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Sodium-dependent serotonin transporter target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 1704 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl228_ki + >>> dataset = load_chembl228_ki() + >>> dataset # doctest: +SKIP + (['CN(C)Cc1ccccc1Sc1ccc(C#N)cc1N, ..., 'CCCN(CC[C@]1(O)C[C@H](NC(=O)c2ccc3ccccc3c2)C1)[C@H]1CCc2nc(N)sc2C1'], \ + array([-0.04139, ..., -1.505])) + + >>> dataset = load_chembl228_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CN(C)Cc1ccccc1Sc1ccc(C#N)cc1N -0.041393 + 1 CN(C)Cc1ccccc1Sc1ccc(C(F)(F)F)cc1N 0.481486 + 2 COc1ccc(Sc2ccccc2CN(C)C)c(N)c1 -0.276462 + 3 CN(C)Cc1ccccc1Sc1ccc(Cl)cc1N 0.568636 + 4 Fc1ccc([C@@H]2CCNC[C@H]2COc2ccc3c(c2)OCO3)cc1 0.661986 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl228_ki", + filename="chembl228_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl231_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL231 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Histamine h1 receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 973 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl231_ki + >>> dataset = load_chembl231_ki() + >>> dataset # doctest: +SKIP + (['CN1CCN(C2=Nc3ccccc3Nc3sc(CO)cc32)CC1, ..., 'O=C(O)c1cc(-c2ccc(C3CCNCC3)cc2)cc(-n2cc(-c3ccc(Cl)s3)nn2)c1'], \ + array([-0.7782, ..., -2.23])) + + >>> dataset = load_chembl231_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CN1CCN(C2=Nc3ccccc3Nc3sc(CO)cc32)CC1 -0.778151 + 1 Cc1cc2c(s1)Nc1ccccc1N=C2N1CCN(C)CC1 -0.622900 + 2 Cc1cc2c(s1)Nc1ccccc1N=C2N1CCNCC1 -1.342423 + 3 Cc1cc2c(s1)Nc1ccccc1N=C2N1CC[N+](C)([O-])CC1 -1.939519 + 4 CC(=O)c1ccc(OCCCN2CC[C@H](NC(=O)[C@@H](N)CO)C2)cc1 -4.633468 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl231_ki", + filename="chembl231_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl233_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL233 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Mu-type opioid receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 3142 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl233_ki + >>> dataset = load_chembl233_ki() + >>> dataset # doctest: +SKIP + (['CC(c1ccccc1)N1CC[C@H]1[C@@H](N)c1cccc(Cl)c1, ..., 'CCO[C@@]12Cc3cc(-c4ccccc4)cnc3[C@@H]3Oc4c(O)ccc5c4[C@@]31CCN(CC1CC1)[C@@H]2C5'], \ + array([-4.026, ..., -2.698])) + + >>> dataset = load_chembl233_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CC(c1ccccc1)N1CC[C@H]1[C@@H](N)c1cccc(Cl)c1 -4.026125 + 1 Cc1ccc(C(c2ccc(C)cc2)N2CC[C@H]2[C@H](N)c2cccc(Cl)c2)cc1 -2.903633 + 2 COc1ccc([C@H](N)[C@@H]2CCN2C(c2ccccc2)c2ccccc2)cc1 -2.937016 + 3 N[C@H](c1cccc(Cl)c1)[C@@H]1CCN1C(c1ccc(F)cc1)c1ccc(F)cc1 -3.337659 + 4 N[C@H](c1cccc(Cl)c1)[C@@H]1CCN1C(c1cccc(Cl)c1)c1cccc(Cl)c1 -3.854852 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl233_ki", + filename="chembl233_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl234_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL234 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the D(3) dopamine receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 3657 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl234_ki + >>> dataset = load_chembl234_ki() + >>> dataset # doctest: +SKIP + (['CN1C2CCC1CC(OC(c1ccc(F)cc1)c1ccc(F)cc1)C2, ..., 'CNc1cc(OC)c(C(=O)N[C@@H]2CCN(Cc3ccccc3)[C@@H]2C)cc1Cl'], \ + array([-2.161, ..., -0.07188])) + + >>> dataset = load_chembl234_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CN1C2CCC1CC(OC(c1ccc(F)cc1)c1ccc(F)cc1)C2 -2.161368 + 1 O=C(NCCCN1CCN(c2cccc(Cl)c2Cl)CC1)c1cccc2c1-c1ccccc1C2=O -1.556303 + 2 c1ccc(N2CCN(CCCn3c4ccccc4c4ccccc43)CC2)cc1 -3.383815 + 3 Oc1nc2c(N3CCN(Cc4ccccc4)CC3)cccc2[nH]1 -1.752048 + 4 O=C(NCCCN1CCN(c2ccccc2)CC1)c1cccc2c1-c1ccccc1C2=O -2.633468 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl234_ki", + filename="chembl234_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl235_ec50( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL235 EC50 dataset. + + The task is to predict the half maximal effective concentration (EC50) of molecules against the Peroxisome proliferator-activated receptor gamma target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 2349 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl235_ec50 + >>> dataset = load_chembl235_ec50() + >>> dataset # doctest: +SKIP + (['CC(/C=C/C(F)=C(/C)c1cc(C(C)(C)C)cc(C(C)(C)C)c1OCC(F)(F)F)=C\C(=O)O, ..., 'O=C(O)Cc1cc(Br)c(Oc2cc(I)c(O)c(I)c2)c(I)c1'], \ + array([-1.324, ..., -2.477])) + + >>> dataset = load_chembl235_ec50(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES EC50 + 0 CC(/C=C/C(F)=C(/C)c1cc(C(C)(C)C)cc(C(C)(C)C)c1OCC(F)(F)F)=C\C(=O)O -1.324282 + 1 CCCOc1c(/C(C)=C\C=C\C(C)=C\C(=O)O)cc(C(C)C)cc1C(F)(F)C(F)(F)F -1.343409 + 2 C/C(=C/C=C/C(C)=C/C(=O)O)c1cc(-c2cccs2)cc(C(C)C)c1OCC(F)F -0.993436 + 3 CCC(Cc1ccc(OC)c(C(=O)NCc2ccc(OCCc3ccccc3)cc2)c1)C(=O)O -3.477121 + 4 CCCCC(Cc1ccc(OC)c(C(=O)NCc2ccc(C(F)(F)F)cc2)c1)C(=O)O -3.397940 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl235_ec50", + filename="chembl235_ec50.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl236_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL236 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Delta-type opioid receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 2598 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl236_ki + >>> dataset = load_chembl236_ki() + >>> dataset # doctest: +SKIP + (['CC(c1ccccc1)N1CC[C@H]1[C@@H](N)c1cccc(Cl)c1, ..., 'CCO[C@@]12Cc3cc(-c4ccccc4)cnc3[C@@H]3Oc4c(O)ccc5c4[C@@]31CCN(CC1CC1)[C@@H]2C5'], \ + array([-4.592, ..., -0.8739])) + + >>> dataset = load_chembl236_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CC(c1ccccc1)N1CC[C@H]1[C@@H](N)c1cccc(Cl)c1 -4.592399 + 2 Cc1ccc(C(c2ccc(C)cc2)N2CC[C@H]2[C@H](N)c2cccc(Cl)c2)cc1 -3.699924 + 4 COc1ccc([C@H](N)[C@@H]2CCN2C(c2ccccc2)c2ccccc2)cc1 -3.465234 + 5 N[C@H](c1cccc(Cl)c1)[C@@H]1CCN1C(c1ccc(F)cc1)c1ccc(F)cc1 -3.870989 + 6 N[C@H](c1cccc(Cl)c1)[C@@H]1CCN1C(c1cccc(Cl)c1)c1cccc(Cl)c1 -3.432809 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl236_ki", + filename="chembl236_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl237_ec50( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL237 EC50 dataset. + + The task is to predict the half maximal effective concentration (EC50) of molecules against the Kappa-type opioid receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 955 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl237_ec50 + >>> dataset = load_chembl237_ec50() + >>> dataset # doctest: +SKIP + (['C=CCN1CC[C@]23c4c5ccc(O)c4O[C@H]2C(=O)CC[C@@]3(O)[C@H]1C5, ..., 'Oc1ccc2c3c1O[C@H]1c4ncc(-c5ccccc5)cc4C[C@@]4(OCCCC5CCCCC5)[C@@H](C2)N(CC2CC2)CC[C@]314'], \ + array([-0.9191, ..., -1.538])) + + >>> dataset = load_chembl237_ec50(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES EC50 + 1 C=CCN1CC[C@]23c4c5ccc(O)c4O[C@H]2C(=O)CC[C@@]3(O)[C@H]1C5 -0.919078 + 2 CN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@]24CC[C@@]3(C[C@H]2C(C)(C)C(C)(C)O4)C1C5 -1.320146 + 3 CO[C@@]12CCC3(C[C@H]1[C@@](C)(O)C(C)(C)C)[C@H]1Cc4ccc(O)c5c4C3(CCN1C)[C@H]2O5 -0.380211 + 4 Nc1nc2cc3c(cc2s1)C[C@@H]1[C@@H]2CCCC[C@]32CCN1CC1CC1 -0.380211 + 5 CN1CCC23c4c5ccc(O)c4OC2c2nc(N)ncc2CC3(O)C1C5 -3.031408 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl237_ec50", + filename="chembl237_ec50.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl237_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL237 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Kappa-type opioid receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 2603 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl237_ki + >>> dataset = load_chembl237_ki() + >>> dataset # doctest: +SKIP + (['CC(c1ccccc1)N1CC[C@H]1[C@@H](N)c1cccc(Cl)c1, ..., 'CCO[C@@]12Cc3cc(-c4ccccc4)cnc3[C@@H]3Oc4c(O)ccc5c4[C@@]31CCN(CC1CC1)[C@@H]2C5'], \ + array([-3.613, ..., -2.401])) + + >>> dataset = load_chembl237_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CC(c1ccccc1)N1CC[C@H]1[C@@H](N)c1cccc(Cl)c1 -3.612678 + 1 Cc1ccc(C(c2ccc(C)cc2)N2CC[C@H]2[C@H](N)c2cccc(Cl)c2)cc1 -3.265054 + 2 COc1ccc([C@H](N)[C@@H]2CCN2C(c2ccccc2)c2ccccc2)cc1 -3.127429 + 3 N[C@H](c1cccc(Cl)c1)[C@@H]1CCN1C(c1ccc(F)cc1)c1ccc(F)cc1 -3.350248 + 4 N[C@H](c1cccc(Cl)c1)[C@@H]1CCN1C(c1cccc(Cl)c1)c1cccc(Cl)c1 -3.780821 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl237_ki", + filename="chembl237_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl238_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL238 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Sodium-dependent dopamine transporter target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 1052 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl238_ki + >>> dataset = load_chembl238_ki() + >>> dataset # doctest: +SKIP + (['CN1CCC(O)(c2ccc(Cl)c(Cl)c2)C([C@@H](O)c2ccc(Cl)c(Cl)c2)C1, ..., 'C[C@H]1CN(CC[S+](O)C(c2ccc(F)cc2)c2ccc(F)cc2)C[C@@H](C)N1CC(O)Cc1ccccc1'], \ + array([-3.617, ..., -0.873])) + + >>> dataset = load_chembl238_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CN1CCC(O)(c2ccc(Cl)c(Cl)c2)C([C@@H](O)c2ccc(Cl)c(Cl)c2)C1 -3.617000 + 1 CN1CCC(O)(c2ccc(Cl)c(Cl)c2)C(C(=O)c2ccc(Cl)c(Cl)c2)C1 -1.037426 + 2 Cc1ccc(C2OC(=O)OC3(c4ccc(C)cc4)CCN(C)CC23)cc1 -3.913284 + 3 Cc1ccc([C@H](O)C2CN(C)CCC2(O)c2ccc(C)cc2)cc1 -4.027350 + 4 CN1CCC(O)(c2ccc(F)cc2)C(C(=O)c2ccc(F)cc2)C1 -3.755875 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl238_ki", + filename="chembl238_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl239_ec50( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL239 EC50 dataset. + + The task is to predict the half maximal effective concentration (EC50) of molecules against the Peroxisome proliferator-activated receptor alpha target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 1721 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl239_ec50 + >>> dataset = load_chembl239_ec50() + >>> dataset # doctest: +SKIP + (['CCC(Cc1ccc(OC)c(C(=O)NCc2ccc(OCCc3ccccc3)cc2)c1)C(=O)O, ..., 'CC(C)(Oc1ccc(CCOc2ccc(/N=N/c3ccc(Cl)cc3)cc2)cc1)C(=O)O'], \ + array([-3.431, ..., -2.58])) + + >>> dataset = load_chembl239_ec50(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES EC50 + 0 CCC(Cc1ccc(OC)c(C(=O)NCc2ccc(OCCc3ccccc3)cc2)c1)C(=O)O -3.431364 + 1 CC[C@@H](Cc1ccc(OC)c(C(=O)NCc2ccc(Oc3ccc(F)cc3)cc2)c1)C(=O)O -0.964024 + 2 CCCCC(Cc1ccc(OC)c(C(=O)NCc2ccc(C(F)(F)F)cc2)c1)C(=O)O -3.000000 + 3 CCC(Cc1ccc(OC)c(C(=O)NCCc2ccc(C(F)(F)F)cc2)c1)C(=O)O -2.869232 + 4 CCC(Cc1ccc(OC)c(C(=O)NCc2ccc(OC(F)(F)F)cc2)c1)C(=O)O -1.633468 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl239_ec50", + filename="chembl239_ec50.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl244_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL244 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Coagulation factor x target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 3097 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl244_ki + >>> dataset = load_chembl244_ki() + >>> dataset # doctest: +SKIP + (['CC(=N)N1CCC(Oc2ccc3nc(CCC(=O)O)n(Cc4ccc5ccc(C(=N)N)cc5c4)c3c2)CC1, ..., 'CC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CO)C(=O)N[C@H](C=O)CCCNC(=N)N'], \ + array([-0.1139, ..., -2.556])) + + >>> dataset = load_chembl244_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CC(=N)N1CCC(Oc2ccc3nc(CCC(=O)O)n(Cc4ccc5ccc(C(=N)N)cc5c4)c3c2)CC1 -0.113943 + 1 CC(=N)N1CCC(Oc2ccc3c(c2)nc(C(C)C)n3Cc2ccc3ccc(C(=N)N)cc3c2)CC1 -0.301030 + 2 CCC(C)c1nc2cc(OC3CCN(C(C)=N)CC3)ccc2n1Cc1ccc2ccc(C(=N)N)cc2c1 -0.518514 + 3 CC1CCN(C(=O)[C@H](Cc2cccc(C(=N)N)c2)NS(=O)(=O)c2c(C(C)C)cc(C(C)C)cc2C(C)C)CC1 -3.301000 + 4 COC(=O)[C@H]1Cc2ccccc2CN1C(=O)[C@H](Cc1cccc(C(=N)N)c1)NS(=O)(=O)c1ccc2ccccc2c1 -4.431000 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl244_ki", + filename="chembl244_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl262_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL262 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Glycogen synthase kinase-3 beta target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 856 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl262_ki + >>> dataset = load_chembl262_ki() + >>> dataset # doctest: +SKIP + (['Cc1nc(N)sc1-c1ccnc(Nc2cccc([N+](=O)[O-])c2)n1, ..., 'CC(C)(C#N)c1cccc(-c2ccnc3[nH]ccc23)n1'], \ + array([-1.301, ..., -2.322])) + + >>> dataset = load_chembl262_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 Cc1nc(N)sc1-c1ccnc(Nc2cccc([N+](=O)[O-])c2)n1 -1.30103 + 1 Cc1ccc2c(-c3ccnc(Nc4cccc(C(F)(F)F)c4)n3)c(-c3ccc(F)cc3)nn2n1 -1.30103 + 2 Cc1ccc2c(-c3ccnc(Nc4ccc(F)c(F)c4)n3)c(-c3ccc(F)cc3)nn2n1 -1.00000 + 3 Cc1ccc2c(-c3ccnc(Nc4ccc5c(c4)OCCO5)n3)c(-c3ccc(F)cc3)nn2n1 -1.00000 + 4 Cc1ccc2c(-c3ccnc(Nc4ccc(Cl)c(C(F)(F)F)c4)n3)c(-c3ccc(F)cc3)nn2n1 -1.69897 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl262_ki", + filename="chembl262_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl264_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL264 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Histamine h3 receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 2862 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl264_ki + >>> dataset = load_chembl264_ki() + >>> dataset # doctest: +SKIP + (['CC(=O)c1ccc(OCCCc2c[nH]cn2)cc1, ..., 'CC(C)(C)c1ccc(OCCCCCCN2CCCCCC2)cc1'], \ + array([-1.94, ..., -2.919])) + + >>> dataset = load_chembl264_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CC(=O)c1ccc(OCCCc2c[nH]cn2)cc1 -1.939519 + 1 c1ccc(COCCCc2c[nH]cn2)cc1 -0.415974 + 2 CC(=O)c1ccc(SCCc2c[nH]cn2)cc1 -0.041393 + 3 c1ccc(OCCCc2c[nH]cn2)cc1 -1.431364 + 4 CC(=O)c1ccc(SCCCc2c[nH]cn2)cc1 -1.255273 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl264_ki", + filename="chembl264_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl287_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL287 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Sigma non-opioid intracellular receptor 1 target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 1328 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl287_ki + >>> dataset = load_chembl287_ki() + >>> dataset # doctest: +SKIP + (['O=S1(=O)c2ccccc2CCC12CCN(Cc1ccccc1)CC2, ..., 'Cc1[nH]c2cc(C(F)(F)F)ccc2c(=O)c1CN(C)Cc1ccccc1'], \ + array([-1.301, ..., -1.949])) + + >>> dataset = load_chembl287_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 O=S1(=O)c2ccccc2CCC12CCN(Cc1ccccc1)CC2 -1.301030 + 1 COc1ccc(N2C[C@H](CN3CCC(O)(c4ccsc4)CC3)OC2=O)cc1 -1.531479 + 2 COc1ccc(N2C[C@H](CN3CCC(O)(c4ccc5c(c4)OCO5)CC3)OC2=O)cc1 -1.278754 + 3 CNC(=O)CC1Cc2ccccc2C2(CCN(Cc3ccccc3)CC2)O1 -2.230449 + 4 OCC1OC2(CCN(Cc3ccccc3)CC2)c2ccccc21 -0.752816 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl287_ki", + filename="chembl287_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl1862_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL1862 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Tyrosine-protein kinase abl1 target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 794 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl1862_ki + >>> dataset = load_chembl1862_ki() + >>> dataset # doctest: +SKIP + (['Nc1[nH]cnc2nnc(-c3ccc(Cl)cc3)c1-2, ..., 'CCCCNc1ncnc2c1cnn2CC(Cl)c1ccccc1'], \ + array([-2.699, ..., -3.3])) + + >>> dataset = load_chembl1862_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 Nc1[nH]cnc2nnc(-c3ccc(Cl)cc3)c1-2 -2.69897 + 1 Cc1ccc(N2NC(=O)/C(=C/c3ccc(-c4ccc(C)c(Cl)c4)o3)C2=O)cc1C -3.69897 + 2 O=C1NN(c2ccc(Cl)c(Cl)c2)C(=O)/C1=C\c1cccc(OCc2ccccc2)c1 -3.00000 + 3 O=C1NN(c2ccc(I)cc2)C(=O)/C1=C\c1cc2c(cc1Br)OCO2 -3.39794 + 4 O=C1NN(c2ccc(I)cc2)C(=O)/C1=C\c1ccc(N2CCOCC2)cc1 -4.30103 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl1862_ki", + filename="chembl1862_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl1871_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL1871 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Androgen receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 659 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl1871_ki + >>> dataset = load_chembl1871_ki() + >>> dataset # doctest: +SKIP + (['CC1=CC(C)(C)Nc2ccc3c(c21)/C(=C/c1ccsc1)Oc1ccc(F)cc1-3, ..., 'CN(C[C@](C)(O)C(=O)Nc1ccc(C#N)c(C(F)(F)F)c1)c1ccc(C#N)c(-c2ccccc2)c1'], \ + array([-2.825, ..., -1.892])) + + >>> dataset = load_chembl1871_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CC1=CC(C)(C)Nc2ccc3c(c21)/C(=C/c1ccsc1)Oc1ccc(F)cc1-3 -2.825426 + 1 CCc1ccccc1/C=C1\Oc2ccc(F)cc2-c2ccc3c(c21)C(C)=CC(C)(C)N3 -3.201124 + 2 CC1=CC(C)(C)Nc2ccc3c(c21)/C(=C/c1ccccc1N(C)C)Oc1ccc(F)cc1-3 -2.913284 + 3 CC1=CC(C)(C)Nc2ccc3c(c21)/C(=C/c1ccccc1)Oc1c(F)cccc1-3 -3.163161 + 4 CC(=O)O[C@]1(C(C)=O)CC[C@H]2[C@@H]3C[C@H](C)C4=CC(=O)CC[C@]4(C)[C@H]3CC[C@@]21C -0.462398 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl1871_ki", + filename="chembl1871_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl2034_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL2034 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Glucocorticoid receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 750 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl2034_ki + >>> dataset = load_chembl2034_ki() + >>> dataset # doctest: +SKIP + (['CC1=CC(C)(C)Nc2ccc3c(c21)/C(=C/c1ccsc1)Oc1ccc(F)cc1-3, ..., 'NS(=O)(=O)C[C@H]1COc2cc(F)ccc2N1C(=O)c1ccc2c(c1)NCCO2'], \ + array([-1.924, ..., -3.1])) + + >>> dataset = load_chembl2034_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CC1=CC(C)(C)Nc2ccc3c(c21)/C(=C/c1ccsc1)Oc1ccc(F)cc1-3 -1.924279 + 1 CCc1ccccc1/C=C1\Oc2ccc(F)cc2-c2ccc3c(c21)C(C)=CC(C)(C)N3 -2.431364 + 2 CC1=CC(C)(C)Nc2ccc3c(c21)/C(=C/c1ccccc1N(C)C)Oc1ccc(F)cc1-3 -2.692847 + 3 CC1=CC(C)(C)Nc2ccc3c(c21)/C(=C/c1ccccc1)Oc1c(F)cccc1-3 -2.506505 + 4 CC(=O)O[C@]1(C(C)=O)CC[C@H]2[C@@H]3C[C@H](C)C4=CC(=O)CC[C@]4(C)[C@H]3CC[C@@]21C -1.120574 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl2034_ki", + filename="chembl2034_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl2047_ec50( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL2047 EC50 dataset. + + The task is to predict the half maximal effective concentration (EC50) of molecules against the Bile acid receptor target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 631 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl2047_ec50 + >>> dataset = load_chembl2047_ec50() + >>> dataset # doctest: +SKIP + (['C[C@H](CCC(=O)NCC(=O)O)[C@H]1CC[C@H]2[C@H]3[C@H](CC[C@@]21C)[C@@]1(C)CC[C@@H](O)C[C@H]1C[C@H]3O, ..., 'CC(C)c1onc(-c2c(Cl)cccc2Cl)c1COc1ccc(CNc2ccc(CC(=O)O)cc2)c(Cl)c1'], \ + array([-3.477, ..., -2.973])) + + >>> dataset = load_chembl2047_ec50(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES EC50 + 0 C[C@H](CCC(=O)NCC(=O)O)[C@H]1CC[C@H]2[C@H]3[C@H](CC[C@@]21C)[C@@]1(C)CC[C@@H](O)C[C@H]1C[C@H]3O -3.477121 + 1 C[C@H](CCC(=O)O)C1CC[C@H]2[C@H]3[C@H](CC[C@]12C)[C@@]1(C)CC[C@@H](O)CC1[C@@H](C)[C@H]3O -2.875061 + 3 CCC[C@@H]1C2C[C@H](O)CC[C@]2(C)[C@H]2CC[C@]3(C)C([C@H](C)CCC(=O)O)CC[C@H]3[C@@H]2[C@@H]1O -3.045323 + 4 CC(C)c1onc(-c2c(Cl)cccc2Br)c1COc1ccc(/C=C/c2cccc(C(=O)O)c2)c(Cl)c1 -1.079181 + 5 Cc1cc(OCc2c(-c3c(Cl)cccc3Cl)noc2C(C)C)ccc1/C=C/c1cccc(C(=O)O)c1 -1.672098 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl2047_ec50", + filename="chembl2047_ec50.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl2147_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL2147 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Serine/threonine-protein kinase pim-1 target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 1456 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl2147_ki + >>> dataset = load_chembl2147_ki() + >>> dataset # doctest: +SKIP + (['FC(F)(F)c1cccc(-c2nnc3ccc(NC4CCCCC4)cn23)c1, ..., 'NC(=O)c1cc(Cl)c2c(Cl)c(C#CC3CNCCO3)n([C@H]3CCCNC3)c2n1'], \ + array([-1.041, ..., 0.04576])) + + >>> dataset = load_chembl2147_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 FC(F)(F)c1cccc(-c2nnc3ccc(NC4CCCCC4)cn23)c1 -1.041393 + 1 Cc1ccc2[nH]c(=O)c(CC(=O)O)c(-c3ccccc3)c2c1 -3.653213 + 2 O=C(O)c1cccc(Nc2nc(-c3ccc(O)cc3O)cs2)c1 -3.531479 + 3 O=C(O)c1cccc2c(-c3ccccc3)c(-c3ccccc3)[nH]c12 -2.740363 + 4 CCc1ccc(C2C(C(C)=O)=C(O)C(=O)N2CCc2c[nH]c3ccccc23)cc1 -3.322219 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl2147_ki", + filename="chembl2147_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl2835_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL2835 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Tyrosine-protein kinase jak1 target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 615 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl2835_ki + >>> dataset = load_chembl2835_ki() + >>> dataset # doctest: +SKIP + (['C[C@@H]1CCN(C(=O)CC#N)C[C@@H]1N(C)c1ncnc2[nH]ccc12, ..., 'Cc1cnc(Nc2ccc(OCCN3CCCC3)cc2)nc1Nc1cccc(S(=O)(=O)NC(C)(C)C)c1'], \ + array([0.1549, ..., -2.021])) + + >>> dataset = load_chembl2835_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 C[C@@H]1CCN(C(=O)CC#N)C[C@@H]1N(C)c1ncnc2[nH]ccc12 0.154902 + 1 C[C@@H]1CCN(C(=O)CC#N)C[C@@H]1n1cnc2cnc3[nH]ccc3c21 0.301030 + 2 C[C@@H]1CCN(Cc2ccccc2)C[C@@H]1N(C)c1ncnc2[nH]ccc12 -2.785330 + 3 C[C@@H]1CCN(Cc2ccccc2)C[C@@H]1n1cnc2cnc3[nH]ccc3c21 -1.079181 + 4 N#CCC(=O)N1CCC[C@@H](n2cnc3cnc4[nH]ccc4c32)C1 0.397940 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl2835_ki", + filename="chembl2835_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl2971_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL2971 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Tyrosine-protein kinase jak2 target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 976 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl2971_ki + >>> dataset = load_chembl2971_ki() + >>> dataset # doctest: +SKIP + (['NC(=O)Nc1sc(-c2ccc(F)cc2)cc1C(N)=O, ..., 'Cc1cc(Nc2nc(N[C@@H](C)c3ccc(F)cc3)c(C#N)cc2F)n[nH]1'], \ + array([-0.699, ..., 0.3468])) + + >>> dataset = load_chembl2971_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 NC(=O)Nc1sc(-c2ccc(F)cc2)cc1C(N)=O -0.698970 + 1 O[C@H]1CC[C@H](Nc2ccc3nnc(-c4cccc(C(F)(F)F)c4)n3n2)CC1 -3.380211 + 2 c1ccc(-c2ncnc3[nH]ccc23)cc1 -2.683947 + 3 Clc1cnc2[nH]cc(-c3ccccc3)c2c1 -2.414973 + 4 CCC1Nc2ccccc2-c2ccnc3[nH]cc1c23 -3.230449 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl2971_ki", + filename="chembl2971_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl3979_ec50( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL3979 EC50 dataset. + + The task is to predict the half maximal effective concentration (EC50) of molecules against the Peroxisome proliferator-activated receptor delta target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 1125 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl3979_ec50 + >>> dataset = load_chembl3979_ec50() + >>> dataset # doctest: +SKIP + (['CCC(Cc1ccc(OC)c(C(=O)NCCc2ccc(C(F)(F)F)cc2)c1)C(=O)O, ..., 'CC(C)c1onc(-c2c(Cl)cccc2Cl)c1COc1ccc(CNc2ccc(CC(=O)O)cc2)c(Cl)c1'], \ + array([-3.176, ..., -3.176])) + + >>> dataset = load_chembl3979_ec50(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES EC50 + 0 CCC(Cc1ccc(OC)c(C(=O)NCCc2ccc(C(F)(F)F)cc2)c1)C(=O)O -3.176091 + 1 CCC(Cc1ccc(OC)c(C(=O)NCc2ccc(OC(F)(F)F)cc2)c1)C(=O)O -2.954243 + 2 CCC(Cc1ccc(OC)c(CCCc2ccc(C(F)(F)F)cc2)c1)C(=O)O -2.806180 + 3 CCSC(Cc1ccc(OC)c(C(=O)NCc2ccc(C(F)(F)F)cc2)c1)C(=O)O -3.477121 + 4 CCOC(Cc1ccc(OC)c(C(=O)NCc2ccc(C(F)(F)F)cc2)c1)C(=O)O -3.477121 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl3979_ec50", + filename="chembl3979_ec50.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl4005_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL4005 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Phosphatidylinositol 4,5-bisphosphate 3-kinase catalytic subunit alpha isoform target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 960 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl4005_ki + >>> dataset = load_chembl4005_ki() + >>> dataset # doctest: +SKIP + (['COC[C@H]1OC(=O)c2coc3c2[C@@]1(C)C1=C(C3=O)[C@@H]2CCC(=O)[C@@]2(C)C[C@H]1OC(C)=O, ..., 'CC(C)n1nc(-c2ccc3oc(N)nc3c2)c2c(N)ncnc21'], \ + array([-2.079, ..., -1.447])) + + >>> dataset = load_chembl4005_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 COC[C@H]1OC(=O)c2coc3c2[C@@]1(C)C1=C(C3=O)[C@@H]2CCC(=O)[C@@]2(C)C[C@H]1OC(C)=O -2.079181 + 1 O=c1cc(N2CCOCC2)oc2c(-c3ccccc3)cccc12 -3.778151 + 2 CS(=O)(=O)N1CCN(Cc2cc3nc(-c4cccc5[nH]ncc45)nc(N4CCOCC4)c3s2)CC1 -0.806180 + 3 COc1ccc(NC(=O)c2c(C)ccc3c(N)nc(C)nc23)cn1 -2.000000 + 4 COc1ccc(NC(=O)c2cc(C)cc3c(N)nc(C)nc23)cn1 0.301030 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl4005_ki", + filename="chembl4005_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl4203_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL4203 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Dual specificity protein kinase clk4 target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 731 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl4203_ki + >>> dataset = load_chembl4203_ki() + >>> dataset # doctest: +SKIP + (['O=c1[nH]cnc2c1sc1c(Cl)ccc(Cl)c12, ..., 'O=C(c1cccc(-c2cnc3[nH]ccc3c2)c1)N1CCOCC1'], \ + array([-1.977, ..., -3.8])) + + >>> dataset = load_chembl4203_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 O=c1[nH]cnc2c1sc1c(Cl)ccc(Cl)c12 -1.976808 + 1 Nc1ncnc2onc(-c3ccc(NC(=O)Nc4cccc(C(F)(F)F)c4)cc3)c12 -2.400002 + 2 O=c1[nH]cnc2c(-c3ccccc3)c(C(F)(F)F)sc12 -3.299999 + 3 O=C1Nc2ccccc2Nc2cc(-c3ccncc3F)ccc21 -1.400020 + 4 Cc1cc(N2CCOCC2)cc2[nH]c(-c3c(NCC(O)c4cccc(Cl)c4)cc[nH]c3=O)nc12 -1.700011 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl4203_ki", + filename="chembl4203_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl4616_ec50( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL4616 EC50 dataset. + + The task is to predict the half maximal effective concentration (EC50) of molecules against the Growth hormone secretagogue receptor type 1 target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 682 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl4616_ec50 + >>> dataset = load_chembl4616_ec50() + >>> dataset # doctest: +SKIP + (['CCCCCCCC(=O)OC[C@H](NC(=O)CN)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccccc1)C(=O)O, ..., 'CC(=O)N1CCC[C@H](NC(=O)[C@H]2CN(S(=O)(=O)c3ccccc3)C[C@@H]2NC(=O)c2cc(-c3ccccc3Cl)on2)C1'], \ + array([-1.857, ..., -2.111])) + + >>> dataset = load_chembl4616_ec50(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES EC50 + 0 CCCCCCCC(=O)OC[C@H](NC(=O)CN)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccccc1)C(=O)O -1.857332 + 2 CC(C)(N)C(=O)N[C@H](COCc1ccccc1)C(=O)N1CCC2(CC1)CN(S(C)(=O)=O)c1ccccc12 0.072578 + 3 NC(=O)CN(CCc1ccccc1)C(=O)[C@@H](Cc1ccc2ccccc2c1)NC(=O)[C@@H](Cc1ccc2ccccc2c1)NC(=O)C1CCNCC1 0.468521 + 4 CC(C)N(CCNC(=O)C1c2ccc(Oc3cccc(F)c3)cc2CCN1C(=O)OC(C)(C)C)C(C)C -0.633468 + 5 CC(C)N(CCNC(=O)C1c2ccc(Oc3ccc(Cl)cc3)cc2CCN1C(=O)OC(C)(C)C)C(C)C 0.136677 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl4616_ec50", + filename="chembl4616_ec50.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) + + +@validate_params( + { + "data_dir": [None, str, os.PathLike], + "as_frame": ["boolean"], + "verbose": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_chembl4792_ki( + data_dir: str | os.PathLike | None = None, + as_frame: bool = False, + verbose: bool = False, +) -> pd.DataFrame | tuple[list[str]] | np.ndarray: + r""" + Load the ChEMBL4792 Ki dataset. + + The task is to predict the inhibitor constant (Ki) of molecules against the Orexin receptor type 2 target [1]_ [2]_. + + + ================== ============== + Tasks 1 + Task type regression + Total samples 1471 + Recommended split activity_cliff + Recommended metric RMSE + ================== ============== + + Parameters + ---------- + data_dir : {None, str, path-like}, default=None + Path to the root data directory. If ``None``, currently set scikit-learn directory + is used, by default `$HOME/scikit_learn_data`. + + as_frame : bool, default=False + If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise, + returns SMILES as list of strings, and labels as a NumPy array (1D integer binary + vector). + + verbose : bool, default=False + If True, progress bar will be shown for downloading or loading files. + + Returns + ------- + data : pd.DataFrame or tuple(list[str], np.ndarray) + Depending on the ``as_frame`` argument, one of: + - Pandas DataFrame with columns: "SMILES", "label" + - tuple of: list of strings (SMILES), NumPy array (labels) + + References + ---------- + .. [1] `D. van Tilborg, A. Alenicheva, and F. Grisoni + "Exposing the Limitations of Molecular Machine Learning with Activity Cliffs" + Journal of Chemical Information and Modeling, vol. 62, no. 23, pp. 5938–5951, Dec. 2022 + `_ + + .. [2] `B. Zdrazil et al. + "The ChEMBL Database in 2023: a drug discovery platform spanning multiple bioactivity data types and time periods" + Nucleic Acids Research, vol. 52, no. D1, Nov. 2023 + `_ + + Examples + -------- + >>> from skfp.datasets.moleculenet import load_chembl4792_ki + >>> dataset = load_chembl4792_ki() + >>> dataset # doctest: +SKIP + (['CC1(C)OC[C@H](NC(=O)Nc2ccc(Br)cc2Cl)[C@H](c2ccccc2)O1, ..., 'CC(/C=C/c1ccccc1)=N/Nc1nc(Nc2ccccc2)nc(-n2nc(C)cc2C)n1'], \ + array([-0.8, ..., -4.25])) + + >>> dataset = load_chembl4792_ki(as_frame=True) + >>> dataset.head() # doctest: +NORMALIZE_WHITESPACE + SMILES Ki + 0 CC1(C)OC[C@H](NC(=O)Nc2ccc(Br)cc2Cl)[C@H](c2ccccc2)O1 -0.800029 + 1 Cc1cc(Br)ccc1NC(=O)N[C@H]1COC(C)(C)O[C@H]1c1ccccc1 -1.599992 + 2 Cc1ccc(Cl)c(NC(=O)N[C@H]2COC(C)(C)O[C@H]2c2ccccc2)c1 -1.800029 + 3 Cc1ccc(NC(=O)N[C@H]2COC(C)(C)O[C@H]2c2ccccc2)c(C)c1 -2.099991 + 4 CC1(C)OC[C@H](NC(=O)Nc2cc(Cl)ccc2Cl)[C@H](c2ccccc2)O1 -1.800029 + """ + df = fetch_dataset( + data_dir, + dataset_name="MoleculeACE_chembl4792_ki", + filename="chembl4792_ki.csv", + verbose=verbose, + ) + return df if as_frame else get_mol_strings_and_labels(df) diff --git a/tests/datasets/moleculeace.py b/tests/datasets/moleculeace.py new file mode 100644 index 00000000..74ea9086 --- /dev/null +++ b/tests/datasets/moleculeace.py @@ -0,0 +1,262 @@ +import pytest +from sklearn.utils._param_validation import InvalidParameterError + +from skfp.datasets.moleculeace import ( + load_chembl204_ki, + load_chembl214_ki, + load_chembl218_ec50, + load_chembl219_ki, + load_chembl228_ki, + load_chembl231_ki, + load_chembl233_ki, + load_chembl234_ki, + load_chembl235_ec50, + load_chembl236_ki, + load_chembl237_ec50, + load_chembl237_ki, + load_chembl238_ki, + load_chembl239_ec50, + load_chembl244_ki, + load_chembl262_ki, + load_chembl264_ki, + load_chembl287_ki, + load_chembl1862_ki, + load_chembl1871_ki, + load_chembl2034_ki, + load_chembl2047_ec50, + load_chembl2147_ki, + load_chembl2835_ki, + load_chembl2971_ki, + load_chembl3979_ec50, + load_chembl4005_ki, + load_chembl4203_ki, + load_chembl4616_ec50, + load_chembl4792_ki, + load_moleculeace_benchmark, + load_moleculeace_dataset, + load_moleculeace_splits, +) +from skfp.datasets.moleculeace.benchmark import MOLECULEACE_DATASET_NAMES +from tests.datasets.test_utils import run_basic_dataset_checks + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +def test_load_moleculeace_benchmark(): + benchmark_full = load_moleculeace_benchmark(as_frames=True) + benchmark_names = [name for name, df in benchmark_full] + assert benchmark_names == MOLECULEACE_DATASET_NAMES + + benchmark_full_tuples = load_moleculeace_benchmark(as_frames=False) + benchmark_names = [name for name, smiles, y in benchmark_full_tuples] + assert benchmark_names == MOLECULEACE_DATASET_NAMES + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +def test_load_moleculeace_benchmark_subset(): + dataset_names = ["chembl4005_ki", "chembl204_ki", "chembl235_ec50"] + benchmark = load_moleculeace_benchmark(subset=dataset_names, as_frames=True) + benchmark_names = [name for name, df in benchmark] + assert benchmark_names == dataset_names + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +def test_load_moleculeace_benchmark_wrong_subset(): + dataset_names = ["chembl4005_ki", "Nonexistent"] + with pytest.raises(ValueError) as exc_info: + load_moleculeace_benchmark(subset=dataset_names, as_frames=True) + + assert "Dataset name 'Nonexistent' not recognized" in str(exc_info) + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +@pytest.mark.parametrize("split_type", ["random", "activity_cliff"]) +@pytest.mark.parametrize("dataset_name", MOLECULEACE_DATASET_NAMES) +def test_load_moleculeace_splits(dataset_name, split_type): + train, test = load_moleculeace_splits(dataset_name, split_type) + assert isinstance(train, list) + assert len(train) > 0 + assert all(isinstance(idx, int) for idx in train) + + assert isinstance(test, list) + assert len(test) > 0 + assert all(isinstance(idx, int) for idx in test) + + assert len(train) > len(test) + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +@pytest.mark.parametrize("split_type", ["random", "activity_cliff"]) +@pytest.mark.parametrize("dataset_name", MOLECULEACE_DATASET_NAMES) +def test_load_moleculeace_splits_as_dict(dataset_name, split_type): + train, test = load_moleculeace_splits(dataset_name, split_type) + split_idxs = load_moleculeace_splits(dataset_name, split_type, as_dict=True) + assert isinstance(split_idxs, dict) + assert set(split_idxs.keys()) == {"train", "test"} + assert split_idxs["train"] == train + assert split_idxs["test"] == test + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +@pytest.mark.parametrize( + "dataset_name, dataset_length", + [ + ("chembl204_ki", 2754), + ("chembl214_ki", 3317), + ("chembl218_ec50", 1031), + ("chembl219_ki", 1865), + ("chembl228_ki", 1704), + ("chembl231_ki", 973), + ("chembl233_ki", 3142), + ("chembl234_ki", 3657), + ("chembl235_ec50", 2349), + ("chembl236_ki", 2598), + ("chembl237_ec50", 955), + ("chembl237_ki", 2603), + ("chembl238_ki", 1052), + ("chembl239_ec50", 1721), + ("chembl244_ki", 3097), + ("chembl262_ki", 856), + ("chembl264_ki", 2862), + ("chembl287_ki", 1328), + ("chembl1862_ki", 794), + ("chembl1871_ki", 659), + ("chembl2034_ki", 750), + ("chembl2047_ec50", 631), + ("chembl2147_ki", 1456), + ("chembl2835_ki", 615), + ("chembl2971_ki", 976), + ("chembl3979_ec50", 1125), + ("chembl4005_ki", 960), + ("chembl4203_ki", 731), + ("chembl4616_ec50", 682), + ("chembl4792_ki", 1471), + ], +) +def test_load_moleculeace_splits_lengths(dataset_name, dataset_length): + train, test = load_moleculeace_splits(dataset_name, split_type="random") + loaded_length = len(train) + len(test) + assert loaded_length == dataset_length + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +@pytest.mark.parametrize("dataset_name", MOLECULEACE_DATASET_NAMES) +def test_load_moleculeace_splits_activity_cliffs(dataset_name): + random_train, random_test = load_moleculeace_splits( + dataset_name, split_type="random" + ) + activity_train, activity_test = load_moleculeace_splits( + dataset_name, split_type="activity_cliff" + ) + + assert set(random_train) == set(activity_train) + assert set(random_test) > set(activity_test) + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +@pytest.mark.parametrize("split_type", ["random", "activity_cliff"]) +def test_load_moleculeace_splits_nonexistent_dataset(split_type): + with pytest.raises(InvalidParameterError) as error: + load_moleculeace_splits("nonexistent", split_type) + + assert str(error.value).startswith( + "The 'dataset_name' parameter of load_moleculeace_splits must be a str among" + ) + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +def test_load_moleculeace_splits_nonexistent_splits(): + with pytest.raises(InvalidParameterError) as error: + load_moleculeace_splits("chembl204_ki", "nonexistent") + + assert str(error.value).startswith( + "The 'split_type' parameter of load_moleculeace_splits must be a str among" + ) + + +@pytest.mark.flaky( + reruns=100, + reruns_delay=5, + only_rerun=["LocalEntryNotFoundError", "FileNotFoundError"], +) +@pytest.mark.parametrize( + "dataset_name, load_func, expected_length, num_tasks, task_type", + [ + ("chembl204_ki", load_chembl204_ki, 2754, 1, "regression"), + ("chembl214_ki", load_chembl214_ki, 3317, 1, "regression"), + ("chembl218_ec50", load_chembl218_ec50, 1031, 1, "regression"), + ("chembl219_ki", load_chembl219_ki, 1865, 1, "regression"), + ("chembl228_ki", load_chembl228_ki, 1704, 1, "regression"), + ("chembl231_ki", load_chembl231_ki, 973, 1, "regression"), + ("chembl233_ki", load_chembl233_ki, 3142, 1, "regression"), + ("chembl234_ki", load_chembl234_ki, 3657, 1, "regression"), + ("chembl235_ec50", load_chembl235_ec50, 2349, 1, "regression"), + ("chembl236_ki", load_chembl236_ki, 2598, 1, "regression"), + ("chembl237_ec50", load_chembl237_ec50, 955, 1, "regression"), + ("chembl237_ki", load_chembl237_ki, 2603, 1, "regression"), + ("chembl238_ki", load_chembl238_ki, 1052, 1, "regression"), + ("chembl239_ec50", load_chembl239_ec50, 1721, 1, "regression"), + ("chembl244_ki", load_chembl244_ki, 3097, 1, "regression"), + ("chembl262_ki", load_chembl262_ki, 856, 1, "regression"), + ("chembl264_ki", load_chembl264_ki, 2862, 1, "regression"), + ("chembl287_ki", load_chembl287_ki, 1328, 1, "regression"), + ("chembl1862_ki", load_chembl1862_ki, 794, 1, "regression"), + ("chembl1871_ki", load_chembl1871_ki, 659, 1, "regression"), + ("chembl2034_ki", load_chembl2034_ki, 750, 1, "regression"), + ("chembl2047_ec50", load_chembl2047_ec50, 631, 1, "regression"), + ("chembl2147_ki", load_chembl2147_ki, 1456, 1, "regression"), + ("chembl2835_ki", load_chembl2835_ki, 615, 1, "regression"), + ("chembl2971_ki", load_chembl2971_ki, 976, 1, "regression"), + ("chembl3979_ec50", load_chembl3979_ec50, 1125, 1, "regression"), + ("chembl4005_ki", load_chembl4005_ki, 960, 1, "regression"), + ("chembl4203_ki", load_chembl4203_ki, 731, 1, "regression"), + ("chembl4616_ec50", load_chembl4616_ec50, 682, 1, "regression"), + ("chembl4792_ki", load_chembl4792_ki, 1471, 1, "regression"), + ], +) +def test_load_dataset(dataset_name, load_func, expected_length, num_tasks, task_type): + smiles_list, y = load_func() + df = load_moleculeace_dataset(dataset_name, as_frame=True) + run_basic_dataset_checks( + smiles_list, + y, + df, + expected_length=expected_length, + num_tasks=num_tasks, + task_type=task_type, + )