Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion skfp/datasets/lrgb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from .benchmark import load_lrgb_mol_benchmark, load_lrgb_mol_splits
from .benchmark import (
load_lrgb_mol_benchmark,
load_lrgb_mol_dataset,
load_lrgb_mol_splits,
)
from .peptides_func import load_peptides_func
from .peptides_struct import load_peptides_struct
94 changes: 93 additions & 1 deletion skfp/datasets/lrgb/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def load_lrgb_mol_benchmark(
both, following LRGB [1]_. See paper for details on stratification. AUPRC metric
is recommended for Peptides-func, and MAE for Peptides-struct.

Dataset names are also returned (case-sensitive): "Peptides-func" and "Peptides-struct".
Dataset names are also returned (case-sensitive): "Peptides-func", "Peptides-struct".

Parameters
----------
Expand Down Expand Up @@ -111,6 +111,98 @@ def load_lrgb_mol_benchmark(
return datasets_gen


@validate_params(
{
"dataset_name": [StrOptions({"Peptides-func", "Peptides-struct"})],
"data_dir": [None, str, os.PathLike],
"mol_type": [StrOptions({"SMILES", "aminoseq"})],
"standardize_labels": ["boolean"],
"as_frame": ["boolean"],
"verbose": ["boolean"],
},
prefer_skip_nested_validation=True,
)
def load_lrgb_mol_dataset(
dataset_name: str,
data_dir: Optional[Union[str, os.PathLike]] = None,
mol_type: str = "SMILES",
standardize_labels: bool = True,
as_frame: bool = False,
verbose: bool = False,
) -> Union[pd.DataFrame, tuple[list[str]], np.ndarray]:
"""
Load LRGB molecular dataset by name.

Loads a given dataset from LRGB benchmark by its name. This is a proxy for
easier benchmarking, that avoids looking for individual functions.

Dataset names are (case-sensitive): "Peptides-func", "Peptides-struct".

Parameters
----------
dataset_name : {"Peptides-func", "Peptides-struct"}
Name of the dataset to load.

mol_type : {"SMILES", "aminoseq"}, default="SMILES"
Which molecule representation to return, either SMILES strings or aminoacid
sequences.

standardize_labels : bool, default=True
Whether to standardize labels to mean 0 and standard deviation 1 for
Peptides-struct, following the recommendation from the original paper [1]_.
Otherwise, the raw property values are returned.

data_dir : {None, str, path-like}, default=None
Path to the root data directory. If ``None``, currently set scikit-learn directory
is used, by default `$HOME/scikit_learn_data`.

as_frame : bool, default=False
If True, returns the raw DataFrame with columns "SMILES" and labels
(dataset-dependent). Otherwise, returns SMILES as list of strings, and
labels as a NumPy array (shape and type are dataset-dependent).

verbose : bool, default=False
If True, progress bar will be shown for downloading or loading files.

Returns
-------
data : pd.DataFrame or tuple(list[str], np.ndarray)
Depending on the ``as_frame`` argument, one of:
- Pandas DataFrame with columns depending on the dataset
- tuple of: list of strings (SMILES), NumPy array (labels)

References
----------
.. [1] `Dwivedi, Vijay Prakash, et al.
"Long Range Graph Benchmark"
Advances in Neural Information Processing Systems 35 (2022): 22326-22340
<https://proceedings.neurips.cc/paper_files/paper/2022/hash/8c3c666820ea055a77726d66fc7d447f-Abstract-Datasets_and_Benchmarks.html>`_

Examples
--------
>>> from skfp.datasets.lrgb import load_lrgb_mol_dataset
>>> dataset = load_lrgb_mol_dataset("Peptides-func")
>>> dataset # doctest: +SKIP
(['[Cl].CC(C)NCC(O)COc1cccc2ccccc12', ..., '[N+](=NCC(=O)N[C@@H]([C@H](O)C1=CC=C([N+]([O-])=O)C=C1)CO)=[N-]'], \
array([1, 1, 1, ..., 1, 1, 1]))

>>> dataset = load_lrgb_mol_dataset("Peptides-func", as_frame=True)
>>> dataset.head() # doctest: +NORMALIZE_WHITESPACE
SMILES label
0 [Cl].CC(C)NCC(O)COc1cccc2ccccc12 1
1 C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl 1
2 c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO... 1
3 C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C 1
4 Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)... 1
"""
if dataset_name == "Peptides-func":
return load_peptides_func(data_dir, mol_type, as_frame, verbose)
else:
return load_peptides_struct(
data_dir, mol_type, standardize_labels, as_frame, verbose
)


@validate_params(
{
"dataset_name": [StrOptions({"Peptides-func", "Peptides-struct"})],
Expand Down
6 changes: 5 additions & 1 deletion skfp/datasets/moleculenet/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from .bace import load_bace
from .bbbp import load_bbbp
from .benchmark import load_moleculenet_benchmark, load_ogb_splits
from .benchmark import (
load_moleculenet_benchmark,
load_moleculenet_dataset,
load_ogb_splits,
)
from .clintox import load_clintox
from .esol import load_esol
from .freesolv import load_freesolv
Expand Down
144 changes: 110 additions & 34 deletions skfp/datasets/moleculenet/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,35 @@
from .tox21 import load_tox21
from .toxcast import load_toxcast

MOLECULENET_DATASET_NAMES = [
"ESOL",
"FreeSolv",
"Lipophilicity",
"BACE",
"BBBP",
"HIV",
"ClinTox",
"MUV",
"SIDER",
"Tox21",
"ToxCast",
"PCBA",
]
MOLECULENET_DATASET_NAME_TO_LOADER_FUNC = {
"ESOL": load_esol,
"FreeSolv": load_freesolv,
"Lipophilicity": load_lipophilicity,
"BACE": load_bace,
"BBBP": load_bbbp,
"HIV": load_hiv,
"ClinTox": load_clintox,
"MUV": load_muv,
"SIDER": load_sider,
"Tox21": load_tox21,
"ToxCast": load_toxcast,
"PCBA": load_pcba,
}


@validate_params(
{
Expand Down Expand Up @@ -104,22 +133,9 @@ def load_moleculenet_benchmark(
"""
dataset_names = _subset_to_dataset_names(subset)

dataset_name_to_func = {
"ESOL": load_esol,
"FreeSolv": load_freesolv,
"Lipophilicity": load_lipophilicity,
"BACE": load_bace,
"BBBP": load_bbbp,
"HIV": load_hiv,
"ClinTox": load_clintox,
"MUV": load_muv,
"SIDER": load_sider,
"Tox21": load_tox21,
"ToxCast": load_toxcast,
"PCBA": load_pcba,
}

dataset_functions = [dataset_name_to_func[name] for name in dataset_names]
dataset_functions = [
MOLECULENET_DATASET_NAME_TO_LOADER_FUNC[name] for name in dataset_names
]

if as_frames:
# generator of tuples (dataset_name, DataFrame)
Expand All @@ -139,24 +155,84 @@ def load_moleculenet_benchmark(

@validate_params(
{
"dataset_name": [
StrOptions(
{
"ESOL",
"FreeSolv",
"Lipophilicity",
"BACE",
"BBBP",
"HIV",
"ClinTox",
"MUV",
"SIDER",
"Tox21",
"ToxCast",
"PCBA",
}
)
],
"dataset_name": [StrOptions(set(MOLECULENET_DATASET_NAMES))],
"data_dir": [None, str, os.PathLike],
"as_frame": ["boolean"],
"verbose": ["boolean"],
},
prefer_skip_nested_validation=True,
)
def load_moleculenet_dataset(
dataset_name: str,
data_dir: Optional[Union[str, os.PathLike]] = None,
as_frame: bool = False,
verbose: bool = False,
) -> Union[pd.DataFrame, tuple[list[str]], np.ndarray]:
"""
Load MoleculeNet dataset by name.

Loads a given dataset from MoleculeNet [1]_ benchmark by its name. This is a proxy
for easier benchmarking, that avoids looking for individual functions.

Dataset names here are the same as returned by `load_moleculenet_benchmark` function,
and are case-sensitive.

Parameters
----------
dataset_name : {"ESOL", "FreeSolv", "Lipophilicity","BACE", "BBBP", "HIV", "ClinTox",
"MUV", "SIDER", "Tox21", "ToxCast", "PCBA"}
Name of the dataset to load.

data_dir : {None, str, path-like}, default=None
Path to the root data directory. If ``None``, currently set scikit-learn directory
is used, by default `$HOME/scikit_learn_data`.

as_frame : bool, default=False
If True, returns the raw DataFrame with columns "SMILES" and labels
(dataset-dependent). Otherwise, returns SMILES as list of strings, and
labels as a NumPy array (shape and type are dataset-dependent).

verbose : bool, default=False
If True, progress bar will be shown for downloading or loading files.

Returns
-------
data : pd.DataFrame or tuple(list[str], np.ndarray)
Depending on the ``as_frame`` argument, one of:
- Pandas DataFrame with columns depending on the dataset
- tuple of: list of strings (SMILES), NumPy array (labels)

References
----------
.. [1] `Zhenqin Wu et al.
"MoleculeNet: a benchmark for molecular machine learning"
Chem. Sci., 2018,9, 513-530
<https://pubs.rsc.org/en/content/articlelanding/2018/sc/c7sc02664a>`_

Examples
--------
>>> from skfp.datasets.moleculenet import load_moleculenet_dataset
>>> dataset = load_moleculenet_dataset("BBBP")
>>> dataset # doctest: +SKIP
(['[Cl].CC(C)NCC(O)COc1cccc2ccccc12', ..., '[N+](=NCC(=O)N[C@@H]([C@H](O)C1=CC=C([N+]([O-])=O)C=C1)CO)=[N-]'], \
array([1, 1, 1, ..., 1, 1, 1]))

>>> dataset = load_moleculenet_dataset("BBBP", as_frame=True)
>>> dataset.head() # doctest: +NORMALIZE_WHITESPACE
SMILES label
0 [Cl].CC(C)NCC(O)COc1cccc2ccccc12 1
1 C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl 1
2 c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO... 1
3 C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C 1
4 Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)... 1
"""
loader_func = MOLECULENET_DATASET_NAME_TO_LOADER_FUNC[dataset_name]
return loader_func(data_dir, as_frame, verbose)


@validate_params(
{
"dataset_name": [StrOptions(set(MOLECULENET_DATASET_NAMES))],
"data_dir": [None, str, os.PathLike],
"as_frame": ["boolean"],
"verbose": ["boolean"],
Expand Down
File renamed without changes.
26 changes: 0 additions & 26 deletions skfp/datasets/tdc/adme/__init__.py

This file was deleted.

Loading