Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion docs/modules/datasets/tdc.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ ADME dataset loaders
adme.load_b3db_regression
adme.load_bioavailability_ma
adme.load_caco2_wang
adme.load_clearance_hepatocyte_az
adme.load_clearance_microsome_az
adme.load_cyp1a2_veith
adme.load_cyp2c19_veith
Expand Down
1 change: 0 additions & 1 deletion skfp/datasets/tdc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
load_b3db_regression,
load_bioavailability_ma,
load_caco2_wang,
load_clearance_hepatocyte_az,
load_clearance_microsome_az,
load_cyp1a2_veith,
load_cyp2c9_substrate_carbonmangels,
Expand Down
82 changes: 0 additions & 82 deletions skfp/datasets/tdc/adme.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,88 +314,6 @@ def load_caco2_wang(
return df if as_frame else get_mol_strings_and_labels(df)


@validate_params(
{
"data_dir": [None, str, os.PathLike],
"as_frame": ["boolean"],
"verbose": ["boolean"],
},
prefer_skip_nested_validation=True,
)
def load_clearance_hepatocyte_az(
data_dir: str | os.PathLike | None = None,
as_frame: bool = False,
verbose: bool = False,
) -> pd.DataFrame | tuple[list[str], np.ndarray]:
"""
Load the hepatocyte subset of Clearance AstraZeneca dataset.

The task is to predict drug clearance.
It is defined as the volume of plasma cleared of a drug over a specified time period
and it measures the rate at which the active drug is removed from the body [1]_ [2]_ [3]_.
Many studies [2]_ show various clearance outcomes of experiments performed with
human hepatocytes (HHEP) and human liver microsomes (HLM) which are two main
in vitro systems used in metabolic stability and inhibition studies.
This subset od the Clearance dataset includes measurements from hepatocyte studies.

This dataset is a part of "excretion" subset of ADME tasks.

================== ====================
Tasks 1
Task type regression
Total samples 1213
Recommended split scaffold
Recommended metric Spearman correlation
================== ====================

Parameters
----------
data_dir : {None, str, path-like}, default=None
Path to the root data directory. If ``None``, currently set scikit-learn directory
is used, by default `$HOME/scikit_learn_data`.

as_frame : bool, default=False
If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise,
returns SMILES as list of strings, and labels as a NumPy array (1D integer binary
vector).

verbose : bool, default=False
If True, progress bar will be shown for downloading or loading files.

Returns
-------
data : pd.DataFrame or tuple(list[str], np.ndarray)
Depending on the ``as_frame`` argument, one of:
- Pandas DataFrame with columns: "SMILES", "label"
- tuple of: list of strings (SMILES), NumPy array (labels)

References
----------
.. [1] `AstraZeneca.
"Experimental in vitro Dmpk and physicochemical data on a set of publicly disclosed compounds"
(2016)
<https://www.ebi.ac.uk/chembl/explore/document/CHEMBL3301361>`_

.. [2] `Di, Li, et al.
"Mechanistic insights from comparing intrinsic clearance values
between human liver microsomes and hepatocytes to guide drug design"
European Journal of Medicinal Chemistry 57 (2012): 441-448.
<https://doi.org/10.1016/j.ejmech.2012.06.043>`_

.. [3] `Huang, Kexin, et al.
"Therapeutics Data Commons: Machine Learning Datasets and Tasks for Drug Discovery and Development"
Proceedings of Neural Information Processing Systems, NeurIPS Datasets and Benchmarks, 2021
<https://openreview.net/forum?id=8nvgnORnoWr>`_
"""
df = fetch_dataset(
data_dir,
dataset_name="TDC_clearance_hepatocyte_az",
filename="tdc_clearance_hepatocyte_az.csv",
verbose=verbose,
)
return df if as_frame else get_mol_strings_and_labels(df)


@validate_params(
{
"data_dir": [None, str, os.PathLike],
Expand Down
5 changes: 0 additions & 5 deletions skfp/datasets/tdc/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
load_b3db_regression,
load_bioavailability_ma,
load_caco2_wang,
load_clearance_hepatocyte_az,
load_clearance_microsome_az,
load_cyp1a2_veith,
load_cyp2c9_substrate_carbonmangels,
Expand Down Expand Up @@ -53,7 +52,6 @@
"b3db_regression",
"bioavailability_ma",
"caco2_wang",
"clearance_hepatocyte_az",
"clearance_microsome_az",
"cyp1a2_veith",
"cyp2c19_veith",
Expand Down Expand Up @@ -95,7 +93,6 @@
"b3db_regression": load_b3db_regression,
"bioavailability_ma": load_bioavailability_ma,
"caco2_wang": load_caco2_wang,
"clearance_hepatocyte_az": load_clearance_hepatocyte_az,
"clearance_microsome_az": load_clearance_microsome_az,
"cyp1a2_veith": load_cyp1a2_veith,
"cyp2c19_veith": load_cyp2c19_veith,
Expand Down Expand Up @@ -170,7 +167,6 @@ def load_tdc_benchmark(
- "b3db_regression"
- "bioavailability_ma"
- "caco2_wang"
- "clearance_hepatocyte_az"
- "clearance_microsome_az"
- "cyp1a2_veith"
- "cyp2c19_veith"
Expand Down Expand Up @@ -404,7 +400,6 @@ def _subset_to_dataset_names(subset: str | list[str] | None) -> list[str]:
"b3db_regression",
"bioavailability_ma",
"caco2_wang",
"clearance_hepatocyte_az",
"clearance_microsome_az",
"cyp1a2_veith",
"cyp2c19_veith",
Expand Down
9 changes: 0 additions & 9 deletions tests/datasets/tdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
load_b3db_regression,
load_bioavailability_ma,
load_caco2_wang,
load_clearance_hepatocyte_az,
load_clearance_microsome_az,
load_cyp1a2_veith,
load_cyp2c9_substrate_carbonmangels,
Expand Down Expand Up @@ -171,7 +170,6 @@ def test_load_ogb_splits_as_dict(dataset_name):
("b3db_regression", 942),
("ppbr_az", 1614),
("half_life_obach", 667),
("clearance_hepatocyte_az", 1213),
("clearance_microsome_az", 1102),
("hlm", 6013),
("rlm", 5590),
Expand Down Expand Up @@ -285,13 +283,6 @@ def test_load_tdc_splits_nonexistent_dataset():
("b3db_regression", load_b3db_regression, 942, 1, "regression"),
("ppbr_az", load_ppbr_az, 1614, 1, "regression"),
("half_life_obach", load_half_life_obach, 667, 1, "regression"),
(
"clearance_hepatocyte_az",
load_clearance_hepatocyte_az,
1213,
1,
"regression",
),
("clearance_microsome_az", load_clearance_microsome_az, 1102, 1, "regression"),
("hlm", load_hlm, 6013, 1, "binary_classification"),
("rlm", load_rlm, 5590, 1, "binary_classification"),
Expand Down