MLCIL · j-adamczyk · Mar 10, 2026 · Mar 8, 2026
@@ -29,7 +29,6 @@ ADME dataset loaders
     adme.load_b3db_regression
     adme.load_bioavailability_ma
     adme.load_caco2_wang
-    adme.load_clearance_hepatocyte_az
     adme.load_clearance_microsome_az
     adme.load_cyp1a2_veith
     adme.load_cyp2c19_veith

@@ -3,7 +3,6 @@
     load_b3db_regression,
     load_bioavailability_ma,
     load_caco2_wang,
-    load_clearance_hepatocyte_az,
     load_clearance_microsome_az,
     load_cyp1a2_veith,
     load_cyp2c9_substrate_carbonmangels,

@@ -314,88 +314,6 @@ def load_caco2_wang(
     return df if as_frame else get_mol_strings_and_labels(df)
 
 
-@validate_params(
-    {
-        "data_dir": [None, str, os.PathLike],
-        "as_frame": ["boolean"],
-        "verbose": ["boolean"],
-    },
-    prefer_skip_nested_validation=True,
-)
-def load_clearance_hepatocyte_az(
-    data_dir: str | os.PathLike | None = None,
-    as_frame: bool = False,
-    verbose: bool = False,
-) -> pd.DataFrame | tuple[list[str], np.ndarray]:
-    """
-    Load the hepatocyte subset of Clearance AstraZeneca dataset.
-
-    The task is to predict drug clearance.
-    It is defined as the volume of plasma cleared of a drug over a specified time period
-    and it measures the rate at which the active drug is removed from the body [1]_ [2]_ [3]_.
-    Many studies [2]_ show various clearance outcomes of experiments performed with
-    human hepatocytes (HHEP) and human liver microsomes (HLM) which are two main
-    in vitro systems used in metabolic stability and inhibition studies.
-    This subset od the Clearance dataset includes measurements from hepatocyte studies.
-
-    This dataset is a part of "excretion" subset of ADME tasks.
-
-    ==================  ====================
-    Tasks                                  1
-    Task type                     regression
-    Total samples                       1213
-    Recommended split               scaffold
-    Recommended metric  Spearman correlation
-    ==================  ====================
-
-    Parameters
-    ----------
-    data_dir : {None, str, path-like}, default=None
-        Path to the root data directory. If ``None``, currently set scikit-learn directory
-        is used, by default `$HOME/scikit_learn_data`.
-
-    as_frame : bool, default=False
-        If True, returns the raw DataFrame with columns: "SMILES", "label". Otherwise,
-        returns SMILES as list of strings, and labels as a NumPy array (1D integer binary
-        vector).
-
-    verbose : bool, default=False
-        If True, progress bar will be shown for downloading or loading files.
-
-    Returns
-    -------
-    data : pd.DataFrame or tuple(list[str], np.ndarray)
-        Depending on the ``as_frame`` argument, one of:
-        - Pandas DataFrame with columns: "SMILES", "label"
-        - tuple of: list of strings (SMILES), NumPy array (labels)
-
-    References
-    ----------
-    .. [1] `AstraZeneca.
-        "Experimental in vitro Dmpk and physicochemical data on a set of publicly disclosed compounds"
-        (2016)
-        <https://www.ebi.ac.uk/chembl/explore/document/CHEMBL3301361>`_
-
-    .. [2] `Di, Li, et al.
-        "Mechanistic insights from comparing intrinsic clearance values
-        between human liver microsomes and hepatocytes to guide drug design"
-        European Journal of Medicinal Chemistry 57 (2012): 441-448.
-        <https://doi.org/10.1016/j.ejmech.2012.06.043>`_
-
-    .. [3] `Huang, Kexin, et al.
-        "Therapeutics Data Commons: Machine Learning Datasets and Tasks for Drug Discovery and Development"
-        Proceedings of Neural Information Processing Systems, NeurIPS Datasets and Benchmarks, 2021
-        <https://openreview.net/forum?id=8nvgnORnoWr>`_
-    """
-    df = fetch_dataset(
-        data_dir,
-        dataset_name="TDC_clearance_hepatocyte_az",
-        filename="tdc_clearance_hepatocyte_az.csv",
-        verbose=verbose,
-    )
-    return df if as_frame else get_mol_strings_and_labels(df)
-
-
 @validate_params(
     {
         "data_dir": [None, str, os.PathLike],

@@ -12,7 +12,6 @@
     load_b3db_regression,
     load_bioavailability_ma,
     load_caco2_wang,
-    load_clearance_hepatocyte_az,
     load_clearance_microsome_az,
     load_cyp1a2_veith,
     load_cyp2c9_substrate_carbonmangels,
@@ -53,7 +52,6 @@
     "b3db_regression",
     "bioavailability_ma",
     "caco2_wang",
-    "clearance_hepatocyte_az",
     "clearance_microsome_az",
     "cyp1a2_veith",
     "cyp2c19_veith",
@@ -95,7 +93,6 @@
     "b3db_regression": load_b3db_regression,
     "bioavailability_ma": load_bioavailability_ma,
     "caco2_wang": load_caco2_wang,
-    "clearance_hepatocyte_az": load_clearance_hepatocyte_az,
     "clearance_microsome_az": load_clearance_microsome_az,
     "cyp1a2_veith": load_cyp1a2_veith,
     "cyp2c19_veith": load_cyp2c19_veith,
@@ -170,7 +167,6 @@ def load_tdc_benchmark(
     - "b3db_regression"
     - "bioavailability_ma"
     - "caco2_wang"
-    - "clearance_hepatocyte_az"
     - "clearance_microsome_az"
     - "cyp1a2_veith"
     - "cyp2c19_veith"
@@ -404,7 +400,6 @@ def _subset_to_dataset_names(subset: str | list[str] | None) -> list[str]:
         "b3db_regression",
         "bioavailability_ma",
         "caco2_wang",
-        "clearance_hepatocyte_az",
         "clearance_microsome_az",
         "cyp1a2_veith",
         "cyp2c19_veith",

@@ -8,7 +8,6 @@
     load_b3db_regression,
     load_bioavailability_ma,
     load_caco2_wang,
-    load_clearance_hepatocyte_az,
     load_clearance_microsome_az,
     load_cyp1a2_veith,
     load_cyp2c9_substrate_carbonmangels,
@@ -171,7 +170,6 @@ def test_load_ogb_splits_as_dict(dataset_name):
         ("b3db_regression", 942),
         ("ppbr_az", 1614),
         ("half_life_obach", 667),
-        ("clearance_hepatocyte_az", 1213),
         ("clearance_microsome_az", 1102),
         ("hlm", 6013),
         ("rlm", 5590),
@@ -285,13 +283,6 @@ def test_load_tdc_splits_nonexistent_dataset():
         ("b3db_regression", load_b3db_regression, 942, 1, "regression"),
         ("ppbr_az", load_ppbr_az, 1614, 1, "regression"),
         ("half_life_obach", load_half_life_obach, 667, 1, "regression"),
-        (
-            "clearance_hepatocyte_az",
-            load_clearance_hepatocyte_az,
-            1213,
-            1,
-            "regression",
-        ),
         ("clearance_microsome_az", load_clearance_microsome_az, 1102, 1, "regression"),
         ("hlm", load_hlm, 6013, 1, "binary_classification"),
         ("rlm", load_rlm, 5590, 1, "binary_classification"),