daisybio · JudithBernett · Mar 6, 2025 · Mar 3, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/.gitignore b/.gitignore
@@ -5,7 +5,7 @@ data/mapping
 data/GDSC1
 data/GDSC2
 data/CCLE
-data/Toy_Data
+data/TOYv1
 data/CTRPv1
 data/CTRPv2
 

diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -3,12 +3,12 @@ Quickstart
 
 Make sure you have installed DrEvalPy and its dependencies (see `Installation <./installation.html>`_).
 
-To make sure the pipeline runs, you can use the fast models NaiveDrugMeanPredictor and NaivePredictor on the Toy_Data
+To make sure the pipeline runs, you can use the fast models NaiveDrugMeanPredictor and NaivePredictor on the TOYv1
 dataset with the LPO test mode.
 
 .. code-block:: bash
 
-    python run_suite.py --run_id my_first_run --models NaiveDrugMeanPredictor --baselines NaivePredictor --dataset Toy_Data --test_mode LPO
+    python run_suite.py --run_id my_first_run --models NaiveDrugMeanPredictor --baselines NaivePredictor --dataset TOYv1 --test_mode LPO
 
 This will train the two baseline models on a subset of gene expression features and drug fingerprint features to
 predict IC50 values of the GDSC1 database. It will evaluate in "LPO" which is the leave-pairs-out splitting strategy

diff --git a/docs/usage.rst b/docs/usage.rst
@@ -156,14 +156,21 @@ We provide commonly used datasets to evaluate your model on (GDSC1, GDSC2, CCLE,
 +-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
 | Dataset Name      | Number of Drugs | Number of Cell Lines| Description                                                                                                           |
 +===================+=================+=====================+=======================================================================================================================+
-| GDSC1             | 345             | 987                 | The Genomics of Drug Sensitivity in Cancer (GDSC) dataset version 1.                                                  |
+| GDSC1             | 378             | 970                 | The Genomics of Drug Sensitivity in Cancer (GDSC) dataset version 1.                                                  |
 +-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
-| GDSC2             | 192             | 809                 | The Genomics of Drug Sensitivity in Cancer (GDSC) dataset version 2.                                                  |
+| GDSC2             | 287             | 969                 | The Genomics of Drug Sensitivity in Cancer (GDSC) dataset version 2.                                                  |
 +-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
-| CCLE              | 18              | 471                 | The Cancer Cell Line Encyclopedia (CCLE) dataset. The response data will soon be replaced with the data from CTRPv2.  |
+| CCLE              | 24              | 503                 | The Cancer Cell Line Encyclopedia (CCLE) dataset.                                                                     |
 +-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
-| Toy_Data          | 40              | 98                  | A toy dataset for testing purposes.                                                                                   |
+| CTRPv1            | 354             | 243                 | The Cancer Therapeutics Response Portal (CTRP) dataset version 1.                                                     |
 +-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
+| CTRPv2            | 546             | 886                 | The Cancer Therapeutics Response Portal (CTRP) dataset version 2.                                                     |
++-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
+| TOYv1             | 36              | 90                  | A toy dataset for testing purposes subsetted from CTRPv2.                                                             |
++-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
+| TOYv2             | 36              | 90                  | A second toy dataset for cross study testing purposes. 80 cell lines and 32 drugs overlap TOYv2.                      |
++-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
+
 
 If using the ``--curve_curator`` option with these datasets, the desired measure provided with the ``--measure`` option is appended with "_curvecurator", e.g. "IC50_curvecurator".
 In the provided datasets, these are the measures calculated with the same fitting procedure using CurveCurator. To use the measures reported from the original publications of the

diff --git a/drevalpy/datasets/loader.py b/drevalpy/datasets/loader.py
@@ -23,7 +23,7 @@ def load_gdsc1(
 
     :param path_data: Path to the dataset.
     :param file_name: File name of the dataset.
-    :param measure: The name of the column containing the measure to predict, default = "LN_IC50"
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
 
     :param dataset_name: Name of the dataset.
     :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
@@ -49,7 +49,7 @@ def load_gdsc2(path_data: str = "data", measure: str = "LN_IC50_curvecurator", f
 
     :param path_data: Path to the dataset.
     :param file_name: File name of the dataset.
-    :param measure: The name of the column containing the measure to predict, default = "LN_IC50"
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
 
     :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
     """
@@ -64,7 +64,7 @@ def load_ccle(
 
     :param path_data: Path to the dataset.
     :param file_name: File name of the dataset.
-    :param measure: The name of the column containing the measure to predict, default = "LN_IC50"
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
 
     :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
     """
@@ -84,17 +84,19 @@ def load_ccle(
     )
 
 
-def load_toy(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
+def _load_toy(
+    path_data: str = "data", measure: str = "LN_IC50_curvecurator", dataset_name="TOYv1"
+) -> DrugResponseDataset:
     """
-    Loads small Toy dataset, subsampled from GDSC1.
+    Loads small Toy dataset, subsampled from CTRPv2 or GDSC2.
 
     :param path_data: Path to the dataset.
-    :param measure: The name of the column containing the measure to predict, default = "response"
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
+    :param dataset_name: Name of the dataset. Either "TOYv1" or "TOYv2".
 
     :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
     """
-    dataset_name = "Toy_Data"
-    path = os.path.join(path_data, dataset_name, "toy_data.csv")
+    path = os.path.join(path_data, dataset_name, f"{dataset_name}.csv")
     if not os.path.exists(path):
         download_dataset(dataset_name, path_data, redownload=True)
     response_data = pd.read_csv(path, dtype={"pubchem_id": str})
@@ -107,13 +109,37 @@ def load_toy(path_data: str = "data", measure: str = "LN_IC50_curvecurator") ->
     )
 
 
+def load_toyv1(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
+    """
+    Loads small Toy dataset, subsampled from CTRPv2.
+
+    :param path_data: Path to the dataset.
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
+
+    :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
+    """
+    return _load_toy(path_data, measure, "TOYv1")
+
+
+def load_toyv2(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
+    """
+    Loads small Toy dataset, subsampled from GDSC2. Can be used to test cross study prediction.
+
+    :param path_data: Path to the dataset.
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
+
+    :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
+    """
+    return _load_toy(path_data, measure, "TOYv2")
+
+
 def _load_ctrpv(version: str, path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
     """
     Load CTRPv1 dataset.
 
     :param version: The version of the CTRP dataset to load.
     :param path_data: Path to location of CTRPv1 dataset
-    :param measure: The name of the column containing the measure to predict, default = "response"
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
 
     :return: DrugResponseDataset containing response, cell line IDs, and drug IDs
     """
@@ -171,7 +197,8 @@ def load_custom(path_data: str | Path, measure: str = "response") -> DrugRespons
     "GDSC1": load_gdsc1,
     "GDSC2": load_gdsc2,
     "CCLE": load_ccle,
-    "Toy_Data": load_toy,
+    "TOYv1": load_toyv1,
+    "TOYv2": load_toyv2,
     "CTRPv1": load_ctrpv1,
     "CTRPv2": load_ctrpv2,
 }
@@ -184,7 +211,7 @@ def load_dataset(
     """
     Load a dataset based on the dataset name.
 
-    :param dataset_name: The name of the dataset to load. Can be one of ('GDSC1', 'GDSC2', 'CCLE', or 'Toy_Data')
+    :param dataset_name: The name of the dataset to load. Can be one of ('GDSC1', 'GDSC2', 'CCLE', or 'TOYv1')
         to download provided datasets, or any other name to allow for custom datasets.
     :param path_data: The parent path in which custom or downloaded datasets should be located, or in which raw
         viability data is to be found for fitting with CurveCurator (see param curve_curator for details).

diff --git a/drevalpy/datasets/utils.py b/drevalpy/datasets/utils.py
@@ -21,7 +21,7 @@ def download_dataset(
     """
     Download the latets dataset from Zenodo.
 
-    :param dataset_name: dataset name, e.g., "GDSC1", "GDSC2", "CCLE" or "Toy_Data"
+    :param dataset_name: dataset name, from "GDSC1", "GDSC2", "CCLE", "CTRPv1", "CTRPv2", "TOYv1", "TOYv2"
     :param data_path: where to save the data
     :param redownload: whether to redownload the data
     :raises HTTPError: if the download fails

diff --git a/drevalpy/models/SuperFELTR/hyperparameters.yaml b/drevalpy/models/SuperFELTR/hyperparameters.yaml
@@ -11,21 +11,24 @@ SuperFELTR:
   expression_var_threshold:
     GDSC1: 0.1
     GDSC2: 0.1
-    Toy_Data: 0.03
+    TOYv1: 0.03
+    TOYv2: 0.03
     CCLE: 0.1
     CTRPv1: 0.1
     CTRPv2: 0.1
   mutation_var_threshold:
     GDSC1: 0.1
     GDSC2: 0.1
-    Toy_Data: 0.05
+    TOYv1: 0.05
+    TOYv2: 0.05
     CCLE: 0.1
     CTRPv1: 0.1
     CTRPv2: 0.1
   cnv_var_threshold:
     GDSC1: 0.7
     GDSC2: 0.7
-    Toy_Data: 0.6
+    TOYv1: 0.6
+    TOYv2: 0.6
     CCLE: 0.7
     CTRPv1: 0.7
     CTRPv2: 0.7

diff --git a/drevalpy/utils.py b/drevalpy/utils.py
@@ -341,8 +341,9 @@ def get_datasets(
     """
     Load the response data and cross-study datasets.
 
-    :param dataset_name: The name of the dataset to load. Can be one of ('GDSC1', 'GDSC2', 'CCLE', or 'Toy_Data')
-        to download provided datasets, or any other name to allow for custom datasets.
+    :param dataset_name: The name of the dataset to load. Can be one of ('GDSC1', 'GDSC2', 'CCLE', CTRPv1',
+        'CTRPv2', 'TOYv1', 'TOYv2')
+        to download provided datasets, or any other name to use a custom datasets.
     :param cross_study_datasets: list of cross-study datasets. CurveCurator is not applicable to these. If you wish
         to provide custom cross_study_datasets, you have to invoke curve fitting manually using
         drevalpy.datasets.curvecurator.fit_curves

diff --git a/tests/individual_models/conftest.py b/tests/individual_models/conftest.py
@@ -3,7 +3,7 @@
 import pytest
 
 from drevalpy.datasets.dataset import DrugResponseDataset, FeatureDataset
-from drevalpy.datasets.loader import load_toy
+from drevalpy.datasets.loader import load_toyv1
 from drevalpy.models.utils import (
     get_multiomics_feature_dataset,
     load_cl_ids_from_csv,
@@ -20,13 +20,13 @@ def sample_dataset() -> tuple[DrugResponseDataset, FeatureDataset, FeatureDatase
     :returns: drug_response, cell_line_input, drug_input
     """
     path_data = "../data"
-    drug_response = load_toy(path_data)
+    drug_response = load_toyv1(path_data)
     drug_response.remove_nan_responses()
-    cell_line_input = get_multiomics_feature_dataset(data_path=path_data, dataset_name="Toy_Data", gene_lists=None)
-    cell_line_ids = load_cl_ids_from_csv(path=path_data, dataset_name="Toy_Data")
+    cell_line_input = get_multiomics_feature_dataset(data_path=path_data, dataset_name="TOYv1", gene_lists=None)
+    cell_line_ids = load_cl_ids_from_csv(path=path_data, dataset_name="TOYv1")
     cell_line_input.add_features(cell_line_ids)
     # Load the drug features
-    drug_ids = load_drug_ids_from_csv(data_path=path_data, dataset_name="Toy_Data")
-    drug_input = load_drug_fingerprint_features(data_path=path_data, dataset_name="Toy_Data")
+    drug_ids = load_drug_ids_from_csv(data_path=path_data, dataset_name="TOYv1")
+    drug_input = load_drug_fingerprint_features(data_path=path_data, dataset_name="TOYv1")
     drug_input.add_features(drug_ids)
     return drug_response, cell_line_input, drug_input
diff --git a/tests/individual_models/test_literature_models.py b/tests/individual_models/test_literature_models.py
@@ -123,8 +123,8 @@ def test_dipk(
     hpam_combi["epochs"] = 1
     hpam_combi["epochs_autoencoder"] = 1
     model.build_model(hpam_combi)
-    drug_input = model.load_drug_features(data_path="../data", dataset_name="Toy_Data")  # type: ignore
-    cell_line_input = model.load_cell_line_features(data_path="../data", dataset_name="Toy_Data")
+    drug_input = model.load_drug_features(data_path="../data", dataset_name="TOYv1")  # type: ignore
+    cell_line_input = model.load_cell_line_features(data_path="../data", dataset_name="TOYv1")
 
     cell_lines_to_keep = cell_line_input.identifiers
     drugs_to_keep = drug_input.identifiers

diff --git a/tests/test_available_data.py b/tests/test_available_data.py
@@ -10,7 +10,7 @@ def test_factory() -> None:
     assert "GDSC1" in AVAILABLE_DATASETS
     assert "GDSC2" in AVAILABLE_DATASETS
     assert "CCLE" in AVAILABLE_DATASETS
-    assert "Toy_Data" in AVAILABLE_DATASETS
+    assert "TOYv1" in AVAILABLE_DATASETS
     assert "CTRPv1" in AVAILABLE_DATASETS
     assert "CTRPv2" in AVAILABLE_DATASETS
     assert len(AVAILABLE_DATASETS) == 6
@@ -51,8 +51,15 @@ def test_ctrpv2():
     assert len(ctrpv2) == 395024
 
 
-def test_toy_data():
-    """Test the Toy_Data dataset."""
+def test_toyv1():
+    """Test the TOYv1 dataset."""
     tempdir = tempfile.TemporaryDirectory()
-    toy_data = AVAILABLE_DATASETS["Toy_Data"](path_data=tempdir.name)
-    assert len(toy_data) == 3426
+    toyv1 = AVAILABLE_DATASETS["TOYv1"](path_data=tempdir.name)
+    assert len(toyv1) == 2680
+
+
+def test_toyv2():
+    """Test the TOYv2 dataset."""
+    tempdir = tempfile.TemporaryDirectory()
+    toyv2 = AVAILABLE_DATASETS["TOYv2"](path_data=tempdir.name)
+    assert len(toyv2) == 2837
diff --git a/tests/test_drp_model.py b/tests/test_drp_model.py
@@ -147,6 +147,23 @@ def test_load_and_reduce_gene_features(gene_list: Optional[str]) -> None:
         assert "The following genes are missing from the dataset GDSC1_small" in str(valerr.value)
 
 
+def test_order_load_and_reduce_gene_features() -> None:
+    """Test the order of the features after loading and reducing gene features. it should be maintained."""
+    # TODO move to cross study tests where TOYv1 and TOYv2 are available!!!
+    gene_list = "gene_expression_genes_intersection.csv"
+    a = load_and_reduce_gene_features("gene_expression", gene_list, "data", "TOYv1")
+    b = load_and_reduce_gene_features("gene_expression", gene_list, "data", "TOYv2")
+    # assert the meta info (=gene names) are the same
+    assert np.all(a.meta_info["gene_expression"] == b.meta_info["gene_expression"])
+    # assert the shape of the features for a random cell line is actually the same
+    random_cell_line_a = np.random.choice(a.identifiers)
+    random_cell_line_b = np.random.choice(b.identifiers)
+    assert (
+        a.features[random_cell_line_a]["gene_expression"].shape
+        == b.features[random_cell_line_b]["gene_expression"].shape
+    )
+
+
 def test_iterate_features() -> None:
     """Test the iteration over features."""
     df = pd.DataFrame({"GeneA": [1, 2, 3, 2], "GeneB": [4, 5, 6, 2], "GeneC": [7, 8, 9, 2]})

diff --git a/tests/test_run_suite.py b/tests/test_run_suite.py
@@ -15,7 +15,7 @@
     [
         {
             "run_id": "test_run",
-            "dataset_name": "Toy_Data",
+            "dataset_name": "TOYv1",
             "models": ["NaiveCellLineMeanPredictor"],
             "baselines": ["NaiveDrugMeanPredictor"],
             "test_mode": ["LPO"],
@@ -53,7 +53,7 @@ def test_run_suite(args):
         evaluation_results_per_drug,
         evaluation_results_per_cell_line,
         true_vs_pred,
-    ) = parse_results(path_to_results=os.path.join(temp_dir.name, args.run_id), dataset="Toy_Data")
+    ) = parse_results(path_to_results=os.path.join(temp_dir.name, args.run_id), dataset="TOYv1")
 
     (
         evaluation_results,