daisybio · JudithBernett · Mar 6, 2025 · Mar 3, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/.gitignore b/.gitignore
@@ -5,7 +5,8 @@ data/mapping
 data/GDSC1
 data/GDSC2
 data/CCLE
-data/Toy_Data
+data/TOYv1
+data/TOYv2
 data/CTRPv1
 data/CTRPv2
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -56,9 +56,9 @@
 # the built documents.
 #
 # The short X.Y version.
-version = "1.2.3"
+version = "1.2.4"
 # The full version, including alpha/beta/rc tags.
-release = "1.2.3"
+release = "1.2.4"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -3,12 +3,12 @@ Quickstart
 
 Make sure you have installed DrEvalPy and its dependencies (see `Installation <./installation.html>`_).
 
-To make sure the pipeline runs, you can use the fast models NaiveDrugMeanPredictor and NaivePredictor on the Toy_Data
+To make sure the pipeline runs, you can use the fast models NaiveDrugMeanPredictor and NaivePredictor on the TOYv1 (subset of CTRPv2) or TOYv2 (subset of GDSC2)
 dataset with the LPO test mode.
 
 .. code-block:: bash
 
-    python run_suite.py --run_id my_first_run --models NaiveDrugMeanPredictor --baselines NaivePredictor --dataset Toy_Data --test_mode LPO
+    python run_suite.py --run_id my_first_run --models NaiveDrugMeanPredictor --baselines NaivePredictor --dataset TOYv1 --test_mode LPO
 
 This will train the two baseline models on a subset of gene expression features and drug fingerprint features to
 predict IC50 values of the GDSC1 database. It will evaluate in "LPO" which is the leave-pairs-out splitting strategy

diff --git a/docs/usage.rst b/docs/usage.rst
@@ -156,14 +156,21 @@ We provide commonly used datasets to evaluate your model on (GDSC1, GDSC2, CCLE,
 +-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
 | Dataset Name      | Number of Drugs | Number of Cell Lines| Description                                                                                                           |
 +===================+=================+=====================+=======================================================================================================================+
-| GDSC1             | 345             | 987                 | The Genomics of Drug Sensitivity in Cancer (GDSC) dataset version 1.                                                  |
+| GDSC1             | 378             | 970                 | The Genomics of Drug Sensitivity in Cancer (GDSC) dataset version 1.                                                  |
 +-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
-| GDSC2             | 192             | 809                 | The Genomics of Drug Sensitivity in Cancer (GDSC) dataset version 2.                                                  |
+| GDSC2             | 287             | 969                 | The Genomics of Drug Sensitivity in Cancer (GDSC) dataset version 2.                                                  |
 +-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
-| CCLE              | 18              | 471                 | The Cancer Cell Line Encyclopedia (CCLE) dataset. The response data will soon be replaced with the data from CTRPv2.  |
+| CCLE              | 24              | 503                 | The Cancer Cell Line Encyclopedia (CCLE) dataset.                                                                     |
 +-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
-| Toy_Data          | 40              | 98                  | A toy dataset for testing purposes.                                                                                   |
+| CTRPv1            | 354             | 243                 | The Cancer Therapeutics Response Portal (CTRP) dataset version 1.                                                     |
 +-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
+| CTRPv2            | 546             | 886                 | The Cancer Therapeutics Response Portal (CTRP) dataset version 2.                                                     |
++-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
+| TOYv1             | 36              | 90                  | A toy dataset for testing purposes subsetted from CTRPv2.                                                             |
++-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
+| TOYv2             | 36              | 90                  | A second toy dataset for cross study testing purposes. 80 cell lines and 32 drugs overlap TOYv2.                      |
++-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
+
 
 If using the ``--curve_curator`` option with these datasets, the desired measure provided with the ``--measure`` option is appended with "_curvecurator", e.g. "IC50_curvecurator".
 In the provided datasets, these are the measures calculated with the same fitting procedure using CurveCurator. To use the measures reported from the original publications of the

diff --git a/drevalpy/datasets/dataset.py b/drevalpy/datasets/dataset.py
@@ -22,6 +22,7 @@
 import numpy as np
 import pandas as pd
 from sklearn.base import TransformerMixin
+from sklearn.feature_selection import VarianceThreshold
 from sklearn.model_selection import GroupKFold, train_test_split
 
 from ..pipeline_function import pipeline_function
@@ -1002,6 +1003,9 @@ def fit_transform_features(self, train_ids: np.ndarray, transformer: Transformer
         # Collect all features of the view for fitting the scaler
         train_features = np.vstack([self.features[identifier][view] for identifier in train_ids])
         transformer.fit(train_features)
+        if isinstance(transformer, VarianceThreshold):
+            mask = transformer.get_support()
+            self.meta_info[view] = self.meta_info[view][mask]
 
         # Apply transformation and scaling to each feature vector
         for identifier in self.features:

diff --git a/drevalpy/datasets/loader.py b/drevalpy/datasets/loader.py
@@ -23,7 +23,7 @@ def load_gdsc1(
 
     :param path_data: Path to the dataset.
     :param file_name: File name of the dataset.
-    :param measure: The name of the column containing the measure to predict, default = "LN_IC50"
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
 
     :param dataset_name: Name of the dataset.
     :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
@@ -49,7 +49,7 @@ def load_gdsc2(path_data: str = "data", measure: str = "LN_IC50_curvecurator", f
 
     :param path_data: Path to the dataset.
     :param file_name: File name of the dataset.
-    :param measure: The name of the column containing the measure to predict, default = "LN_IC50"
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
 
     :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
     """
@@ -64,7 +64,7 @@ def load_ccle(
 
     :param path_data: Path to the dataset.
     :param file_name: File name of the dataset.
-    :param measure: The name of the column containing the measure to predict, default = "LN_IC50"
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
 
     :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
     """
@@ -84,17 +84,19 @@ def load_ccle(
     )
 
 
-def load_toy(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
+def _load_toy(
+    path_data: str = "data", measure: str = "LN_IC50_curvecurator", dataset_name="TOYv1"
+) -> DrugResponseDataset:
     """
-    Loads small Toy dataset, subsampled from GDSC1.
+    Loads small Toy dataset, subsampled from CTRPv2 or GDSC2.
 
     :param path_data: Path to the dataset.
-    :param measure: The name of the column containing the measure to predict, default = "response"
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
+    :param dataset_name: Name of the dataset. Either "TOYv1" or "TOYv2".
 
     :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
     """
-    dataset_name = "Toy_Data"
-    path = os.path.join(path_data, dataset_name, "toy_data.csv")
+    path = os.path.join(path_data, dataset_name, f"{dataset_name}.csv")
     if not os.path.exists(path):
         download_dataset(dataset_name, path_data, redownload=True)
     response_data = pd.read_csv(path, dtype={"pubchem_id": str})
@@ -107,13 +109,37 @@ def load_toy(path_data: str = "data", measure: str = "LN_IC50_curvecurator") ->
     )
 
 
+def load_toyv1(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
+    """
+    Loads small Toy dataset, subsampled from CTRPv2.
+
+    :param path_data: Path to the dataset.
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
+
+    :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
+    """
+    return _load_toy(path_data, measure, "TOYv1")
+
+
+def load_toyv2(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
+    """
+    Loads small Toy dataset, subsampled from GDSC2. Can be used to test cross study prediction.
+
+    :param path_data: Path to the dataset.
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
+
+    :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
+    """
+    return _load_toy(path_data, measure, "TOYv2")
+
+
 def _load_ctrpv(version: str, path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
     """
     Load CTRPv1 dataset.
 
     :param version: The version of the CTRP dataset to load.
     :param path_data: Path to location of CTRPv1 dataset
-    :param measure: The name of the column containing the measure to predict, default = "response"
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
 
     :return: DrugResponseDataset containing response, cell line IDs, and drug IDs
     """
@@ -171,7 +197,8 @@ def load_custom(path_data: str | Path, measure: str = "response") -> DrugRespons
     "GDSC1": load_gdsc1,
     "GDSC2": load_gdsc2,
     "CCLE": load_ccle,
-    "Toy_Data": load_toy,
+    "TOYv1": load_toyv1,
+    "TOYv2": load_toyv2,
     "CTRPv1": load_ctrpv1,
     "CTRPv2": load_ctrpv2,
 }
@@ -184,7 +211,7 @@ def load_dataset(
     """
     Load a dataset based on the dataset name.
 
-    :param dataset_name: The name of the dataset to load. Can be one of ('GDSC1', 'GDSC2', 'CCLE', or 'Toy_Data')
+    :param dataset_name: The name of the dataset to load. Can be one of ('GDSC1', 'GDSC2', 'CCLE', 'TOYv1', or 'TOYv2')
         to download provided datasets, or any other name to allow for custom datasets.
     :param path_data: The parent path in which custom or downloaded datasets should be located, or in which raw
         viability data is to be found for fitting with CurveCurator (see param curve_curator for details).

diff --git a/drevalpy/datasets/utils.py b/drevalpy/datasets/utils.py
@@ -21,7 +21,7 @@ def download_dataset(
     """
     Download the latets dataset from Zenodo.
 
-    :param dataset_name: dataset name, e.g., "GDSC1", "GDSC2", "CCLE" or "Toy_Data"
+    :param dataset_name: dataset name, from "GDSC1", "GDSC2", "CCLE", "CTRPv1", "CTRPv2", "TOYv1", "TOYv2"
     :param data_path: where to save the data
     :param redownload: whether to redownload the data
     :raises HTTPError: if the download fails

diff --git a/drevalpy/models/DIPK/dipk.py b/drevalpy/models/DIPK/dipk.py
@@ -20,7 +20,7 @@
 
 from drevalpy.datasets.dataset import DrugResponseDataset, FeatureDataset
 from drevalpy.models.drp_model import DRPModel
-from drevalpy.models.utils import load_and_reduce_gene_features
+from drevalpy.models.utils import load_and_select_gene_features
 
 from .data_utils import CollateFn, DIPKDataset, get_data, load_bionic_features
 from .gene_expression_encoder import GeneExpressionEncoder, encode_gene_expression, train_gene_expession_autoencoder
@@ -263,13 +263,28 @@ def predict(
         :param cell_line_input: input data associated with the cell line
         :param drug_input: input data associated with the drug
         :return: predicted response values
-        :raises ValueError: if drug_input is None or if the model is not initialized
+        :raises ValueError: if drug_input is None or if the model is not initialized or
+            if the gene expression encoder is not initialized
         """
         if drug_input is None:
             raise ValueError("DIPK model requires drug features.")
         if not isinstance(self.model, Predictor):
             raise ValueError("DIPK model not initialized.")
 
+        # Encode gene expression data if this has not been done yet (e.g., for cross-study predictions)
+        if self.gene_expression_encoder is None:
+            raise ValueError("Gene expression encoder is not initialized.")
+        random_cell_line = next(iter(cell_line_input.features.keys()))
+        if (
+            len(cell_line_input.features[random_cell_line]["gene_expression"])
+            != self.gene_expression_encoder.latent_dim
+        ):
+            print("Encoding gene expression data for cross study prediction")
+            cell_line_input.apply(
+                lambda x: encode_gene_expression(x, self.gene_expression_encoder),  # type: ignore[arg-type]
+                view="gene_expression",
+            )  # type: ignore[arg-type]
+
         # Load data
         collate = CollateFn(train=False)
         test_samples = get_data(
@@ -310,9 +325,11 @@ def load_cell_line_features(self, data_path: str, dataset_name: str) -> FeatureD
         :param dataset_name: path to the dataset
         :returns: cell line features
         """
-        gene_expression = load_and_reduce_gene_features(
+        # we use the interception of all genes that are present
+        # in the gene expression features of all datasets
+        gene_expression = load_and_select_gene_features(
             feature_type="gene_expression",
-            gene_list=None,
+            gene_list="gene_expression_intersection",
             data_path=data_path,
             dataset_name=dataset_name,
         )

diff --git a/drevalpy/models/DIPK/gene_expression_encoder.py b/drevalpy/models/DIPK/gene_expression_encoder.py
@@ -11,9 +11,6 @@
 from torch.nn import functional
 from torch.utils.data import DataLoader, Dataset
 
-ldim = 512
-hdim = [2048, 1024]
-
 
 class GeneExpressionEncoder(nn.Module):
     """Gene expression encoder.
@@ -22,7 +19,7 @@ class GeneExpressionEncoder(nn.Module):
     DIPK model https://github.com/user15632/DIPK.
     """
 
-    def __init__(self, input_dim, latent_dim=ldim, h_dims=None, drop_out_rate=0.3):
+    def __init__(self, input_dim, latent_dim=512, h_dims=None, drop_out_rate=0.3):
         """Initialize the gene expression encoder.
 
         :param input_dim: input dimension
@@ -32,7 +29,7 @@ def __init__(self, input_dim, latent_dim=ldim, h_dims=None, drop_out_rate=0.3):
         """
         super().__init__()
         if h_dims is None:
-            h_dims = hdim
+            h_dims = [2048, 1024]
         hidden_dims = deepcopy(h_dims)
         hidden_dims.insert(0, input_dim)
         modules = []
@@ -47,6 +44,7 @@ def __init__(self, input_dim, latent_dim=ldim, h_dims=None, drop_out_rate=0.3):
             )
         self.encoder = nn.Sequential(*modules)
         self.bottleneck = nn.Linear(hidden_dims[-1], latent_dim)
+        self.latent_dim = latent_dim
 
     def forward(self, input):
         """Forward pass of the gene expression encoder.
@@ -62,7 +60,7 @@ def forward(self, input):
 class GeneExpressionDecoder(nn.Module):
     """Gene expression decoder."""
 
-    def __init__(self, input_dim, latent_dim=ldim, h_dims=None, drop_out_rate=0.3):
+    def __init__(self, input_dim, latent_dim=512, h_dims=None, drop_out_rate=0.3):
         """Initialize the gene expression decoder.
 
         :param input_dim: input dimension
@@ -72,7 +70,7 @@ def __init__(self, input_dim, latent_dim=ldim, h_dims=None, drop_out_rate=0.3):
         """
         super().__init__()
         if h_dims is None:
-            h_dims = hdim
+            h_dims = [2048, 1024]
         hidden_dims = deepcopy(h_dims)
         hidden_dims.insert(0, input_dim)
         self.decoder_input = nn.Linear(latent_dim, hidden_dims[-1])