daisybio
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/quickstart.rst‎
Lines changed: 2 additions & 2 deletions b/‎docs/quickstart.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/usage.rst‎
Lines changed: 11 additions & 4 deletions b/‎docs/usage.rst‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎drevalpy/datasets/loader.py‎
Lines changed: 38 additions & 11 deletions b/‎drevalpy/datasets/loader.py‎
Lines changed: 38 additions & 11 deletions
diff --git a/‎drevalpy/datasets/utils.py‎
Lines changed: 1 addition & 1 deletion b/‎drevalpy/datasets/utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎drevalpy/models/DIPK/dipk.py‎
Lines changed: 13 additions & 1 deletion b/‎drevalpy/models/DIPK/dipk.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎drevalpy/models/DIPK/gene_expression_encoder.py‎
Lines changed: 5 additions & 7 deletions b/‎drevalpy/models/DIPK/gene_expression_encoder.py‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎drevalpy/models/MOLIR/molir.py‎
Lines changed: 13 additions & 1 deletion b/‎drevalpy/models/MOLIR/molir.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎drevalpy/models/SuperFELTR/hyperparameters.yaml‎
Lines changed: 6 additions & 3 deletions b/‎drevalpy/models/SuperFELTR/hyperparameters.yaml‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎drevalpy/models/SuperFELTR/superfeltr.py‎
Lines changed: 9 additions & 0 deletions b/‎drevalpy/models/SuperFELTR/superfeltr.py‎
Lines changed: 9 additions & 0 deletions
@@ -5,7 +5,8 @@ data/mapping
 data/GDSC1
 data/GDSC2
 data/CCLE
-data/Toy_Data
+data/TOYv1
+data/TOYv2
 data/CTRPv1
 data/CTRPv2
 
 
@@ -3,12 +3,12 @@ Quickstart
 
 Make sure you have installed DrEvalPy and its dependencies (see `Installation <./installation.html>`_).
 
-To make sure the pipeline runs, you can use the fast models NaiveDrugMeanPredictor and NaivePredictor on the Toy_Data
+To make sure the pipeline runs, you can use the fast models NaiveDrugMeanPredictor and NaivePredictor on the TOYv1 (subset of CTRPv2) or TOYv2 (subset of GDSC2)
 dataset with the LPO test mode.
 
 .. code-block:: bash
 
-    python run_suite.py --run_id my_first_run --models NaiveDrugMeanPredictor --baselines NaivePredictor --dataset Toy_Data --test_mode LPO
+    python run_suite.py --run_id my_first_run --models NaiveDrugMeanPredictor --baselines NaivePredictor --dataset TOYv1 --test_mode LPO
 
 This will train the two baseline models on a subset of gene expression features and drug fingerprint features to
 predict IC50 values of the GDSC1 database. It will evaluate in "LPO" which is the leave-pairs-out splitting strategy
 
@@ -156,14 +156,21 @@ We provide commonly used datasets to evaluate your model on (GDSC1, GDSC2, CCLE,
 +-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
 | Dataset Name      | Number of Drugs | Number of Cell Lines| Description                                                                                                           |
 +===================+=================+=====================+=======================================================================================================================+
-| GDSC1             | 345             | 987                 | The Genomics of Drug Sensitivity in Cancer (GDSC) dataset version 1.                                                  |
+| GDSC1             | 378             | 970                 | The Genomics of Drug Sensitivity in Cancer (GDSC) dataset version 1.                                                  |
 +-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
-| GDSC2             | 192             | 809                 | The Genomics of Drug Sensitivity in Cancer (GDSC) dataset version 2.                                                  |
+| GDSC2             | 287             | 969                 | The Genomics of Drug Sensitivity in Cancer (GDSC) dataset version 2.                                                  |
 +-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
-| CCLE              | 18              | 471                 | The Cancer Cell Line Encyclopedia (CCLE) dataset. The response data will soon be replaced with the data from CTRPv2.  |
+| CCLE              | 24              | 503                 | The Cancer Cell Line Encyclopedia (CCLE) dataset.                                                                     |
 +-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
-| Toy_Data          | 40              | 98                  | A toy dataset for testing purposes.                                                                                   |
+| CTRPv1            | 354             | 243                 | The Cancer Therapeutics Response Portal (CTRP) dataset version 1.                                                     |
 +-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
+| CTRPv2            | 546             | 886                 | The Cancer Therapeutics Response Portal (CTRP) dataset version 2.                                                     |
++-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
+| TOYv1             | 36              | 90                  | A toy dataset for testing purposes subsetted from CTRPv2.                                                             |
++-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
+| TOYv2             | 36              | 90                  | A second toy dataset for cross study testing purposes. 80 cell lines and 32 drugs overlap TOYv2.                      |
++-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
+
 
 If using the ``--curve_curator`` option with these datasets, the desired measure provided with the ``--measure`` option is appended with "_curvecurator", e.g. "IC50_curvecurator".
 In the provided datasets, these are the measures calculated with the same fitting procedure using CurveCurator. To use the measures reported from the original publications of the
 
@@ -23,7 +23,7 @@ def load_gdsc1(
 
     :param path_data: Path to the dataset.
     :param file_name: File name of the dataset.
-    :param measure: The name of the column containing the measure to predict, default = "LN_IC50"
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
 
     :param dataset_name: Name of the dataset.
     :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
@@ -49,7 +49,7 @@ def load_gdsc2(path_data: str = "data", measure: str = "LN_IC50_curvecurator", f
 
     :param path_data: Path to the dataset.
     :param file_name: File name of the dataset.
-    :param measure: The name of the column containing the measure to predict, default = "LN_IC50"
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
 
     :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
     """
@@ -64,7 +64,7 @@ def load_ccle(
 
     :param path_data: Path to the dataset.
     :param file_name: File name of the dataset.
-    :param measure: The name of the column containing the measure to predict, default = "LN_IC50"
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
 
     :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
     """
@@ -84,17 +84,19 @@ def load_ccle(
     )
 
 
-def load_toy(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
+def _load_toy(
+    path_data: str = "data", measure: str = "LN_IC50_curvecurator", dataset_name="TOYv1"
+) -> DrugResponseDataset:
     """
-    Loads small Toy dataset, subsampled from GDSC1.
+    Loads small Toy dataset, subsampled from CTRPv2 or GDSC2.
 
     :param path_data: Path to the dataset.
-    :param measure: The name of the column containing the measure to predict, default = "response"
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
+    :param dataset_name: Name of the dataset. Either "TOYv1" or "TOYv2".
 
     :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
     """
-    dataset_name = "Toy_Data"
-    path = os.path.join(path_data, dataset_name, "toy_data.csv")
+    path = os.path.join(path_data, dataset_name, f"{dataset_name}.csv")
     if not os.path.exists(path):
         download_dataset(dataset_name, path_data, redownload=True)
     response_data = pd.read_csv(path, dtype={"pubchem_id": str})
@@ -107,13 +109,37 @@ def load_toy(path_data: str = "data", measure: str = "LN_IC50_curvecurator") ->
     )
 
 
+def load_toyv1(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
+    """
+    Loads small Toy dataset, subsampled from CTRPv2.
+
+    :param path_data: Path to the dataset.
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
+
+    :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
+    """
+    return _load_toy(path_data, measure, "TOYv1")
+
+
+def load_toyv2(path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
+    """
+    Loads small Toy dataset, subsampled from GDSC2. Can be used to test cross study prediction.
+
+    :param path_data: Path to the dataset.
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
+
+    :return: DrugResponseDataset containing response, cell line IDs, and drug IDs.
+    """
+    return _load_toy(path_data, measure, "TOYv2")
+
+
 def _load_ctrpv(version: str, path_data: str = "data", measure: str = "LN_IC50_curvecurator") -> DrugResponseDataset:
     """
     Load CTRPv1 dataset.
 
     :param version: The version of the CTRP dataset to load.
     :param path_data: Path to location of CTRPv1 dataset
-    :param measure: The name of the column containing the measure to predict, default = "response"
+    :param measure: The name of the column containing the measure to predict, default = "LN_IC50_curvecurator"
 
     :return: DrugResponseDataset containing response, cell line IDs, and drug IDs
     """
@@ -171,7 +197,8 @@ def load_custom(path_data: str | Path, measure: str = "response") -> DrugRespons
     "GDSC1": load_gdsc1,
     "GDSC2": load_gdsc2,
     "CCLE": load_ccle,
-    "Toy_Data": load_toy,
+    "TOYv1": load_toyv1,
+    "TOYv2": load_toyv2,
     "CTRPv1": load_ctrpv1,
     "CTRPv2": load_ctrpv2,
 }
@@ -184,7 +211,7 @@ def load_dataset(
     """
     Load a dataset based on the dataset name.
 
-    :param dataset_name: The name of the dataset to load. Can be one of ('GDSC1', 'GDSC2', 'CCLE', or 'Toy_Data')
+    :param dataset_name: The name of the dataset to load. Can be one of ('GDSC1', 'GDSC2', 'CCLE', 'TOYv1', or 'TOYv2')
         to download provided datasets, or any other name to allow for custom datasets.
     :param path_data: The parent path in which custom or downloaded datasets should be located, or in which raw
         viability data is to be found for fitting with CurveCurator (see param curve_curator for details).
 
@@ -21,7 +21,7 @@ def download_dataset(
     """
     Download the latets dataset from Zenodo.
 
-    :param dataset_name: dataset name, e.g., "GDSC1", "GDSC2", "CCLE" or "Toy_Data"
+    :param dataset_name: dataset name, from "GDSC1", "GDSC2", "CCLE", "CTRPv1", "CTRPv2", "TOYv1", "TOYv2"
     :param data_path: where to save the data
     :param redownload: whether to redownload the data
     :raises HTTPError: if the download fails
 
@@ -270,6 +270,18 @@ def predict(
         if not isinstance(self.model, Predictor):
             raise ValueError("DIPK model not initialized.")
 
+        # Encode gene expression data if this has not been done yet (e.g., for cross-study predictions)
+        random_cell_line = next(iter(cell_line_input.features.keys()))
+        if (
+            len(cell_line_input.features[random_cell_line]["gene_expression"])
+            != self.gene_expression_encoder.latent_dim
+        ):
+            print("Encoding gene expression data for cross study prediction")
+            cell_line_input.apply(
+                lambda x: encode_gene_expression(x, self.gene_expression_encoder),  # type: ignore[arg-type]
+                view="gene_expression",
+            )  # type: ignore[arg-type]
+
         # Load data
         collate = CollateFn(train=False)
         test_samples = get_data(
@@ -314,7 +326,7 @@ def load_cell_line_features(self, data_path: str, dataset_name: str) -> FeatureD
         # in the gene expression features of all datasets
         gene_expression = load_and_reduce_gene_features(
             feature_type="gene_expression",
-            gene_list="gene_expression_genes_intercept_all_datasets" if dataset_name != "Toy_Data" else None,
+            gene_list="gene_expression_intersection",
             data_path=data_path,
             dataset_name=dataset_name,
         )
 
@@ -11,9 +11,6 @@
 from torch.nn import functional
 from torch.utils.data import DataLoader, Dataset
 
-ldim = 512
-hdim = [2048, 1024]
-
 
 class GeneExpressionEncoder(nn.Module):
     """Gene expression encoder.
@@ -22,7 +19,7 @@ class GeneExpressionEncoder(nn.Module):
     DIPK model https://github.com/user15632/DIPK.
     """
 
-    def __init__(self, input_dim, latent_dim=ldim, h_dims=None, drop_out_rate=0.3):
+    def __init__(self, input_dim, latent_dim=512, h_dims=None, drop_out_rate=0.3):
         """Initialize the gene expression encoder.
 
         :param input_dim: input dimension
@@ -32,7 +29,7 @@ def __init__(self, input_dim, latent_dim=ldim, h_dims=None, drop_out_rate=0.3):
         """
         super().__init__()
         if h_dims is None:
-            h_dims = hdim
+            h_dims = [2048, 1024]
         hidden_dims = deepcopy(h_dims)
         hidden_dims.insert(0, input_dim)
         modules = []
@@ -47,6 +44,7 @@ def __init__(self, input_dim, latent_dim=ldim, h_dims=None, drop_out_rate=0.3):
             )
         self.encoder = nn.Sequential(*modules)
         self.bottleneck = nn.Linear(hidden_dims[-1], latent_dim)
+        self.latent_dim = latent_dim
 
     def forward(self, input):
         """Forward pass of the gene expression encoder.
@@ -62,7 +60,7 @@ def forward(self, input):
 class GeneExpressionDecoder(nn.Module):
     """Gene expression decoder."""
 
-    def __init__(self, input_dim, latent_dim=ldim, h_dims=None, drop_out_rate=0.3):
+    def __init__(self, input_dim, latent_dim=512, h_dims=None, drop_out_rate=0.3):
         """Initialize the gene expression decoder.
 
         :param input_dim: input dimension
@@ -72,7 +70,7 @@ def __init__(self, input_dim, latent_dim=ldim, h_dims=None, drop_out_rate=0.3):
         """
         super().__init__()
         if h_dims is None:
-            h_dims = hdim
+            h_dims = [2048, 1024]
         hidden_dims = deepcopy(h_dims)
         hidden_dims.insert(0, input_dim)
         self.decoder_input = nn.Linear(latent_dim, hidden_dims[-1])
 
@@ -143,7 +143,15 @@ def predict(
         :param cell_line_input: cell line omics features
         :param drug_input: drug features, not needed
         :returns: Predicted drug response
+        :raises ValueError: If the model was not trained
         """
+        if (
+            (self.gene_expression_features is None)
+            or (self.mutations_features is None)
+            or (self.copy_number_variation_features is None)
+        ):
+            raise ValueError("MOLIR Model not trained, please train the model first.")
+
         input_data = self.get_feature_matrices(
             cell_line_ids=cell_line_ids,
             drug_ids=drug_ids,
@@ -156,6 +164,10 @@ def predict(
             input_data["copy_number_variation_gistic"],
         )
 
+        # Filter out features that were not present during training
+        # This is necessary because the feature order might have changed
+        # or more features are available
+        # impute missing features with zeros
         for key, features in {
             "gene_expression": self.gene_expression_features,
             "mutations": self.mutations_features,
@@ -199,7 +211,7 @@ def load_cell_line_features(self, data_path: str, dataset_name: str) -> FeatureD
             gene_lists={
                 "gene_expression": "gene_expression_intersection",
                 "mutations": "mutations_intersection",
-                "copy_number_variation_gistic": "copy_number_variation_intersection",
+                "copy_number_variation_gistic": "copy_number_variation_gistic_intersection",
             },
             omics=self.cell_line_views,
         )
 
@@ -11,21 +11,24 @@ SuperFELTR:
   expression_var_threshold:
     GDSC1: 0.1
     GDSC2: 0.1
-    Toy_Data: 0.03
+    TOYv1: 0.03
+    TOYv2: 0.03
     CCLE: 0.1
     CTRPv1: 0.1
     CTRPv2: 0.1
   mutation_var_threshold:
     GDSC1: 0.1
     GDSC2: 0.1
-    Toy_Data: 0.05
+    TOYv1: 0.05
+    TOYv2: 0.05
     CCLE: 0.1
     CTRPv1: 0.1
     CTRPv2: 0.1
   cnv_var_threshold:
     GDSC1: 0.7
     GDSC2: 0.7
-    Toy_Data: 0.6
+    TOYv1: 0.6
+    TOYv2: 0.6
     CCLE: 0.7
     CTRPv1: 0.7
     CTRPv2: 0.7
 
@@ -201,6 +201,13 @@ def predict(
         :returns: predicted drug response
         :raises ValueError: if drug_input is not None
         """
+        if (
+            self.gene_expression_features is None
+            or self.mutations_features is None
+            or self.copy_number_variation_features is None
+        ):
+            raise ValueError("Model was not trained, no features available.")
+
         if drug_input is not None:
             raise ValueError("SuperFELTR is a single drug model and does not require drug input.")
 
@@ -216,6 +223,8 @@ def predict(
             input_data["copy_number_variation_gistic"],
         )
 
+        # make cross study prediction possible by selecting only the features that were used during training
+        # missing features are imputed with zeros
         for key, features in {
             "gene_expression": self.gene_expression_features,
             "mutations": self.mutations_features,