daisybio
diff --git a/‎.github/workflows/build_package.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/build_package.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎create_report.py‎
Lines changed: 4 additions & 2 deletions b/‎create_report.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎docs/conf.py‎
Lines changed: 2 additions & 2 deletions b/‎docs/conf.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎drevalpy/datasets/dataset.py‎
Lines changed: 0 additions & 4 deletions b/‎drevalpy/datasets/dataset.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎drevalpy/models/DIPK/dipk.py‎
Lines changed: 4 additions & 2 deletions b/‎drevalpy/models/DIPK/dipk.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎drevalpy/models/MOLIR/molir.py‎
Lines changed: 13 additions & 38 deletions b/‎drevalpy/models/MOLIR/molir.py‎
Lines changed: 13 additions & 38 deletions
diff --git a/‎drevalpy/models/MOLIR/utils.py‎
Lines changed: 71 additions & 0 deletions b/‎drevalpy/models/MOLIR/utils.py‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎drevalpy/models/SimpleNeuralNetwork/multiomics_neural_network.py‎
Lines changed: 0 additions & 2 deletions b/‎drevalpy/models/SimpleNeuralNetwork/multiomics_neural_network.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎drevalpy/models/SuperFELTR/hyperparameters.yaml‎
Lines changed: 0 additions & 24 deletions b/‎drevalpy/models/SuperFELTR/hyperparameters.yaml‎
Lines changed: 0 additions & 24 deletions
diff --git a/‎drevalpy/models/SuperFELTR/superfeltr.py‎
Lines changed: 15 additions & 39 deletions b/‎drevalpy/models/SuperFELTR/superfeltr.py‎
Lines changed: 15 additions & 39 deletions
@@ -22,7 +22,8 @@ jobs:
 
       - name: Install Poetry
         run: |
-          pip install poetry
+          pipx install poetry
+          pipx inject poetry poetry-plugin-export
           poetry --version
 
       - name: Build package
 
@@ -305,14 +305,16 @@ def draw_per_grouping_algorithm_plots(
         t_vs_p=true_vs_pred,
     )
     """
-    For debugging:
+    #For debugging:
     evaluation_results = pd.read_csv(
         f'results/{run_id}/evaluation_results.csv', index_col=0
     )
     evaluation_results_per_drug = pd.read_csv(
         f'results/{run_id}/evaluation_results_per_drug.csv', index_col=0
     )
-    evaluation_results_per_cell_line = None
+    evaluation_results_per_cell_line = pd.read_csv(
+        f'results/{run_id}/evaluation_results_per_cl.csv', index_col=0
+    )
     true_vs_pred = pd.read_csv(
         f'results/{run_id}/true_vs_pred.csv', index_col=0
     )
 
@@ -56,9 +56,9 @@
 # the built documents.
 #
 # The short X.Y version.
-version = "1.2.5"
+version = "1.2.6"
 # The full version, including alpha/beta/rc tags.
-release = "1.2.5"
+release = "1.2.6"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
 
@@ -22,7 +22,6 @@
 import numpy as np
 import pandas as pd
 from sklearn.base import TransformerMixin
-from sklearn.feature_selection import VarianceThreshold
 from sklearn.model_selection import GroupKFold, train_test_split
 
 from ..pipeline_function import pipeline_function
@@ -1003,9 +1002,6 @@ def fit_transform_features(self, train_ids: np.ndarray, transformer: Transformer
         # Collect all features of the view for fitting the scaler
         train_features = np.vstack([self.features[identifier][view] for identifier in train_ids])
         transformer.fit(train_features)
-        if isinstance(transformer, VarianceThreshold):
-            mask = transformer.get_support()
-            self.meta_info[view] = self.meta_info[view][mask]
 
         # Apply transformation and scaling to each feature vector
         for identifier in self.features:
 
@@ -313,8 +313,10 @@ def predict(
                     bionic=bionic_features,
                     molgnet_mask=molgnet_mask,
                 )
-                predictions += torch.squeeze(prediction).cpu().tolist()
-
+                if prediction.numel() > 1:
+                    predictions += torch.squeeze(prediction).cpu().tolist()
+                else:
+                    predictions += [prediction.item()]
         return np.array(predictions)
 
     def load_cell_line_features(self, data_path: str, dataset_name: str) -> FeatureDataset:
 
@@ -9,13 +9,12 @@
 from typing import Any
 
 import numpy as np
-from sklearn.feature_selection import VarianceThreshold
 from sklearn.preprocessing import StandardScaler
 
 from ...datasets.dataset import DrugResponseDataset, FeatureDataset
 from ..drp_model import DRPModel
 from ..utils import get_multiomics_feature_dataset
-from .utils import MOLIModel, get_dimensions_of_omics_data
+from .utils import MOLIModel, filter_and_sort_omics, get_dimensions_of_omics_data, select_features_for_view
 
 
 class MOLIR(DRPModel):
@@ -76,8 +75,9 @@ def train(
         """
         Initializes and trains the model.
 
-        First, the gene expression data is reduced using a variance threshold (0.05) and standardized. Then,
-        the model is initialized with the hyperparameters and the dimensions of the gene expression, mutation and
+        First, the gene expression data was reduced using a variance threshold (0.05) and standardized. We chose to use
+        the most variable 1000 genes instead to avoid issues with the variance threshold.
+        Then, the model is initialized with the hyperparameters and the dimensions of the gene expression, mutation and
         copy number variation data. If there is no training data, the model is set to None (and predictions will be
         skipped as well). If there is not enough training data, the predictions will be made on the randomly
         initialized model.
@@ -89,11 +89,10 @@ def train(
         :param model_checkpoint_dir: directory to save the model checkpoints
         """
         if len(output) > 0:
-            selector_gex = VarianceThreshold(0.05)
-            cell_line_input.fit_transform_features(
-                train_ids=np.unique(output.cell_line_ids),
-                transformer=selector_gex,
+            cell_line_input = select_features_for_view(
                 view="gene_expression",
+                cell_line_input=cell_line_input,
+                output=output,
             )
             self.gene_expression_features = cell_line_input.meta_info["gene_expression"]
             self.mutations_features = cell_line_input.meta_info["mutations"]
@@ -145,6 +144,9 @@ def predict(
         :returns: Predicted drug response
         :raises ValueError: If the model was not trained
         """
+        if self.model is None:
+            print("No model trained, will predict NA.")
+            return np.array([np.nan] * len(cell_line_ids))
         if (
             (self.gene_expression_features is None)
             or (self.mutations_features is None)
@@ -164,37 +166,10 @@ def predict(
             input_data["copy_number_variation_gistic"],
         )
 
-        # Filter out features that were not present during training
-        # This is necessary because the feature order might have changed
-        # or more features are available
-        # impute missing features with zeros
-        for key, features in {
-            "gene_expression": self.gene_expression_features,
-            "mutations": self.mutations_features,
-            "copy_number_variation_gistic": self.copy_number_variation_features,
-        }.items():
-            if key == "gene_expression":
-                values = gene_expression
-            elif key == "mutations":
-                values = mutations
-            else:
-                values = cnvs
-            if values.shape[1] != len(features):
-                new_value = np.zeros((values.shape[0], len(features)))
-                lookup_table = {feature: i for i, feature in enumerate(cell_line_input.meta_info[key])}
-                for i, feature in enumerate(features):
-                    if feature in lookup_table:
-                        new_value[:, i] = values[:, lookup_table[feature]]
-                if key == "gene_expression":
-                    gene_expression = new_value
-                elif key == "mutations":
-                    mutations = new_value
-                else:
-                    cnvs = new_value
+        (gene_expression, mutations, cnv) = filter_and_sort_omics(
+            model=self, gene_expression=gene_expression, mutations=mutations, cnvs=cnvs, cell_line_input=cell_line_input
+        )
 
-        if self.model is None:
-            print("No model trained, will predict NA.")
-            return np.array([np.nan] * len(cell_line_ids))
         return self.model.predict(gene_expression, mutations, cnvs)
 
     def load_cell_line_features(self, data_path: str, dataset_name: str) -> FeatureDataset:
 
@@ -18,6 +18,7 @@
 from torch.utils.data import DataLoader, Dataset
 
 from drevalpy.datasets.dataset import DrugResponseDataset, FeatureDataset
+from drevalpy.models.drp_model import DRPModel
 
 
 class RegressionDataset(Dataset):
@@ -205,6 +206,76 @@ def get_dimensions_of_omics_data(cell_line_input: FeatureDataset) -> tuple[int,
     return dim_gex, dim_mut, dim_cnv
 
 
+def filter_and_sort_omics(
+    model: DRPModel,  # MOLIR or SuperFELTR
+    gene_expression: np.ndarray,
+    mutations: np.ndarray,
+    cnvs: np.ndarray,
+    cell_line_input: FeatureDataset,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Filters out features that were not present during training and imputes missing features with zeros.
+
+    This is necessary because the feature order might have changed or more features are available (cross-study setting).
+
+    :param model: either MOLIR or SuperFELTR self
+    :param gene_expression: new gene expression data from which to predict
+    :param mutations: new mutation data from which to predict
+    :param cnvs: new copy number variation data from which to predict
+    :param cell_line_input: needed for meta information (feature names)
+    :return: filtered and sorted gene expression, mutations, and copy number variation data
+    """
+    for key, features in {
+        "gene_expression": model.gene_expression_features,  # type: ignore
+        "mutations": model.mutations_features,  # type: ignore
+        "copy_number_variation_gistic": model.copy_number_variation_features,  # type: ignore
+    }.items():
+        if key == "gene_expression":
+            values = gene_expression
+        elif key == "mutations":
+            values = mutations
+        else:
+            values = cnvs
+        if values.shape[1] != len(features):
+            new_value = np.zeros((values.shape[0], len(features)))
+            lookup_table = {feature: i for i, feature in enumerate(cell_line_input.meta_info[key])}
+            for i, feature in enumerate(features):
+                if feature in lookup_table:
+                    new_value[:, i] = values[:, lookup_table[feature]]
+            if key == "gene_expression":
+                gene_expression = new_value
+            elif key == "mutations":
+                mutations = new_value
+            else:
+                cnvs = new_value
+    return gene_expression, mutations, cnvs
+
+
+def select_features_for_view(
+    view: str,  # "gene_expression", "mutations", or "copy_number_variation_gistic"
+    cell_line_input: FeatureDataset,
+    output: DrugResponseDataset,
+):
+    """
+    Selects the top 1000 features with the highest variance for the omics data.
+
+    :param view: either "gene_expression", "mutations", or "copy_number_variation_gistic"
+    :param cell_line_input: the omics data of the cell lines
+    :param output: the training dataset containing the response output
+    :return: the modified cell line input with the top 1000 features with the highest variance
+    """
+    train_features = np.vstack(
+        [cell_line_input.features[identifier][view] for identifier in np.unique(output.cell_line_ids)]
+    )
+    variances = np.var(train_features, axis=0)
+    mask = np.zeros(len(variances), dtype=bool)
+    mask[np.argsort(variances)[::-1][:1000]] = True
+    cell_line_input.meta_info[view] = cell_line_input.meta_info[view][mask]
+    for identifier in cell_line_input.features:
+        cell_line_input.features[identifier][view] = cell_line_input.features[identifier][view][mask]
+    return cell_line_input
+
+
 class MOLIEncoder(nn.Module):
     """
     Encoders of the MOLIR model, which is identical to the encoders of the original MOLI model.
 
@@ -34,7 +34,6 @@ def __init__(self):
         self.model = None
         self.hyperparameters = None
         self.pca = None
-        self.methylation_features = None
 
     @classmethod
     def get_model_name(cls) -> str:
@@ -83,7 +82,6 @@ def train(
             [cell_line_input.features[id_]["methylation"] for id_ in np.unique(output.cell_line_ids)],
             axis=0,
         )
-        self.methylation_features = cell_line_input.meta_info["methylation"]
 
         self.pca.n_components = min(self.pca.n_components, len(unique_methylation))
         self.pca = self.pca.fit(unique_methylation)
 
@@ -8,29 +8,5 @@ SuperFELTR:
   out_dim_mutation_encoder: 32
   out_dim_cnv_encoder: 64
   epochs: 30
-  expression_var_threshold:
-    GDSC1: 0.1
-    GDSC2: 0.1
-    TOYv1: 0.03
-    TOYv2: 0.03
-    CCLE: 0.1
-    CTRPv1: 0.1
-    CTRPv2: 0.1
-  mutation_var_threshold:
-    GDSC1: 0.1
-    GDSC2: 0.1
-    TOYv1: 0.05
-    TOYv2: 0.05
-    CCLE: 0.1
-    CTRPv1: 0.1
-    CTRPv2: 0.1
-  cnv_var_threshold:
-    GDSC1: 0.7
-    GDSC2: 0.7
-    TOYv1: 0.6
-    TOYv2: 0.6
-    CCLE: 0.7
-    CTRPv1: 0.7
-    CTRPv2: 0.7
   margin: 1.0
   learning_rate: 0.01
@@ -19,11 +19,10 @@
 
 import numpy as np
 import pytorch_lightning as pl
-from sklearn.feature_selection import VarianceThreshold
 
 from ...datasets.dataset import DrugResponseDataset, FeatureDataset
 from ..drp_model import DRPModel
-from ..MOLIR.utils import get_dimensions_of_omics_data, make_ranges
+from ..MOLIR.utils import filter_and_sort_omics, get_dimensions_of_omics_data, make_ranges, select_features_for_view
 from ..utils import get_multiomics_feature_dataset
 from .utils import SuperFELTEncoder, SuperFELTRegressor, train_superfeltr_model
 
@@ -201,6 +200,9 @@ def predict(
         :returns: predicted drug response
         :raises ValueError: if drug_input is not None
         """
+        if self.expr_encoder is None or self.mut_encoder is None or self.cnv_encoder is None or self.regressor is None:
+            print("No training data was available, predicting NA")
+            return np.array([np.nan] * len(cell_line_ids))
         if (
             self.gene_expression_features is None
             or self.mutations_features is None
@@ -223,35 +225,10 @@ def predict(
             input_data["copy_number_variation_gistic"],
         )
 
-        # make cross study prediction possible by selecting only the features that were used during training
-        # missing features are imputed with zeros
-        for key, features in {
-            "gene_expression": self.gene_expression_features,
-            "mutations": self.mutations_features,
-            "copy_number_variation_gistic": self.copy_number_variation_features,
-        }.items():
-            if key == "gene_expression":
-                values = gene_expression
-            elif key == "mutations":
-                values = mutations
-            else:
-                values = cnvs
-            if values.shape[1] != len(features):
-                new_value = np.zeros((values.shape[0], len(features)))
-                lookup_table = {feature: i for i, feature in enumerate(cell_line_input.meta_info[key])}
-                for i, feature in enumerate(features):
-                    if feature in lookup_table:
-                        new_value[:, i] = values[:, lookup_table[feature]]
-                if key == "gene_expression":
-                    gene_expression = new_value
-                elif key == "mutations":
-                    mutations = new_value
-                else:
-                    cnvs = new_value
+        (gene_expression, mutations, cnvs) = filter_and_sort_omics(
+            model=self, gene_expression=gene_expression, mutations=mutations, cnvs=cnvs, cell_line_input=cell_line_input
+        )
 
-        if self.expr_encoder is None or self.mut_encoder is None or self.cnv_encoder is None or self.regressor is None:
-            print("No training data was available, predicting NA")
-            return np.array([np.nan] * len(cell_line_ids))
         if self.best_checkpoint is None:
             print("Not enough training data provided for SuperFELTR Regressor. Predicting with random initialization.")
             return self.regressor.predict(gene_expression, mutations, cnvs)
@@ -260,21 +237,20 @@ def predict(
 
     def _feature_selection(self, output: DrugResponseDataset, cell_line_input: FeatureDataset) -> FeatureDataset:
         """
-        Feature selection for all omics data using the predefined variance thresholds.
+        Feature selection for all omics data.
+
+        Originally, this was done with VarianceThreshold but as data can vary and hence the thresholds are not
+        universally applicable, we now changed it to select the top 1000 variable features for each omics data.
 
         :param output: training data associated with the response output
         :param cell_line_input: cell line omics features
         :returns: cell line omics features with selected features
         """
-        thresholds = {
-            "gene_expression": self.hyperparameters["expression_var_threshold"][output.dataset_name],
-            "mutations": self.hyperparameters["mutation_var_threshold"][output.dataset_name],
-            "copy_number_variation_gistic": self.hyperparameters["cnv_var_threshold"][output.dataset_name],
-        }
         for view in self.cell_line_views:
-            selector = VarianceThreshold(thresholds[view])
-            cell_line_input.fit_transform_features(
-                train_ids=np.unique(output.cell_line_ids), transformer=selector, view=view
+            cell_line_input = select_features_for_view(
+                view=view,
+                cell_line_input=cell_line_input,
+                output=output,
             )
         self.gene_expression_features = cell_line_input.meta_info["gene_expression"]
         self.mutations_features = cell_line_input.meta_info["mutations"]
Original file line number	Diff line number	Diff line change
`@@ -305,14 +305,16 @@ def draw_per_grouping_algorithm_plots(`
`305`	`305`	`t_vs_p=true_vs_pred,`
`306`	`306`	`)`
`307`	`307`	`"""`
`308`		`- For debugging:`
	`308`	`+ #For debugging:`
`309`	`309`	`evaluation_results = pd.read_csv(`
`310`	`310`	`f'results/{run_id}/evaluation_results.csv', index_col=0`
`311`	`311`	`)`
`312`	`312`	`evaluation_results_per_drug = pd.read_csv(`
`313`	`313`	`f'results/{run_id}/evaluation_results_per_drug.csv', index_col=0`
`314`	`314`	`)`
`315`		`- evaluation_results_per_cell_line = None`
	`315`	`+ evaluation_results_per_cell_line = pd.read_csv(`
	`316`	`+ f'results/{run_id}/evaluation_results_per_cl.csv', index_col=0`
	`317`	`+ )`
`316`	`318`	`true_vs_pred = pd.read_csv(`
`317`	`319`	`f'results/{run_id}/true_vs_pred.csv', index_col=0`
`318`	`320`	`)`