From d99bb17546f33d163fa45af821d2e21da82779c4 Mon Sep 17 00:00:00 2001
From: PascalIversen
Date: Sun, 1 Dec 2024 16:05:38 +0100
Subject: [PATCH 01/11] featuredataset from csv
---
drevalpy/datasets/dataset.py | 38 ++++++++++++++++++++++++++++--------
1 file changed, 30 insertions(+), 8 deletions(-)
diff --git a/drevalpy/datasets/dataset.py b/drevalpy/datasets/dataset.py
index 528e8f67..b9516616 100644
--- a/drevalpy/datasets/dataset.py
+++ b/drevalpy/datasets/dataset.py
@@ -17,7 +17,7 @@
import os
from abc import ABC, abstractmethod
from pathlib import Path
-from typing import Any, Callable
+from typing import Any, Callable, Optional
import networkx as nx
import numpy as np
@@ -728,17 +728,39 @@ class FeatureDataset(Dataset):
@classmethod
def from_csv(
- cls: type["FeatureDataset"], input_file: str | Path, dataset_name: str = "unknown"
- ) -> "FeatureDataset":
+ cls: type["FeatureDataset"],
+ path_to_csv: str | Path,
+ view_name: str = "unknown",
+ id_column: str = "id",
+ drop_columns: Optional[list[str]] = None,
+ ):
"""
Load a feature dataset from a csv file.
- This function creates a FeatureDataset from a provided input file in csv format.
- :param input_file: Path to the csv file containing the data to be loaded
- :param dataset_name: Optional name to associate the dataset with, default = "unknown"
- :raises NotImplementedError: This method is currently not implemented.
+ :param path_to_csv: path to the csv file containing the data to be loaded
+ :param view_name: name of the view (e.g. gene_expression)
+ :param id_column: name of the column containing the identifiers
+ :param drop_columns: list of columns to drop (e.g. other identifier columns)
+ :returns: FeatureDataset object containing data from provided csv file.
"""
- raise NotImplementedError
+ data = pd.read_csv(path_to_csv)
+ ids = data[id_column].values
+ data_features = data.drop(columns=[id_column] + (drop_columns or []))
+ features = {}
+ for identifier in ids:
+ features_for_instance = data_features.loc[data_features[id_column] == identifier].values
+
+ if len(features_for_instance) > 1:
+
+ features_for_instance = features_for_instance[0]
+
+ print(
+ f"{view_name} FeatureDataset.from_csv: Multiple features for identifier {identifier}. Using first."
+ )
+
+ features[identifier] = {view_name: features_for_instance}
+
+ return cls(features=features)
@property
def meta_info(self) -> dict[str, Any]:
From f9a73c2a60238598827d5aba759162fb85f46067 Mon Sep 17 00:00:00 2001
From: PascalIversen
Date: Mon, 2 Dec 2024 10:16:13 +0100
Subject: [PATCH 02/11] save load
---
drevalpy/datasets/dataset.py | 59 ++++++++++++++++++-----------
tests/test_dataset.py | 73 +++++++++++++++++++++++++++++-------
2 files changed, 96 insertions(+), 36 deletions(-)
diff --git a/drevalpy/datasets/dataset.py b/drevalpy/datasets/dataset.py
index b9516616..c79c7515 100644
--- a/drevalpy/datasets/dataset.py
+++ b/drevalpy/datasets/dataset.py
@@ -730,12 +730,16 @@ class FeatureDataset(Dataset):
def from_csv(
cls: type["FeatureDataset"],
path_to_csv: str | Path,
- view_name: str = "unknown",
- id_column: str = "id",
+ id_column: str,
+ view_name: str,
drop_columns: Optional[list[str]] = None,
):
- """
- Load a feature dataset from a csv file.
+ """Load a one-view feature dataset from a csv file.
+
+ Load a feature dataset from a csv file. The rows of the csv file represent the instances (cell lines or drugs),
+ the columns represent the features. A column named id_column contains the identifiers of the instances.
+ All unrelated columns (e.g. other id columns) should be provided as drop_columns,
+ that will be removed from the dataset.
:param path_to_csv: path to the csv file containing the data to be loaded
:param view_name: name of the view (e.g. gene_expression)
@@ -745,22 +749,42 @@ def from_csv(
"""
data = pd.read_csv(path_to_csv)
ids = data[id_column].values
- data_features = data.drop(columns=[id_column] + (drop_columns or []))
+ data_features = data.drop(columns=(drop_columns or []))
+ data_features = data_features.set_index(id_column)
+ # remove duplicate feature rows (rows with the same index)
+ data_features = data_features[~data_features.index.duplicated(keep="first")]
features = {}
+
for identifier in ids:
- features_for_instance = data_features.loc[data_features[id_column] == identifier].values
+ features_for_instance = data_features.loc[identifier].values
+ features[identifier] = {view_name: features_for_instance}
- if len(features_for_instance) > 1:
+ return cls(features=features)
- features_for_instance = features_for_instance[0]
+ def to_csv(self, path: str | Path, id_column: str, view_name: str, **kwargs):
+ """
+ Save the feature dataset to a CSV file.
- print(
- f"{view_name} FeatureDataset.from_csv: Multiple features for identifier {identifier}. Using first."
- )
+ :param path: Path to the CSV file.
+ :param id_column: Name of the column containing the identifiers.
+ :param view_name: Name of the view (e.g., gene_expression).
+ :param kwargs: Additional arguments for pandas to_csv function.
- features[identifier] = {view_name: features_for_instance}
+ :raises ValueError: If the view is not found for an identifier.
+ """
+ data = []
+ for identifier, feature_dict in self.features.items():
+ # Get the feature vector for the specified view
+ if view_name in feature_dict:
+ row = {id_column: identifier}
+ row.update({f"feature_{i}": value for i, value in enumerate(feature_dict[view_name])})
+ data.append(row)
+ else:
+ raise ValueError(f"View {view_name!r} not found for identifier {identifier!r}.")
- return cls(features=features)
+ # Convert to DataFrame and save to CSV
+ df = pd.DataFrame(data)
+ df.to_csv(path, index=False, **kwargs)
@property
def meta_info(self) -> dict[str, Any]:
@@ -820,15 +844,6 @@ def __init__(
raise AssertionError(f"Meta keys {meta_info.keys()} not in view names {self.view_names}")
self._meta_info = meta_info
- def save(self, path: str):
- """
- Saves the feature dataset to data.
-
- :param path: path to the dataset
- :raises NotImplementedError: if method is not implemented
- """
- raise NotImplementedError("save method not implemented")
-
def randomize_features(self, views_to_randomize: str | list[str], randomization_type: str) -> None:
"""
Randomizes the feature vectors.
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index 35ab3f54..6ccbcb79 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -5,6 +5,7 @@
import networkx as nx
import numpy as np
+import pandas as pd
import pytest
from flaky import flaky
@@ -526,20 +527,6 @@ def test_invariant_randomization_graph(graph_dataset: FeatureDataset) -> None:
)
-def test_feature_dataset_save_and_load(sample_dataset: FeatureDataset) -> None:
- """
- Test if the save and load methods work correctly.
-
- :param sample_dataset: sample FeatureDataset
- """
- tmp = tempfile.NamedTemporaryFile()
- with pytest.raises(NotImplementedError):
- sample_dataset.save(path=tmp.name)
-
- with pytest.raises(NotImplementedError):
- FeatureDataset.from_csv(tmp.name)
-
-
def test_add_features(sample_dataset: FeatureDataset, graph_dataset: FeatureDataset) -> None:
"""
Test if the add_features method works correctly.
@@ -551,3 +538,61 @@ def test_add_features(sample_dataset: FeatureDataset, graph_dataset: FeatureData
assert sample_dataset.meta_info is not None
assert "molecular_graph" in sample_dataset.meta_info
assert "molecular_graph" in sample_dataset.view_names
+
+
+def test_feature_dataset_csv_methods():
+ """Test the `from_csv` and `to_csv` methods of the FeatureDataset class."""
+ # Create temporary directory for testing
+ with tempfile.TemporaryDirectory() as temp_dir:
+ temp_dir = Path(temp_dir)
+
+ # Create test data
+ test_csv_path = temp_dir / "test_features.csv"
+ data = {
+ "id": ["A", "B", "C"],
+ "feature_1": [1.0, 2.0, 3.0],
+ "feature_2": [4.0, 5.0, 6.0],
+ }
+ df = pd.DataFrame(data)
+ df.to_csv(test_csv_path, index=False)
+
+ # Test `from_csv` method
+ view_name = "example_view"
+ feature_dataset = FeatureDataset.from_csv(
+ path_to_csv=test_csv_path, id_column="id", view_name=view_name, drop_columns=None
+ )
+
+ # Validate loaded data
+ assert set(feature_dataset.identifiers) == {"A", "B", "C"}, "Identifiers mismatch."
+ assert feature_dataset.view_names == [view_name], "View names mismatch."
+ expected_features = {
+ "A": {"example_view": np.array([1.0, 4.0])},
+ "B": {"example_view": np.array([2.0, 5.0])},
+ "C": {"example_view": np.array([3.0, 6.0])},
+ }
+ for identifier in expected_features:
+ np.testing.assert_array_equal(
+ feature_dataset.features[identifier][view_name],
+ expected_features[identifier][view_name],
+ f"Feature mismatch for identifier {identifier}.",
+ )
+
+ # Test `to_csv` method
+ output_csv_path = temp_dir / "output_features.csv"
+ feature_dataset.to_csv(path=output_csv_path, id_column="id", view_name=view_name)
+
+ # Validate saved data
+ saved_df = pd.read_csv(output_csv_path)
+ expected_saved_df = pd.DataFrame(
+ {
+ "id": ["A", "B", "C"],
+ "feature_0": [1.0, 2.0, 3.0],
+ "feature_1": [4.0, 5.0, 6.0],
+ }
+ )
+ pd.testing.assert_frame_equal(
+ saved_df,
+ expected_saved_df,
+ check_dtype=False, # Relax dtype check for cross-platform compatibility
+ obj="Saved CSV data",
+ )
From 8e0d02f84245a5ae1f91a26708aa7cbdce4b13ed Mon Sep 17 00:00:00 2001
From: PascalIversen
Date: Mon, 2 Dec 2024 10:24:32 +0100
Subject: [PATCH 03/11] to_csv abstract
---
drevalpy/datasets/dataset.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drevalpy/datasets/dataset.py b/drevalpy/datasets/dataset.py
index c79c7515..0a09e689 100644
--- a/drevalpy/datasets/dataset.py
+++ b/drevalpy/datasets/dataset.py
@@ -46,7 +46,7 @@ def from_csv(cls: type["Dataset"], input_file: str | Path, dataset_name: str = "
"""
@abstractmethod
- def save(self, path: str):
+ def to_csv(self, path: str):
"""
Saves the dataset to data.
@@ -226,7 +226,7 @@ def to_dataframe(self) -> pd.DataFrame:
data["predictions"] = self.predictions
return pd.DataFrame(data)
- def save(self, path: str | Path):
+ def to_csv(self, path: str | Path):
"""
Stores the drug response dataset on disk.
From 3123ecf6ba01f08fc104d689d037e3c33cc113e2 Mon Sep 17 00:00:00 2001
From: PascalIversen
Date: Mon, 2 Dec 2024 10:30:20 +0100
Subject: [PATCH 04/11] remove save()
---
drevalpy/datasets/dataset.py | 2 +-
tests/test_dataset.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/drevalpy/datasets/dataset.py b/drevalpy/datasets/dataset.py
index 0a09e689..90a531de 100644
--- a/drevalpy/datasets/dataset.py
+++ b/drevalpy/datasets/dataset.py
@@ -412,7 +412,7 @@ def save_splits(self, path: str):
]:
if mode in split:
split_path = os.path.join(path, f"cv_split_{i}_{mode}.csv")
- split[mode].save(path=split_path)
+ split[mode].to_csv(path=split_path)
def load_splits(self, path: str) -> None:
"""
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index 6ccbcb79..d485c8d5 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -30,7 +30,7 @@ def test_response_dataset_load() -> None:
response=data["response"],
)
dataset_path = Path("dataset.csv")
- dataset.save(dataset_path)
+ dataset.to_csv(dataset_path)
del dataset
# Load the dataset
dataset = DrugResponseDataset.from_csv(dataset_path)
From 1f798a8e7c9e61443e62acbeded21dbd97d421c4 Mon Sep 17 00:00:00 2001
From: PascalIversen
Date: Mon, 2 Dec 2024 10:51:33 +0100
Subject: [PATCH 05/11] to_csv
---
drevalpy/experiment.py | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/drevalpy/experiment.py b/drevalpy/experiment.py
index 98a2e0f1..3807fc83 100644
--- a/drevalpy/experiment.py
+++ b/drevalpy/experiment.py
@@ -150,7 +150,7 @@ def drug_response_experiment(
raise ValueError("No cv splits found.")
for split_index, split in enumerate(response_data.cv_splits):
- print(f"################# FOLD {split_index+1}/{len(response_data.cv_splits)} " f"#################")
+ print(f"################# FOLD {split_index + 1}/{len(response_data.cv_splits)} " f"#################")
prediction_file = os.path.join(predictions_path, f"predictions_split_{split_index}.csv")
@@ -226,7 +226,7 @@ def drug_response_experiment(
single_drug_id=(drug_id if model_name in SINGLE_DRUG_MODEL_FACTORY else None),
)
- test_dataset.save(prediction_file)
+ test_dataset.to_csv(prediction_file)
else:
print(f"Split {split_index} already exists. Skipping.")
with open(
@@ -363,7 +363,7 @@ def consolidate_single_drug_model_predictions(
# Robustness predictions
for trial in range(n_trials_robustness):
robustness_path = os.path.join(single_drug_prediction_path, "robustness")
- f = f"robustness_{trial+1}_split_{split}.csv"
+ f = f"robustness_{trial + 1}_split_{split}.csv"
if trial not in predictions["robustness"]:
predictions["robustness"][trial] = []
predictions["robustness"][trial].append(
@@ -411,7 +411,7 @@ def consolidate_single_drug_model_predictions(
os.path.join(
out_path,
"robustness",
- f"robustness_{trial+1}_split_{split}.csv",
+ f"robustness_{trial + 1}_split_{split}.csv",
)
)
@@ -535,7 +535,7 @@ def cross_study_prediction(
dataset._response = response_transformation.inverse_transform(dataset.response)
else:
dataset._predictions = np.array([])
- dataset.save(
+ dataset.to_csv(
os.path.join(
path_out,
"cross_study",
@@ -609,10 +609,10 @@ def robustness_test(
robustness_test_path = os.path.join(path_out, "robustness")
os.makedirs(robustness_test_path, exist_ok=True)
for trial in range(n_trials):
- print(f"Running robustness test trial {trial+1}/{n_trials}")
+ print(f"Running robustness test trial {trial + 1}/{n_trials}")
trial_file = os.path.join(
robustness_test_path,
- f"robustness_{trial+1}_split_{split_index}.csv",
+ f"robustness_{trial + 1}_split_{split_index}.csv",
)
if not os.path.isfile(trial_file):
robustness_train_predict(
@@ -666,7 +666,7 @@ def robustness_train_predict(
early_stopping_dataset=early_stopping_dataset,
response_transformation=response_transformation,
)
- test_dataset.save(trial_file)
+ test_dataset.to_csv(trial_file)
def randomization_test(
@@ -804,7 +804,7 @@ def randomize_train_predict(
cl_features=cl_features_rand,
drug_features=drug_features_rand,
)
- test_dataset_rand.save(randomization_test_file)
+ test_dataset_rand.to_csv(randomization_test_file)
def split_early_stopping(
From 3a9ad4f9fc237f58d8b2870ae03ba5487ad97e69 Mon Sep 17 00:00:00 2001
From: PascalIversen
Date: Mon, 2 Dec 2024 12:15:57 +0100
Subject: [PATCH 06/11] signature from_csv
---
drevalpy/datasets/dataset.py | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/drevalpy/datasets/dataset.py b/drevalpy/datasets/dataset.py
index 90a531de..f266ece8 100644
--- a/drevalpy/datasets/dataset.py
+++ b/drevalpy/datasets/dataset.py
@@ -36,21 +36,23 @@ class Dataset(ABC):
@classmethod
@abstractmethod
- def from_csv(cls: type["Dataset"], input_file: str | Path, dataset_name: str = "unknown") -> "Dataset":
+ def from_csv(cls: type["Dataset"], input_file: str | Path, **kwargs) -> "Dataset":
"""
Loads the dataset from data.
:param input_file: Path to the csv file containing the data to be loaded
- :param dataset_name: Optional name to associate the dataset with, default = "unknown"
+ :param kwargs: additional keyword arguments
+
:returns: Dataset object containing data from provided csv file.
"""
@abstractmethod
- def to_csv(self, path: str):
+ def to_csv(self, path: str, **kwargs) -> None:
"""
Saves the dataset to data.
:param path: path to the dataset
+ :param kwargs: additional keyword arguments
"""
@@ -761,14 +763,13 @@ def from_csv(
return cls(features=features)
- def to_csv(self, path: str | Path, id_column: str, view_name: str, **kwargs):
+ def to_csv(self, path: str | Path, id_column: str, view_name: str):
"""
Save the feature dataset to a CSV file.
:param path: Path to the CSV file.
:param id_column: Name of the column containing the identifiers.
:param view_name: Name of the view (e.g., gene_expression).
- :param kwargs: Additional arguments for pandas to_csv function.
:raises ValueError: If the view is not found for an identifier.
"""
@@ -784,7 +785,7 @@ def to_csv(self, path: str | Path, id_column: str, view_name: str, **kwargs):
# Convert to DataFrame and save to CSV
df = pd.DataFrame(data)
- df.to_csv(path, index=False, **kwargs)
+ df.to_csv(path, index=False)
@property
def meta_info(self) -> dict[str, Any]:
From 4e11690509ec168bf716b3f65bcf5ae9964953e9 Mon Sep 17 00:00:00 2001
From: PascalIversen
Date: Mon, 2 Dec 2024 13:52:22 +0100
Subject: [PATCH 07/11] removes unnecessary dataset super class
---
drevalpy/datasets/dataset.py | 30 ++----------------------------
1 file changed, 2 insertions(+), 28 deletions(-)
diff --git a/drevalpy/datasets/dataset.py b/drevalpy/datasets/dataset.py
index f266ece8..bd621d88 100644
--- a/drevalpy/datasets/dataset.py
+++ b/drevalpy/datasets/dataset.py
@@ -15,7 +15,6 @@
import copy
import os
-from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Callable, Optional
@@ -31,32 +30,7 @@
np.set_printoptions(threshold=6)
-class Dataset(ABC):
- """Abstract wrapper class for datasets."""
-
- @classmethod
- @abstractmethod
- def from_csv(cls: type["Dataset"], input_file: str | Path, **kwargs) -> "Dataset":
- """
- Loads the dataset from data.
-
- :param input_file: Path to the csv file containing the data to be loaded
- :param kwargs: additional keyword arguments
-
- :returns: Dataset object containing data from provided csv file.
- """
-
- @abstractmethod
- def to_csv(self, path: str, **kwargs) -> None:
- """
- Saves the dataset to data.
-
- :param path: path to the dataset
- :param kwargs: additional keyword arguments
- """
-
-
-class DrugResponseDataset(Dataset):
+class DrugResponseDataset:
"""Drug response dataset."""
_response: np.ndarray
@@ -722,7 +696,7 @@ def _leave_group_out_cv(
return cv_sets
-class FeatureDataset(Dataset):
+class FeatureDataset:
"""Class for feature datasets."""
_meta_info: dict[str, Any] = {}
From 8f36a2cd4ea5f36b457f421cd0fdb2961216c372 Mon Sep 17 00:00:00 2001
From: PascalIversen
Date: Mon, 9 Dec 2024 10:22:57 +0100
Subject: [PATCH 08/11] removed all optionals with highly experimental script
---
drevalpy/datasets/dataset.py | 4 +-
drevalpy/experiment.py | 68 ++++++++++-----------
drevalpy/models/MOLIR/utils.py | 9 ++-
drevalpy/models/baselines/sklearn_models.py | 4 +-
drevalpy/models/drp_model.py | 16 ++---
drevalpy/models/utils.py | 7 +--
drevalpy/utils.py | 5 +-
drevalpy/visualization/utils.py | 4 +-
8 files changed, 56 insertions(+), 61 deletions(-)
diff --git a/drevalpy/datasets/dataset.py b/drevalpy/datasets/dataset.py
index bd621d88..7c086be0 100644
--- a/drevalpy/datasets/dataset.py
+++ b/drevalpy/datasets/dataset.py
@@ -16,7 +16,7 @@
import copy
import os
from pathlib import Path
-from typing import Any, Callable, Optional
+from typing import Any, Callable
import networkx as nx
import numpy as np
@@ -708,7 +708,7 @@ def from_csv(
path_to_csv: str | Path,
id_column: str,
view_name: str,
- drop_columns: Optional[list[str]] = None,
+ drop_columns: list[str | None] = None,
):
"""Load a one-view feature dataset from a csv file.
diff --git a/drevalpy/experiment.py b/drevalpy/experiment.py
index 3807fc83..dcf0bcf6 100644
--- a/drevalpy/experiment.py
+++ b/drevalpy/experiment.py
@@ -4,7 +4,7 @@
import os
import shutil
import warnings
-from typing import Any, Optional
+from typing import Any
import numpy as np
import pandas as pd
@@ -23,16 +23,16 @@
def drug_response_experiment(
models: list[type[DRPModel]],
response_data: DrugResponseDataset,
- baselines: Optional[list[type[DRPModel]]] = None,
- response_transformation: Optional[TransformerMixin] = None,
+ baselines: list[type[DRPModel]] | None = None,
+ response_transformation: TransformerMixin | None = None,
run_id: str = "",
test_mode: str = "LPO",
metric: str = "RMSE",
n_cv_splits: int = 5,
multiprocessing: bool = False,
- randomization_mode: Optional[list[str]] = None,
+ randomization_mode: list[str] | None = None,
randomization_type: str = "permutation",
- cross_study_datasets: Optional[list[DrugResponseDataset]] = None,
+ cross_study_datasets: list[DrugResponseDataset] | None = None,
n_trials_robustness: int = 0,
path_out: str = "results/",
overwrite: bool = False,
@@ -287,7 +287,7 @@ def consolidate_single_drug_model_predictions(
n_cv_splits: int,
results_path: str,
cross_study_datasets: list[DrugResponseDataset],
- randomization_mode: Optional[list[str]] = None,
+ randomization_mode: list[str] | None = None,
n_trials_robustness: int = 0,
out_path: str = "",
) -> None:
@@ -427,7 +427,7 @@ def consolidate_single_drug_model_predictions(
def load_features(
model: DRPModel, path_data: str, dataset: DrugResponseDataset
-) -> tuple[FeatureDataset, Optional[FeatureDataset]]:
+) -> tuple[FeatureDataset, FeatureDataset | None]:
"""
Load and reduce cell line and drug features for a given dataset.
@@ -448,11 +448,11 @@ def cross_study_prediction(
test_mode: str,
train_dataset: DrugResponseDataset,
path_data: str,
- early_stopping_dataset: Optional[DrugResponseDataset],
- response_transformation: Optional[TransformerMixin],
+ early_stopping_dataset: DrugResponseDataset | None,
+ response_transformation: TransformerMixin | None,
path_out: str,
split_index: int,
- single_drug_id: Optional[str] = None,
+ single_drug_id: str | None = None,
) -> None:
"""
Run the drug response prediction experiment on a cross-study dataset to assess the generalizability of the model.
@@ -484,7 +484,7 @@ def cross_study_prediction(
cell_lines_to_keep = cl_features.identifiers if cl_features is not None else None
- drugs_to_keep: Optional[np.ndarray] = None
+ drugs_to_keep: np.ndarray | None = None
if single_drug_id is not None:
drugs_to_keep = np.array([single_drug_id])
elif drug_features is not None:
@@ -584,10 +584,10 @@ def robustness_test(
path_data: str,
train_dataset: DrugResponseDataset,
test_dataset: DrugResponseDataset,
- early_stopping_dataset: Optional[DrugResponseDataset],
+ early_stopping_dataset: DrugResponseDataset | None,
path_out: str,
split_index: int,
- response_transformation: Optional[TransformerMixin] = None,
+ response_transformation: TransformerMixin | None = None,
):
"""
Run robustness tests for the given model and dataset.
@@ -634,11 +634,11 @@ def robustness_train_predict(
trial_file: str,
train_dataset: DrugResponseDataset,
test_dataset: DrugResponseDataset,
- early_stopping_dataset: Optional[DrugResponseDataset],
+ early_stopping_dataset: DrugResponseDataset | None,
model: DRPModel,
hpam_set: dict,
path_data: str,
- response_transformation: Optional[TransformerMixin] = None,
+ response_transformation: TransformerMixin | None = None,
):
"""
Train and predict for the robustness test.
@@ -676,11 +676,11 @@ def randomization_test(
path_data: str,
train_dataset: DrugResponseDataset,
test_dataset: DrugResponseDataset,
- early_stopping_dataset: Optional[DrugResponseDataset],
+ early_stopping_dataset: DrugResponseDataset | None,
path_out: str,
split_index: int,
randomization_type: str = "permutation",
- response_transformation=Optional[TransformerMixin],
+ response_transformation=TransformerMixin | None,
) -> None:
"""
Run randomization tests for the given model and dataset.
@@ -745,8 +745,8 @@ def randomize_train_predict(
path_data: str,
train_dataset: DrugResponseDataset,
test_dataset: DrugResponseDataset,
- early_stopping_dataset: Optional[DrugResponseDataset],
- response_transformation: Optional[TransformerMixin],
+ early_stopping_dataset: DrugResponseDataset | None,
+ response_transformation: TransformerMixin | None,
) -> None:
"""
Randomize the features for a given view and run the model.
@@ -783,12 +783,12 @@ def randomize_train_predict(
)
return
- cl_features_rand: Optional[FeatureDataset] = None
+ cl_features_rand: FeatureDataset | None = None
if cl_features is not None:
cl_features_rand = cl_features.copy()
cl_features_rand.randomize_features(view, randomization_type=randomization_type) # type: ignore[union-attr]
- drug_features_rand: Optional[FeatureDataset] = None
+ drug_features_rand: FeatureDataset | None = None
if drug_features is not None:
drug_features_rand = drug_features.copy()
drug_features_rand.randomize_features(view, randomization_type=randomization_type) # type: ignore[union-attr]
@@ -837,10 +837,10 @@ def train_and_predict(
path_data: str,
train_dataset: DrugResponseDataset,
prediction_dataset: DrugResponseDataset,
- early_stopping_dataset: Optional[DrugResponseDataset] = None,
- response_transformation: Optional[TransformerMixin] = None,
- cl_features: Optional[FeatureDataset] = None,
- drug_features: Optional[FeatureDataset] = None,
+ early_stopping_dataset: DrugResponseDataset | None = None,
+ response_transformation: TransformerMixin | None = None,
+ cl_features: FeatureDataset | None = None,
+ drug_features: FeatureDataset | None = None,
) -> DrugResponseDataset:
"""
Train the model and predict the response for the prediction dataset.
@@ -926,8 +926,8 @@ def train_and_evaluate(
path_data: str,
train_dataset: DrugResponseDataset,
validation_dataset: DrugResponseDataset,
- early_stopping_dataset: Optional[DrugResponseDataset] = None,
- response_transformation: Optional[TransformerMixin] = None,
+ early_stopping_dataset: DrugResponseDataset | None = None,
+ response_transformation: TransformerMixin | None = None,
metric: str = "rmse",
) -> dict[str, float]:
"""
@@ -960,8 +960,8 @@ def hpam_tune(
train_dataset: DrugResponseDataset,
validation_dataset: DrugResponseDataset,
hpam_set: list[dict],
- early_stopping_dataset: Optional[DrugResponseDataset] = None,
- response_transformation: Optional[TransformerMixin] = None,
+ early_stopping_dataset: DrugResponseDataset | None = None,
+ response_transformation: TransformerMixin | None = None,
metric: str = "RMSE",
path_data: str = "data",
) -> dict:
@@ -1019,9 +1019,9 @@ def hpam_tune_raytune(
model: DRPModel,
train_dataset: DrugResponseDataset,
validation_dataset: DrugResponseDataset,
- early_stopping_dataset: Optional[DrugResponseDataset],
+ early_stopping_dataset: DrugResponseDataset | None,
hpam_set: list[dict],
- response_transformation: Optional[TransformerMixin] = None,
+ response_transformation: TransformerMixin | None = None,
metric: str = "RMSE",
ray_path: str = "raytune",
path_data: str = "data",
@@ -1094,7 +1094,7 @@ def make_model_list(models: list[type[DRPModel]], response_data: DrugResponseDat
@pipeline_function
-def get_model_name_and_drug_id(model_name: str) -> tuple[str, Optional[str]]:
+def get_model_name_and_drug_id(model_name: str) -> tuple[str, str | None]:
"""
Get the model name and drug id from the model name.
@@ -1119,8 +1119,8 @@ def get_model_name_and_drug_id(model_name: str) -> tuple[str, Optional[str]]:
@pipeline_function
def get_datasets_from_cv_split(
- split: dict[str, DrugResponseDataset], model_class: type[DRPModel], model_name: str, drug_id: Optional[str] = None
-) -> tuple[DrugResponseDataset, DrugResponseDataset, Optional[DrugResponseDataset], DrugResponseDataset]:
+ split: dict[str, DrugResponseDataset], model_class: type[DRPModel], model_name: str, drug_id: str | None = None
+) -> tuple[DrugResponseDataset, DrugResponseDataset, DrugResponseDataset | None, DrugResponseDataset]:
"""
Get train, validation, (early stopping), and test datasets from the CV split.
diff --git a/drevalpy/models/MOLIR/utils.py b/drevalpy/models/MOLIR/utils.py
index c544ee6a..4f7d5c2e 100644
--- a/drevalpy/models/MOLIR/utils.py
+++ b/drevalpy/models/MOLIR/utils.py
@@ -9,7 +9,6 @@
import os
import random
import secrets
-from typing import Optional
import numpy as np
import pytorch_lightning as pl
@@ -69,7 +68,7 @@ def generate_triplets_indices(
y: np.ndarray,
positive_range: float,
negative_range: float,
- random_seed: Optional[int] = None,
+ random_seed: int | None = None,
) -> tuple[np.ndarray, np.ndarray]:
"""
Generates triplets for the MOLIR model.
@@ -155,8 +154,8 @@ def create_dataset_and_loaders(
batch_size: int,
output_train: DrugResponseDataset,
cell_line_input: FeatureDataset,
- output_earlystopping: Optional[DrugResponseDataset] = None,
-) -> tuple[DataLoader, Optional[DataLoader]]:
+ output_earlystopping: DrugResponseDataset | None = None,
+) -> tuple[DataLoader, DataLoader | None]:
"""
Creates the RegressionDataset (torch Dataset) and the DataLoader for the training and validation data.
@@ -322,7 +321,7 @@ def fit(
self,
output_train: DrugResponseDataset,
cell_line_input: FeatureDataset,
- output_earlystopping: Optional[DrugResponseDataset] = None,
+ output_earlystopping: DrugResponseDataset | None = None,
patience: int = 5,
) -> None:
"""
diff --git a/drevalpy/models/baselines/sklearn_models.py b/drevalpy/models/baselines/sklearn_models.py
index 84a67e02..09fd417c 100644
--- a/drevalpy/models/baselines/sklearn_models.py
+++ b/drevalpy/models/baselines/sklearn_models.py
@@ -1,7 +1,5 @@
"""Contains sklearn baseline models: ElasticNet, RandomForest, SVM."""
-from typing import Optional
-
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import ElasticNet, Lasso, Ridge
@@ -121,7 +119,7 @@ def load_cell_line_features(self, data_path: str, dataset_name: str) -> FeatureD
dataset_name=dataset_name,
)
- def load_drug_features(self, data_path: str, dataset_name: str) -> Optional[FeatureDataset]:
+ def load_drug_features(self, data_path: str, dataset_name: str) -> FeatureDataset | None:
"""
Load the drug features, in this case the fingerprints.
diff --git a/drevalpy/models/drp_model.py b/drevalpy/models/drp_model.py
index ad642384..3b4b6798 100644
--- a/drevalpy/models/drp_model.py
+++ b/drevalpy/models/drp_model.py
@@ -9,7 +9,7 @@
import inspect
import os
from abc import ABC, abstractmethod
-from typing import Any, Optional
+from typing import Any
import numpy as np
import yaml
@@ -154,7 +154,7 @@ def load_cell_line_features(self, data_path: str, dataset_name: str) -> FeatureD
"""
@abstractmethod
- def load_drug_features(self, data_path: str, dataset_name: str) -> Optional[FeatureDataset]:
+ def load_drug_features(self, data_path: str, dataset_name: str) -> FeatureDataset | None:
"""
Load the drug features before the train/predict method is called.
@@ -170,12 +170,12 @@ def load_drug_features(self, data_path: str, dataset_name: str) -> Optional[Feat
def get_concatenated_features(
self,
- cell_line_view: Optional[str],
- drug_view: Optional[str],
+ cell_line_view: str | None,
+ drug_view: str | None,
cell_line_ids_output: np.ndarray,
drug_ids_output: np.ndarray,
- cell_line_input: Optional[FeatureDataset],
- drug_input: Optional[FeatureDataset],
+ cell_line_input: FeatureDataset | None,
+ drug_input: FeatureDataset | None,
) -> np.ndarray:
"""
Concatenates the features to an input matrix X for the given cell line and drug views.
@@ -228,8 +228,8 @@ def get_feature_matrices(
self,
cell_line_ids: np.ndarray,
drug_ids: np.ndarray,
- cell_line_input: Optional[FeatureDataset],
- drug_input: Optional[FeatureDataset],
+ cell_line_input: FeatureDataset | None,
+ drug_input: FeatureDataset | None,
) -> dict[str, np.ndarray]:
"""
Returns the feature matrices for the given cell line and drug ids by retrieving the correct views.
diff --git a/drevalpy/models/utils.py b/drevalpy/models/utils.py
index 27d1e545..3723db1c 100644
--- a/drevalpy/models/utils.py
+++ b/drevalpy/models/utils.py
@@ -2,7 +2,6 @@
import os.path
import warnings
-from typing import Optional
import numpy as np
import pandas as pd
@@ -24,7 +23,7 @@ def load_cl_ids_from_csv(path: str, dataset_name: str) -> FeatureDataset:
def load_and_reduce_gene_features(
feature_type: str,
- gene_list: Optional[str],
+ gene_list: str | None,
data_path: str,
dataset_name: str,
) -> FeatureDataset:
@@ -138,8 +137,8 @@ def load_drug_fingerprint_features(data_path: str, dataset_name: str) -> Feature
def get_multiomics_feature_dataset(
data_path: str,
dataset_name: str,
- gene_list: Optional[str] = "drug_target_genes_all_drugs",
- omics: Optional[list[str]] = None,
+ gene_list: str | None = "drug_target_genes_all_drugs",
+ omics: list[str | None] = None,
) -> FeatureDataset:
"""
Get multiomics feature dataset for the given list of OMICs.
diff --git a/drevalpy/utils.py b/drevalpy/utils.py
index 9dd99884..c31b058b 100644
--- a/drevalpy/utils.py
+++ b/drevalpy/utils.py
@@ -2,7 +2,6 @@
import argparse
import os
-from typing import Optional
from sklearn.base import TransformerMixin
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
@@ -315,7 +314,7 @@ def get_datasets(
measure: str = "response",
curve_curator: bool = False,
cores: int = 1,
-) -> tuple[DrugResponseDataset, Optional[list[DrugResponseDataset]]]:
+) -> tuple[DrugResponseDataset, list[DrugResponseDataset] | None]:
"""
Load the response data and cross-study datasets.
@@ -350,7 +349,7 @@ def get_datasets(
@pipeline_function
-def get_response_transformation(response_transformation: str) -> Optional[TransformerMixin]:
+def get_response_transformation(response_transformation: str) -> TransformerMixin | None:
"""
Get the skelarn response transformation object of choice.
diff --git a/drevalpy/visualization/utils.py b/drevalpy/visualization/utils.py
index bdd3cfae..a643b12e 100644
--- a/drevalpy/visualization/utils.py
+++ b/drevalpy/visualization/utils.py
@@ -4,7 +4,7 @@
import pathlib
import re
import shutil
-from typing import Optional, TextIO
+from typing import TextIO
import importlib_resources
import pandas as pd
@@ -267,7 +267,7 @@ def _evaluate_per_group(
df: pd.DataFrame,
group_by: str,
norm_group_eval_results: dict[str, dict[str, float]],
- eval_results_per_group: Optional[pd.DataFrame],
+ eval_results_per_group: pd.DataFrame | None,
model: str,
) -> tuple[dict[str, dict[str, float]], pd.DataFrame]:
"""
From c7830714696a854caf85849b34293789bf3842a5 Mon Sep 17 00:00:00 2001
From: PascalIversen
Date: Mon, 9 Dec 2024 10:45:13 +0100
Subject: [PATCH 09/11] fix type
---
drevalpy/datasets/dataset.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drevalpy/datasets/dataset.py b/drevalpy/datasets/dataset.py
index 7c086be0..bef2fa06 100644
--- a/drevalpy/datasets/dataset.py
+++ b/drevalpy/datasets/dataset.py
@@ -708,7 +708,7 @@ def from_csv(
path_to_csv: str | Path,
id_column: str,
view_name: str,
- drop_columns: list[str | None] = None,
+ drop_columns: list[str] | None = None,
):
"""Load a one-view feature dataset from a csv file.
From b84e12c1158d20a8939aa8d98d5a7b5b105e35d5 Mon Sep 17 00:00:00 2001
From: PascalIversen
Date: Mon, 9 Dec 2024 11:03:10 +0100
Subject: [PATCH 10/11] isort
---
drevalpy/utils.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/drevalpy/utils.py b/drevalpy/utils.py
index eb1e8547..7093d4c6 100644
--- a/drevalpy/utils.py
+++ b/drevalpy/utils.py
@@ -1,10 +1,8 @@
"""Utility functions for the evaluation pipeline."""
import argparse
-import os
from pathlib import Path
-
from sklearn.base import TransformerMixin
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
From 6f5ffe951e0bb18ffe121785f70fb5d4dd8d4c4a Mon Sep 17 00:00:00 2001
From: PascalIversen
Date: Mon, 9 Dec 2024 11:18:09 +0100
Subject: [PATCH 11/11] soem typign stuff
---
drevalpy/models/utils.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drevalpy/models/utils.py b/drevalpy/models/utils.py
index 3723db1c..b7208361 100644
--- a/drevalpy/models/utils.py
+++ b/drevalpy/models/utils.py
@@ -138,7 +138,7 @@ def get_multiomics_feature_dataset(
data_path: str,
dataset_name: str,
gene_list: str | None = "drug_target_genes_all_drugs",
- omics: list[str | None] = None,
+ omics: list[str] | None = None,
) -> FeatureDataset:
"""
Get multiomics feature dataset for the given list of OMICs.