Merge pull request #238 from daisybio/development

JudithBernett · web-flow · commit 27ca39bfcc0b · 2025-06-23T12:56:12.000+02:00
v.1.3.4
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -7,7 +7,29 @@
 - [ ] This comment contains a description of changes (with reason)
 - [ ] Referenced issue is linked
 - [ ] If you've fixed a bug or added code that should be tested, add tests!
-- [ ] Documentation in `docs` is updated
+- [ ] Documentation in `docs` is updated. If you've created a new file, add it to the API documentation pages.
+
+<!-- Only applies to PRs for a new version release, delete the lines that don't apply -->
+
+**Version release checklist**
+
+- [ ] Update the version in pyproject.toml
+- [ ] Update version/release in docs/conf.py
+- [ ] Run ‚poetry update‘ to get the latest package versions. This will update the poetry.lock file
+- [ ] Run ‚poetry export --without-hashes --without development -f requirements.txt -o requirements.txt‘ to update the requirements.txt file
+- [ ] (If one of the sphinx packages has been updated, you also need to update docs/requirements.txt)
+- [ ] (If poetry itself was updated, update that in the Dockerfile)
+- [ ] If you updated the python version:
+  - [ ] Update the Dockerfile so that it always runs on the latest python version. Watch out: the ‚builder‘ is the full python, the ‚runtime‘ is a slim python build.
+  - [ ] Update the python version in .github/workflows/: run_tests.yml, build_package.yml, publish_docs.yml, python-package.yml
+  - [ ] Update the python version in noxfile.py
+  - [ ] Update the documentation: contributing.rst, installation.rst
+
+Then,
+
+1. Open a PR from development to main with these changes.
+2. Wait for a review and merge.
+3. Create a new release on GitHub with the version number. Update the release notes with the changes made in this version.
 
 **Description of changes**
 
diff --git a/.github/workflows/build_package.yml b/.github/workflows/build_package.yml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       matrix:
         os: [macos-latest, ubuntu-latest, windows-latest]
-        python: ["3.11", "3.12"]
+        python: ["3.11", "3.12", "3.13"]
 
     steps:
       - uses: actions/checkout@v4
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -16,12 +16,12 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - { python-version: "3.12", os: ubuntu-latest, session: "pre-commit" }
-          - { python-version: "3.12", os: ubuntu-latest, session: "mypy" }
-          - { python-version: "3.12", os: ubuntu-latest, session: "tests" }
-          - { python-version: "3.12", os: windows-latest, session: "typeguard" }
-          - { python-version: "3.12", os: ubuntu-latest, session: "xdoctest" }
-          - { python-version: "3.12", os: ubuntu-latest, session: "docs-build" }
+          - { python-version: "3.13", os: ubuntu-latest, session: "pre-commit" }
+          - { python-version: "3.13", os: ubuntu-latest, session: "mypy" }
+          - { python-version: "3.13", os: ubuntu-latest, session: "tests" }
+          - { python-version: "3.13", os: windows-latest, session: "typeguard" }
+          - { python-version: "3.13", os: ubuntu-latest, session: "xdoctest" }
+          - { python-version: "3.13", os: ubuntu-latest, session: "docs-build" }
 
     env:
       NOXSESSION: ${{ matrix.session }}
diff --git a/docs/conf.py b/docs/conf.py
@@ -56,9 +56,9 @@
 # the built documents.
 #
 # The short X.Y version.
-version = "1.3.3"
+version = "1.3.4"
 # The full version, including alpha/beta/rc tags.
-release = "1.3.3"
+release = "1.3.4"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/drevalpy.datasets.rst b/docs/drevalpy.datasets.rst
@@ -20,7 +20,7 @@ Loaders
 CurveCurator
 ------------
 
-.. automodule:: drevalpy.datasets.curvec
+.. automodule:: drevalpy.datasets.curvecurator
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/docs/drevalpy.models.baselines.rst b/docs/drevalpy.models.baselines.rst
@@ -17,6 +17,14 @@ Sklearn Models
    :undoc-members:
    :show-inheritance:
 
+Single-Drug Elastic Net
+-----------------------------------------------------------
+
+.. automodule:: drevalpy.models.baselines.singledrug_elastic_net
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 Single-Drug Random Forest
 -----------------------------------------------------------
 
diff --git a/docs/drevalpy.visualization.rst b/docs/drevalpy.visualization.rst
@@ -9,10 +9,10 @@ Outplot
    :undoc-members:
    :show-inheritance:
 
-Correlation comparison scatter plot
+Comparison scatter plot
 -------------------------------------------------
 
-.. automodule:: drevalpy.visualization.corr_comp_scatter
+.. automodule:: drevalpy.visualization.comp_scatter
    :members:
    :undoc-members:
    :show-inheritance:
@@ -25,10 +25,10 @@ Critical difference plot
    :undoc-members:
    :show-inheritance:
 
-HTML tables
+Cross study tables
 ------------------------------------------
 
-.. automodule:: drevalpy.visualization.html_tables
+.. automodule:: drevalpy.visualization.cross_study_tables
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/drevalpy/datasets/curvecurator.py b/drevalpy/datasets/curvecurator.py
@@ -123,13 +123,22 @@ def _exec_curvecurator(output_dir: Path, batched: bool = True):
         of configs spefified in <output_dir>/configlist.txt and consecutively executing each
         CurveCurator run. If False, run a single CurveCurator run (this can be used for
         parallelisation).
+    :raises RuntimeError: If CurveCurator fails to execute, the error message is printed to stdout and stderr.
     """
     if batched:
         command = ["CurveCurator", str(output_dir / "configlist.txt"), "--mad", "--batch"]
     else:
         command = ["CurveCurator", str(output_dir / "config.toml"), "--mad"]
-    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    process.communicate()
+    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    stdout, stderr = process.communicate()
+
+    if process.returncode != 0:
+        print("CurveCurator stdout:")
+        print(stdout)
+        print("CurveCurator stderr:")
+        print(stderr)
+
+        raise RuntimeError(f"CurveCurator failed with exit code {process.returncode}")
 
 
 def _calc_ic50(model_params_df: pd.DataFrame):
diff --git a/drevalpy/datasets/dataset.py b/drevalpy/datasets/dataset.py
@@ -817,6 +817,7 @@ def from_csv(
         view_name: str,
         drop_columns: list[str] | None = None,
         transpose: bool = False,
+        extract_meta_info: bool = True,
     ):
         """Load a one-view feature dataset from a csv file.
 
@@ -830,44 +831,55 @@ def from_csv(
         :param id_column: name of the column containing the identifiers
         :param drop_columns: list of columns to drop (e.g. other identifier columns)
         :param transpose: if True, the csv is transposed, i.e. the rows become columns and vice versa
+        :param extract_meta_info: if True, extracts meta information from the dataset, e.g. gene names for gene expression
         :returns: FeatureDataset object containing data from provided csv file.
         """
         data = pd.read_csv(path_to_csv).T if transpose else pd.read_csv(path_to_csv)
         data[id_column] = data[id_column].astype(str)
         ids = data[id_column].values
         data_features = data.drop(columns=(drop_columns or []))
         data_features = data_features.set_index(id_column)
-        # remove duplicate feature rows (rows with the same index)
         data_features = data_features[~data_features.index.duplicated(keep="first")]
         features = {}
 
         for identifier in ids:
             features_for_instance = data_features.loc[identifier].values
             features[identifier] = {view_name: features_for_instance}
 
-        return cls(features=features)
+        meta_info = {}
+        if extract_meta_info:
+            meta_info = {view_name: list(data_features.columns)}
+
+        return cls(features=features, meta_info=meta_info)
 
     def to_csv(self, path: str | Path, id_column: str, view_name: str):
         """
-        Save the feature dataset to a CSV file.
+        Save the feature dataset to a CSV file. If meta_info is available for the view and valid,
+        it will be written as column names.
 
         :param path: Path to the CSV file.
         :param id_column: Name of the column containing the identifiers.
-        :param view_name: Name of the view (e.g., gene_expression).
-
-        :raises ValueError: If the view is not found for an identifier.
+        :param view_name: Name of the view.
         """
         data = []
+        feature_names = None
+
         for identifier, feature_dict in self.features.items():
-            # Get the feature vector for the specified view
-            if view_name in feature_dict:
-                row = {id_column: identifier}
-                row.update({f"feature_{i}": value for i, value in enumerate(feature_dict[view_name])})
-                data.append(row)
-            else:
+            vector = feature_dict.get(view_name)
+            if vector is None:
                 raise ValueError(f"View {view_name!r} not found for identifier {identifier!r}.")
 
-        # Convert to DataFrame and save to CSV
+            if feature_names is None:
+                meta_names = self.meta_info.get(view_name)
+                if isinstance(meta_names, list) and len(meta_names) == len(vector):
+                    feature_names = meta_names
+                else:
+                    feature_names = [f"feature_{i}" for i in range(len(vector))]
+
+            row = {id_column: identifier}
+            row.update({name: value for name, value in zip(feature_names, vector)})
+            data.append(row)
+
         df = pd.DataFrame(data)
         df.to_csv(path, index=False)
 
diff --git a/drevalpy/experiment.py b/drevalpy/experiment.py
@@ -302,7 +302,7 @@ def drug_response_experiment(
                 path_data=path_data,
                 model_checkpoint_dir=model_checkpoint_dir,
                 metric=metric,
-                result_path=final_model_path,
+                final_model_path=final_model_path,
                 test_mode=test_mode,
                 val_ratio=0.1,
                 hyperparameter_tuning=hyperparameter_tuning,
@@ -585,7 +585,7 @@ def cross_study_prediction(
             drug_input=drug_input,
         )
         if response_transformation:
-            dataset._response = response_transformation.inverse_transform(dataset.response)
+            dataset.inverse_transform(response_transformation)
     else:
         dataset._predictions = np.array([])
     dataset.to_csv(
@@ -993,18 +993,23 @@ def train_and_predict(
         )
 
     if len(prediction_dataset) > 0:
+        drug_input = drug_features.copy() if drug_features is not None else None
         prediction_dataset._predictions = model.predict(
             cell_line_ids=prediction_dataset.cell_line_ids,
             drug_ids=prediction_dataset.drug_ids,
             cell_line_input=cl_features.copy(),
             drug_input=drug_input,
         )
 
-        if response_transformation:
-            prediction_dataset.inverse_transform(response_transformation)
     else:
         prediction_dataset._predictions = np.array([])
 
+    if response_transformation:
+        train_dataset.inverse_transform(response_transformation)
+        prediction_dataset.inverse_transform(response_transformation)
+        if early_stopping_dataset is not None:
+            early_stopping_dataset.inverse_transform(response_transformation)
+
     return prediction_dataset
 
 
@@ -1016,7 +1021,7 @@ def train_and_evaluate(
     validation_dataset: DrugResponseDataset,
     early_stopping_dataset: DrugResponseDataset | None = None,
     response_transformation: TransformerMixin | None = None,
-    metric: str = "rmse",
+    metric: str = "RMSE",
     model_checkpoint_dir: str = "TEMPORARY",
 ) -> dict[str, float]:
     """
@@ -1283,15 +1288,14 @@ def generate_data_saving_path(model_name, drug_id, result_path, suffix) -> str:
     return model_path
 
 
-@pipeline_function
 def train_final_model(
     model_class: type[DRPModel],
     full_dataset: DrugResponseDataset,
     response_transformation: TransformerMixin,
     path_data: str,
     model_checkpoint_dir: str,
     metric: str,
-    result_path: str,
+    final_model_path: str,
     test_mode: str = "LCO",
     val_ratio: float = 0.1,
     hyperparameter_tuning: bool = True,
@@ -1314,7 +1318,7 @@ def train_final_model(
     :param path_data: path to data directory
     :param model_checkpoint_dir: checkpoint dir for intermediate tuning models
     :param metric: metric for tuning, e.g., "RMSE"
-    :param result_path: path to results
+    :param final_model_path: path to final_model save directory
     :param test_mode: split logic for validation (LCO, LDO, LTO, LPO)
     :param val_ratio: validation size ratio
     :param hyperparameter_tuning: whether to perform hyperparameter tuning
@@ -1356,17 +1360,25 @@ def train_final_model(
     print(f"Best hyperparameters for final model: {best_hpams}")
     train_dataset.add_rows(validation_dataset)
     train_dataset.shuffle(random_state=42)
+    if response_transformation:
+        train_dataset.fit_transform(response_transformation)
+        if early_stopping_dataset is not None:
+            early_stopping_dataset.transform(response_transformation)
 
     model.build_model(hyperparameters=best_hpams)
+    drug_features = drug_features.copy() if drug_features is not None else None
     model.train(
         output=train_dataset,
         output_earlystopping=early_stopping_dataset,
-        cell_line_input=cl_features,
+        cell_line_input=cl_features.copy(),
         drug_input=drug_features,
         model_checkpoint_dir=model_checkpoint_dir,
     )
+    if response_transformation:
+        train_dataset.inverse_transform(response_transformation)
+        if early_stopping_dataset is not None:
+            early_stopping_dataset.inverse_transform(response_transformation)
 
-    final_model_path = os.path.join(result_path, "final_model")
     os.makedirs(final_model_path, exist_ok=True)
     model.save(final_model_path)
 
diff --git a/drevalpy/utils.py b/drevalpy/utils.py
@@ -392,7 +392,7 @@ def get_datasets(
 
 
 @pipeline_function
-def get_response_transformation(response_transformation: str) -> TransformerMixin | None:
+def get_response_transformation(response_transformation: str | None) -> TransformerMixin | None:
     """
     Get the skelarn response transformation object of choice.
 
@@ -401,7 +401,7 @@ def get_response_transformation(response_transformation: str) -> TransformerMixi
     :returns: response transformation object
     :raises ValueError: if the response transformation is not recognized
     """
-    if response_transformation == "None":
+    if (response_transformation == "None") or (response_transformation is None):
         return None
     if response_transformation == "standard":
         return StandardScaler()
diff --git a/drevalpy/visualization/utils.py b/drevalpy/visualization/utils.py
@@ -12,6 +12,7 @@
 
 from ..datasets.dataset import DrugResponseDataset
 from ..evaluation import AVAILABLE_METRICS, evaluate
+from ..models.utils import CELL_LINE_IDENTIFIER, DRUG_IDENTIFIER
 from ..pipeline_function import pipeline_function
 from . import (
     ComparisonScatter,
@@ -228,7 +229,7 @@ def prep_results(
             elif file == "cell_line_names.csv":
                 cell_line_names = pd.read_csv(os.path.join(root, file), index_col=0)
                 # index: cellosaurus_id, column: cell_line_name
-                cell_line_metadata.update(zip(cell_line_names["cell_line_name"], cell_line_names.index))
+                cell_line_metadata.update(zip(cell_line_names[CELL_LINE_IDENTIFIER], cell_line_names.index))
 
     # add variables
     # split the index by "_" into: algorithm, randomization, test_mode, split, CV_split
@@ -251,15 +252,15 @@ def prep_results(
         all_drugs = [drug_metadata[drug] for drug in eval_results_per_drug["drug"]]
         eval_results_per_drug["drug_name"] = all_drugs
         # rename drug to pubchem_id
-        eval_results_per_drug = eval_results_per_drug.rename(columns={"drug": "pubchem_id"})
+        eval_results_per_drug = eval_results_per_drug.rename(columns={"drug": DRUG_IDENTIFIER})
     if eval_results_per_cell_line is not None:
         print("Reformatting the evaluation results per cell line ...")
         eval_results_per_cell_line[["algorithm", "rand_setting", "test_mode", "split", "CV_split"]] = (
             eval_results_per_cell_line["model"].str.split("_", expand=True)
         )
         all_cello_ids = [cell_line_metadata[cell_line] for cell_line in eval_results_per_cell_line["cell_line"]]
         eval_results_per_cell_line["cellosaurus_id"] = all_cello_ids
-        eval_results_per_cell_line = eval_results_per_cell_line.rename(columns={"cell_line": "cell_line_name"})
+        eval_results_per_cell_line = eval_results_per_cell_line.rename(columns={"cell_line": CELL_LINE_IDENTIFIER})
 
     print("Reformatting the true vs. predicted values ...")
     t_vs_p[["algorithm", "rand_setting", "test_mode", "split", "CV_split"]] = t_vs_p["model"].str.split(
@@ -270,8 +271,8 @@ def prep_results(
     t_vs_p["drug_name"] = all_drugs
     all_cello_ids = [cell_line_metadata[cell_line] for cell_line in t_vs_p["cell_line"]]
     t_vs_p["cellosaurus_id"] = all_cello_ids
-    t_vs_p = t_vs_p.rename(columns={"cell_line": "cell_line_name", "drug": "pubchem_id"})
-    t_vs_p["pubchem_id"] = t_vs_p["pubchem_id"].astype(str)
+    t_vs_p = t_vs_p.rename(columns={"cell_line": CELL_LINE_IDENTIFIER, "drug": DRUG_IDENTIFIER})
+    t_vs_p[DRUG_IDENTIFIER] = t_vs_p[DRUG_IDENTIFIER].astype(str)
 
     if "NaiveMeanEffectsPredictor" in eval_results["algorithm"].unique():
         eval_results = _normalize_metrics_by_mean_effects(
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/requirements.txt b/requirements.txt
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
diff --git a/tests/test_run_suite.py b/tests/test_run_suite.py