daisybio · picciama · Nov 29, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/.github/workflows/build_package.yml b/.github/workflows/build_package.yml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       matrix:
         os: [macos-latest, ubuntu-latest, windows-latest]
-        python: ["3.9", "3.10"]
+        python: ["3.11", "3.12"]
 
     steps:
       - uses: actions/checkout@v4

diff --git a/.github/workflows/publish_docs.yml b/.github/workflows/publish_docs.yml
@@ -13,7 +13,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
-          python-version: 3.9
+          python-version: 3.11
 
       - name: Install pip
         run: |

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -16,14 +16,14 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - { python-version: "3.10", os: ubuntu-latest, session: "pre-commit" }
-          - { python-version: "3.10", os: ubuntu-latest, session: "safety" }
-          - { python-version: "3.10", os: ubuntu-latest, session: "mypy" }
-          - { python-version: "3.10", os: ubuntu-latest, session: "tests" }
-          - { python-version: "3.10", os: windows-latest, session: "tests" }
-          - { python-version: "3.10", os: ubuntu-latest, session: "typeguard" }
-          - { python-version: "3.10", os: ubuntu-latest, session: "xdoctest" }
-          - { python-version: "3.10", os: ubuntu-latest, session: "docs-build" }
+          - { python-version: "3.11", os: ubuntu-latest, session: "pre-commit" }
+          - { python-version: "3.11", os: ubuntu-latest, session: "safety" }
+          - { python-version: "3.11", os: ubuntu-latest, session: "mypy" }
+          - { python-version: "3.11", os: ubuntu-latest, session: "tests" }
+          - { python-version: "3.11", os: windows-latest, session: "tests" }
+          - { python-version: "3.11", os: ubuntu-latest, session: "typeguard" }
+          - { python-version: "3.11", os: ubuntu-latest, session: "xdoctest" }
+          - { python-version: "3.11", os: ubuntu-latest, session: "docs-build" }
 
     env:
       NOXSESSION: ${{ matrix.session }}

diff --git a/docs/API.rst b/docs/API.rst
diff --git a/docs/conf.py b/docs/conf.py
@@ -16,9 +16,7 @@
 
 from jinja2.defaults import DEFAULT_FILTERS
 
-import drevalpy
-
-sys.path.insert(0, os.path.abspath("../"))
+sys.path.insert(0, os.path.abspath(".."))
 
 
 # -- General configuration ---------------------------------------------
@@ -58,9 +56,9 @@
 # the built documents.
 #
 # The short X.Y version.
-version = drevalpy.__version__
+version = "1.0.10"
 # The full version, including alpha/beta/rc tags.
-release = drevalpy.__version__
+release = "1.0.10"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,5 +1,4 @@
-sphinx-autobuild==2024.10.3 ; python_version >= "3.9" and python_full_version <= "3.13.0"
-sphinx-autodoc-typehints==2.3.0 ; python_version >= "3.9" and python_full_version <= "3.13.0"
-sphinx-click==6.0.0 ; python_version >= "3.9" and python_full_version <= "3.13.0"
-sphinx-rtd-theme==3.0.2 ; python_version >= "3.9" and python_full_version <= "3.13.0"
--e .
+sphinx-autobuild==2024.10.3 ; python_version >= "3.11" and python_version < "3.13"
+sphinx-autodoc-typehints==2.5.0 ; python_version >= "3.11" and python_version < "3.13"
+sphinx-click==6.0.0 ; python_version >= "3.11" and python_version < "3.13"
+sphinx-rtd-theme==3.0.2 ; python_version >= "3.11" and python_version < "3.13"
diff --git a/drevalpy/datasets/curvecurator.py b/drevalpy/datasets/curvecurator.py
@@ -0,0 +1,186 @@
+"""Contains all function required for CurveCurator fitting."""
+
+import subprocess
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import toml
+
+from ..pipeline_function import pipeline_function
+
+
+def _prepare_raw_data(curve_df: pd.DataFrame, output_dir: str | Path):
+    required_columns = ["dose", "response", "sample", "drug"]
+    if not all([col in curve_df.columns for col in required_columns]):
+        raise ValueError("Missing columns in viability data. Required columns are {required_columns}.")
+    if "replicate" in curve_df.columns:
+        required_columns.append("replicate")
+    curve_df = curve_df[required_columns]
+    n_replicates = 1
+    conc_columns = ["dose"]
+    has_multicol_index = False
+    if "replicate" in curve_df.columns:
+        n_replicates = curve_df["replicate"].nunique()
+        conc_columns.append("replicate")
+        has_multicol_index = True
+
+    df = curve_df.pivot(index=["sample", "drug"], columns=conc_columns, values="response")
+
+    for i in range(n_replicates):
+        df.insert(0, (0.0, n_replicates - i), 1.0)
+
+    concentrations = df.columns.sort_values()
+    df = df[concentrations]
+
+    experiments = np.arange(df.shape[1])
+    df.insert(0, "Name", df.index.map(lambda x: f"{x[0]}|{x[1]}"))
+    df.columns = ["Name"] + [f"Raw {i}" for i in experiments]
+
+    curvecurator_folder = Path(output_dir)
+    curvecurator_folder.mkdir(exist_ok=True, parents=True)
+    df.to_csv(curvecurator_folder / "curvecurator_input.tsv", sep="\t", index=False)
+
+    if has_multicol_index:
+        doses = [pair[0] for pair in concentrations]
+    else:
+        doses = concentrations.to_list()
+    return len(experiments), doses, n_replicates, len(df)
+
+
+def _prepare_toml(filename: str, n_exp: int, n_replicates: int, doses: list[float], dataset_name: str, cores: int):
+    config = {
+        "Meta": {
+            "id": filename,
+            "description": dataset_name,
+            "condition": "drug",
+            "treatment_time": "72 h",
+        },
+        "Experiment": {
+            "experiments": range(n_exp),
+            "doses": doses,
+            "dose_scale": "1e-06",
+            "dose_unit": "M",
+            "control_experiment": [i for i in range(n_replicates)],
+            "measurement_type": "OTHER",
+            "data_type": "OTHER",
+            "search_engine": "OTHER",
+            "search_engine_version": "0",
+        },
+        "Paths": {
+            "input_file": "curvecurator_input.tsv",
+            "curves_file": "curves.txt",
+            "normalization_file": "norm.txt",
+            "mad_file": "mad.txt",
+            "dashboard": "dashboard.html",
+        },
+        "Processing": {
+            "available_cores": cores,
+            "max_missing": max(len(doses) - 5, 0),
+            "imputation": False,
+            "normalization": False,
+        },
+        "Curve Fit": {
+            "type": "OLS",
+            "speed": "exhaustive",
+            "max_iterations": 1000,
+            "interpolation": False,
+            "control_fold_change": True,
+        },
+        "F Statistic": {
+            "optimized_dofs": True,
+            "alpha": 0.05,
+            "fc_lim": 0.45,
+        },
+    }
+    return config
+
+
+def _exec_curvecurator(output_dir: Path):
+    command = ["CurveCurator", str(output_dir / "config.toml"), "--mad"]
+    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    process.communicate()
+
+
+@pipeline_function
+def preprocess(input_file: str | Path, output_dir: str | Path, dataset_name: str, cores: int):
+    """
+    Preprocess raw viability data and create config.toml for use with CurveCurator.
+
+    :param input_file: Path to csv file containing the raw viability data
+    :param output_dir: Path to store all the files to, including the preprocessed data, the config.toml
+        for CurveCurator, CurveCurator's output files, and the postprocessed data
+    :param dataset_name: Name of the dataset
+    :param cores: The number of cores to be used for fitting the curves using CurveCurator.
+        This parameter is written into the config.toml, but it is min of the number of curves to fit
+        and the number given (min(n_curves, cores))
+    """
+    input_file = Path(input_file)
+    output_dir = Path(output_dir)
+    curve_df = pd.read_csv(input_file)
+
+    n_exp, doses, n_replicates, n_curves_to_fit = _prepare_raw_data(curve_df, output_dir)
+    cores = min(n_curves_to_fit, cores)
+
+    config = _prepare_toml(input_file.name, n_exp, n_replicates, doses, dataset_name, cores)
+    with open(output_dir / "config.toml", "w") as f:
+        toml.dump(config, f)
+
+
+@pipeline_function
+def postprocess(output_folder: str | Path, dataset_name: str):
+    """
+    Postprocess CurveCurator output file.
+
+    This function reads the curves.txt file created by CurveCurator, which contains the
+    fitted curve parameters and postprocesses it to be used by drevalpy.
+
+    :param output_folder: Path to the output folder of CurveCurator containin the curves.txt file.
+    :param dataset_name: The name of the dataset, will be used to prepend the postprocessed <dataset_name>.csv file
+    """
+    output_folder = Path(output_folder)
+    required_columns = {
+        "Name": "Name",
+        "pEC50": "response",
+        "pEC50 Error": "pEC50Error",
+        "Curve Slope": "Slope",
+        "Curve Front": "Front",
+        "Curve Back": "Back",
+        "Curve Fold Change": "FoldChange",
+        "Curve AUC": "AUC",
+        "Curve R2": "R2",
+        "Curve P_Value": "pValue",
+        "Curve Relevance Score": "RelevanceScore",
+        "Curve F_Value": "fValue",
+        "Curve Log P_Value": "negLog10pValue",
+        "Signal Quality": "SignalQuality",
+        "Curve RMSE": "RMSE",
+        "Curve F_Value SAM Corrected": "fValueSAMCorrected",
+        "Curve Regulation": "Regulation",
+    }
+    fitted_curve_data = pd.read_csv(Path(output_folder) / "curves.txt", sep="\t", usecols=required_columns).rename(
+        columns=required_columns
+    )
+    fitted_curve_data[["cell_line_id", "drug_id"]] = fitted_curve_data.Name.str.split("|", expand=True)
+    fitted_curve_data.to_csv(output_folder / f"{dataset_name}.csv", index=None)
+
+
+def fit_curves(input_file: str | Path, output_dir: str | Path, dataset_name: str, cores: int):
+    """
+    Fit curves for provided raw viability data.
+
+    This functions reads viability data in a predefined input format, preprocesses the data
+    to be readable by CurveCurator, fits curves to the data using CurveCurator, and postprocesses
+    the fitted data to a format required by drevalpy.
+
+    :param input_file: Path to the file containing the raw viability data
+    :param output_dir: Path to store all the files to, including the preprocessed data, the config.toml
+        for CurveCurator, CurveCurator's output files, and the postprocessed data
+    :param dataset_name: The name of the dataset, will be used to prepend the postprocessed <dataset_name>.csv file
+    :param cores: The number of cores to be used for fitting the curves using CurveCurator.
+        This parameter is written into the config.toml, but it is min of the number of curves to fit
+        and the number given (min(n_curves, cores))
+    """
+    preprocess(input_file, output_dir, dataset_name, cores)
+    _exec_curvecurator(Path(output_dir))
+    postprocess(output_dir, dataset_name)
diff --git a/drevalpy/datasets/loader.py b/drevalpy/datasets/loader.py
@@ -1,6 +1,7 @@
 """Contains functions to load the GDSC1, GDSC2, CCLE, and Toy datasets."""
 
 import os
+from pathlib import Path
 from typing import Callable
 
 import pandas as pd
@@ -92,6 +93,16 @@ def load_toy(path_data: str = "data") -> DrugResponseDataset:
     )
 
 
+def load_custom(path_data: str | Path) -> DrugResponseDataset:
+    """
+    Load custom dataset.
+
+    :param path_data: Path to location of custom dataset
+    :return: DrugResponseDataset containing response, cell line IDs, and drug IDs
+    """
+    return DrugResponseDataset.from_csv(path_data)
+
+
 AVAILABLE_DATASETS: dict[str, Callable] = {
     "GDSC1": load_gdsc1,
     "GDSC2": load_gdsc2,
@@ -105,12 +116,16 @@ def load_dataset(dataset_name: str, path_data: str = "data") -> DrugResponseData
     """
     Load a dataset based on the dataset name.
 
-    :param dataset_name: The name of the dataset to load ('GDSC1', 'GDSC2', 'CCLE', or 'Toy_Data').
+    :param dataset_name: The name of the dataset to load. Can be one of ('GDSC1', 'GDSC2', 'CCLE', or 'Toy_Data')
+        to download provided datasets, or any other name, to allow for custom datasets. In that case, the following
+        file has to exist: <path_data>/<dataset_name>.csv.
     :param path_data: The path to the dataset.
     :return: A DrugResponseDataset containing response, cell line IDs, drug IDs, and dataset name.
-    :raises ValueError: If the dataset name is unknown.
+    :raises FileNotFoundError: If the custom dataset could not be found at the given path.
     """
     if dataset_name in AVAILABLE_DATASETS:
         return AVAILABLE_DATASETS[dataset_name](path_data)  # type: ignore
-    else:
-        raise ValueError(f"Unknown dataset name: {dataset_name}")
+    custom_path = Path(path_data) / dataset_name / f"{dataset_name}.csv"
+    if custom_path.is_file():
+        return load_custom(custom_path)
+    raise FileNotFoundError(f"Custom dataset does not exist at given path: {custom_path}")
diff --git a/noxfile.py b/noxfile.py
@@ -18,7 +18,7 @@
     sys.exit(1)
 
 package = "drevalpy"
-python_versions = ["3.10", "3.11"]
+python_versions = ["3.11", "3.12"]
 nox.options.sessions = (
     "pre-commit",
     "safety",
@@ -147,7 +147,7 @@ def mypy(session: Session) -> None:
     """
     args = session.posargs or ["drevalpy", "tests", "docs/conf.py"]
     session.install(".")
-    session.install("mypy", "pytest", "types-requests", "types-attrs", "types-PyYAML")
+    session.install("mypy", "pytest", "types-requests", "types-attrs", "types-PyYAML", "types-toml")
     session.run("mypy", *args)
 
 
@@ -158,7 +158,7 @@ def tests(session: Session) -> None:
 
     :param session: The Session object.
     """
-    session.install(".")
+    session.install(".[fit]")
     session.install("coverage[toml]", "pytest", "pygments")
     try:
         session.run("coverage", "run", "--parallel", "-m", "pytest", *session.posargs)
@@ -194,7 +194,7 @@ def typeguard(session: Session) -> None:
 
     :param session: The Session object.
     """
-    session.install(".")
+    session.install(".[fit]")
     session.install("pytest", "typeguard", "pygments")
     session.run("pytest", f"--typeguard-packages={package}", *session.posargs)