Skip to content

Commit ce5145b

Browse files
authored
Merge pull request #84 from daisybio/feature/curvecurator
Feature/curvecurator
2 parents 15b6f72 + 6c84254 commit ce5145b

File tree

18 files changed

+1195
-746
lines changed

18 files changed

+1195
-746
lines changed

.github/workflows/build_package.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ jobs:
99
strategy:
1010
matrix:
1111
os: [macos-latest, ubuntu-latest, windows-latest]
12-
python: ["3.9", "3.10"]
12+
python: ["3.11", "3.12"]
1313

1414
steps:
1515
- uses: actions/checkout@v4

.github/workflows/publish_docs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ jobs:
1313
- name: Setup Python
1414
uses: actions/setup-python@v5
1515
with:
16-
python-version: 3.9
16+
python-version: 3.11
1717

1818
- name: Install pip
1919
run: |

.github/workflows/run_tests.yml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,14 @@ jobs:
1616
fail-fast: false
1717
matrix:
1818
include:
19-
- { python-version: "3.10", os: ubuntu-latest, session: "pre-commit" }
20-
- { python-version: "3.10", os: ubuntu-latest, session: "safety" }
21-
- { python-version: "3.10", os: ubuntu-latest, session: "mypy" }
22-
- { python-version: "3.10", os: ubuntu-latest, session: "tests" }
23-
- { python-version: "3.10", os: windows-latest, session: "tests" }
24-
- { python-version: "3.10", os: ubuntu-latest, session: "typeguard" }
25-
- { python-version: "3.10", os: ubuntu-latest, session: "xdoctest" }
26-
- { python-version: "3.10", os: ubuntu-latest, session: "docs-build" }
19+
- { python-version: "3.11", os: ubuntu-latest, session: "pre-commit" }
20+
- { python-version: "3.11", os: ubuntu-latest, session: "safety" }
21+
- { python-version: "3.11", os: ubuntu-latest, session: "mypy" }
22+
- { python-version: "3.11", os: ubuntu-latest, session: "tests" }
23+
- { python-version: "3.11", os: windows-latest, session: "tests" }
24+
- { python-version: "3.11", os: ubuntu-latest, session: "typeguard" }
25+
- { python-version: "3.11", os: ubuntu-latest, session: "xdoctest" }
26+
- { python-version: "3.11", os: ubuntu-latest, session: "docs-build" }
2727

2828
env:
2929
NOXSESSION: ${{ matrix.session }}

docs/API.rst

Lines changed: 0 additions & 29 deletions
This file was deleted.

docs/conf.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@
1616

1717
from jinja2.defaults import DEFAULT_FILTERS
1818

19-
import drevalpy
20-
21-
sys.path.insert(0, os.path.abspath("../"))
19+
sys.path.insert(0, os.path.abspath(".."))
2220

2321

2422
# -- General configuration ---------------------------------------------
@@ -58,9 +56,9 @@
5856
# the built documents.
5957
#
6058
# The short X.Y version.
61-
version = drevalpy.__version__
59+
version = "1.0.10"
6260
# The full version, including alpha/beta/rc tags.
63-
release = drevalpy.__version__
61+
release = "1.0.10"
6462

6563
# The language for content autogenerated by Sphinx. Refer to documentation
6664
# for a list of supported languages.

docs/requirements.txt

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
sphinx-autobuild==2024.10.3 ; python_version >= "3.9" and python_full_version <= "3.13.0"
2-
sphinx-autodoc-typehints==2.3.0 ; python_version >= "3.9" and python_full_version <= "3.13.0"
3-
sphinx-click==6.0.0 ; python_version >= "3.9" and python_full_version <= "3.13.0"
4-
sphinx-rtd-theme==3.0.2 ; python_version >= "3.9" and python_full_version <= "3.13.0"
5-
-e .
1+
sphinx-autobuild==2024.10.3 ; python_version >= "3.11" and python_version < "3.13"
2+
sphinx-autodoc-typehints==2.5.0 ; python_version >= "3.11" and python_version < "3.13"
3+
sphinx-click==6.0.0 ; python_version >= "3.11" and python_version < "3.13"
4+
sphinx-rtd-theme==3.0.2 ; python_version >= "3.11" and python_version < "3.13"

drevalpy/datasets/curvecurator.py

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
"""
2+
Contains all function required for CurveCurator fitting.
3+
4+
CurveCurator publication:
5+
Bayer, F.P., Gander, M., Kuster, B. et al. CurveCurator: a recalibrated F-statistic to assess,
6+
classify, and explore significance of dose–response curves. Nat Commun 14, 7902 (2023).
7+
https://doi-org.eaccess.tum.edu/10.1038/s41467-023-43696-z
8+
9+
CurveCurator applies a recalibrated F-statistic for p-value estimation of 4-point log-logistic
10+
regression fits. In drevalpy, this can be used to generate training data with higher quality, since
11+
quality measures, such as p-value, R2, or relevance score can be used to filter out viability
12+
measurements of low quality.
13+
"""
14+
15+
import subprocess
16+
from pathlib import Path
17+
18+
import numpy as np
19+
import pandas as pd
20+
import toml
21+
22+
from ..pipeline_function import pipeline_function
23+
24+
25+
def _prepare_raw_data(curve_df: pd.DataFrame, output_dir: str | Path):
26+
required_columns = ["dose", "response", "sample", "drug"]
27+
if not all([col in curve_df.columns for col in required_columns]):
28+
raise ValueError("Missing columns in viability data. Required columns are {required_columns}.")
29+
if "replicate" in curve_df.columns:
30+
required_columns.append("replicate")
31+
curve_df = curve_df[required_columns]
32+
n_replicates = 1
33+
conc_columns = ["dose"]
34+
has_multicol_index = False
35+
if "replicate" in curve_df.columns:
36+
n_replicates = curve_df["replicate"].nunique()
37+
conc_columns.append("replicate")
38+
has_multicol_index = True
39+
40+
df = curve_df.pivot(index=["sample", "drug"], columns=conc_columns, values="response")
41+
42+
for i in range(n_replicates):
43+
df.insert(0, (0.0, n_replicates - i), 1.0)
44+
45+
concentrations = df.columns.sort_values()
46+
df = df[concentrations]
47+
48+
experiments = np.arange(df.shape[1])
49+
df.insert(0, "Name", df.index.map(lambda x: f"{x[0]}|{x[1]}"))
50+
df.columns = ["Name"] + [f"Raw {i}" for i in experiments]
51+
52+
curvecurator_folder = Path(output_dir)
53+
curvecurator_folder.mkdir(exist_ok=True, parents=True)
54+
df.to_csv(curvecurator_folder / "curvecurator_input.tsv", sep="\t", index=False)
55+
56+
if has_multicol_index:
57+
doses = [pair[0] for pair in concentrations]
58+
else:
59+
doses = concentrations.to_list()
60+
return len(experiments), doses, n_replicates, len(df)
61+
62+
63+
def _prepare_toml(filename: str, n_exp: int, n_replicates: int, doses: list[float], dataset_name: str, cores: int):
64+
config = {
65+
"Meta": {
66+
"id": filename,
67+
"description": dataset_name,
68+
"condition": "drug",
69+
"treatment_time": "72 h",
70+
},
71+
"Experiment": {
72+
"experiments": range(n_exp),
73+
"doses": doses,
74+
"dose_scale": "1e-06",
75+
"dose_unit": "M",
76+
"control_experiment": [i for i in range(n_replicates)],
77+
"measurement_type": "OTHER",
78+
"data_type": "OTHER",
79+
"search_engine": "OTHER",
80+
"search_engine_version": "0",
81+
},
82+
"Paths": {
83+
"input_file": "curvecurator_input.tsv",
84+
"curves_file": "curves.txt",
85+
"normalization_file": "norm.txt",
86+
"mad_file": "mad.txt",
87+
"dashboard": "dashboard.html",
88+
},
89+
"Processing": {
90+
"available_cores": cores,
91+
"max_missing": max(len(doses) - 5, 0),
92+
"imputation": False,
93+
"normalization": False,
94+
},
95+
"Curve Fit": {
96+
"type": "OLS",
97+
"speed": "exhaustive",
98+
"max_iterations": 1000,
99+
"interpolation": False,
100+
"control_fold_change": True,
101+
},
102+
"F Statistic": {
103+
"optimized_dofs": True,
104+
"alpha": 0.05,
105+
"fc_lim": 0.45,
106+
},
107+
}
108+
return config
109+
110+
111+
def _exec_curvecurator(output_dir: Path):
112+
command = ["CurveCurator", str(output_dir / "config.toml"), "--mad"]
113+
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
114+
process.communicate()
115+
116+
117+
def _calc_ic50(model_params_df: pd.DataFrame):
118+
"""
119+
Calculate the IC50 from a fitted model.
120+
121+
This function expects a dataframe that was processed in the postprocess function, containing
122+
the columns "Front", "Back", "Slope", "pEC50". It calculates the IC50 for all the models in the
123+
dataframe in closed form and adds the column IC50_curvecurator to the input dataframe.
124+
125+
:param model_params_df: a dataframe containing the fitted parameters
126+
"""
127+
128+
def ic50(front, back, slope, pec50):
129+
return (np.log10((front - back) / (0.5 + back)) - slope * pec50) / slope
130+
131+
front = model_params_df["Front"].values
132+
back = model_params_df["Back"].values
133+
slope = model_params_df["Slope"].values
134+
pec50 = model_params_df["pEC50"].values
135+
136+
model_params_df["IC50_curvecurator"] = ic50(front, back, slope, pec50)
137+
138+
139+
@pipeline_function
140+
def preprocess(input_file: str | Path, output_dir: str | Path, dataset_name: str, cores: int):
141+
"""
142+
Preprocess raw viability data and create config.toml for use with CurveCurator.
143+
144+
:param input_file: Path to csv file containing the raw viability data
145+
:param output_dir: Path to store all the files to, including the preprocessed data, the config.toml
146+
for CurveCurator, CurveCurator's output files, and the postprocessed data
147+
:param dataset_name: Name of the dataset
148+
:param cores: The number of cores to be used for fitting the curves using CurveCurator.
149+
This parameter is written into the config.toml, but it is min of the number of curves to fit
150+
and the number given (min(n_curves, cores))
151+
"""
152+
input_file = Path(input_file)
153+
output_dir = Path(output_dir)
154+
curve_df = pd.read_csv(input_file)
155+
156+
n_exp, doses, n_replicates, n_curves_to_fit = _prepare_raw_data(curve_df, output_dir)
157+
cores = min(n_curves_to_fit, cores)
158+
159+
config = _prepare_toml(input_file.name, n_exp, n_replicates, doses, dataset_name, cores)
160+
with open(output_dir / "config.toml", "w") as f:
161+
toml.dump(config, f)
162+
163+
164+
@pipeline_function
165+
def postprocess(output_folder: str | Path, dataset_name: str):
166+
"""
167+
Postprocess CurveCurator output file.
168+
169+
This function reads the curves.txt file created by CurveCurator, which contains the
170+
fitted curve parameters and postprocesses it to be used by drevalpy.
171+
172+
:param output_folder: Path to the output folder of CurveCurator containin the curves.txt file.
173+
:param dataset_name: The name of the dataset, will be used to prepend the postprocessed <dataset_name>.csv file
174+
"""
175+
output_folder = Path(output_folder)
176+
required_columns = {
177+
"Name": "Name",
178+
"pEC50": "pEC50",
179+
"pEC50 Error": "pEC50Error",
180+
"Curve Slope": "Slope",
181+
"Curve Front": "Front",
182+
"Curve Back": "Back",
183+
"Curve Fold Change": "FoldChange",
184+
"Curve AUC": "AUC",
185+
"Curve R2": "R2",
186+
"Curve P_Value": "pValue",
187+
"Curve Relevance Score": "RelevanceScore",
188+
"Curve F_Value": "fValue",
189+
"Curve Log P_Value": "negLog10pValue",
190+
"Signal Quality": "SignalQuality",
191+
"Curve RMSE": "RMSE",
192+
"Curve F_Value SAM Corrected": "fValueSAMCorrected",
193+
"Curve Regulation": "Regulation",
194+
}
195+
fitted_curve_data = pd.read_csv(Path(output_folder) / "curves.txt", sep="\t", usecols=required_columns).rename(
196+
columns=required_columns
197+
)
198+
fitted_curve_data[["cell_line_id", "drug_id"]] = fitted_curve_data.Name.str.split("|", expand=True)
199+
fitted_curve_data["EC50_curvecurator"] = np.power(
200+
10, -fitted_curve_data["pEC50"].values
201+
) # in CurveCurator 10^-pEC50 = EC50
202+
_calc_ic50(fitted_curve_data)
203+
fitted_curve_data.to_csv(output_folder / f"{dataset_name}.csv", index=None)
204+
205+
206+
def fit_curves(input_file: str | Path, output_dir: str | Path, dataset_name: str, cores: int):
207+
"""
208+
Fit curves for provided raw viability data.
209+
210+
This functions reads viability data in a predefined input format, preprocesses the data
211+
to be readable by CurveCurator, fits curves to the data using CurveCurator, and postprocesses
212+
the fitted data to a format required by drevalpy.
213+
214+
:param input_file: Path to the file containing the raw viability data
215+
:param output_dir: Path to store all the files to, including the preprocessed data, the config.toml
216+
for CurveCurator, CurveCurator's output files, and the postprocessed data
217+
:param dataset_name: The name of the dataset, will be used to prepend the postprocessed <dataset_name>.csv file
218+
:param cores: The number of cores to be used for fitting the curves using CurveCurator.
219+
This parameter is written into the config.toml, but it is min of the number of curves to fit
220+
and the number given (min(n_curves, cores))
221+
"""
222+
preprocess(input_file, output_dir, dataset_name, cores)
223+
_exec_curvecurator(Path(output_dir))
224+
postprocess(output_dir, dataset_name)

drevalpy/datasets/dataset.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,10 @@ class DrugResponseDataset(Dataset):
6666

6767
@classmethod
6868
def from_csv(
69-
cls: type["DrugResponseDataset"], input_file: str | Path, dataset_name: str = "unknown"
69+
cls: type["DrugResponseDataset"],
70+
input_file: str | Path,
71+
dataset_name: str = "unknown",
72+
measure: str = "response",
7073
) -> "DrugResponseDataset":
7174
"""
7275
Load a dataset from a csv file.
@@ -80,6 +83,7 @@ def from_csv(
8083
8184
:param input_file: Path to the csv file containing the data to be loaded
8285
:param dataset_name: Optional name to associate the dataset with, default = "unknown"
86+
:param measure: The name of the column containing the measure to predict, default = "response"
8387
8488
:returns: DrugResponseDataset object containing data from provided csv file.
8589
"""
@@ -89,7 +93,7 @@ def from_csv(
8993
else:
9094
predictions = None
9195
return cls(
92-
response=data["response"].values,
96+
response=data[measure].values,
9397
cell_line_ids=data["cell_line_id"].values,
9498
drug_ids=data["drug_id"].values,
9599
predictions=predictions,

0 commit comments

Comments
 (0)