|
| 1 | +""" |
| 2 | +Contains all function required for CurveCurator fitting. |
| 3 | +
|
| 4 | +CurveCurator publication: |
| 5 | +Bayer, F.P., Gander, M., Kuster, B. et al. CurveCurator: a recalibrated F-statistic to assess, |
| 6 | +classify, and explore significance of dose–response curves. Nat Commun 14, 7902 (2023). |
| 7 | +https://doi-org.eaccess.tum.edu/10.1038/s41467-023-43696-z |
| 8 | +
|
| 9 | +CurveCurator applies a recalibrated F-statistic for p-value estimation of 4-point log-logistic |
| 10 | +regression fits. In drevalpy, this can be used to generate training data with higher quality, since |
| 11 | +quality measures, such as p-value, R2, or relevance score can be used to filter out viability |
| 12 | +measurements of low quality. |
| 13 | +""" |
| 14 | + |
| 15 | +import subprocess |
| 16 | +from pathlib import Path |
| 17 | + |
| 18 | +import numpy as np |
| 19 | +import pandas as pd |
| 20 | +import toml |
| 21 | + |
| 22 | +from ..pipeline_function import pipeline_function |
| 23 | + |
| 24 | + |
| 25 | +def _prepare_raw_data(curve_df: pd.DataFrame, output_dir: str | Path): |
| 26 | + required_columns = ["dose", "response", "sample", "drug"] |
| 27 | + if not all([col in curve_df.columns for col in required_columns]): |
| 28 | + raise ValueError("Missing columns in viability data. Required columns are {required_columns}.") |
| 29 | + if "replicate" in curve_df.columns: |
| 30 | + required_columns.append("replicate") |
| 31 | + curve_df = curve_df[required_columns] |
| 32 | + n_replicates = 1 |
| 33 | + conc_columns = ["dose"] |
| 34 | + has_multicol_index = False |
| 35 | + if "replicate" in curve_df.columns: |
| 36 | + n_replicates = curve_df["replicate"].nunique() |
| 37 | + conc_columns.append("replicate") |
| 38 | + has_multicol_index = True |
| 39 | + |
| 40 | + df = curve_df.pivot(index=["sample", "drug"], columns=conc_columns, values="response") |
| 41 | + |
| 42 | + for i in range(n_replicates): |
| 43 | + df.insert(0, (0.0, n_replicates - i), 1.0) |
| 44 | + |
| 45 | + concentrations = df.columns.sort_values() |
| 46 | + df = df[concentrations] |
| 47 | + |
| 48 | + experiments = np.arange(df.shape[1]) |
| 49 | + df.insert(0, "Name", df.index.map(lambda x: f"{x[0]}|{x[1]}")) |
| 50 | + df.columns = ["Name"] + [f"Raw {i}" for i in experiments] |
| 51 | + |
| 52 | + curvecurator_folder = Path(output_dir) |
| 53 | + curvecurator_folder.mkdir(exist_ok=True, parents=True) |
| 54 | + df.to_csv(curvecurator_folder / "curvecurator_input.tsv", sep="\t", index=False) |
| 55 | + |
| 56 | + if has_multicol_index: |
| 57 | + doses = [pair[0] for pair in concentrations] |
| 58 | + else: |
| 59 | + doses = concentrations.to_list() |
| 60 | + return len(experiments), doses, n_replicates, len(df) |
| 61 | + |
| 62 | + |
| 63 | +def _prepare_toml(filename: str, n_exp: int, n_replicates: int, doses: list[float], dataset_name: str, cores: int): |
| 64 | + config = { |
| 65 | + "Meta": { |
| 66 | + "id": filename, |
| 67 | + "description": dataset_name, |
| 68 | + "condition": "drug", |
| 69 | + "treatment_time": "72 h", |
| 70 | + }, |
| 71 | + "Experiment": { |
| 72 | + "experiments": range(n_exp), |
| 73 | + "doses": doses, |
| 74 | + "dose_scale": "1e-06", |
| 75 | + "dose_unit": "M", |
| 76 | + "control_experiment": [i for i in range(n_replicates)], |
| 77 | + "measurement_type": "OTHER", |
| 78 | + "data_type": "OTHER", |
| 79 | + "search_engine": "OTHER", |
| 80 | + "search_engine_version": "0", |
| 81 | + }, |
| 82 | + "Paths": { |
| 83 | + "input_file": "curvecurator_input.tsv", |
| 84 | + "curves_file": "curves.txt", |
| 85 | + "normalization_file": "norm.txt", |
| 86 | + "mad_file": "mad.txt", |
| 87 | + "dashboard": "dashboard.html", |
| 88 | + }, |
| 89 | + "Processing": { |
| 90 | + "available_cores": cores, |
| 91 | + "max_missing": max(len(doses) - 5, 0), |
| 92 | + "imputation": False, |
| 93 | + "normalization": False, |
| 94 | + }, |
| 95 | + "Curve Fit": { |
| 96 | + "type": "OLS", |
| 97 | + "speed": "exhaustive", |
| 98 | + "max_iterations": 1000, |
| 99 | + "interpolation": False, |
| 100 | + "control_fold_change": True, |
| 101 | + }, |
| 102 | + "F Statistic": { |
| 103 | + "optimized_dofs": True, |
| 104 | + "alpha": 0.05, |
| 105 | + "fc_lim": 0.45, |
| 106 | + }, |
| 107 | + } |
| 108 | + return config |
| 109 | + |
| 110 | + |
| 111 | +def _exec_curvecurator(output_dir: Path): |
| 112 | + command = ["CurveCurator", str(output_dir / "config.toml"), "--mad"] |
| 113 | + process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
| 114 | + process.communicate() |
| 115 | + |
| 116 | + |
| 117 | +def _calc_ic50(model_params_df: pd.DataFrame): |
| 118 | + """ |
| 119 | + Calculate the IC50 from a fitted model. |
| 120 | +
|
| 121 | + This function expects a dataframe that was processed in the postprocess function, containing |
| 122 | + the columns "Front", "Back", "Slope", "pEC50". It calculates the IC50 for all the models in the |
| 123 | + dataframe in closed form and adds the column IC50_curvecurator to the input dataframe. |
| 124 | +
|
| 125 | + :param model_params_df: a dataframe containing the fitted parameters |
| 126 | + """ |
| 127 | + |
| 128 | + def ic50(front, back, slope, pec50): |
| 129 | + return (np.log10((front - back) / (0.5 + back)) - slope * pec50) / slope |
| 130 | + |
| 131 | + front = model_params_df["Front"].values |
| 132 | + back = model_params_df["Back"].values |
| 133 | + slope = model_params_df["Slope"].values |
| 134 | + pec50 = model_params_df["pEC50"].values |
| 135 | + |
| 136 | + model_params_df["IC50_curvecurator"] = ic50(front, back, slope, pec50) |
| 137 | + |
| 138 | + |
| 139 | +@pipeline_function |
| 140 | +def preprocess(input_file: str | Path, output_dir: str | Path, dataset_name: str, cores: int): |
| 141 | + """ |
| 142 | + Preprocess raw viability data and create config.toml for use with CurveCurator. |
| 143 | +
|
| 144 | + :param input_file: Path to csv file containing the raw viability data |
| 145 | + :param output_dir: Path to store all the files to, including the preprocessed data, the config.toml |
| 146 | + for CurveCurator, CurveCurator's output files, and the postprocessed data |
| 147 | + :param dataset_name: Name of the dataset |
| 148 | + :param cores: The number of cores to be used for fitting the curves using CurveCurator. |
| 149 | + This parameter is written into the config.toml, but it is min of the number of curves to fit |
| 150 | + and the number given (min(n_curves, cores)) |
| 151 | + """ |
| 152 | + input_file = Path(input_file) |
| 153 | + output_dir = Path(output_dir) |
| 154 | + curve_df = pd.read_csv(input_file) |
| 155 | + |
| 156 | + n_exp, doses, n_replicates, n_curves_to_fit = _prepare_raw_data(curve_df, output_dir) |
| 157 | + cores = min(n_curves_to_fit, cores) |
| 158 | + |
| 159 | + config = _prepare_toml(input_file.name, n_exp, n_replicates, doses, dataset_name, cores) |
| 160 | + with open(output_dir / "config.toml", "w") as f: |
| 161 | + toml.dump(config, f) |
| 162 | + |
| 163 | + |
| 164 | +@pipeline_function |
| 165 | +def postprocess(output_folder: str | Path, dataset_name: str): |
| 166 | + """ |
| 167 | + Postprocess CurveCurator output file. |
| 168 | +
|
| 169 | + This function reads the curves.txt file created by CurveCurator, which contains the |
| 170 | + fitted curve parameters and postprocesses it to be used by drevalpy. |
| 171 | +
|
| 172 | + :param output_folder: Path to the output folder of CurveCurator containin the curves.txt file. |
| 173 | + :param dataset_name: The name of the dataset, will be used to prepend the postprocessed <dataset_name>.csv file |
| 174 | + """ |
| 175 | + output_folder = Path(output_folder) |
| 176 | + required_columns = { |
| 177 | + "Name": "Name", |
| 178 | + "pEC50": "pEC50", |
| 179 | + "pEC50 Error": "pEC50Error", |
| 180 | + "Curve Slope": "Slope", |
| 181 | + "Curve Front": "Front", |
| 182 | + "Curve Back": "Back", |
| 183 | + "Curve Fold Change": "FoldChange", |
| 184 | + "Curve AUC": "AUC", |
| 185 | + "Curve R2": "R2", |
| 186 | + "Curve P_Value": "pValue", |
| 187 | + "Curve Relevance Score": "RelevanceScore", |
| 188 | + "Curve F_Value": "fValue", |
| 189 | + "Curve Log P_Value": "negLog10pValue", |
| 190 | + "Signal Quality": "SignalQuality", |
| 191 | + "Curve RMSE": "RMSE", |
| 192 | + "Curve F_Value SAM Corrected": "fValueSAMCorrected", |
| 193 | + "Curve Regulation": "Regulation", |
| 194 | + } |
| 195 | + fitted_curve_data = pd.read_csv(Path(output_folder) / "curves.txt", sep="\t", usecols=required_columns).rename( |
| 196 | + columns=required_columns |
| 197 | + ) |
| 198 | + fitted_curve_data[["cell_line_id", "drug_id"]] = fitted_curve_data.Name.str.split("|", expand=True) |
| 199 | + fitted_curve_data["EC50_curvecurator"] = np.power( |
| 200 | + 10, -fitted_curve_data["pEC50"].values |
| 201 | + ) # in CurveCurator 10^-pEC50 = EC50 |
| 202 | + _calc_ic50(fitted_curve_data) |
| 203 | + fitted_curve_data.to_csv(output_folder / f"{dataset_name}.csv", index=None) |
| 204 | + |
| 205 | + |
| 206 | +def fit_curves(input_file: str | Path, output_dir: str | Path, dataset_name: str, cores: int): |
| 207 | + """ |
| 208 | + Fit curves for provided raw viability data. |
| 209 | +
|
| 210 | + This functions reads viability data in a predefined input format, preprocesses the data |
| 211 | + to be readable by CurveCurator, fits curves to the data using CurveCurator, and postprocesses |
| 212 | + the fitted data to a format required by drevalpy. |
| 213 | +
|
| 214 | + :param input_file: Path to the file containing the raw viability data |
| 215 | + :param output_dir: Path to store all the files to, including the preprocessed data, the config.toml |
| 216 | + for CurveCurator, CurveCurator's output files, and the postprocessed data |
| 217 | + :param dataset_name: The name of the dataset, will be used to prepend the postprocessed <dataset_name>.csv file |
| 218 | + :param cores: The number of cores to be used for fitting the curves using CurveCurator. |
| 219 | + This parameter is written into the config.toml, but it is min of the number of curves to fit |
| 220 | + and the number given (min(n_curves, cores)) |
| 221 | + """ |
| 222 | + preprocess(input_file, output_dir, dataset_name, cores) |
| 223 | + _exec_curvecurator(Path(output_dir)) |
| 224 | + postprocess(output_dir, dataset_name) |
0 commit comments