Skip to content

Commit da122ac

Browse files
committed
improve calibration speed
1 parent d7bb1b6 commit da122ac

File tree

1 file changed

+20
-49
lines changed

1 file changed

+20
-49
lines changed

im2deep/calibrate.py

Lines changed: 20 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -59,25 +59,13 @@ def get_ccs_shift(
5959
"""
6060
LOGGER.debug(f"Using charge state {use_charge_state} for CCS shift calculation.")
6161

62-
tmp_df = cal_df.copy(deep=True)
63-
tmp_ref_df = reference_dataset.copy(deep=True)
64-
65-
tmp_df["sequence"] = tmp_df["peptidoform"].apply(lambda x: x.proforma.split("\\")[0])
66-
tmp_df["charge"] = tmp_df["peptidoform"].apply(lambda x: x.precursor_charge)
67-
tmp_ref_df["sequence"] = tmp_ref_df["peptidoform"].apply(
68-
lambda x: Peptidoform(x).proforma.split("\\")[0]
69-
)
70-
tmp_ref_df["charge"] = tmp_ref_df["peptidoform"].apply(
71-
lambda x: Peptidoform(x).precursor_charge
72-
)
73-
74-
reference_tmp = tmp_ref_df[tmp_ref_df["charge"] == use_charge_state]
75-
df_tmp = tmp_df[tmp_df["charge"] == use_charge_state]
62+
reference_tmp = reference_dataset[reference_dataset["charge"] == use_charge_state]
63+
df_tmp = cal_df[cal_df["charge"] == use_charge_state]
7664
both = pd.merge(
7765
left=reference_tmp,
7866
right=df_tmp,
7967
right_on=["sequence", "charge"],
80-
left_on=["sequence", "charge"],
68+
left_on=["peptidoform", "charge"],
8169
how="inner",
8270
suffixes=("_ref", "_data"),
8371
)
@@ -90,7 +78,7 @@ def get_ccs_shift(
9078

9179
# How much CCS in calibration data is larger than reference CCS, so predictions
9280
# need to be increased by this amount
93-
return 0 if both.shape[0] == 0 else np.mean(both["ccs_observed"] - both["CCS"])
81+
return 0 if both.empty else np.mean(both["ccs_observed"] - both["CCS"])
9482

9583

9684
def get_ccs_shift_per_charge(cal_df: pd.DataFrame, reference_dataset: pd.DataFrame) -> ndarray:
@@ -111,25 +99,11 @@ def get_ccs_shift_per_charge(cal_df: pd.DataFrame, reference_dataset: pd.DataFra
11199
CCS shift factors per charge state.
112100
113101
"""
114-
tmp_df = cal_df.copy(deep=True)
115-
tmp_ref_df = reference_dataset.copy(deep=True)
116-
117-
tmp_df["sequence"] = tmp_df["peptidoform"].apply(lambda x: x.proforma.split("\\")[0])
118-
tmp_df["charge"] = tmp_df["peptidoform"].apply(lambda x: x.precursor_charge)
119-
tmp_ref_df["sequence"] = tmp_ref_df["peptidoform"].apply(
120-
lambda x: Peptidoform(x).proforma.split("\\")[0]
121-
)
122-
tmp_ref_df["charge"] = tmp_ref_df["peptidoform"].apply(
123-
lambda x: Peptidoform(x).precursor_charge
124-
)
125-
126-
reference_tmp = tmp_ref_df
127-
df_tmp = tmp_df
128102
both = pd.merge(
129-
left=reference_tmp,
130-
right=df_tmp,
103+
left=reference_dataset,
104+
right=cal_df,
131105
right_on=["sequence", "charge"],
132-
left_on=["sequence", "charge"],
106+
left_on=["peptidoform", "charge"],
133107
how="inner",
134108
suffixes=("_ref", "_data"),
135109
)
@@ -159,7 +133,6 @@ def calculate_ccs_shift(
159133
CCS shift factor.
160134
161135
"""
162-
cal_df["charge"] = cal_df["peptidoform"].apply(lambda x: x.precursor_charge)
163136
cal_df = cal_df[cal_df["charge"] < 7] # predictions do not go higher for IM2Deep
164137

165138
if not per_charge:
@@ -207,37 +180,35 @@ def linear_calibration(
207180
208181
"""
209182
LOGGER.info("Calibrating CCS values using linear calibration...")
183+
calibration_dataset['sequence'] = calibration_dataset['peptidoform'].apply(lambda x: x.proforma.split("\\")[0])
184+
calibration_dataset['charge'] = calibration_dataset['peptidoform'].apply(lambda x: x.precursor_charge)
185+
# reference_dataset['sequence'] = reference_dataset['peptidoform'].apply(lambda x: x.split('/')[0])
186+
reference_dataset['charge'] = reference_dataset['peptidoform'].apply(lambda x: int(x.split('/')[1]))
187+
210188
if per_charge:
189+
LOGGER.info('Getting general shift factor')
211190
general_shift = calculate_ccs_shift(
212191
calibration_dataset,
213192
reference_dataset,
214193
per_charge=False,
215194
use_charge_state=use_charge_state,
216195
)
196+
LOGGER.info('Getting shift factors per charge state')
217197
shift_factor_dict = calculate_ccs_shift(
218198
calibration_dataset, reference_dataset, per_charge=True
219199
)
220-
for charge in preds_df["charge"].unique():
221-
if charge not in shift_factor_dict:
222-
LOGGER.info(
223-
"No overlapping precursors for charge state {}. Using overall shift factor for precursors with that charge.".format(
224-
charge
225-
)
226-
)
227-
shift_factor_dict[charge] = general_shift
228-
LOGGER.info("Shift factors per charge: {}".format(shift_factor_dict))
229-
preds_df["predicted_ccs"] = preds_df.apply(
230-
lambda x: x["predicted_ccs"] + shift_factor_dict[x["charge"]], axis=1
231-
)
200+
201+
preds_df['shift'] = preds_df['charge'].map(shift_factor_dict).fillna(general_shift)
202+
preds_df['predicted_ccs'] = preds_df['predicted_ccs'] + preds_df['shift']
203+
232204
else:
233205
shift_factor = calculate_ccs_shift(
234206
calibration_dataset,
235207
reference_dataset,
236208
per_charge=False,
237209
use_charge_state=use_charge_state,
238210
)
239-
preds_df["predicted_ccs"] = preds_df.apply(
240-
lambda x: x["predicted_ccs"] + shift_factor, axis=1
241-
)
211+
preds_df['predicted_ccs'] += shift_factor
212+
242213
LOGGER.info("CCS values calibrated.")
243214
return preds_df

0 commit comments

Comments
 (0)