@@ -59,25 +59,13 @@ def get_ccs_shift(
5959 """
6060 LOGGER .debug (f"Using charge state { use_charge_state } for CCS shift calculation." )
6161
62- tmp_df = cal_df .copy (deep = True )
63- tmp_ref_df = reference_dataset .copy (deep = True )
64-
65- tmp_df ["sequence" ] = tmp_df ["peptidoform" ].apply (lambda x : x .proforma .split ("\\ " )[0 ])
66- tmp_df ["charge" ] = tmp_df ["peptidoform" ].apply (lambda x : x .precursor_charge )
67- tmp_ref_df ["sequence" ] = tmp_ref_df ["peptidoform" ].apply (
68- lambda x : Peptidoform (x ).proforma .split ("\\ " )[0 ]
69- )
70- tmp_ref_df ["charge" ] = tmp_ref_df ["peptidoform" ].apply (
71- lambda x : Peptidoform (x ).precursor_charge
72- )
73-
74- reference_tmp = tmp_ref_df [tmp_ref_df ["charge" ] == use_charge_state ]
75- df_tmp = tmp_df [tmp_df ["charge" ] == use_charge_state ]
62+ reference_tmp = reference_dataset [reference_dataset ["charge" ] == use_charge_state ]
63+ df_tmp = cal_df [cal_df ["charge" ] == use_charge_state ]
7664 both = pd .merge (
7765 left = reference_tmp ,
7866 right = df_tmp ,
7967 right_on = ["sequence" , "charge" ],
80- left_on = ["sequence " , "charge" ],
68+ left_on = ["peptidoform " , "charge" ],
8169 how = "inner" ,
8270 suffixes = ("_ref" , "_data" ),
8371 )
@@ -90,7 +78,7 @@ def get_ccs_shift(
9078
9179 # How much CCS in calibration data is larger than reference CCS, so predictions
9280 # need to be increased by this amount
93- return 0 if both .shape [ 0 ] == 0 else np .mean (both ["ccs_observed" ] - both ["CCS" ])
81+ return 0 if both .empty else np .mean (both ["ccs_observed" ] - both ["CCS" ])
9482
9583
9684def get_ccs_shift_per_charge (cal_df : pd .DataFrame , reference_dataset : pd .DataFrame ) -> ndarray :
@@ -111,25 +99,11 @@ def get_ccs_shift_per_charge(cal_df: pd.DataFrame, reference_dataset: pd.DataFra
11199 CCS shift factors per charge state.
112100
113101 """
114- tmp_df = cal_df .copy (deep = True )
115- tmp_ref_df = reference_dataset .copy (deep = True )
116-
117- tmp_df ["sequence" ] = tmp_df ["peptidoform" ].apply (lambda x : x .proforma .split ("\\ " )[0 ])
118- tmp_df ["charge" ] = tmp_df ["peptidoform" ].apply (lambda x : x .precursor_charge )
119- tmp_ref_df ["sequence" ] = tmp_ref_df ["peptidoform" ].apply (
120- lambda x : Peptidoform (x ).proforma .split ("\\ " )[0 ]
121- )
122- tmp_ref_df ["charge" ] = tmp_ref_df ["peptidoform" ].apply (
123- lambda x : Peptidoform (x ).precursor_charge
124- )
125-
126- reference_tmp = tmp_ref_df
127- df_tmp = tmp_df
128102 both = pd .merge (
129- left = reference_tmp ,
130- right = df_tmp ,
103+ left = reference_dataset ,
104+ right = cal_df ,
131105 right_on = ["sequence" , "charge" ],
132- left_on = ["sequence " , "charge" ],
106+ left_on = ["peptidoform " , "charge" ],
133107 how = "inner" ,
134108 suffixes = ("_ref" , "_data" ),
135109 )
@@ -159,7 +133,6 @@ def calculate_ccs_shift(
159133 CCS shift factor.
160134
161135 """
162- cal_df ["charge" ] = cal_df ["peptidoform" ].apply (lambda x : x .precursor_charge )
163136 cal_df = cal_df [cal_df ["charge" ] < 7 ] # predictions do not go higher for IM2Deep
164137
165138 if not per_charge :
@@ -207,37 +180,35 @@ def linear_calibration(
207180
208181 """
209182 LOGGER .info ("Calibrating CCS values using linear calibration..." )
183+ calibration_dataset ['sequence' ] = calibration_dataset ['peptidoform' ].apply (lambda x : x .proforma .split ("\\ " )[0 ])
184+ calibration_dataset ['charge' ] = calibration_dataset ['peptidoform' ].apply (lambda x : x .precursor_charge )
185+ # reference_dataset['sequence'] = reference_dataset['peptidoform'].apply(lambda x: x.split('/')[0])
186+ reference_dataset ['charge' ] = reference_dataset ['peptidoform' ].apply (lambda x : int (x .split ('/' )[1 ]))
187+
210188 if per_charge :
189+ LOGGER .info ('Getting general shift factor' )
211190 general_shift = calculate_ccs_shift (
212191 calibration_dataset ,
213192 reference_dataset ,
214193 per_charge = False ,
215194 use_charge_state = use_charge_state ,
216195 )
196+ LOGGER .info ('Getting shift factors per charge state' )
217197 shift_factor_dict = calculate_ccs_shift (
218198 calibration_dataset , reference_dataset , per_charge = True
219199 )
220- for charge in preds_df ["charge" ].unique ():
221- if charge not in shift_factor_dict :
222- LOGGER .info (
223- "No overlapping precursors for charge state {}. Using overall shift factor for precursors with that charge." .format (
224- charge
225- )
226- )
227- shift_factor_dict [charge ] = general_shift
228- LOGGER .info ("Shift factors per charge: {}" .format (shift_factor_dict ))
229- preds_df ["predicted_ccs" ] = preds_df .apply (
230- lambda x : x ["predicted_ccs" ] + shift_factor_dict [x ["charge" ]], axis = 1
231- )
200+
201+ preds_df ['shift' ] = preds_df ['charge' ].map (shift_factor_dict ).fillna (general_shift )
202+ preds_df ['predicted_ccs' ] = preds_df ['predicted_ccs' ] + preds_df ['shift' ]
203+
232204 else :
233205 shift_factor = calculate_ccs_shift (
234206 calibration_dataset ,
235207 reference_dataset ,
236208 per_charge = False ,
237209 use_charge_state = use_charge_state ,
238210 )
239- preds_df ["predicted_ccs" ] = preds_df .apply (
240- lambda x : x ["predicted_ccs" ] + shift_factor , axis = 1
241- )
211+ preds_df ['predicted_ccs' ] += shift_factor
212+
242213 LOGGER .info ("CCS values calibrated." )
243214 return preds_df
0 commit comments