|
1 | | -# TODO to be remove as already implemented in alphaDIA. |
2 | | -from typing import Tuple, Union |
| 1 | +# TODO to be removed |
3 | 2 |
|
4 | | -import numpy as np |
5 | | -import pandas as pd |
6 | | -import tqdm |
7 | | -from alphatims.bruker import TimsTOF |
| 3 | +_DEPRECATION_MSG = "has been moved to alphaviz and will be removed from alpharaw in a future version." |
8 | 4 |
|
9 | | -from alpharaw.ms_data_base import MSData_Base, ms_reader_provider |
10 | | -from alpharaw.wrappers.alphatims_wrapper import AlphaTimsWrapper |
11 | 5 |
|
12 | | -from .psm_match import PepSpecMatch |
| 6 | +def load_ms_data_tims(*args, **kwargs): |
| 7 | + raise DeprecationWarning( |
| 8 | + f"load_ms_data_tims {_DEPRECATION_MSG}" |
| 9 | + ) |
13 | 10 |
|
14 | | -alphatims_hdf_types = [ |
15 | | - "alphatims", |
16 | | - "alphatims_hdf", |
17 | | - "tims.hdf", |
18 | | -] |
19 | 11 |
|
20 | | - |
21 | | -def load_ms_data_tims( |
22 | | - ms_file: Union[str, MSData_Base, TimsTOF], |
23 | | - ms_file_type: str = "alpharaw_hdf", |
24 | | - dda: bool = False, |
25 | | - spectra_sorted_by_rt: bool = True, |
26 | | -) -> Tuple[MSData_Base, TimsTOF]: |
27 | | - """Load ms data as TimsTOF object |
28 | | -
|
29 | | - Parameters |
30 | | - ---------- |
31 | | - ms_file : str |
32 | | - ms2 file path |
33 | | -
|
34 | | - ms_file_type : str, optional |
35 | | - ms2 file type, could be |
36 | | - ["alpharaw_hdf","raw.hdf","thermo","sciex","alphapept_hdf","mgf"]. |
37 | | - Default to 'alphatims_hdf' |
38 | | -
|
39 | | - dda : bool, optional |
40 | | - if it is DDA data, by default False |
41 | | -
|
42 | | - spectra_sorted_by_rt : bool, optional |
43 | | - If spectra are already sorted by RT. |
44 | | - Defaults to True |
45 | | -
|
46 | | - Returns |
47 | | - ------- |
48 | | - tuple |
49 | | - MSData_Base: alpharaw MS Data (Reader) object |
50 | | - TimsTOF: AlphaTims object |
51 | | - """ |
52 | | - if isinstance(ms_file, TimsTOF): |
53 | | - return None, ms_file |
54 | | - elif ms_file_type.lower() in alphatims_hdf_types: |
55 | | - return None, TimsTOF(ms_file) |
56 | | - else: |
57 | | - if isinstance(ms_file, MSData_Base): |
58 | | - raw_data = ms_file |
59 | | - else: |
60 | | - raw_data = ms_reader_provider.get_reader(ms_file_type) |
61 | | - raw_data.import_raw(ms_file) |
62 | | - |
63 | | - if not spectra_sorted_by_rt: |
64 | | - # RT may not be sorted in AP HDF for timsTOF after preprocessing |
65 | | - raw_data._sort_rt() |
66 | | - |
67 | | - tims_data = AlphaTimsWrapper(raw_data, dda=dda) |
68 | | - return raw_data, tims_data |
69 | | - |
70 | | - |
71 | | -class PepSpecMatch_AlphaTims(PepSpecMatch): |
72 | | - """ |
73 | | - Inherited from :class:`alpharaw.match.psm_match.PepSpecMatch`, but |
74 | | - this can be used for DIA PSM matching by selecting |
75 | | - MS2 spectra with RT (and IM) values. |
76 | | - """ |
77 | | - |
78 | | - #: RT win to get a MS2 spectrum by slicing |
79 | | - rt_sec_tol_to_slice_ms2 = 3.0 |
80 | | - |
81 | | - #: IM win to get a MS2 spectrum by slicing |
82 | | - im_tol_to_slice_ms2 = 0.03 |
83 | | - |
84 | | - #: find closest MS2 for the given RT when slicing |
85 | | - find_k_nearest_ms2_by_rt = True |
86 | | - k_rt_nearest = 1 |
87 | | - |
88 | | - # : find closest MS2 for the given RT when slicing |
89 | | - find_k_nearest_ms2_by_im = False |
90 | | - k_im_nearest = 11 |
91 | | - |
92 | | - def get_peak_df( |
93 | | - self, |
94 | | - precursor_mz: float, |
95 | | - rt_sec: float, |
96 | | - im: float = 0.0, |
97 | | - ) -> pd.DataFrame: |
98 | | - """ |
99 | | - Parameters |
100 | | - ---------- |
101 | | - precursor_mz : float |
102 | | - Precursor m/z value |
103 | | - rt_sec : float |
104 | | - RT value in seconds |
105 | | - im : float, optional |
106 | | - Ion mobility, by default 0.0 |
107 | | -
|
108 | | - Returns |
109 | | - ------- |
110 | | - pd.DataFrame |
111 | | - peak_df in alphatims DF format |
112 | | - """ |
113 | | - rt_slice = slice( |
114 | | - rt_sec - self.rt_sec_tol_to_slice_ms2, |
115 | | - rt_sec + self.rt_sec_tol_to_slice_ms2, |
116 | | - ) |
117 | | - |
118 | | - if im == 0 or self.tims_data.scan_max_index == 1: |
119 | | - im_slice = slice(None) |
120 | | - elif self.find_k_nearest_ms2_by_im and self.tims_data.scan_max_index > 1: |
121 | | - # AlphaTims without AlphaRaw for .d files |
122 | | - im_slice = self.tims_data.scan_max_index - np.searchsorted( |
123 | | - self.tims_data.mobility_values[::-1], im |
124 | | - ) |
125 | | - else: |
126 | | - im_slice = slice( |
127 | | - im - self.im_tol_to_slice_ms2, im + self.im_tol_to_slice_ms2 |
128 | | - ) |
129 | | - |
130 | | - spec_df = self.tims_data[rt_slice, im_slice, precursor_mz:precursor_mz] |
131 | | - |
132 | | - def find_k_nearest(array, val, k=3): |
133 | | - nearest = np.argmin(np.abs(array - val)) |
134 | | - if nearest <= k // 2: |
135 | | - return slice(k) |
136 | | - elif nearest >= len(array) - k // 2 - 1: |
137 | | - return slice(-k, None) |
138 | | - else: |
139 | | - return slice(nearest - k // 2, nearest + k // 2 + 1) |
140 | | - |
141 | | - if ( |
142 | | - self.find_k_nearest_ms2_by_im |
143 | | - and im > 0 |
144 | | - and self.tims_data.scan_max_index > 1 |
145 | | - ): |
146 | | - # RAW from AlphaRaw, mobility===0 in AlphaTims wrapper obj |
147 | | - scan_idxes = np.sort(spec_df.scan_indices.unique()) |
148 | | - if len(scan_idxes) > 1: # im from psm |
149 | | - scan_idxes = scan_idxes[ |
150 | | - find_k_nearest( |
151 | | - self.raw_data.spectrum_df.mobility.values[scan_idxes], |
152 | | - im, |
153 | | - self.k_im_nearest, |
154 | | - ) |
155 | | - ] |
156 | | - spec_df = spec_df[spec_df.scan_indices.isin(scan_idxes)] |
157 | | - |
158 | | - if self.find_k_nearest_ms2_by_rt: |
159 | | - rt_values = np.sort(spec_df.rt_values.unique()) |
160 | | - if len(rt_values) > 1: |
161 | | - closest_rts = rt_values[ |
162 | | - find_k_nearest(rt_values, rt_sec, self.k_rt_nearest) |
163 | | - ] |
164 | | - spec_df = spec_df[spec_df.rt_values.isin(closest_rts)] |
165 | | - |
166 | | - return spec_df |
167 | | - |
168 | | - def get_peaks( |
169 | | - self, |
170 | | - precursor_mz: float, |
171 | | - rt_sec: float, |
172 | | - im: float = 0.0, |
173 | | - ) -> tuple: |
174 | | - """ |
175 | | - Parameters |
176 | | - ---------- |
177 | | - precursor_mz : float |
178 | | - Precursor m/z value |
179 | | - rt_sec : float |
180 | | - RT value in seconds |
181 | | - im : float, optional |
182 | | - Ion mobility, by default 0.0 |
183 | | -
|
184 | | - Returns |
185 | | - ------- |
186 | | - tuple |
187 | | - np.ndarray: peak m/z values |
188 | | - np.ndarray: peak intensity values |
189 | | - """ |
190 | | - spec_df = self.get_peak_df(precursor_mz, rt_sec, im) |
191 | | - spec_df = spec_df.sort_values("mz_values").reset_index(drop=True) |
192 | | - return (spec_df.mz_values.values, spec_df.intensity_values.values) |
193 | | - |
194 | | - def load_ms_data( |
195 | | - self, |
196 | | - ms_file, |
197 | | - ms_file_type, |
198 | | - dda=False, |
199 | | - spectra_sorted_by_rt=True, |
200 | | - ): |
201 | | - self.raw_data, self.tims_data = load_ms_data_tims( |
202 | | - ms_file, ms_file_type, dda, spectra_sorted_by_rt |
203 | | - ) |
204 | | - |
205 | | - def match_ms2_one_raw( |
206 | | - self, psm_df_one_raw: pd.DataFrame, verbose: bool = False |
207 | | - ) -> tuple: |
208 | | - """ |
209 | | - Matching psm_df_one_raw against |
210 | | - self.tims_data and self.raw_data |
211 | | - after `self.load_ms_data()` |
212 | | -
|
213 | | - Parameters |
214 | | - ---------- |
215 | | - psm_df_one_raw : pd.DataFrame |
216 | | - psm dataframe |
217 | | - that contains only one raw file |
218 | | -
|
219 | | - Returns |
220 | | - ------- |
221 | | - tuple: |
222 | | - pd.DataFrame: psm dataframe with fragment index information. |
223 | | -
|
224 | | - pd.DataFrame: fragment mz dataframe. |
225 | | -
|
226 | | - pd.DataFrame: matched intensity dataframe. |
227 | | -
|
228 | | - pd.DataFrame: matched mass error dataframe. |
229 | | - np.inf if a fragment is not matched. |
230 | | -
|
231 | | - """ |
232 | | - self.psm_df = psm_df_one_raw |
233 | | - |
234 | | - psm_df_one_raw = self._add_missing_columns_to_psm_df(psm_df_one_raw) |
235 | | - |
236 | | - ( |
237 | | - fragment_mz_df, |
238 | | - matched_intensity_df, |
239 | | - matched_mz_err_df, |
240 | | - ) = self._prepare_matching_dfs() |
241 | | - |
242 | | - if ( |
243 | | - "mobility" in psm_df_one_raw.columns |
244 | | - and "mobility" in self.raw_data.spectrum_df.columns |
245 | | - ): |
246 | | - query_columns = [ |
247 | | - "frag_start_idx", |
248 | | - "frag_stop_idx", |
249 | | - "precursor_mz", |
250 | | - "rt", |
251 | | - "mobility", |
252 | | - ] |
253 | | - else: |
254 | | - query_columns = [ |
255 | | - "frag_start_idx", |
256 | | - "frag_stop_idx", |
257 | | - "precursor_mz", |
258 | | - "rt", |
259 | | - ] |
260 | | - |
261 | | - psm_iters = psm_df_one_raw[query_columns].values |
262 | | - if verbose: |
263 | | - psm_iters = tqdm.tqdm(psm_iters) |
264 | | - |
265 | | - for items in psm_iters: |
266 | | - frag_start_idx = int(items[0]) |
267 | | - frag_stop_idx = int(items[1]) |
268 | | - |
269 | | - spec_mzs, spec_intens = self.get_peaks( |
270 | | - *items[2:], |
271 | | - ) |
272 | | - self._match_one_psm( |
273 | | - spec_mzs, |
274 | | - spec_intens, |
275 | | - fragment_mz_df, |
276 | | - matched_intensity_df, |
277 | | - matched_mz_err_df, |
278 | | - frag_start_idx, |
279 | | - frag_stop_idx, |
280 | | - ) |
281 | | - return (psm_df_one_raw, fragment_mz_df, matched_intensity_df, matched_mz_err_df) |
282 | | - |
283 | | - def match_ms2_multi_raw( |
284 | | - self, |
285 | | - psm_df: pd.DataFrame, |
286 | | - ms_files: Union[dict, list], |
287 | | - ms_file_type: str = "alphatims", |
288 | | - dda: bool = False, |
289 | | - ): |
290 | | - """ |
291 | | - Matching PSM dataframe against the ms2 files in ms_files |
292 | | - This method will store matched values as attributes: |
293 | | - - self.psm_df |
294 | | - - self.fragment_mz_df |
295 | | - - self.matched_intensity_df |
296 | | - - self.matched_mz_err_df |
297 | | -
|
298 | | - Parameters |
299 | | - ---------- |
300 | | - psm_df : pd.DataFrame |
301 | | - PSM dataframe |
302 | | -
|
303 | | - ms_files : dict | list |
304 | | - if dict: {raw_name: ms2 path} |
305 | | - if list: [ms2 path1, ms2 path2] |
306 | | -
|
307 | | - ms_file_type : str, optional |
308 | | - One of ["alphatims_hdf","alpharaw_hdf","thermo","sciex","alphapept_hdf","mgf"] |
309 | | - Defaults to 'alphapept'. |
310 | | -
|
311 | | - Returns |
312 | | - ------- |
313 | | - tuple: |
314 | | - pd.DataFrame: psm dataframe with fragment index information. |
315 | | -
|
316 | | - pd.DataFrame: fragment mz dataframe. |
317 | | -
|
318 | | - pd.DataFrame: matched intensity dataframe. |
319 | | -
|
320 | | - pd.DataFrame: matched mass error dataframe. |
321 | | - np.inf if a fragment is not matched. |
322 | | -
|
323 | | - """ |
324 | | - raise NotImplementedError( |
325 | | - "Not necessary for matching multiple raw files using AlphaTims, " |
326 | | - "loop through `match_ms2_one_raw()`" |
| 12 | +class PepSpecMatch_AlphaTims: |
| 13 | + def __init__(self, *args, **kwargs): |
| 14 | + raise DeprecationWarning( |
| 15 | + f"PepSpecMatch_AlphaTims {_DEPRECATION_MSG}" |
327 | 16 | ) |
0 commit comments