Skip to content

Commit 581da04

Browse files
authored
Merge pull request #162 from compomics/refactor-parse-spectra
Refactor parsing of precursor data from spectrum files
2 parents f8cab91 + b417a6c commit 581da04

File tree

9 files changed

+318
-101
lines changed

9 files changed

+318
-101
lines changed

ms2rescore/core.py

Lines changed: 18 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,14 @@
33
from multiprocessing import cpu_count
44
from typing import Dict, Optional
55

6-
import numpy as np
76
import psm_utils.io
87
from mokapot.dataset import LinearPsmDataset
98
from psm_utils import PSMList
109

1110
from ms2rescore import exceptions
1211
from ms2rescore.feature_generators import FEATURE_GENERATORS
1312
from ms2rescore.parse_psms import parse_psms
14-
from ms2rescore.parse_spectra import get_missing_values
13+
from ms2rescore.parse_spectra import add_precursor_values
1514
from ms2rescore.report import generate
1615
from ms2rescore.rescoring_engines import mokapot, percolator
1716
from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence
@@ -62,20 +61,28 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
6261
)
6362

6463
# Add missing precursor info from spectrum file if needed
65-
psm_list = _fill_missing_precursor_info(psm_list, config)
64+
available_ms_data = add_precursor_values(
65+
psm_list, config["spectrum_path"], config["spectrum_id_pattern"]
66+
)
6667

6768
# Add rescoring features
6869
for fgen_name, fgen_config in config["feature_generators"].items():
69-
# TODO: Handle this somewhere else, more generally?
70-
if fgen_name == "maxquant" and not (psm_list["source"] == "msms").all():
71-
logger.warning(
72-
"MaxQuant feature generator requires PSMs from a MaxQuant msms.txt file. Skipping "
73-
"this feature generator."
74-
)
75-
continue
70+
# Compile configuration
7671
conf = config.copy()
7772
conf.update(fgen_config)
7873
fgen = FEATURE_GENERATORS[fgen_name](**conf)
74+
75+
# Check if required MS data is available
76+
missing_ms_data = fgen.required_ms_data - available_ms_data
77+
if missing_ms_data:
78+
logger.warning(
79+
f"Skipping feature generator {fgen_name} because required MS data is missing: "
80+
f"{missing_ms_data}. Ensure that the required MS data is present in the input "
81+
"files or disable the feature generator."
82+
)
83+
continue
84+
85+
# Add features
7986
fgen.add_features(psm_list)
8087
logger.debug(f"Adding features from {fgen_name}: {set(fgen.feature_names)}")
8188
feature_names[fgen_name] = set(fgen.feature_names)
@@ -102,6 +109,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
102109
# Write feature names to file
103110
_write_feature_names(feature_names, output_file_root)
104111

112+
# Rename PSMs to USIs if requested
105113
if config["rename_to_usi"]:
106114
logging.debug(f"Creating USIs for {len(psm_list)} PSMs")
107115
psm_list["spectrum_id"] = [psm.get_usi(as_url=False) for psm in psm_list]
@@ -183,51 +191,6 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
183191
logger.exception(e)
184192

185193

186-
def _fill_missing_precursor_info(psm_list: PSMList, config: Dict) -> PSMList:
187-
"""Fill missing precursor info from spectrum file if needed."""
188-
# Check if required
189-
# TODO: avoid hard coding feature generators in some way
190-
rt_required = ("deeplc" in config["feature_generators"]) and any(
191-
v is None or v == 0 or np.isnan(v) for v in psm_list["retention_time"]
192-
)
193-
im_required = (
194-
"ionmob" in config["feature_generators"] or "im2deep" in config["feature_generators"]
195-
) and any(v is None or v == 0 or np.isnan(v) for v in psm_list["ion_mobility"])
196-
logger.debug(f"RT required: {rt_required}, IM required: {im_required}")
197-
198-
# Add missing values
199-
if rt_required or im_required:
200-
logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
201-
get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required)
202-
203-
# Check if values are now present
204-
for value_name, required in [("retention_time", rt_required), ("ion_mobility", im_required)]:
205-
if required and (
206-
0.0 in psm_list[value_name]
207-
or None in psm_list[value_name]
208-
or np.isnan(psm_list[value_name]).any()
209-
):
210-
if all(v is None or v == 0.0 or np.isnan(v) for v in psm_list[value_name]):
211-
raise exceptions.MissingValuesError(
212-
f"Could not find any '{value_name}' values in PSM or spectrum files. Disable "
213-
f"feature generators that require '{value_name}' or ensure that the values are "
214-
"present in the input files."
215-
)
216-
else:
217-
missing_value_psms = psm_list[
218-
[v is None or np.isnan(v) for v in psm_list[value_name]]
219-
]
220-
logger.warning(
221-
f"Found {len(missing_value_psms)} PSMs with missing '{value_name}' values. "
222-
"These PSMs will be removed."
223-
)
224-
psm_list = psm_list[
225-
[v is not None and not np.isnan(v) for v in psm_list[value_name]]
226-
]
227-
228-
return psm_list
229-
230-
231194
def _filter_by_rank(psm_list: PSMList, max_rank: int, lower_score_better: bool) -> PSMList:
232195
"""Filter PSMs by rank."""
233196
psm_list.set_ranks(lower_score_better=lower_score_better)

ms2rescore/feature_generators/base.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,17 @@
11
from abc import ABC, abstractmethod
2+
from typing import Set
23

34
from psm_utils import PSMList
45

6+
from ms2rescore.parse_spectra import MSDataType
7+
58

69
class FeatureGeneratorBase(ABC):
710
"""Base class from which all feature generators must inherit."""
811

12+
# List of required MS data types for feature generation
13+
required_ms_data: Set[MSDataType] = set()
14+
915
def __init__(self, *args, **kwargs) -> None:
1016
super().__init__()
1117

ms2rescore/feature_generators/deeplc.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from psm_utils import PSMList
2828

2929
from ms2rescore.feature_generators.base import FeatureGeneratorBase
30+
from ms2rescore.parse_spectra import MSDataType
3031

3132
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
3233
logger = logging.getLogger(__name__)
@@ -35,6 +36,8 @@
3536
class DeepLCFeatureGenerator(FeatureGeneratorBase):
3637
"""DeepLC retention time-based feature generator."""
3738

39+
required_ms_data = {MSDataType.retention_time}
40+
3841
def __init__(
3942
self,
4043
*args,
@@ -138,6 +141,7 @@ def add_features(self, psm_list: PSMList) -> None:
138141
)
139142

140143
# Disable wild logging to stdout by Tensorflow, unless in debug mode
144+
141145
with contextlib.redirect_stdout(
142146
open(os.devnull, "w", encoding="utf-8")
143147
) if not self._verbose else contextlib.nullcontext():

ms2rescore/feature_generators/im2deep.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from psm_utils import PSMList
2323

2424
from ms2rescore.feature_generators.base import FeatureGeneratorBase
25+
from ms2rescore.parse_spectra import MSDataType
2526

2627
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
2728
logger = logging.getLogger(__name__)
@@ -30,6 +31,8 @@
3031
class IM2DeepFeatureGenerator(FeatureGeneratorBase):
3132
"""IM2Deep collision cross section feature generator."""
3233

34+
required_ms_data = {MSDataType.ion_mobility}
35+
3336
def __init__(
3437
self,
3538
*args,

ms2rescore/feature_generators/ionmob.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from psm_utils import Peptidoform, PSMList
2525

2626
from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException
27+
from ms2rescore.parse_spectra import MSDataType
2728

2829
try:
2930
from ionmob import __file__ as ionmob_file
@@ -55,6 +56,8 @@
5556
class IonMobFeatureGenerator(FeatureGeneratorBase):
5657
"""Ionmob collisional cross section (CCS)-based feature generator."""
5758

59+
required_ms_data = {MSDataType.ion_mobility}
60+
5861
def __init__(
5962
self,
6063
*args,

ms2rescore/feature_generators/maxquant.py

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,16 @@
2323
class MaxQuantFeatureGenerator(FeatureGeneratorBase):
2424
"""Generate MaxQuant-derived features."""
2525

26+
available_features = [
27+
"mean_error_top7",
28+
"sq_mean_error_top7",
29+
"stdev_error_top7",
30+
"ln_explained_ion_current",
31+
"ln_nterm_ion_current_ratio",
32+
"ln_cterm_ion_current_ratio",
33+
"ln_ms2_ion_current",
34+
]
35+
2636
def __init__(self, *args, **kwargs) -> None:
2737
"""
2838
Generate MaxQuant-derived features.
@@ -39,30 +49,30 @@ def __init__(self, *args, **kwargs) -> None:
3949
4050
"""
4151
super().__init__(*args, **kwargs)
52+
self._feature_names = self.available_features.copy()
4253

4354
@property
4455
def feature_names(self) -> List[str]:
45-
return [
46-
"mean_error_top7",
47-
"sq_mean_error_top7",
48-
"stdev_error_top7",
49-
"ln_explained_ion_current",
50-
"ln_nterm_ion_current_ratio",
51-
"ln_cterm_ion_current_ratio",
52-
"ln_ms2_ion_current",
53-
]
56+
return self._feature_names
5457

5558
def add_features(self, psm_list: PSMList):
5659
"""
57-
Add MS²PIP-derived features to PSMs.
60+
Add MaxQuant-derived features to PSMs.
5861
5962
Parameters
6063
----------
6164
psm_list
6265
PSMs to add features to.
6366
6467
"""
65-
logger.info("Adding MaxQuant-derived features to PSMs.")
68+
# Check if all PSMs are from MaxQuant
69+
if not self._all_psms_from_maxquant(psm_list):
70+
self._feature_names = [] # Set feature names to empty list to indicate none added
71+
logger.warning("Not all PSMs are from MaxQuant. Skipping MaxQuant feature generation.")
72+
return
73+
else:
74+
self._feature_names = self.available_features # Reset feature names
75+
logger.info("Adding MaxQuant-derived features to PSMs.")
6676

6777
# Infer mass deviations column name
6878
for column_name in [
@@ -90,6 +100,11 @@ def add_features(self, psm_list: PSMList):
90100
for psm in psm_list:
91101
psm["rescoring_features"].update(self._compute_features(psm["metadata"]))
92102

103+
@staticmethod
104+
def _all_psms_from_maxquant(psm_list):
105+
"""Check if the PSMs are from MaxQuant."""
106+
return (psm_list["source"] == "msms").all()
107+
93108
def _compute_features(self, psm_metadata):
94109
"""Compute features from derived from intensities and mass errors."""
95110
features = {}

ms2rescore/feature_generators/ms2pip.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from rich.progress import track
3939

4040
from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException
41+
from ms2rescore.parse_spectra import MSDataType
4142
from ms2rescore.utils import infer_spectrum_path
4243

4344
logger = logging.getLogger(__name__)
@@ -46,6 +47,8 @@
4647
class MS2PIPFeatureGenerator(FeatureGeneratorBase):
4748
"""Generate MS²PIP-based features."""
4849

50+
required_ms_data = {MSDataType.ms2_spectra}
51+
4952
def __init__(
5053
self,
5154
*args,

0 commit comments

Comments
 (0)