Skip to content

Commit e4f7a4b

Browse files
authored
Merge pull request #239 from CompOmics/feature/add-regex-validation
Better regex validation
2 parents e7c2d3d + 943318c commit e4f7a4b

File tree

6 files changed

+323
-32
lines changed

6 files changed

+323
-32
lines changed

ms2rescore/config_parser.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,15 @@
33
import importlib.resources
44
import json
55
import multiprocessing as mp
6+
import re
67
from argparse import Namespace
78
from pathlib import Path
89
from typing import Dict, List, Union
910

1011
try:
1112
import tomllib
1213
except ImportError:
13-
import tomli as tomllib
14+
import tomli as tomllib # type: ignore
1415

1516
from cascade_config import CascadeConfig
1617

@@ -86,6 +87,36 @@ def _validate_processes(config: Dict) -> Dict:
8687
return config
8788

8889

90+
def _validate_regular_expressions(config: Dict) -> Dict:
91+
"""Validate regular expressions in configuration."""
92+
for field in [
93+
"psm_id_pattern",
94+
"spectrum_id_pattern",
95+
"psm_id_rt_pattern",
96+
"psm_id_im_pattern",
97+
]:
98+
if config["ms2rescore"][field]:
99+
100+
# Check if valid regex
101+
try:
102+
pattern = re.compile(config["ms2rescore"][field])
103+
except re.error as e:
104+
raise MS2RescoreConfigurationError(
105+
f"Invalid regular expression provided for '{field}': {e}"
106+
) from e
107+
108+
# Check if regex has exactly one capturing group
109+
if pattern.groups != 1:
110+
raise MS2RescoreConfigurationError(
111+
f"Regular expression for '{field}' should contain exactly one "
112+
"capturing group. Please check and try again. "
113+
"See https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra "
114+
"for more information."
115+
)
116+
117+
return config
118+
119+
89120
def parse_configurations(configurations: List[Union[dict, str, Path, Namespace]]) -> Dict:
90121
"""
91122
Parse and validate MS²Rescore configuration files and CLI arguments.
@@ -142,6 +173,7 @@ def parse_configurations(configurations: List[Union[dict, str, Path, Namespace]]
142173
# Validate and infer filenames and number of parallel processes
143174
config = _validate_filenames(config)
144175
config = _validate_processes(config)
176+
config = _validate_regular_expressions(config)
145177

146178
# Convert feature_generators and rescoring_engine names to lowercase
147179
config["ms2rescore"]["feature_generators"] = {

ms2rescore/parse_psms.py

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,39 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
4545
psm_list = _remove_invalid_aa(psm_list)
4646
_find_decoys(psm_list, config["id_decoy_pattern"])
4747
_calculate_qvalues(psm_list, config["lower_score_is_better"])
48+
49+
# Parse retention time and/or ion mobility from spectrum_id if patterns provided
4850
if config["psm_id_rt_pattern"] or config["psm_id_im_pattern"]:
4951
logger.debug("Parsing retention time and/or ion mobility from PSM identifier...")
5052
_parse_values_from_spectrum_id(
5153
psm_list, config["psm_id_rt_pattern"], config["psm_id_im_pattern"]
5254
)
5355

56+
# Apply psm_id_pattern if provided
57+
if config["psm_id_pattern"]:
58+
pattern = re.compile(config["psm_id_pattern"])
59+
logger.debug("Applying 'psm_id_pattern'...")
60+
logger.debug(
61+
f"Parsing '{psm_list[0].spectrum_id}' to '{_match_psm_ids(psm_list[0].spectrum_id, pattern)}'"
62+
)
63+
new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
64+
65+
# Validate that the number of unique IDs remains the same
66+
if len(set(new_ids)) != len(set(psm_list["spectrum_id"])):
67+
example_old_id = psm_list["spectrum_id"][0]
68+
example_new_id = new_ids[0]
69+
raise MS2RescoreConfigurationError(
70+
"'psm_id_pattern' resulted in a different number of unique PSM IDs. "
71+
"This might indicate issues with the regex pattern. Please check and try again. "
72+
f"Example old ID: '{example_old_id}' -> new ID: '{example_new_id}'. "
73+
" See "
74+
"https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra "
75+
"for more information."
76+
)
77+
78+
# Assign new IDs
79+
psm_list["spectrum_id"] = new_ids
80+
5481
# Store scoring values for comparison later
5582
for psm in psm_list:
5683
psm.provenance_data.update(
@@ -62,6 +89,7 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
6289
}
6390
)
6491

92+
# Rename and add modifications
6593
logger.debug("Parsing modifications...")
6694
modifications_found = set(
6795
[
@@ -81,15 +109,6 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
81109
psm_list.add_fixed_modifications(config["fixed_modifications"])
82110
psm_list.apply_fixed_modifications()
83111

84-
if config["psm_id_pattern"]:
85-
pattern = re.compile(config["psm_id_pattern"])
86-
logger.debug("Applying 'psm_id_pattern'...")
87-
logger.debug(
88-
f"Parsing '{psm_list[0].spectrum_id}' to '{_match_psm_ids(psm_list[0].spectrum_id, pattern)}'"
89-
)
90-
new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
91-
psm_list["spectrum_id"] = new_ids
92-
93112
return psm_list
94113

95114

ms2rescore/parse_spectra.py

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77
from typing import Optional, Set, Tuple
88

99
import numpy as np
10-
from ms2rescore_rs import get_precursor_info
10+
from ms2rescore_rs import Precursor, get_precursor_info
1111
from psm_utils import PSMList
1212

13-
from ms2rescore.exceptions import MS2RescoreError
13+
from ms2rescore.exceptions import MS2RescoreConfigurationError, MS2RescoreError
1414
from ms2rescore.utils import infer_spectrum_path
1515

1616
LOGGER = logging.getLogger(__name__)
@@ -149,6 +149,43 @@ def add_precursor_values(
149149
return available_data_types
150150

151151

152+
def _apply_spectrum_id_pattern(
153+
precursors: dict[str, Precursor], pattern: str
154+
) -> dict[str, Precursor]:
155+
"""Apply spectrum ID pattern to precursor IDs."""
156+
# Map precursor IDs using regex pattern
157+
compiled_pattern = re.compile(pattern)
158+
id_mapping = {
159+
match.group(1): spectrum_id
160+
for spectrum_id in precursors.keys()
161+
if (match := compiled_pattern.search(spectrum_id)) is not None
162+
}
163+
164+
# Validate that any IDs were matched
165+
if not id_mapping:
166+
raise MS2RescoreConfigurationError(
167+
"'spectrum_id_pattern' did not match any spectrum-file IDs. Please check and try "
168+
"again. See "
169+
"https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra "
170+
"for more information."
171+
)
172+
173+
# Validate that the same number of unique IDs were matched
174+
elif len(id_mapping) != len(precursors):
175+
new_id, old_id = next(iter(id_mapping.items()))
176+
raise MS2RescoreConfigurationError(
177+
"'spectrum_id_pattern' resulted in a different number of unique spectrum IDs. This "
178+
"indicates issues with the regex pattern. Please check and try again. "
179+
f"Example old ID: '{old_id}' -> new ID: '{new_id}'. "
180+
"See https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra "
181+
"for more information."
182+
)
183+
184+
precursors = {new_id: precursors[orig_id] for new_id, orig_id in id_mapping.items()}
185+
186+
return precursors
187+
188+
152189
def _get_precursor_values(
153190
psm_list: PSMList, spectrum_path: str, spectrum_id_pattern: Optional[str] = None
154191
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
@@ -162,23 +199,18 @@ def _get_precursor_values(
162199
spectrum_file = infer_spectrum_path(spectrum_path, run_name)
163200

164201
LOGGER.debug("Reading spectrum file: '%s'", spectrum_file)
165-
precursors = get_precursor_info(str(spectrum_file))
202+
precursors: dict[str, Precursor] = get_precursor_info(str(spectrum_file))
166203

167204
# Parse spectrum IDs with regex pattern if provided
168205
if spectrum_id_pattern:
169-
compiled_pattern = re.compile(spectrum_id_pattern)
170-
precursors = {
171-
match.group(1): precursor
172-
for spectrum_id, precursor in precursors.items()
173-
if (match := compiled_pattern.search(spectrum_id)) is not None
174-
}
175-
176-
# Ensure all PSMs have a precursor values
206+
precursors = _apply_spectrum_id_pattern(precursors, spectrum_id_pattern)
207+
208+
# Ensure all PSMs have precursor values
177209
for psm in psm_list_run:
178210
if psm.spectrum_id not in precursors:
179-
raise SpectrumParsingError(
180-
"Mismatch between PSM and spectrum file IDs. Could find precursor values "
181-
f"for PSM with ID {psm.spectrum_id} in run {run_name}.\n"
211+
raise MS2RescoreConfigurationError(
212+
"Mismatch between PSM and spectrum file IDs. Could not find precursor "
213+
f"values for PSM with ID {psm.spectrum_id} in run {run_name}.\n"
182214
"Please check that the `spectrum_id_pattern` and `psm_id_pattern` options "
183215
"are configured correctly. See "
184216
"https://ms2rescore.readthedocs.io/en/stable/userguide/configuration/#mapping-psms-to-spectra"
@@ -199,6 +231,6 @@ def _get_precursor_values(
199231

200232

201233
class SpectrumParsingError(MS2RescoreError):
202-
"""Error parsing retention time from spectrum file."""
234+
"""Error while parsing spectrum file."""
203235

204236
pass

tests/test_config_parser.py

Lines changed: 95 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
from pathlib import Path
22

3-
from ms2rescore.config_parser import _parse_output_path
3+
import pytest
4+
5+
from ms2rescore.config_parser import _parse_output_path, _validate_regular_expressions
6+
from ms2rescore.exceptions import MS2RescoreConfigurationError
47

58

69
def test__parse_output_path():
@@ -21,3 +24,94 @@ def test__parse_output_path():
2124

2225
for output_path, expected in test_cases:
2326
assert _parse_output_path(output_path, test_psm_file) == expected
27+
28+
29+
def test__validate_regular_expressions_valid():
30+
"""Test _validate_regular_expressions with valid regex patterns."""
31+
# Valid pattern with one capturing group
32+
config = {
33+
"ms2rescore": {
34+
"psm_id_pattern": r"scan:(\d+):.*",
35+
"spectrum_id_pattern": r"spectrum_(\d+)",
36+
"psm_id_rt_pattern": None,
37+
"psm_id_im_pattern": None,
38+
}
39+
}
40+
result = _validate_regular_expressions(config)
41+
assert result == config
42+
43+
44+
def test__validate_regular_expressions_none():
45+
"""Test _validate_regular_expressions with None patterns."""
46+
config = {
47+
"ms2rescore": {
48+
"psm_id_pattern": None,
49+
"spectrum_id_pattern": None,
50+
"psm_id_rt_pattern": None,
51+
"psm_id_im_pattern": None,
52+
}
53+
}
54+
result = _validate_regular_expressions(config)
55+
assert result == config
56+
57+
58+
def test__validate_regular_expressions_invalid_regex():
59+
"""Test _validate_regular_expressions with invalid regex syntax."""
60+
config = {
61+
"ms2rescore": {
62+
"psm_id_pattern": r"scan:(\d+", # Missing closing parenthesis
63+
"spectrum_id_pattern": None,
64+
"psm_id_rt_pattern": None,
65+
"psm_id_im_pattern": None,
66+
}
67+
}
68+
with pytest.raises(MS2RescoreConfigurationError, match="Invalid regular expression"):
69+
_validate_regular_expressions(config)
70+
71+
72+
def test__validate_regular_expressions_no_capturing_group():
73+
"""Test _validate_regular_expressions with no capturing groups."""
74+
config = {
75+
"ms2rescore": {
76+
"psm_id_pattern": r"scan:\d+:.*", # No capturing group
77+
"spectrum_id_pattern": None,
78+
"psm_id_rt_pattern": None,
79+
"psm_id_im_pattern": None,
80+
}
81+
}
82+
with pytest.raises(
83+
MS2RescoreConfigurationError, match="should contain exactly one capturing group"
84+
):
85+
_validate_regular_expressions(config)
86+
87+
88+
def test__validate_regular_expressions_multiple_capturing_groups():
89+
"""Test _validate_regular_expressions with multiple capturing groups."""
90+
config = {
91+
"ms2rescore": {
92+
"psm_id_pattern": r"scan:(\d+):(.*)", # Two capturing groups
93+
"spectrum_id_pattern": None,
94+
"psm_id_rt_pattern": None,
95+
"psm_id_im_pattern": None,
96+
}
97+
}
98+
with pytest.raises(
99+
MS2RescoreConfigurationError, match="should contain exactly one capturing group"
100+
):
101+
_validate_regular_expressions(config)
102+
103+
104+
def test__validate_regular_expressions_spectrum_id_pattern_invalid():
105+
"""Test _validate_regular_expressions with invalid spectrum_id_pattern."""
106+
config = {
107+
"ms2rescore": {
108+
"psm_id_pattern": None,
109+
"spectrum_id_pattern": r"spectrum_\d+", # No capturing group
110+
"psm_id_rt_pattern": None,
111+
"psm_id_im_pattern": None,
112+
}
113+
}
114+
with pytest.raises(
115+
MS2RescoreConfigurationError, match="should contain exactly one capturing group"
116+
):
117+
_validate_regular_expressions(config)

tests/test_parse_psms.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import json
2+
from pathlib import Path
3+
4+
import pytest
5+
from psm_utils import PSM, PSMList
6+
7+
from ms2rescore.exceptions import MS2RescoreConfigurationError
8+
from ms2rescore.parse_psms import parse_psms
9+
10+
11+
@pytest.fixture(scope="module")
12+
def default_config():
13+
cfg_path = Path(__file__).parents[1] / "ms2rescore" / "package_data" / "config_default.json"
14+
cfg = json.loads(cfg_path.read_text())["ms2rescore"]
15+
return cfg
16+
17+
18+
@pytest.fixture
19+
def psm_list_factory():
20+
def _factory(ids):
21+
return PSMList(
22+
psm_list=[
23+
PSM(
24+
peptidoform="PEPTIDE/2",
25+
run="run1",
26+
spectrum_id=sid,
27+
retention_time=None,
28+
ion_mobility=None,
29+
precursor_mz=None,
30+
)
31+
for sid in ids
32+
]
33+
)
34+
35+
return _factory
36+
37+
38+
def test_psm_id_pattern_success(default_config, psm_list_factory):
39+
psm_list = psm_list_factory(["scan:1:fileA", "scan:2:fileA"])
40+
# Ensure at least one decoy is present so parse_psms does not raise
41+
psm_list[0].is_decoy = True
42+
config = dict(default_config)
43+
config.update(
44+
{
45+
"psm_id_pattern": r"scan:(\d+):.*",
46+
"lower_score_is_better": True,
47+
"psm_file": [],
48+
"psm_reader_kwargs": {},
49+
"id_decoy_pattern": None,
50+
}
51+
)
52+
53+
result = parse_psms(config, psm_list)
54+
assert list(result["spectrum_id"]) == ["1", "2"]
55+
56+
57+
def test_psm_id_pattern_collapses_unique_ids(default_config, psm_list_factory):
58+
psm_list = psm_list_factory(["scan:1:fileA", "scan:1:fileB"])
59+
config = dict(default_config)
60+
config.update(
61+
{
62+
"psm_id_pattern": r"scan:(\d+):.*",
63+
"lower_score_is_better": True,
64+
"psm_file": [],
65+
"psm_reader_kwargs": {},
66+
"id_decoy_pattern": None,
67+
}
68+
)
69+
70+
with pytest.raises(MS2RescoreConfigurationError):
71+
parse_psms(config, psm_list)

0 commit comments

Comments
 (0)