Skip to content

Commit 4742ee0

Browse files
authored
Merge pull request #124 from levitsky/fix/fragpipe-proforma
Fix Fragpipe proteoform parsing
2 parents d36a56e + dd3490b commit 4742ee0

File tree

2 files changed

+48
-23
lines changed

2 files changed

+48
-23
lines changed

psm_utils/io/fragpipe.py

Lines changed: 43 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@
1919
from typing import Any, cast
2020

2121
import pandas as pd
22+
from pyteomics.proforma import MassModification, to_proforma # type: ignore[import]
2223

2324
from psm_utils.io._base_classes import ReaderBase
2425
from psm_utils.io._utils import set_csv_field_size_limit
26+
from psm_utils.io.exceptions import InvalidModificationError
2527
from psm_utils.psm import PSM
2628
from psm_utils.psm_list import PSMList
2729

@@ -86,7 +88,7 @@ def _get_peptide_spectrum_match(self, psm_dict: dict[str, Any]) -> PSM:
8688

8789
return PSM(
8890
peptidoform=self._parse_peptidoform(
89-
psm_dict["Modified Peptide"], psm_dict["Peptide"], psm_dict["Charge"]
91+
psm_dict["Peptide"], psm_dict["Assigned Modifications"], psm_dict["Charge"]
9092
),
9193
spectrum_id=self._parse_spectrum_id(psm_dict["Spectrum"]),
9294
run=self._parse_run(psm_dict["Spectrum File"]),
@@ -108,22 +110,47 @@ def _get_peptide_spectrum_match(self, psm_dict: dict[str, Any]) -> PSM:
108110
)
109111

110112
@staticmethod
111-
def _parse_peptidoform(mod_peptide: str, peptide: str, charge: str | None) -> str:
113+
def _parse_peptidoform(peptide: str, modifications: str, charge: str | None) -> str:
112114
"""Parse the peptidoform from the modified peptide, peptide, and charge columns."""
113-
if mod_peptide:
114-
peptide = mod_peptide
115-
# N-terminal modification
116-
if peptide.startswith("n"):
117-
peptide = peptide[1:]
118-
# A hyphen needs to be added after the N-terminal modification, thus after the ]
119-
peptide = peptide.replace("]", "]-", 1)
120-
# C-terminal modification
121-
if peptide.endswith("]"):
122-
if "c[" in peptide:
123-
peptide = peptide.replace("c[", "-[", 1)
124-
if charge:
125-
peptide += f"/{int(float(charge))}"
126-
return peptide
115+
sequence: list[tuple[str, list[MassModification]]] = [(aa, []) for aa in peptide]
116+
n_term: list[MassModification] = []
117+
c_term: list[MassModification] = []
118+
119+
if not modifications:
120+
return to_proforma(sequence, n_term=n_term, c_term=c_term, charge_state=charge)
121+
122+
for mod_entry in modifications.split(", "):
123+
if not mod_entry:
124+
continue
125+
126+
parsed_mod_entry: list[str] = mod_entry[:-1].split("(")
127+
if not len(parsed_mod_entry) == 2:
128+
raise InvalidModificationError(
129+
f"Could not parse modification entry '{mod_entry}'."
130+
)
131+
site: str = parsed_mod_entry[0]
132+
mass: float = float(parsed_mod_entry[1])
133+
134+
if site == "N-term":
135+
n_term.append(MassModification(mass))
136+
elif site == "C-term":
137+
c_term.append(MassModification(mass))
138+
else:
139+
residue: str = site[-1]
140+
idx: int = int(site[:-1]) - 1
141+
if idx < 0 or idx >= len(sequence):
142+
raise InvalidModificationError(
143+
f"Modification position {idx + 1} is out of bounds for peptide of "
144+
f"length {len(sequence)}."
145+
)
146+
if sequence[idx][0] != residue:
147+
raise InvalidModificationError(
148+
f"Modification site residue '{residue}' does not match "
149+
f"peptide sequence residue '{sequence[idx][0]}' at position {idx + 1}."
150+
)
151+
sequence[idx][1].append(MassModification(mass))
152+
153+
return to_proforma(sequence, n_term=n_term, c_term=c_term, charge_state=charge)
127154

128155
@staticmethod
129156
def _parse_spectrum_id(spectrum: str) -> str:

tests/test_io/test_fragpipe.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,14 @@ def test_iter(self):
4242

4343
def test__parse_peptidoform(self):
4444
test_cases = [
45-
(("LHM[147]TNQNMEKc[17]", "LHMTNQNMEK", "3"), "LHM[147]TNQNMEK-[17]/3"),
46-
(("n[43]ANIAVQR", "ANIAVQR", "2"), "[43]-ANIAVQR/2"),
47-
((None, "IPAVTYPK", "2"), "IPAVTYPK/2"),
48-
(("", "IPAVTYPK", "2"), "IPAVTYPK/2"),
49-
(("", "IPAVTYPK", 2), "IPAVTYPK/2"),
45+
(("LHMTNQNMEK", "3M(15.994915), C-term(17.034480)", "3"), "LHM[+15.9949]TNQNMEK-[+17.0345]/3"),
46+
(("ANIAVQR", "N-term(42.0106)", "2"), "[+42.0106]-ANIAVQR/2"),
47+
(("IPAVTYPK", "", "2"), "IPAVTYPK/2"),
5048
]
5149

5250
reader = FragPipeReader("./tests/test_data/test_fragpipe.tsv")
53-
for (peptide, modified_peptide, charge), expected in test_cases:
54-
assert reader._parse_peptidoform(peptide, modified_peptide, charge) == expected
51+
for (peptide, modifications, charge), expected in test_cases:
52+
assert reader._parse_peptidoform(peptide, modifications, charge) == expected
5553

5654
def test__parse_spectrum_id(self):
5755
test_cases = [

0 commit comments

Comments
 (0)