Skip to content

Commit d562771

Browse files
authored
Merge pull request #131 from CompOmics/ATPs/main
`io.percolator`: Improve parsing of Comet-style peptidoform notation
2 parents 247471b + bd0d4db commit d562771

File tree

2 files changed

+33
-2
lines changed

2 files changed

+33
-2
lines changed

psm_utils/io/percolator.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,9 +145,19 @@ def _infer_charge_columns(fieldnames: list[str]) -> tuple[str | None, dict[int,
145145
@staticmethod
146146
def _parse_peptidoform(percolator_peptide: str, charge: int | None) -> Peptidoform:
147147
"""Parse Percolator TSV peptide notation to Peptidoform."""
148-
# Remove leading and trailing amino acids
148+
# Remove leading and trailing amino acids (e.g., R.PEPTIDE.S -> PEPTIDE)
149149
match = re.match(r"^(?:[A-Z-])?\.(.+)\.(?:[A-Z-])?$", percolator_peptide)
150-
peptidoform = match[1] if match else percolator_peptide
150+
peptidoform: str = match[1] if match else percolator_peptide
151+
152+
# Handle Comet's n-terminal modification format: n[42.0106]PEPTIDE -> [42.0106]-PEPTIDE
153+
peptidoform = re.sub(r"^n\[([+-]?[\w\.]*?)\]", r"[\1]-", peptidoform)
154+
155+
# Handle Comet's c-terminal modification format: PEPTIDEc[-0.9840] -> PEPTIDE-[-0.9840]
156+
peptidoform = re.sub(r"c\[([+-]?[\w\.]*?)\]$", r"-[\1]", peptidoform)
157+
158+
# Ensure positive values inside square brackets have a '+' sign
159+
peptidoform = re.sub(r"\[(\d+[\.]*\d*)]", r"[+\1]", peptidoform)
160+
151161
if charge:
152162
peptidoform += f"/{charge}"
153163
return Peptidoform(peptidoform)

tests/test_io/test_percolator.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,32 @@ def test__infer_charge_columns(self):
2828

2929
def test_parse_peptidoform(self):
3030
test_cases = [
31+
# Basic cases
3132
(("ACDEFGHR", None), "ACDEFGHR"),
3233
(("K.ACDEFGHR.I", 1), "ACDEFGHR/1"),
3334
(("K.ACDEFGHR.-", 2), "ACDEFGHR/2"),
3435
(("-.ACDEFGHR.I", 3), "ACDEFGHR/3"),
3536
(("-.ACDEFGHR.-", None), "ACDEFGHR"),
37+
# N-terminal modifications
38+
(("-.n[42.0106]ACDEFGHR.-", None), "[+42.0106]-ACDEFGHR"),
39+
(("n[42.0106]ACDEFGHR", None), "[+42.0106]-ACDEFGHR"), # Without flanking
40+
(("-.n[43]ACDEFGHR.-", 2), "[+43]-ACDEFGHR/2"), # Integer mass
41+
# C-terminal modifications
42+
(("-.ACDEFGHRc[-0.9840].-", None), "ACDEFGHR-[-0.984]"),
43+
(("ACDEFGHRc[-0.9840]", None), "ACDEFGHR-[-0.984]"), # Without flanking
44+
(("-.ACDEFGHRc[17.0265].-", 2), "ACDEFGHR-[+17.0265]/2"), # Positive C-term
45+
# Internal modifications
46+
(("-.ACDEFM[15.9949]GHR.-", None), "ACDEFM[+15.9949]GHR"),
47+
(("-.ACDEM[-18.010565]GHR.-", None), "ACDEM[-18.010565]GHR"), # Negative internal
48+
(("-.AC[57.021]DEFGHR.-", None), "AC[+57.021]DEFGHR"), # Carbamidomethyl
49+
# Multiple modifications
50+
(("-.n[43]ACDEFM[16]GHR.-", None), "[+43]-ACDEFM[+16]GHR"), # N-term + internal
51+
(("-.ACDEFM[16]GHRc[-1].-", None), "ACDEFM[+16]GHR-[-1]"), # Internal + C-term
52+
(("-.n[42]ACDEFM[16]GHRc[-1].-", 2), "[+42]-ACDEFM[+16]GHR-[-1]/2"), # All three
53+
(("-.AC[57]DEM[16]GHK.-", None), "AC[+57]DEM[+16]GHK"), # Multiple internal
54+
# Already has '+' sign (should not add another)
55+
(("-.ACDEFM[+15.9949]GHR.-", None), "ACDEFM[+15.9949]GHR"),
56+
(("-.n[+42.0106]ACDEFGHR.-", None), "[+42.0106]-ACDEFGHR"),
3657
]
3758
for test_in, expected_out in test_cases:
3859
assert expected_out == PercolatorTabReader._parse_peptidoform(*test_in).proforma

0 commit comments

Comments
 (0)