1919from typing import Any , cast
2020
2121import pandas as pd
22+ from pyteomics .proforma import MassModification , to_proforma # type: ignore[import]
2223
2324from psm_utils .io ._base_classes import ReaderBase
2425from psm_utils .io ._utils import set_csv_field_size_limit
26+ from psm_utils .io .exceptions import InvalidModificationError
2527from psm_utils .psm import PSM
2628from psm_utils .psm_list import PSMList
2729
@@ -86,7 +88,7 @@ def _get_peptide_spectrum_match(self, psm_dict: dict[str, Any]) -> PSM:
8688
8789 return PSM (
8890 peptidoform = self ._parse_peptidoform (
89- psm_dict ["Modified Peptide" ], psm_dict ["Peptide " ], psm_dict ["Charge" ]
91+ psm_dict ["Peptide" ], psm_dict ["Assigned Modifications " ], psm_dict ["Charge" ]
9092 ),
9193 spectrum_id = self ._parse_spectrum_id (psm_dict ["Spectrum" ]),
9294 run = self ._parse_run (psm_dict ["Spectrum File" ]),
@@ -108,22 +110,47 @@ def _get_peptide_spectrum_match(self, psm_dict: dict[str, Any]) -> PSM:
108110 )
109111
110112 @staticmethod
111- def _parse_peptidoform (mod_peptide : str , peptide : str , charge : str | None ) -> str :
113+ def _parse_peptidoform (peptide : str , modifications : str , charge : str | None ) -> str :
112114 """Parse the peptidoform from the modified peptide, peptide, and charge columns."""
113- if mod_peptide :
114- peptide = mod_peptide
115- # N-terminal modification
116- if peptide .startswith ("n" ):
117- peptide = peptide [1 :]
118- # A hyphen needs to be added after the N-terminal modification, thus after the ]
119- peptide = peptide .replace ("]" , "]-" , 1 )
120- # C-terminal modification
121- if peptide .endswith ("]" ):
122- if "c[" in peptide :
123- peptide = peptide .replace ("c[" , "-[" , 1 )
124- if charge :
125- peptide += f"/{ int (float (charge ))} "
126- return peptide
115+ sequence : list [tuple [str , list [MassModification ]]] = [(aa , []) for aa in peptide ]
116+ n_term : list [MassModification ] = []
117+ c_term : list [MassModification ] = []
118+
119+ if not modifications :
120+ return to_proforma (sequence , n_term = n_term , c_term = c_term , charge_state = charge )
121+
122+ for mod_entry in modifications .split (", " ):
123+ if not mod_entry :
124+ continue
125+
126+ parsed_mod_entry : list [str ] = mod_entry [:- 1 ].split ("(" )
127+ if not len (parsed_mod_entry ) == 2 :
128+ raise InvalidModificationError (
129+ f"Could not parse modification entry '{ mod_entry } '."
130+ )
131+ site : str = parsed_mod_entry [0 ]
132+ mass : float = float (parsed_mod_entry [1 ])
133+
134+ if site == "N-term" :
135+ n_term .append (MassModification (mass ))
136+ elif site == "C-term" :
137+ c_term .append (MassModification (mass ))
138+ else :
139+ residue : str = site [- 1 ]
140+ idx : int = int (site [:- 1 ]) - 1
141+ if idx < 0 or idx >= len (sequence ):
142+ raise InvalidModificationError (
143+ f"Modification position { idx + 1 } is out of bounds for peptide of "
144+ f"length { len (sequence )} ."
145+ )
146+ if sequence [idx ][0 ] != residue :
147+ raise InvalidModificationError (
148+ f"Modification site residue '{ residue } ' does not match "
149+ f"peptide sequence residue '{ sequence [idx ][0 ]} ' at position { idx + 1 } ."
150+ )
151+ sequence [idx ][1 ].append (MassModification (mass ))
152+
153+ return to_proforma (sequence , n_term = n_term , c_term = c_term , charge_state = charge )
127154
128155 @staticmethod
129156 def _parse_spectrum_id (spectrum : str ) -> str :
0 commit comments