|
| 1 | +"""Reader for PSM files from the AlphaDIA search engine.""" |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +import csv |
| 6 | +from abc import ABC |
| 7 | +from typing import Iterable, Optional |
| 8 | + |
| 9 | +from psm_utils.io._base_classes import ReaderBase |
| 10 | +from psm_utils.io._utils import set_csv_field_size_limit |
| 11 | +from psm_utils.psm import PSM |
| 12 | +from psm_utils.psm_list import PSMList |
| 13 | + |
| 14 | +set_csv_field_size_limit() |
| 15 | + |
| 16 | +# TODO: check |
| 17 | +RESCORING_FEATURES = [ |
| 18 | + "rt_observed", |
| 19 | + "mobility_observed", |
| 20 | + "mz_observed", |
| 21 | + "charge", |
| 22 | + "delta_rt", |
| 23 | +] |
| 24 | + |
| 25 | + |
| 26 | +class AlphaDIAReader(ReaderBase, ABC): |
| 27 | + def __init__(self, filename, *args, **kwargs): |
| 28 | + """ |
| 29 | + Reader for AlphaDIA ``precursor.tsv`` file. |
| 30 | +
|
| 31 | + Parameters |
| 32 | + ---------- |
| 33 | + filename : str or Path |
| 34 | + Path to PSM file. |
| 35 | +
|
| 36 | + """ |
| 37 | + super().__init__(filename, *args, **kwargs) |
| 38 | + self.filename = filename |
| 39 | + |
| 40 | + def __iter__(self) -> Iterable[PSM]: |
| 41 | + """Iterate over file and return PSMs one-by-one.""" |
| 42 | + with open(self.filename) as msms_in: |
| 43 | + reader = csv.DictReader(msms_in, delimiter="\t") |
| 44 | + for row in reader: |
| 45 | + yield self._get_peptide_spectrum_match(row) |
| 46 | + |
| 47 | + def _get_peptide_spectrum_match(self, psm_dict) -> PSM: |
| 48 | + """Parse a single PSM from a AlphaDIA PSM file.""" |
| 49 | + rescoring_features = {} |
| 50 | + for ft in RESCORING_FEATURES: |
| 51 | + try: |
| 52 | + rescoring_features[ft] = psm_dict[ft] |
| 53 | + except KeyError: |
| 54 | + continue |
| 55 | + |
| 56 | + return PSM( |
| 57 | + peptidoform=self._parse_peptidoform( |
| 58 | + psm_dict["sequence"], psm_dict["mods"], psm_dict["mod_sites"], psm_dict["charge"] |
| 59 | + ), |
| 60 | + spectrum_id=psm_dict["frame_start"], # TODO: needs to be checked |
| 61 | + run=psm_dict["run"], |
| 62 | + spectrum=psm_dict["frame_start"], # TODO: needs to be checked |
| 63 | + is_decoy=bool(int(psm_dict["decoy"])), |
| 64 | + score=psm_dict["score"], |
| 65 | + qvalue=psm_dict["qval"], |
| 66 | + pep=psm_dict["proba"], |
| 67 | + precursor_mz=psm_dict["mz_observed"], |
| 68 | + retention_time=psm_dict["rt_observed"], |
| 69 | + ion_mobility=psm_dict["mobility_observed"], |
| 70 | + protein_list=psm_dict["proteins"].split(";"), |
| 71 | + rank=int(psm_dict["rank"]) + 1, # AlphaDIA ranks are 0-based |
| 72 | + source="AlphaDIA", |
| 73 | + provenance_data=({"alphadia_filename": str(self.filename)}), |
| 74 | + metadata={}, |
| 75 | + rescoring_features=rescoring_features, |
| 76 | + ) |
| 77 | + |
| 78 | + @staticmethod |
| 79 | + def _parse_peptidoform(sequence: str, mods: str, mod_sites, charge: Optional[str]) -> str: |
| 80 | + """Parse a peptidoform from a AlphaDIA PSM file.""" |
| 81 | + # Parse modifications |
| 82 | + if mods: |
| 83 | + sequence_list = [""] + list(sequence) + [""] # N-term, sequence, C-term |
| 84 | + for mod, site in zip(mods.split(";"), mod_sites.split(";")): |
| 85 | + site = int(site) |
| 86 | + name = mod.split("@")[0] |
| 87 | + # N-terminal modification |
| 88 | + if site == 0: |
| 89 | + sequence_list[0] = f"[{name}]-" |
| 90 | + # C-terminal modification |
| 91 | + elif site == -1: |
| 92 | + sequence_list[-1] = f"-[{name}]" |
| 93 | + # Sequence modification |
| 94 | + else: |
| 95 | + sequence_list[site] = f"{sequence_list[site]}[{name}]" |
| 96 | + sequence = "".join(sequence_list) |
| 97 | + |
| 98 | + # Add charge |
| 99 | + if charge: |
| 100 | + sequence += f"/{int(float(charge))}" |
| 101 | + |
| 102 | + return sequence |
| 103 | + |
| 104 | + @classmethod |
| 105 | + def from_dataframe(cls, dataframe) -> PSMList: |
| 106 | + """Create a PSMList from a AlphaDIA Pandas DataFrame.""" |
| 107 | + return PSMList( |
| 108 | + psm_list=[ |
| 109 | + cls._get_peptide_spectrum_match(cls(""), entry) |
| 110 | + for entry in dataframe.to_dict(orient="records") |
| 111 | + ] |
| 112 | + ) |
0 commit comments