Skip to content

Commit 133559a

Browse files
authored
Merge pull request #103 from compomics/diann-io
Add I/O support for some DIA search engines
2 parents 27b8591 + 029ba11 commit 133559a

File tree

14 files changed

+695
-4
lines changed

14 files changed

+695
-4
lines changed

README.rst

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,10 @@ Supported file formats
8989
===================================================================================================================== ======================== =============== ===============
9090
File format psm_utils tag Read support Write support
9191
===================================================================================================================== ======================== =============== ===============
92+
`AlphaDIA precursors TSV <https://alphadia.readthedocs.io/en/latest/quickstart.html#output-files>`_ ``alphadia`` ✅ ❌
93+
`DIA-NN TSV <https://github.com/vdemichev/DiaNN#output>`_ ``diann`` ✅ ❌
9294
`FlashLFQ generic TSV <https://github.com/smith-chem-wisc/FlashLFQ/wiki/Identification-Input-Formats>`_ ``flashlfq`` ✅ ✅
95+
`FragPipe PSM TSV <https://fragpipe.nesvilab.org/docs/tutorial_fragpipe_outputs.html#psmtsv/>`_ ``fragpipe`` ✅ ❌
9396
`ionbot CSV <https://ionbot.cloud/>`_ ``ionbot`` ✅ ❌
9497
`OpenMS idXML <https://www.openms.de/>`_ ``idxml`` ✅ ✅
9598
`MaxQuant msms.txt <https://www.maxquant.org/>`_ ``msms`` ✅ ❌
@@ -99,10 +102,10 @@ Supported file formats
99102
`Peptide Record <https://psm-utils.readthedocs.io/en/stable/api/psm_utils.io/#module-psm_utils.io.peptide_record>`_ ``peprec`` ✅ ✅
100103
`pepXML <http://tools.proteomecenter.org/wiki/index.php?title=Formats:pepXML>`_ ``pepxml`` ✅ ❌
101104
`Percolator tab <https://github.com/percolator/percolator/wiki/Interface>`_ ``percolator`` ✅ ✅
102-
Proteome Discoverer MSF ``proteome_discoverer`` ✅ ❌
105+
`Proteome Discoverer MSF <#>`_ ``proteome_discoverer`` ✅ ❌
103106
`Sage Parquet <https://github.com/lazear/sage/blob/v0.14.7/DOCS.md#interpreting-sage-output>`_ ``sage_parquet`` ✅ ❌
104107
`Sage TSV <https://github.com/lazear/sage/blob/v0.14.7/DOCS.md#interpreting-sage-output>`_ ``sage_tsv`` ✅ ❌
105-
ProteoScape Parquet ``proteoscape`` ✅ ❌
108+
`ProteoScape Parquet <#>`_ ``proteoscape`` ✅ ❌
106109
`TSV <https://psm-utils.readthedocs.io/en/stable/api/psm_utils.io/#module-psm_utils.io.tsv>`_ ``tsv`` ✅ ✅
107110
`X!Tandem XML <https://www.thegpm.org/tandem/>`_ ``xtandem`` ✅ ❌
108111
===================================================================================================================== ======================== =============== ===============

docs/source/api/psm_utils.io.rst

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,22 @@ psm_utils.io
77

88

99

10+
psm_utils.io.alphapept
11+
##################
12+
13+
.. automodule:: psm_utils.io.alphapept
14+
:members:
15+
:inherited-members:
16+
17+
18+
psm_utils.io.diann
19+
##################
20+
21+
.. automodule:: psm_utils.io.diann
22+
:members:
23+
:inherited-members:
24+
25+
1026
psm_utils.io.flashlfq
1127
#####################
1228

@@ -15,6 +31,14 @@ psm_utils.io.flashlfq
1531
:inherited-members:
1632

1733

34+
psm_utils.io.fragpipe
35+
##################
36+
37+
.. automodule:: psm_utils.io.fragpipe
38+
:members:
39+
:inherited-members:
40+
41+
1842
psm_utils.io.idxml
1943
##################
2044

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
base_width_mobility base_width_rt rt_observed mobility_observed mono_ms1_intensity top_ms1_intensity sum_ms1_intensity weighted_ms1_intensity weighted_mass_deviation weighted_mass_error mz_observed mono_ms1_height top_ms1_height sum_ms1_height weighted_ms1_height isotope_intensity_correlation isotope_height_correlation n_observations intensity_correlation height_correlation intensity_fraction height_fraction intensity_fraction_weighted height_fraction_weighted mean_observation_score sum_b_ion_intensity sum_y_ion_intensity diff_b_y_ion_intensity f_masked fragment_scan_correlation template_scan_correlation fragment_frame_correlation top3_frame_correlation template_frame_correlation top3_b_ion_correlation n_b_ions top3_y_ion_correlation n_y_ions cycle_fwhm mobility_fwhm delta_frame_peak top_3_ms2_mass_error mean_ms2_mass_error n_overlapping mean_overlapping_intensity mean_overlapping_mass_error precursor_idx rank frame_center scan_center score elution_group_idx frame_start scan_stop frame_stop scan_start proteins rt_calibrated flat_frag_start_idx charge mods decoy sequence mz_library channel genes i_0 flat_frag_stop_idx i_2 i_1 i_3 mobility_library rt_library mod_sites delta_rt n_K n_R n_P _decoy proba qval _candidate_idx valid candidate_idx run mod_seq_hash mod_seq_charge_hash pg_master pg pg_qval intensity
2+
0.000000 40.673340 2800.518555 0.000001 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 894.337830 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.968887 0.845673 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 15.266385 -15.266385 1.000000 0.000000 0.000000 0.929785 0.975279 0.000000 0.000000 0.000000 0.948546 12.000000 14.244627 0.000000 -0.500000 0.132713 -0.218829 0.000000 0.000000 0.000000 10447876 0 72329 0 136.160126 5238821 71876 1 72933 0 P18899 2347.609131 59818105 3 0 SSYGSSSNDDSYGSSNNDDSYGSSNK 894.337830 0 DDR48_YEAST 0.273118 59818117 0.249391 0.348172 0.129319 0.948457 1399.216187 452.909424 1 0 0 0.000000 0.000000 0.000000 10447876 True 10447876 LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01 8562405370847133435 8562405370847133438 P18899 P18899 0.000000 190103852.035206
3+
0.000000 40.745483 1647.208252 0.000001 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 986.440491 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.991654 0.992141 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 14.408463 -14.408463 1.000000 0.000000 0.000000 0.738752 0.974915 0.000000 0.000000 0.000000 0.880488 12.000000 9.885651 0.000000 0.000000 -0.391579 -0.698411 0.000000 0.000000 0.000000 8793636 0 42431 0 122.278320 4411698 41978 1 43035 0 Q9ULU4 1670.462402 49907897 2 0 SSQGSSSSTQSAPSETASASK 986.440491 0 PKCB1_HUMAN 0.380560 49907909 0.190793 0.352861 0.075786 1.158085 387.834503 -23.254150 1 0 1 0.000000 0.000000 0.000000 8793636 True 8793636 LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01 5824087303549386971 5824087303549386973 Q9ULU4 Q9ULU4 0.000000 195496849.073322
4+
0.000000 52.349121 2678.317139 0.000001 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 905.432312 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.986449 0.931379 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 16.636572 -16.636572 1.000000 0.000000 0.000000 0.978579 0.996334 0.000000 0.000000 0.000000 0.988605 12.000000 13.867673 0.000000 0.000000 -0.432777 0.780247 0.000000 0.000000 0.000000 7132549 0 69158 0 152.012512 3581144 68554 1 69913 0 O60763 2646.791260 39980635 2 0 SSQTSGTNEQSSAIVSAR 905.432312 0 USO1_HUMAN 0.404900 39980647 0.177361 0.352328 0.065410 1.110423 1774.035034 31.525879 0 1 0 0.000000 0.000000 0.000000 7132549 True 7132549 LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01 14912031975374993231 14912031975374993233 O60763 O60763 0.000000 406414129.849395

example_files/fragpipe.psm.tsv

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Spectrum Spectrum File Peptide Modified Peptide Extended Peptide Prev AA Next AA Peptide Length Charge Retention Observed Mass Calibrated Observed Mass Observed M/Z Calibrated Observed M/Z Calculated Peptide Mass Calculated M/Z Delta Mass Expectation Hyperscore Nextscore Probability Number of Enzymatic Termini Number of Missed Cleavages Protein Start Protein End Intensity Assigned Modifications Observed Modifications Purity Is Unique Protein Protein ID Entry Name Gene Protein Description Mapped Genes Mapped Proteins
2+
LFQ_Orbitrap_AIF_Human_01.100000.100000.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml SEDCFILDHGK SEDCFILDHGK PFAQGAIK.SEDCFILDHGK.DGKIFVWK K D 11 3 3813.8638 1319.5804 1319.5807 440.8674 440.8675 1319.5815 440.8678 -0.0008 0.01264961000000 19.3701 15.5657 0.9968 2 0 328 338 0.0000 4C(57.0214) 0.00 false sp|GELS_HUMAN| GELS_HUMAN GSN sp|P06396|GELS_HUMAN
3+
LFQ_Orbitrap_AIF_Human_01.100002.100002.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml FLLEAGADQEHK KGHIEMVR.FLLEAGADQEHK.TDEMHTAI R T 12 3 3813.9346 1356.6663 1356.6665 453.2294 453.2294 1356.6672 453.2297 -0.0007 0.01950739000000 18.9370 14.3831 0.9985 2 0 419 430 0.0000 0.00 false sp|O75179|ANR17_HUMAN O75179 ANR17_HUMAN ANKRD17 Ankyrin repeat domain-containing protein 17 ANKHD1 sp|Q8IWZ3|ANKH1_HUMAN
4+
LFQ_Orbitrap_AIF_Human_01.100004.100004.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml REELSNVLAAMR REELSNVLAAM[147]R THIRAKRK.REELSNVLAAMR.KAAAKKD K K 12 3 3814.0050 1403.7197 1403.7198 468.9138 468.9139 1403.7190 468.9136 0.0008 0.00008879724000 24.3292 15.9192 0.9998 2 1 87 98 0.0000 11M(15.9949) 0.00 true sp|Q9Y3U8|RL36_HUMAN Q9Y3U8 RL36_HUMAN RPL36 Large ribosomal subunit protein eL36
5+
LFQ_Orbitrap_AIF_Human_01.100040.100040.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml LHISPSNMTNQNTPEYMEK LHISPSNM[147]TNQNTPEYMEKc[17] EYFGPDFK.LHISPSNMTNQNTPEYMEK.IKQRIFEN K I 19 3 3815.4023 2248.0256 2248.0251 750.3491 750.3490 2248.0254 750.3491 -0.0002 0.00194418200000 21.2429 21.2429 0.7143 2 0 344 362 0.0000 8M(15.9949), C-term(-0.9840) 0.00 true sp|Q92769|HDAC2_HUMAN Q92769 HDAC2_HUMAN HDAC2 Histone deacetylase 2
6+
LFQ_Orbitrap_AIF_Human_01.101373.101373.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml ANIAVQR n[43]ANIAVQR .ANIAVQR.IKREFKEV M I 7 2 3866.1475 812.4501 812.4503 407.2323 407.2324 812.4505 407.2325 -0.0002 0.11090580000000 17.1991 14.1196 0.9898 2 0 2 8 0.0000 N-term(42.0106) 0.00 true sp|P61086|UBE2K_HUMAN P61086 UBE2K_HUMAN UBE2K Ubiquitin-conjugating enzyme E2 K

psm_utils/io/__init__.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@
88

99
from rich.progress import track
1010

11+
import psm_utils.io.alphadia as alphadia
12+
import psm_utils.io.diann as diann
1113
import psm_utils.io.flashlfq as flashlfq
14+
import psm_utils.io.fragpipe as fragpipe
1215
import psm_utils.io.idxml as idxml
1316
import psm_utils.io.ionbot as ionbot
1417
import psm_utils.io.maxquant as maxquant
@@ -113,13 +116,31 @@
113116
"extension": ".parquet",
114117
"filename_pattern": r"^.*(?:_|\.)sage.parquet$",
115118
},
116-
"parquet": { # List after proteoscape and sage to avoid extension matching conflicts
119+
"fragpipe": {
120+
"reader": fragpipe.FragPipeReader,
121+
"writer": None,
122+
"extension": ".tsv",
123+
"filename_pattern": r"^.*(?:_|\.)?psm\.tsv$",
124+
},
125+
"alphadia": {
126+
"reader": alphadia.AlphaDIAReader,
127+
"writer": None,
128+
"extension": ".tsv",
129+
"filename_pattern": r"^.*(?:_|\.)?precursors\.tsv$",
130+
},
131+
"diann": {
132+
"reader": diann.DIANNTSVReader,
133+
"writer": None,
134+
"extension": ".tsv",
135+
"filename_pattern": r"^.*(?:_|\.)?diann\.tsv$",
136+
},
137+
"parquet": { # List after more specific Parquet patterns to avoid matching conflicts
117138
"reader": parquet.ParquetReader,
118139
"writer": parquet.ParquetWriter,
119140
"extension": ".parquet",
120141
"filename_pattern": r"^.*\.parquet$",
121142
},
122-
"tsv": { # List after sage to avoid extension matching conflicts
143+
"tsv": { # List after more specific TSV patterns to avoid matching conflicts
123144
"reader": tsv.TSVReader,
124145
"writer": tsv.TSVWriter,
125146
"extension": ".tsv",

psm_utils/io/alphadia.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
"""Reader for PSM files from the AlphaDIA search engine."""
2+
3+
from __future__ import annotations
4+
5+
import csv
6+
from abc import ABC
7+
from typing import Iterable, Optional
8+
9+
from psm_utils.io._base_classes import ReaderBase
10+
from psm_utils.io._utils import set_csv_field_size_limit
11+
from psm_utils.psm import PSM
12+
from psm_utils.psm_list import PSMList
13+
14+
set_csv_field_size_limit()
15+
16+
# TODO: check
17+
RESCORING_FEATURES = [
18+
"rt_observed",
19+
"mobility_observed",
20+
"mz_observed",
21+
"charge",
22+
"delta_rt",
23+
]
24+
25+
26+
class AlphaDIAReader(ReaderBase, ABC):
27+
def __init__(self, filename, *args, **kwargs):
28+
"""
29+
Reader for AlphaDIA ``precursor.tsv`` file.
30+
31+
Parameters
32+
----------
33+
filename : str or Path
34+
Path to PSM file.
35+
36+
"""
37+
super().__init__(filename, *args, **kwargs)
38+
self.filename = filename
39+
40+
def __iter__(self) -> Iterable[PSM]:
41+
"""Iterate over file and return PSMs one-by-one."""
42+
with open(self.filename) as msms_in:
43+
reader = csv.DictReader(msms_in, delimiter="\t")
44+
for row in reader:
45+
yield self._get_peptide_spectrum_match(row)
46+
47+
def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
48+
"""Parse a single PSM from a AlphaDIA PSM file."""
49+
rescoring_features = {}
50+
for ft in RESCORING_FEATURES:
51+
try:
52+
rescoring_features[ft] = psm_dict[ft]
53+
except KeyError:
54+
continue
55+
56+
return PSM(
57+
peptidoform=self._parse_peptidoform(
58+
psm_dict["sequence"], psm_dict["mods"], psm_dict["mod_sites"], psm_dict["charge"]
59+
),
60+
spectrum_id=psm_dict["frame_start"], # TODO: needs to be checked
61+
run=psm_dict["run"],
62+
spectrum=psm_dict["frame_start"], # TODO: needs to be checked
63+
is_decoy=bool(int(psm_dict["decoy"])),
64+
score=psm_dict["score"],
65+
qvalue=psm_dict["qval"],
66+
pep=psm_dict["proba"],
67+
precursor_mz=psm_dict["mz_observed"],
68+
retention_time=psm_dict["rt_observed"],
69+
ion_mobility=psm_dict["mobility_observed"],
70+
protein_list=psm_dict["proteins"].split(";"),
71+
rank=int(psm_dict["rank"]) + 1, # AlphaDIA ranks are 0-based
72+
source="AlphaDIA",
73+
provenance_data=({"alphadia_filename": str(self.filename)}),
74+
metadata={},
75+
rescoring_features=rescoring_features,
76+
)
77+
78+
@staticmethod
79+
def _parse_peptidoform(sequence: str, mods: str, mod_sites, charge: Optional[str]) -> str:
80+
"""Parse a peptidoform from a AlphaDIA PSM file."""
81+
# Parse modifications
82+
if mods:
83+
sequence_list = [""] + list(sequence) + [""] # N-term, sequence, C-term
84+
for mod, site in zip(mods.split(";"), mod_sites.split(";")):
85+
site = int(site)
86+
name = mod.split("@")[0]
87+
# N-terminal modification
88+
if site == 0:
89+
sequence_list[0] = f"[{name}]-"
90+
# C-terminal modification
91+
elif site == -1:
92+
sequence_list[-1] = f"-[{name}]"
93+
# Sequence modification
94+
else:
95+
sequence_list[site] = f"{sequence_list[site]}[{name}]"
96+
sequence = "".join(sequence_list)
97+
98+
# Add charge
99+
if charge:
100+
sequence += f"/{int(float(charge))}"
101+
102+
return sequence
103+
104+
@classmethod
105+
def from_dataframe(cls, dataframe) -> PSMList:
106+
"""Create a PSMList from a AlphaDIA Pandas DataFrame."""
107+
return PSMList(
108+
psm_list=[
109+
cls._get_peptide_spectrum_match(cls(""), entry)
110+
for entry in dataframe.to_dict(orient="records")
111+
]
112+
)

psm_utils/io/diann.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
"""
2+
Reader for PSM files from DIA-NN
3+
4+
Reads the '.tsv' file as defined on the
5+
`DIA-NN documentation page <https://github.com/vdemichev/DiaNN/tree/1.8.1?tab=readme-ov-file#main-output-reference>`_.
6+
7+
Notes
8+
-----
9+
10+
- DIA-NN calculates q-values at both the run and library level. The run-level q-value is used as
11+
the PSM q-value.
12+
- DIA-NN currently does not return precursor m/z values.
13+
- DIA-NN currently does not support C-terminal modifications in its searches.
14+
15+
"""
16+
17+
from __future__ import annotations
18+
19+
import csv
20+
import re
21+
from typing import Iterable, Optional
22+
23+
from psm_utils.io._base_classes import ReaderBase
24+
from psm_utils.io._utils import set_csv_field_size_limit
25+
from psm_utils.psm import PSM
26+
from psm_utils.psm_list import PSMList
27+
28+
set_csv_field_size_limit()
29+
30+
RESCORING_FEATURES = [
31+
"RT",
32+
"Predicted.RT",
33+
"iRT",
34+
"Predicted.iRT",
35+
"Ms1.Profile.Corr",
36+
"Ms1.Area",
37+
"IM",
38+
"iIM",
39+
"Predicted.IM",
40+
"Predicted.iIM",
41+
]
42+
43+
44+
class DIANNTSVReader(ReaderBase):
45+
def __init__(self, filename, *args, **kwargs) -> None:
46+
"""
47+
Reader for DIA-NN '.tsv' file.
48+
49+
Parameters
50+
----------
51+
filename : str or Path
52+
Path to PSM file.
53+
54+
"""
55+
super().__init__(filename, *args, **kwargs)
56+
self.filename = filename
57+
58+
def __iter__(self) -> Iterable[PSM]:
59+
"""Iterate over file and return PSMs one-by-one."""
60+
with open(self.filename) as msms_in:
61+
reader = csv.DictReader(msms_in, delimiter="\t")
62+
for row in reader:
63+
yield self._get_peptide_spectrum_match(row)
64+
65+
def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
66+
"""Parse a single PSM from a DIA-NN PSM file."""
67+
rescoring_features = {}
68+
for ft in RESCORING_FEATURES:
69+
try:
70+
rescoring_features[ft] = psm_dict[ft]
71+
except KeyError:
72+
continue
73+
74+
return PSM(
75+
peptidoform=self._parse_peptidoform(
76+
psm_dict["Modified.Sequence"], psm_dict["Precursor.Charge"]
77+
),
78+
spectrum_id=psm_dict["MS2.Scan"],
79+
run=psm_dict["Run"],
80+
is_decoy=False,
81+
qvalue=psm_dict["Q.Value"],
82+
pep=float(psm_dict["PEP"]),
83+
score=float(psm_dict["CScore"]),
84+
precursor_mz=None, # Not returned by DIA-NN :(
85+
retention_time=float(psm_dict["RT"]),
86+
ion_mobility=float(psm_dict["IM"]),
87+
protein_list=psm_dict["Protein.Ids"].split(";"),
88+
source="diann",
89+
rank=None,
90+
provenance_data=({"diann_filename": str(self.filename)}),
91+
rescoring_features=rescoring_features,
92+
metadata={},
93+
)
94+
95+
@staticmethod
96+
def _parse_peptidoform(peptide: str, charge: Optional[str]) -> str:
97+
# Add charge
98+
if charge:
99+
peptide += f"/{int(float(charge))}"
100+
101+
# Replace parentheses with square brackets and capitalize UniMod prefix
102+
pattern = r"\(UniMod:(\d+)\)"
103+
replacement = r"[UNIMOD:\1]"
104+
peptide = re.sub(pattern, replacement, peptide)
105+
106+
# Add hyphen for N-terminal modifications
107+
# If [UNIMOD:n] occurs before the first amino acid, a hyphen is added before the first
108+
# amino acid
109+
if peptide[0] == "[":
110+
# Hyphen after the closing bracket
111+
peptide = peptide.replace("]", "]-", 1)
112+
113+
# C-terminal modifications are currently not supported in DIA-NN
114+
115+
return peptide
116+
117+
@classmethod
118+
def from_dataframe(cls, dataframe) -> PSMList:
119+
"""Create a PSMList from a DIA-NN Pandas DataFrame."""
120+
return PSMList(
121+
ptm_list=[
122+
cls._get_peptide_spectrum_match(cls(""), entry)
123+
for entry in dataframe.to_dict(orient="records")
124+
]
125+
)

0 commit comments

Comments
 (0)