Skip to content

Commit 565934a

Browse files
authored
Merge pull request #96 from compomics/feature/flashlfq-output
Add reader and writer for FlashLFQ
2 parents 83dff49 + 12b1540 commit 565934a

File tree

8 files changed

+374
-2
lines changed

8 files changed

+374
-2
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,12 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8-
## Unreleased
8+
## [1.1.0] - 2024-09-05
99

1010
### Added
1111

1212
- `Peptidoform`: Add `modified_sequence` property to return the modified sequence in ProForma format, but without charge state.
13+
- `io`: Add support for reading and writing FlashLFQ generic TSV files.
1314

1415

1516
## [1.0.1] - 2024-08-28

README.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ Supported file formats
8989
===================================================================================================================== ======================== =============== ===============
9090
File format psm_utils tag Read support Write support
9191
===================================================================================================================== ======================== =============== ===============
92+
`FlashLFQ generic TSV <https://github.com/smith-chem-wisc/FlashLFQ/wiki/Identification-Input-Formats>`_ ``flashlfq`` ✅ ✅
9293
`ionbot CSV <https://ionbot.cloud/>`_ ``ionbot`` ✅ ❌
9394
`OpenMS idXML <https://www.openms.de/>`_ ``idxml`` ✅ ✅
9495
`MaxQuant msms.txt <https://www.maxquant.org/>`_ ``msms`` ✅ ❌

docs/source/api/psm_utils.io.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ psm_utils.io
77

88

99

10+
psm_utils.io.flashlfq
11+
##################
12+
13+
.. automodule:: psm_utils.io.flashlfq
14+
:members:
15+
:inherited-members:
16+
17+
1018
psm_utils.io.idxml
1119
##################
1220

example_files/example.flashlfq.tsv

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
File Name Scan Retention Time Precursor Charge Base Sequence Full Sequence Peptide Monoisotopic Mass Protein Accession
2+
SmallCalibratible_Yeast 24.80555 2 KAPAGGAADAAAK KAPAGGAADAAAK
3+
SmallCalibratible_Yeast 24.95372 2 KAPAAAPAASK KAPAAAPAASK
4+
SmallCalibratible_Yeast 24.77032 2 KQAIETANK KQAIETANK
5+
SmallCalibratible_Yeast 24.17319 2 RVDEGGAQDK RVDEGGAQDK
6+
SmallCalibratible_Yeast 24.26695 2 KDAEPQSDSTTSK KDAEPQSDSTTSK
7+
SmallCalibratible_Yeast 24.10798 2 EKAEAEAEK EKAEAEAEK
8+
SmallCalibratible_Yeast 24.06874 2 EKAEAEAEK EKAEAEAEK
9+
SmallCalibratible_Yeast 24.77398 2 FKEEDEKESQR FKEEDEKESQR
10+
SmallCalibratible_Yeast 24.90638 2 YDHEASSSYK YDHEASSSYK
11+
SmallCalibratible_Yeast 24.40345 3 SKDVTDSATTKK SKDVTDSATTKK
12+
SmallCalibratible_Yeast 24.71679 2 FKEEDEKESQR FKEEDEKESQR
13+
SmallCalibratible_Yeast 24.39968 2 ALKQEGAANK ALKQEGAANK
14+
SmallCalibratible_Yeast 24.67303 2 SKDVTDSATTK SKDVTDSATTK
15+
SmallCalibratible_Yeast 24.45053 2 KLEDHPK KLEDHPK
16+
SmallCalibratible_Yeast 24.77398 1 HIDAGAK HIDAGAK
17+
SmallCalibratible_Yeast 24.9022 2 YLAKEEEKK YLAKEEEKK
18+
SmallCalibratible_Yeast 24.76278 2 YAGEVSHDDK YAGEVSHDDK

psm_utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""Common utilities for parsing and handling PSMs, and search engine results."""
22

3-
__version__ = "1.0.1"
3+
__version__ = "1.1.0"
44
__all__ = ["Peptidoform", "PSM", "PSMList"]
55

66
from warnings import filterwarnings

psm_utils/io/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from rich.progress import track
1010

11+
import psm_utils.io.flashlfq as flashlfq
1112
import psm_utils.io.idxml as idxml
1213
import psm_utils.io.ionbot as ionbot
1314
import psm_utils.io.maxquant as maxquant
@@ -28,6 +29,12 @@
2829
from psm_utils.psm_list import PSMList
2930

3031
FILETYPES = {
32+
"flashlfq": {
33+
"reader": flashlfq.FlashLFQReader,
34+
"writer": flashlfq.FlashLFQWriter,
35+
"extension": ".tsv",
36+
"filename_pattern": r"^.*\.flashlfq\.tsv$",
37+
},
3138
"ionbot": {
3239
"reader": ionbot.IonbotReader,
3340
"writer": None,

psm_utils/io/flashlfq.py

Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
"""
2+
Reader and writer for the FlashLFQ generic TSV format.
3+
4+
See the `FlashLFQ documentation <https://github.com/smith-chem-wisc/FlashLFQ/wiki/Identification-Input-Formats>`_
5+
for more information on the format.
6+
7+
Notes
8+
-----
9+
- The FlashLFQ format does not contain the actual spectrum identifier. When reading a FlashLFQ
10+
file, the spectrum identifier is set to the row number in the file.
11+
- The FlashLFQ format does not contain the precursor m/z, but the theoretical monoisotopic mass.
12+
This value is not read into the PSM object, but can be calculated from the peptidoform.
13+
- To read from a FlashLFQ file, the ``Full Sequence`` column is expected to contain a ProForma v2
14+
compatible peptidoform notation.
15+
16+
"""
17+
18+
from __future__ import annotations
19+
20+
import csv
21+
import logging
22+
from pathlib import Path
23+
from typing import Optional, Union
24+
25+
import numpy as np
26+
27+
from psm_utils.io._base_classes import ReaderBase, WriterBase
28+
from psm_utils.io._utils import set_csv_field_size_limit
29+
from psm_utils.io.exceptions import PSMUtilsIOException
30+
from psm_utils.psm import PSM
31+
from psm_utils.psm_list import PSMList
32+
33+
set_csv_field_size_limit()
34+
35+
LOGGER = logging.getLogger(__name__)
36+
37+
38+
class FlashLFQReader(ReaderBase):
39+
"""Reader for FlashLFQ TSV format."""
40+
41+
required_columns = ["Full Sequence", "Precursor Charge"]
42+
43+
def __iter__(self):
44+
"""Iterate over file and return PSMs one-by-one."""
45+
with open(self.filename, "rt") as open_file:
46+
reader = csv.DictReader(open_file, delimiter="\t")
47+
if not all(col in reader.fieldnames for col in self.required_columns):
48+
raise PSMUtilsIOException(
49+
f"FlashLFQ TSV file must contain the following columns: {self.required_columns}"
50+
)
51+
for i, row in enumerate(reader):
52+
yield self._parse_entry(row, spectrum_id=str(i))
53+
54+
def _parse_entry(self, entry: dict, spectrum_id) -> PSM:
55+
"""Parse single FlashLFQ TSV entry to :py:class:`~psm_utils.psm.PSM`."""
56+
# Replace empty strings with None
57+
entry = {k: v if v else None for k, v in entry.items()}
58+
59+
# Parse entry
60+
return PSM(
61+
peptidoform=f"{entry['Full Sequence']}/{entry['Precursor Charge']}",
62+
spectrum_id=spectrum_id,
63+
run=entry.get("File Name"),
64+
retention_time=entry.get("Scan Retention Time"),
65+
protein_list=self._parse_protein_list(entry.get("Protein Accessions")),
66+
)
67+
68+
@staticmethod
69+
def _parse_protein_list(protein_accessions: Optional[str]) -> list[str]:
70+
"""Parse protein list string to list of protein accessions."""
71+
if not protein_accessions:
72+
return []
73+
elif ";" in protein_accessions: # Docs define separator as semicolon
74+
return protein_accessions.split(";")
75+
elif "|" in protein_accessions: # Example file uses pipe
76+
return protein_accessions.split("|")
77+
else:
78+
return [protein_accessions] # Single protein
79+
80+
81+
class FlashLFQWriter(WriterBase):
82+
"""Reader for FlashLFQ TSV format."""
83+
84+
def __init__(
85+
self,
86+
filename: Union[str, Path],
87+
*args,
88+
fdr_threshold: float = 0.01,
89+
only_targets: bool = True,
90+
**kwargs,
91+
):
92+
"""
93+
Reader for psm_utils TSV format.
94+
95+
Parameters
96+
----------
97+
filename
98+
Path to PSM file.
99+
fdr_threshold
100+
FDR threshold for filtering PSMs.
101+
only_targets
102+
If True, only target PSMs are written to file. If False, both target and decoy PSMs
103+
are written.
104+
105+
"""
106+
super().__init__(filename, *args, **kwargs)
107+
108+
self.fdr_threshold = fdr_threshold
109+
self.only_targets = only_targets
110+
111+
self._open_file = None
112+
self._writer = None
113+
self.fieldnames = None
114+
115+
def __enter__(self) -> FlashLFQWriter:
116+
if Path(self.filename).is_file():
117+
# Get fieldnames from existing file
118+
with open(self.filename, "rt") as open_file:
119+
# Get fieldnames
120+
self.fieldnames = open_file.readline().strip().split("\t")
121+
mode = "at"
122+
else:
123+
# Set default fieldnames
124+
self.fieldnames = [
125+
"File Name",
126+
"Base Sequence",
127+
"Full Sequence",
128+
"Peptide Monoisotope Mass",
129+
"Scan Retention Time",
130+
"Precursor Charge",
131+
"Protein Accessions",
132+
]
133+
mode = "wt"
134+
135+
# Open file and writer
136+
self._open_file = open(self.filename, mode, newline="")
137+
self._writer = csv.DictWriter(
138+
self._open_file,
139+
fieldnames=self.fieldnames,
140+
extrasaction="ignore",
141+
delimiter="\t",
142+
)
143+
144+
if mode == "wt":
145+
self._writer.writeheader()
146+
147+
return self
148+
149+
def __exit__(self, *args, **kwargs) -> None:
150+
self._open_file.close()
151+
self._open_file = None
152+
self._writer = None
153+
154+
def write_psm(self, psm: PSM):
155+
"""
156+
Write a single PSM to new or existing PSM file.
157+
158+
Parameters
159+
----------
160+
psm
161+
PSM object to write.
162+
163+
"""
164+
if psm.qvalue and psm.qvalue > self.fdr_threshold:
165+
return
166+
if self.only_targets and psm.is_decoy:
167+
return
168+
169+
entry = self._psm_to_entry(psm)
170+
try:
171+
self._writer.writerow(entry)
172+
except AttributeError as e:
173+
raise PSMUtilsIOException(
174+
f"`write_psm` method can only be called if `{self.__class__.__qualname__}`"
175+
"is opened in context (i.e., using the `with` statement)."
176+
) from e
177+
178+
def write_file(self, psm_list: PSMList):
179+
"""
180+
Write an entire PSMList to a new PSM file.
181+
182+
Parameters
183+
----------
184+
psm_list
185+
PSMList object to write to file.
186+
187+
"""
188+
# Filter out decoys
189+
if self.only_targets:
190+
# Accept both None and False
191+
target_mask = np.array([not psm.is_decoy for psm in psm_list])
192+
LOGGER.debug(f"Skipping {~target_mask.sum()} decoy PSMs for FlashLFQ file.")
193+
else:
194+
target_mask = np.ones(len(psm_list), dtype=bool)
195+
196+
# Filter out PSMs above FDR threshold
197+
if any(psm.qvalue is None for psm in psm_list):
198+
LOGGER.warning(
199+
"Not all PSMs have a q-value. Skipping FDR filtering for FlashLFQ file."
200+
)
201+
fdr_mask = np.ones(len(psm_list), dtype=bool)
202+
else:
203+
fdr_mask = psm_list["qvalue"] <= self.fdr_threshold
204+
filtered_by_fdr = (~fdr_mask & target_mask).sum()
205+
LOGGER.debug(f"Skipping {filtered_by_fdr} PSMs above FDR threshold for FlashLFQ file.")
206+
207+
filtered_psm_list = psm_list[target_mask & fdr_mask]
208+
209+
with open(self.filename, "wt", newline="") as f:
210+
writer = csv.DictWriter(
211+
f, fieldnames=self.fieldnames, delimiter="\t", extrasaction="ignore"
212+
)
213+
writer.writeheader()
214+
for psm in filtered_psm_list:
215+
writer.writerow(self._psm_to_entry(psm))
216+
217+
@staticmethod
218+
def _psm_to_entry(psm: PSM) -> dict:
219+
"""Convert :py:class:`~psm_utils.psm.PSM` to FlashLFQ TSV entry."""
220+
return {
221+
"File Name": psm.run,
222+
"Base Sequence": psm.peptidoform.sequence,
223+
"Full Sequence": psm.peptidoform.modified_sequence,
224+
"Peptide Monoisotope Mass": psm.peptidoform.theoretical_mass,
225+
"Scan Retention Time": psm.retention_time,
226+
"Precursor Charge": psm.peptidoform.precursor_charge,
227+
"Protein Accessions": ";".join(psm.protein_list),
228+
}

0 commit comments

Comments
 (0)