Skip to content

Commit 204e8b6

Browse files
authored
Add more Py_AASequence Convenience functions (#25)
* feature: add "+" and slicing for AASequence * feature: add more pythonic methods * fix: slicing for modified peptide sequences * feature: add string export method * test: fix tests, allow for negative indices in slicing * fix: address ai review comments * fix: critical slicing bug * address ai comments
1 parent 1707b48 commit 204e8b6

File tree

2 files changed

+203
-18
lines changed

2 files changed

+203
-18
lines changed

openms_python/py_aasequence.py

Lines changed: 159 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,14 @@
22

33
from __future__ import annotations
44

5-
from typing import Optional
5+
from typing import Optional, Literal
66
import pyopenms as oms
7+
import warnings
78

89

910
class Py_AASequence:
1011
"""
11-
A Pythonic wrapper around pyOpenMS AASequence.
12+
A Pythonic, immutable wrapper around pyOpenMS AASequence.
1213
1314
This class provides intuitive properties and methods for working with
1415
amino acid sequences, including common operations like reversing and
@@ -40,7 +41,7 @@ def __init__(self, native_sequence: Optional[oms.AASequence] = None):
4041
@classmethod
4142
def from_string(cls, sequence_str: str) -> Py_AASequence:
4243
"""
43-
Create AASequence from string representation.
44+
Create Py_AASequence from string representation.
4445
4546
Args:
4647
sequence_str: String representation of the amino acid sequence.
@@ -57,6 +58,20 @@ def from_string(cls, sequence_str: str) -> Py_AASequence:
5758

5859
# ==================== Pythonic Properties ====================
5960

61+
@classmethod
62+
def from_native(cls, native_sequence: oms.AASequence) -> Py_AASequence:
63+
"""
64+
Creates Py_AASequence from native pyOpenMS AASequence.
65+
66+
Args:
67+
native_sequence (oms.AASequence):
68+
69+
Returns:
70+
Py_AASequence: New wrapped opject
71+
72+
"""
73+
return cls(native_sequence)
74+
6075
@property
6176
def native(self) -> oms.AASequence:
6277
"""Return the underlying pyOpenMS AASequence."""
@@ -204,26 +219,126 @@ def __eq__(self, other: object) -> bool:
204219
return False
205220
return self.sequence == other.sequence
206221

207-
def __getitem__(self, index: int) -> str:
222+
def __getitem__(self, index):
208223
"""
209-
Get residue at position.
224+
Get residue(s) at position(s).
225+
226+
Supports both single indexing and slicing, returning Py_AASequence objects.
210227
211228
Args:
212-
index: Position in the sequence (0-based).
229+
index: Integer for single residue, or slice object for subsequence.
213230
214231
Returns:
215-
str: Single letter amino acid code.
232+
Py_AASequence: Wrapped residue or subsequence.
233+
234+
Example:
235+
>>> seq = Py_AASequence.from_string("PEPTIDE")
236+
>>> seq[1] # Returns Py_AASequence("E")
237+
>>> seq[1:4] # Returns Py_AASequence("EPT")
238+
>>> seq[-1] # Returns Py_AASequence("E")
216239
"""
217-
if index < 0 or index >= len(self):
218-
raise IndexError(f"Index {index} out of range for sequence of length {len(self)}")
219-
residue = self._sequence.getResidue(index)
220-
return residue.getOneLetterCode()
240+
if isinstance(index, slice):
241+
start, stop, step = index.indices(len(self))
242+
if step != 1:
243+
raise ValueError("Step slicing is not supported for amino acid sequences")
244+
return Py_AASequence.from_native(self._sequence.getSubsequence(start, stop - start))
245+
else:
246+
# Handle negative indices
247+
if index < 0:
248+
index = len(self) + index
249+
if index >= len(self):
250+
raise IndexError(f"Index {index} out of range for sequence of length {len(self)}")
251+
residue = self._sequence.getSubsequence(index, 1)
252+
return Py_AASequence.from_native(residue)
221253

222254
def __iter__(self):
223255
"""Iterate over residues."""
224256
for i in range(len(self)):
225257
yield self[i]
258+
def __add__(self, other: Py_AASequence | str) -> Py_AASequence:
259+
"""
260+
Concatenate sequences.
261+
262+
Args:
263+
other: Py_AASequence or string to append.
264+
265+
Returns:
266+
Py_AASequence: New concatenated sequence.
267+
268+
Example:
269+
>>> seq1 = Py_AASequence.from_string("PEP")
270+
>>> seq2 = Py_AASequence.from_string("TIDE")
271+
>>> combined = seq1 + seq2
272+
>>> print(combined.sequence)
273+
PEPTIDE
274+
>>> combined2 = seq1 + "TIDE"
275+
>>> print(combined2.sequence)
276+
PEPTIDE
277+
"""
278+
if isinstance(other, Py_AASequence):
279+
combined_str = self.sequence + other.sequence
280+
elif isinstance(other, str):
281+
combined_str = self.sequence + other
282+
else:
283+
return NotImplemented
284+
return Py_AASequence.from_string(combined_str)
285+
286+
def __radd__(self, other: str) -> Py_AASequence:
287+
"""
288+
Support string + Py_AASequence.
226289
290+
Example:
291+
>>> seq = Py_AASequence.from_string("TIDE")
292+
>>> combined = "PEP" + seq
293+
>>> print(combined.sequence)
294+
PEPTIDE
295+
"""
296+
if isinstance(other, str):
297+
combined_str = other + self.sequence
298+
return Py_AASequence.from_string(combined_str)
299+
return NotImplemented
300+
301+
def __mul__(self, times: int) -> Py_AASequence:
302+
"""
303+
Repeat sequence.
304+
305+
Args:
306+
times: Number of times to repeat (must be >= 0).
307+
308+
Returns:
309+
Py_AASequence: New repeated sequence.
310+
311+
Example:
312+
>>> seq = Py_AASequence.from_string("PEP")
313+
>>> repeated = seq * 3
314+
>>> print(repeated.sequence)
315+
PEPPEPPEP
316+
"""
317+
if not isinstance(times, int) or times < 0:
318+
return NotImplemented
319+
return Py_AASequence.from_string(self.sequence * times)
320+
321+
def __rmul__(self, times: int) -> Py_AASequence:
322+
"""Support int * Py_AASequence."""
323+
return self.__mul__(times)
324+
def __contains__(self, substring: str) -> bool:
325+
"""Check if substring is in sequence."""
326+
return self.has_substring(substring)
327+
328+
def __hash__(self) -> int:
329+
"""Make sequences hashable for use in sets/dicts."""
330+
return hash(self.sequence)
331+
332+
def __lt__(self, other: Py_AASequence) -> bool:
333+
"""Lexicographic comparison by sequence."""
334+
if not isinstance(other, Py_AASequence):
335+
return NotImplemented
336+
return self.sequence < other.sequence
337+
def count(self, residue: str) -> int:
338+
"""Count occurrences of a residue, to be consistent with str.count(), note currently does not account for modifications"""
339+
warnings.warn("count method does not account for modifications")
340+
return self.unmodified_sequence.count(residue)
341+
227342
# ==================== Additional Utilities ====================
228343

229344
def get_mz(self, charge: int) -> float:
@@ -277,4 +392,37 @@ def has_suffix(self, suffix: str) -> bool:
277392
bool: True if sequence ends with suffix.
278393
"""
279394
return self._sequence.hasSuffix(oms.AASequence.fromString(suffix))
395+
396+
397+
# ===================== Exporting =======================
398+
def to_string(self, modified=True, mod_format: Literal['default', 'unimod', 'bracket'] = 'default') -> str:
399+
"""
400+
Get string representation of the sequence.
401+
402+
Args:
403+
modified (bool): Whether to include modifications in the string.
404+
mod_format (Optional[Literal['default', 'unimod', 'bracket']]): Format for modifications.
405+
'default' for OpenMS format,
406+
'unimod' for UniMod format,
407+
'bracket' for bracket notation.
408+
Default is 'default'.
280409
410+
Returns:
411+
str: Amino acid sequence as string.
412+
413+
Example:
414+
>>> seq = Py_AASequence.from_string("PEPTIDE")
415+
>>> seq_str = seq.to_string()
416+
"""
417+
if not modified:
418+
return self.unmodified_sequence
419+
420+
else:
421+
if mod_format == 'default':
422+
return self._sequence.toString()
423+
elif mod_format == 'unimod':
424+
return self._sequence.toUniModString()
425+
elif mod_format == 'bracket':
426+
return self._sequence.toBracketString()
427+
else:
428+
raise ValueError(f"Unsupported mod_format: {mod_format}, supported are 'default', 'unimod' and 'bracket'")

tests/test_py_aasequence.py

Lines changed: 44 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -118,25 +118,22 @@ def test_py_aasequence_iteration():
118118
seq = Py_AASequence.from_string("PEPTIDE")
119119
residues = list(seq)
120120

121-
assert residues == ["P", "E", "P", "T", "I", "D", "E"]
121+
assert [res.sequence for res in residues] == ["P", "E", "P", "T", "I", "D", "E"]
122122
assert len(residues) == 7
123123

124124

125125
def test_py_aasequence_indexing():
126126
"""Test indexing into sequence."""
127127
seq = Py_AASequence.from_string("PEPTIDE")
128128

129-
assert seq[0] == "P"
130-
assert seq[1] == "E"
131-
assert seq[6] == "E"
129+
assert seq[0].sequence == "P"
130+
assert seq[1].sequence == "E"
131+
assert seq[6].sequence == "E"
132132

133133
# Test out of bounds
134134
with pytest.raises(IndexError):
135135
_ = seq[7]
136136

137-
with pytest.raises(IndexError):
138-
_ = seq[-1]
139-
140137

141138
def test_py_aasequence_string_representation():
142139
"""Test string representations."""
@@ -258,3 +255,43 @@ def test_py_aasequence_with_native_aasequence():
258255

259256
assert seq.sequence == "PEPTIDE"
260257
assert seq.native is native
258+
259+
260+
def test_py_aasequence_to_string():
261+
"""Test to_string method with different options."""
262+
seq = Py_AASequence.from_string("PEPTIDEM(Oxidation)")
263+
264+
# Default should return modified string in default format
265+
mod_str = seq.to_string()
266+
assert mod_str == "PEPTIDEM(Oxidation)"
267+
268+
# Unmodified should return unmodified sequence
269+
unmod_str = seq.to_string(modified=False)
270+
assert unmod_str == "PEPTIDEM"
271+
272+
# Bracket format
273+
bracket_str = seq.to_string(modified=True, mod_format='bracket')
274+
assert bracket_str == "PEPTIDEM[147]"
275+
276+
# unimod format
277+
unimod_str = seq.to_string(modified=True, mod_format='unimod')
278+
assert unimod_str == "PEPTIDEM(UniMod:35)"
279+
280+
# Invalid format should raise error
281+
with pytest.raises(ValueError):
282+
_ = seq.to_string(modified=True, mod_format='invalid_format')
283+
284+
285+
def test_slicing():
286+
aa_seq = Py_AASequence.from_string('PEPTIDEM(Oxidation)R')
287+
assert aa_seq[0].sequence == 'P'
288+
assert aa_seq[-1].sequence == 'R'
289+
assert aa_seq[1:4].sequence == 'EPT'
290+
assert aa_seq[-2:].sequence == 'M(Oxidation)R'
291+
292+
def test_count():
293+
aa_seq = Py_AASequence.from_string('PEPTIDEM(Oxidation)R')
294+
assert aa_seq.count('E') == 2
295+
assert aa_seq.count('P') == 2
296+
assert aa_seq.count('K') == 0
297+

0 commit comments

Comments
 (0)