Skip to content

Commit 022a9c7

Browse files
committed
Improve performance of initial sequence encoding using bytes.translate
1 parent 6b22adf commit 022a9c7

File tree

1 file changed

+40
-32
lines changed

1 file changed

+40
-32
lines changed

peptides/__init__.py

Lines changed: 40 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,29 @@
2929
Dong-Sheng Cao, Nan Xiao, Qing-Song Xu, and Alex F. Chen for ``Rcpi``.
3030
""".strip()
3131

32+
# --- Constants --------------------------------------------------------------
33+
34+
# fmt: off
35+
_CODE1 = [
36+
"A", "R", "N", "D", "C", "Q", "E", "G", "H", "I",
37+
"L", "K", "M", "F", "P", "S", "T", "W", "Y", "V",
38+
"O", "U", "B", "Z", "J", "X"
39+
]
40+
41+
# fmt: off
42+
_CODE3 = [
43+
"Ala", "Arg", "Asn", "Asp", "Cys", "Gln", "Glu", "Gly", "His", "Ile",
44+
"Leu", "Lys", "Met", "Phe", "Pro", "Ser", "Thr", "Trp", "Tyr", "Val",
45+
"Pyl", "Sec", "Asx", "Glx", "Xle", "Xaa"
46+
]
47+
48+
_TRANS = bytes(
49+
_CODE1.index(chr(x).upper()) if chr(x).upper() in _CODE1 else _CODE1.index("X")
50+
for x in range(256)
51+
)
52+
53+
54+
# --- Helper classes ---------------------------------------------------------
3255

3356
class BLOSUMIndices(typing.NamedTuple):
3457
"""The BLOSUM62-derived indices of a peptide.
@@ -543,6 +566,8 @@ class ZScales(typing.NamedTuple):
543566
z5: float
544567

545568

569+
# --- Peptide class ----------------------------------------------------------
570+
546571
class Peptide(typing.Sequence[str]):
547572
"""A sequence of amino acids.
548573
@@ -552,22 +577,6 @@ class Peptide(typing.Sequence[str]):
552577
553578
"""
554579

555-
# --- Class constants ----------------------------------------------------
556-
557-
# fmt: off
558-
_CODE1 = [
559-
"A", "R", "N", "D", "C", "Q", "E", "G", "H", "I",
560-
"L", "K", "M", "F", "P", "S", "T", "W", "Y", "V",
561-
"O", "U", "B", "Z", "J", "X"
562-
]
563-
564-
# fmt: off
565-
_CODE3 = [
566-
"Ala", "Arg", "Asn", "Asp", "Cys", "Gln", "Glu", "Gly", "His", "Ile",
567-
"Leu", "Lys", "Met", "Phe", "Pro", "Ser", "Thr", "Trp", "Tyr", "Val",
568-
"Pyl", "Sec", "Asx", "Glx", "Xle", "Xaa"
569-
]
570-
571580
# --- Class methods ------------------------------------------------------
572581

573582
@classmethod
@@ -641,11 +650,10 @@ def __init__(self, sequence: str) -> None:
641650
"""
642651
# store the sequence in text format
643652
self.sequence: str = sequence
644-
# store an encoded version of the sequence as an array of indices
645-
encoder = {aa:i for i,aa in enumerate(self._CODE1)}
646-
self.encoded = array.array('B')
647-
for i, aa in enumerate(sequence):
648-
self.encoded.append(encoder.get(aa, encoder["X"]))
653+
# encode the sequence as ASCII bytes
654+
binary = sequence.encode('ascii')
655+
# encode the peptide with the given encoding
656+
self.encoded = array.array('B', binary.translate(_TRANS))
649657

650658
def __len__(self) -> int:
651659
return len(self.sequence)
@@ -709,7 +717,7 @@ def auto_correlation(
709717
sigma = statistics.stdev(table.values())
710718
table = {k: (v - mu) / sigma for k, v in table.items()}
711719
# build look up table
712-
lut = [table.get(aa, 0.0) for aa in self._CODE1]
720+
lut = [table.get(aa, 0.0) for aa in _CODE1]
713721
# compute using Cruciani formula
714722
if numpy is None:
715723
s1 = s2 = 0.0
@@ -743,7 +751,7 @@ def auto_covariance(
743751
sigma = statistics.stdev(table.values())
744752
table = {k: (v - mu) / sigma for k, v in table.items()}
745753
# build the lookup table
746-
lut = [table.get(aa, 0.0) for aa in self._CODE1]
754+
lut = [table.get(aa, 0.0) for aa in _CODE1]
747755
# compute correlation using Cruciani formula
748756
if numpy is None:
749757
s = 0.0
@@ -784,8 +792,8 @@ def cross_covariance(
784792
table2 = {k: (v - mu2) / sigma2 for k, v in table2.items()}
785793

786794
# build the lookup table
787-
lut1 = [table1.get(aa, 0.0) for aa in self._CODE1]
788-
lut2 = [table2.get(aa, 0.0) for aa in self._CODE1]
795+
lut1 = [table1.get(aa, 0.0) for aa in _CODE1]
796+
lut2 = [table2.get(aa, 0.0) for aa in _CODE1]
789797

790798
# compute using Cruciani formula
791799
if numpy is None:
@@ -837,7 +845,7 @@ def profile(
837845
# number of residues in the peptide sequence
838846
if len(self) >= window:
839847
# build a look-up table and index values
840-
lut = [table.get(aa, default) for aa in self._CODE1]
848+
lut = [table.get(aa, default) for aa in _CODE1]
841849
if numpy is None:
842850
values = [lut[i] for i in self.encoded]
843851
else:
@@ -882,7 +890,7 @@ def counts(self) -> typing.Dict[str, int]:
882890
# some overhead.
883891
return {
884892
aa: self.sequence.count(aa)
885-
for aa in self._CODE1
893+
for aa in _CODE1
886894
}
887895

888896
def frequencies(self) -> typing.Dict[str, float]:
@@ -1067,8 +1075,8 @@ def charge(self, pH: float = 7, pKscale: str = "Lehninger") -> float:
10671075
scale_sign = tables.CHARGE["sign"]
10681076

10691077
# build a look-up table for the pKa scale and the charge sign
1070-
lut_pKa = [scale_pKa.get(aa, 0.0) for aa in self._CODE1]
1071-
lut_sign = [scale_sign.get(aa, 0.0) for aa in self._CODE1]
1078+
lut_pKa = [scale_pKa.get(aa, 0.0) for aa in _CODE1]
1079+
lut_sign = [scale_sign.get(aa, 0.0) for aa in _CODE1]
10721080

10731081
# compute charge of each amino-acid, and sum
10741082
if numpy is None:
@@ -1130,7 +1138,7 @@ def hydrophobic_moment(self, window: int = 11, angle: int = 100) -> float:
11301138
"""
11311139
window = min(window, len(self))
11321140
scale = tables.HYDROPHOBICITY["Eisenberg"]
1133-
lut = [scale.get(aa, 0.0) for aa in self._CODE1]
1141+
lut = [scale.get(aa, 0.0) for aa in _CODE1]
11341142
angles = [(angle * i) % 360 for i in range(window)]
11351143

11361144
if numpy is None:
@@ -1970,7 +1978,7 @@ def structural_class(
19701978
s = sum((pep_frequencies[x]-table[x])**2 for x in table)
19711979
distances[name] = math.sqrt(s)
19721980
elif distance == "mahalanobis" or distance == "discriminant":
1973-
x = [pep_frequencies[aa] for aa in sorted(self._CODE1[:20])]
1981+
x = [pep_frequencies[aa] for aa in sorted(_CODE1[:20])]
19741982
for name,tables in dataset.items():
19751983
if name == "all":
19761984
continue
@@ -1982,7 +1990,7 @@ def structural_class(
19821990
continue
19831991
# compute Mahalanobis distance using the components of
19841992
# the eigendecomposition
1985-
xm = [mean[aa] for aa in sorted(self._CODE1[:20])]
1993+
xm = [mean[aa] for aa in sorted(_CODE1[:20])]
19861994
y = [
19871995
sum(eivecs[i][j] * (x[j] - xm[j]) for j in range(20))
19881996
for i in range(20)

0 commit comments

Comments
 (0)