Skip to content

Commit f80222a

Browse files
committed
fix residue 3 to 1 mapping to cover nonstandard AAs too
1 parent f7b5bf0 commit f80222a

File tree

2 files changed

+162
-2
lines changed

2 files changed

+162
-2
lines changed

evcouplings/compare/pdb.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
2121

2222
from evcouplings.utils.config import InvalidParameterError
23-
from evcouplings.utils.constants import AA3_to_AA1
23+
from evcouplings.utils.constants import AA3_to_AA1, AA3_to_AA1_FULL
2424
from evcouplings.utils.helpers import DefaultOrderedDict
2525
from evcouplings.utils.system import (
2626
valid_file, ResourceError, tempdir
@@ -788,7 +788,7 @@ def get_chain(self, chain, model=0, is_author_id=True):
788788
# (this should be unique and circumvents issues from 0 seqres values if selecting based on author chain ID)
789789
coord_id=lambda df: df.auth_seq_id.astype(str) + df.insertion_code,
790790
seqres_id=lambda df: df.label_seq_id.astype(str).replace("0", pd.NA).replace("", pd.NA),
791-
one_letter_code=lambda df: df.label_comp_id.map(AA3_to_AA1, na_action="ignore"),
791+
one_letter_code=lambda df: df.label_comp_id.map(AA3_to_AA1_FULL, na_action="ignore"),
792792
# note that MSE will now be labeled as HETATM, which was not the case with MMTF
793793
hetatm=lambda df: df.record_type == "HETATM",
794794
).reset_index(

evcouplings/utils/constants.py

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,163 @@
3636
AA3_to_AA1 = {
3737
v: k for k, v in AA1_to_AA3.items()
3838
}
39+
40+
"""
41+
Mapping extracted from https://github.com/steineggerlab/foldseek/blob/8979d230fb64c7089380b652758d8705493ed4a5/src/strucclustutils/GemmiWrapper.cpp#L110
42+
with following Python code, after manually editing fall-through case:
43+
44+
for row in AA_CODES.split("\n"):
45+
row = row.strip()
46+
if "return" not in row or row.startswith("//"):
47+
continue
48+
49+
symbol = row.split('"')[1]
50+
code = row.split("return ")[1].split(";")[0].replace("'", "")
51+
52+
if code not in code_to_symbol:
53+
code_to_symbol[code] = []
54+
55+
code_to_symbol[code].append(symbol)
56+
symbol_to_code[symbol] = code
57+
"""
58+
AA3_to_AA1_FULL = {
59+
'ALA': 'A',
60+
'ARG': 'R',
61+
'ASN': 'N',
62+
'ABA': 'A',
63+
'ASP': 'D',
64+
'ASX': 'B',
65+
'CYS': 'C',
66+
'CSH': 'S',
67+
'GLN': 'Q',
68+
'GLU': 'E',
69+
'GLX': 'Z',
70+
'GLY': 'G',
71+
'HIS': 'H',
72+
'ILE': 'I',
73+
'LEU': 'L',
74+
'LYS': 'K',
75+
'MET': 'M',
76+
'MSE': 'M',
77+
'ORN': 'A',
78+
'PHE': 'F',
79+
'PRO': 'P',
80+
'SER': 'S',
81+
'THR': 'T',
82+
'TRY': 'T',
83+
'TRP': 'W',
84+
'TYR': 'Y',
85+
'UNK': 'X',
86+
'VAL': 'V',
87+
'SEC': 'C',
88+
'PYL': 'O',
89+
'SEP': 'S',
90+
'TPO': 'T',
91+
'PCA': 'E',
92+
'CSO': 'C',
93+
'PTR': 'Y',
94+
'KCX': 'K',
95+
'CSD': 'C',
96+
'LLP': 'K',
97+
'CME': 'C',
98+
'MLY': 'K',
99+
'DAL': 'A',
100+
'TYS': 'Y',
101+
'OCS': 'C',
102+
'M3L': 'K',
103+
'FME': 'M',
104+
'ALY': 'K',
105+
'HYP': 'P',
106+
'CAS': 'C',
107+
'CRO': 'T',
108+
'CSX': 'C',
109+
'DPR': 'P',
110+
'DGL': 'E',
111+
'DVA': 'V',
112+
'CSS': 'C',
113+
'DPN': 'F',
114+
'DSN': 'S',
115+
'DLE': 'L',
116+
'HIC': 'H',
117+
'NLE': 'L',
118+
'MVA': 'V',
119+
'MLZ': 'K',
120+
'CR2': 'G',
121+
'SAR': 'G',
122+
'DAR': 'R',
123+
'DLY': 'K',
124+
'YCM': 'C',
125+
'NRQ': 'M',
126+
'CGU': 'E',
127+
'0TD': 'D',
128+
'MLE': 'L',
129+
'DAS': 'D',
130+
'DTR': 'W',
131+
'CXM': 'M',
132+
'TPQ': 'Y',
133+
'DCY': 'C',
134+
'DSG': 'N',
135+
'DTY': 'Y',
136+
'DHI': 'H',
137+
'MEN': 'N',
138+
'DTH': 'T',
139+
'SAC': 'S',
140+
'DGN': 'Q',
141+
'AIB': 'A',
142+
'SMC': 'C',
143+
'IAS': 'D',
144+
'CIR': 'R',
145+
'BMT': 'T',
146+
'DIL': 'I',
147+
'FGA': 'E',
148+
'PHI': 'F',
149+
'CRQ': 'Q',
150+
'SME': 'M',
151+
'GHP': 'G',
152+
'MHO': 'M',
153+
'NEP': 'H',
154+
'TRQ': 'W',
155+
'TOX': 'W',
156+
'ALC': 'A',
157+
'SCH': 'C',
158+
'MDO': 'A',
159+
'MAA': 'A',
160+
'GYS': 'S',
161+
'MK8': 'L',
162+
'CR8': 'H',
163+
'KPI': 'K',
164+
'SCY': 'C',
165+
'DHA': 'S',
166+
'OMY': 'Y',
167+
'CAF': 'C',
168+
'0AF': 'W',
169+
'SNN': 'N',
170+
'MHS': 'H',
171+
'SNC': 'C',
172+
'PHD': 'D',
173+
'B3E': 'E',
174+
'MEA': 'F',
175+
'MED': 'M',
176+
'OAS': 'S',
177+
'GL3': 'G',
178+
'FVA': 'V',
179+
'PHL': 'F',
180+
'CRF': 'T',
181+
'BFD': 'D',
182+
'MEQ': 'Q',
183+
'DAB': 'A',
184+
'AGM': 'R',
185+
'4BF': 'Y',
186+
'B3A': 'A',
187+
'B3D': 'D',
188+
'B3K': 'K',
189+
'B3Y': 'Y',
190+
'BAL': 'A',
191+
'DBZ': 'A',
192+
'GPL': 'K',
193+
'HSK': 'H',
194+
'HY3': 'P',
195+
'HZP': 'P',
196+
'KYN': 'W',
197+
'MGN': 'Q'
198+
}

0 commit comments

Comments
 (0)