Skip to content

Commit 92c5d6f

Browse files
authored
Merge pull request #331 from debbiemarkslab/pdb-loading-fixes
Fix regressions related to structure loading
2 parents b19baff + f80222a commit 92c5d6f

File tree

7 files changed

+226
-20
lines changed

7 files changed

+226
-20
lines changed

config/sample_config_monomer.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -428,9 +428,9 @@ databases:
428428
sifts_sequence_db: /n/groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.o2.fasta
429429

430430
# Mapping onto predicted 3D structure models
431-
# modeldb_type: alphafolddb_v4
432-
# modeldb_sequence_file: /n/groups/marks/databases/alphafolddb/2022-11-14/sequences.fasta
433-
# modeldb_list_file: /n/groups/marks/databases/alphafolddb/2022-11-14/accession_ids.csv
431+
# modeldb_type: alphafolddb
432+
# modeldb_sequence_file: /n/groups/marks/databases/alphafolddb/2025-10-22/sequences.fasta
433+
# modeldb_list_file: /n/groups/marks/databases/alphafolddb/2025-10-22/accession_ids.csv
434434
# modeldb_file_dir:
435435

436436
# Paths to external tools used by evcouplings. Please refer to README.md for installation instructions and which tools are required.

evcouplings/align/tools.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,10 @@
99

1010
from collections import namedtuple
1111
import pandas as pd
12+
13+
from evcouplings.utils import TerminatedException
1214
from evcouplings.utils.system import (
13-
run, create_prefix_folders, verify_resources, temp
15+
run, create_prefix_folders, verify_resources, temp, ExternalToolError
1416
)
1517
from evcouplings.utils.config import check_required
1618

@@ -353,7 +355,21 @@ def run_jackhmmer(query, database, prefix,
353355

354356
cmd += [query, database]
355357

356-
return_code, stdout, stderr = run(cmd)
358+
return_code, stdout, stderr = run(
359+
cmd, check_returncode=False
360+
)
361+
362+
# override return code checking here, -9 equals out of memory due to denied memory allocation
363+
# (but job not being terminated)
364+
if return_code != 0:
365+
message = "Call failed:\ncmd={}\nreturncode={}\nstdout={}\nstderr={}".format(
366+
cmd, return_code, stdout, stderr
367+
)
368+
369+
if return_code == -9:
370+
raise TerminatedException(message)
371+
else:
372+
raise ExternalToolError(message)
357373

358374
# also check we actually created some sort of alignment
359375
verify_resources(

evcouplings/compare/pdb.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
Authors:
55
Thomas A. Hopf
66
"""
7-
7+
import io
88
from collections import OrderedDict
99
from collections.abc import Iterable
1010
import gzip
11-
from io import BytesIO
11+
from io import BytesIO, TextIOWrapper
1212
from os import path
1313
from urllib.error import HTTPError
1414

@@ -20,13 +20,14 @@
2020
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
2121

2222
from evcouplings.utils.config import InvalidParameterError
23-
from evcouplings.utils.constants import AA3_to_AA1
23+
from evcouplings.utils.constants import AA3_to_AA1, AA3_to_AA1_FULL
2424
from evcouplings.utils.helpers import DefaultOrderedDict
2525
from evcouplings.utils.system import (
2626
valid_file, ResourceError, tempdir
2727
)
2828

2929
PDB_BCIF_DOWNLOAD_URL = "https://models.rcsb.org/{pdb_id}.bcif.gz"
30+
PDB_CIF_DOWNLOAD_URL = "https://files.rcsb.org/download/{pdb_id}.cif.gz"
3031

3132

3233
# Mapping from MMTF secondary structure codes to DSSP symbols
@@ -676,7 +677,7 @@ def from_file(cls, filename, keep_full_data=False):
676677
if binary:
677678
mode = "rb"
678679
else:
679-
mode = "r"
680+
mode = "rt"
680681

681682
with openfunc(filename, mode=mode) as f:
682683
return cls(f, binary=binary, keep_full_data=keep_full_data)
@@ -704,22 +705,36 @@ def from_id(cls, pdb_id, keep_full_data=False):
704705
"""
705706
# TODO: add proper retry logic and timeouts
706707
# TODO: add better exception handling
708+
709+
# easy toggle if we want to switch back to bCIF instead of mmCIF
707710
try:
708711
r = requests.get(
709-
PDB_BCIF_DOWNLOAD_URL.format(pdb_id=pdb_id.lower())
712+
PDB_CIF_DOWNLOAD_URL.format(pdb_id=pdb_id.lower())
710713
)
711714
except requests.exceptions.RequestException as e:
712715
raise ResourceError(
713-
"Error fetching bCIF data for {}".format(pdb_id)
716+
"Error fetching CIF data for {}".format(pdb_id)
714717
) from e
715718

716719
if not r.ok:
717720
raise ResourceError(
718721
"Did not receive valid response fetching {}".format(pdb_id)
719722
)
720723

721-
with gzip.GzipFile(fileobj=BytesIO(r.content), mode="r") as f:
722-
return cls(f, keep_full_data=keep_full_data)
724+
# bCIF:
725+
# with gzip.GzipFile(fileobj=BytesIO(r.content), mode="r") as f:
726+
# return cls(f, binary=True, keep_full_data=keep_full_data)
727+
728+
# following gzip.open() from https://github.com/python/cpython/blob/3.13/Lib/gzip.py
729+
with TextIOWrapper(
730+
gzip.GzipFile(fileobj=BytesIO(r.content), mode="r"), # noqa
731+
encoding="utf-8",
732+
errors=None,
733+
newline=None,
734+
) as f:
735+
return cls(
736+
f, binary=False, keep_full_data=keep_full_data
737+
)
723738

724739
def get_chain(self, chain, model=0, is_author_id=True):
725740
"""
@@ -773,7 +788,7 @@ def get_chain(self, chain, model=0, is_author_id=True):
773788
# (this should be unique and circumvents issues from 0 seqres values if selecting based on author chain ID)
774789
coord_id=lambda df: df.auth_seq_id.astype(str) + df.insertion_code,
775790
seqres_id=lambda df: df.label_seq_id.astype(str).replace("0", pd.NA).replace("", pd.NA),
776-
one_letter_code=lambda df: df.label_comp_id.map(AA3_to_AA1, na_action="ignore"),
791+
one_letter_code=lambda df: df.label_comp_id.map(AA3_to_AA1_FULL, na_action="ignore"),
777792
# note that MSE will now be labeled as HETATM, which was not the case with MMTF
778793
hetatm=lambda df: df.record_type == "HETATM",
779794
).reset_index(

evcouplings/compare/protocol.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
from evcouplings.visualize import pairs, misc
3737

3838
SIFTS_TABLE_FORMAT_STR = "{pdb_id}:{pdb_chain} ({coord_start}-{coord_end})"
39-
AVAILABLE_MODEL_DB_TYPES = ["alphafolddb_v4"]
39+
AVAILABLE_MODEL_DB_TYPES = ["alphafolddb"]
4040
ALPHAFOLDDB_DOWNLOAD_URL = "https://alphafold.ebi.ac.uk/files/{id}.cif"
4141

4242

@@ -664,7 +664,7 @@ def _identify_predicted_structures(**kwargs):
664664
)
665665

666666
table_callback = None
667-
if modeldb_type == "alphafolddb_v4":
667+
if modeldb_type == "alphafolddb":
668668
table_callback = lambda ali, hits: (
669669
_map_alphafold_hits(
670670
kwargs["modeldb_list_file"], set(hits.uniprot_ac)
@@ -732,7 +732,7 @@ def _load_models(model_ids, modeldb_type, structure_dir=None, raise_missing=True
732732
)
733733

734734
# implement database-specific retrieval behaviour here
735-
if modeldb_type == "alphafolddb_v4":
735+
if modeldb_type == "alphafolddb":
736736
make_download_url = lambda model_id: ALPHAFOLDDB_DOWNLOAD_URL.format(id=model_id)
737737

738738
structures = {}
@@ -835,7 +835,8 @@ def models(**kwargs):
835835
if len(sifts_map.hits) > 0:
836836
d_intra = intra_dists(
837837
sifts_map, structures, atom_filter=kwargs["atom_filter"],
838-
output_prefix=aux_prefix + "model_distmap_intra"
838+
output_prefix=aux_prefix + "model_distmap_intra",
839+
raise_missing=False
839840
)
840841

841842
residue_table_filename, dist_mat_filename = d_intra.to_file(outcfg["model_distmap_monomer"])
@@ -1057,7 +1058,8 @@ def standard(**kwargs):
10571058
if len(sifts_map.hits) > 0:
10581059
d_intra = intra_dists(
10591060
sifts_map, structures, atom_filter=kwargs["atom_filter"],
1060-
output_prefix=aux_prefix + "_distmap_intra"
1061+
output_prefix=aux_prefix + "_distmap_intra",
1062+
raise_missing=False
10611063
)
10621064

10631065
residue_table_filename, dist_mat_filename = d_intra.to_file(outcfg["distmap_monomer"])

evcouplings/utils/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,9 @@ class BailoutException(Exception):
4242
"""
4343
Exception for pipeline stopping itself (e.g. if no sequences found)
4444
"""
45+
46+
class TerminatedException(Exception):
47+
"""
48+
Exception for pipeline crashing due to external constraint that (e.g. denied memory allocation),
49+
but does not lead to the job being terminated with a signal
50+
"""

evcouplings/utils/constants.py

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,163 @@
3636
AA3_to_AA1 = {
3737
v: k for k, v in AA1_to_AA3.items()
3838
}
39+
40+
"""
41+
Mapping extracted from https://github.com/steineggerlab/foldseek/blob/8979d230fb64c7089380b652758d8705493ed4a5/src/strucclustutils/GemmiWrapper.cpp#L110
42+
with following Python code, after manually editing fall-through case:
43+
44+
for row in AA_CODES.split("\n"):
45+
row = row.strip()
46+
if "return" not in row or row.startswith("//"):
47+
continue
48+
49+
symbol = row.split('"')[1]
50+
code = row.split("return ")[1].split(";")[0].replace("'", "")
51+
52+
if code not in code_to_symbol:
53+
code_to_symbol[code] = []
54+
55+
code_to_symbol[code].append(symbol)
56+
symbol_to_code[symbol] = code
57+
"""
58+
AA3_to_AA1_FULL = {
59+
'ALA': 'A',
60+
'ARG': 'R',
61+
'ASN': 'N',
62+
'ABA': 'A',
63+
'ASP': 'D',
64+
'ASX': 'B',
65+
'CYS': 'C',
66+
'CSH': 'S',
67+
'GLN': 'Q',
68+
'GLU': 'E',
69+
'GLX': 'Z',
70+
'GLY': 'G',
71+
'HIS': 'H',
72+
'ILE': 'I',
73+
'LEU': 'L',
74+
'LYS': 'K',
75+
'MET': 'M',
76+
'MSE': 'M',
77+
'ORN': 'A',
78+
'PHE': 'F',
79+
'PRO': 'P',
80+
'SER': 'S',
81+
'THR': 'T',
82+
'TRY': 'T',
83+
'TRP': 'W',
84+
'TYR': 'Y',
85+
'UNK': 'X',
86+
'VAL': 'V',
87+
'SEC': 'C',
88+
'PYL': 'O',
89+
'SEP': 'S',
90+
'TPO': 'T',
91+
'PCA': 'E',
92+
'CSO': 'C',
93+
'PTR': 'Y',
94+
'KCX': 'K',
95+
'CSD': 'C',
96+
'LLP': 'K',
97+
'CME': 'C',
98+
'MLY': 'K',
99+
'DAL': 'A',
100+
'TYS': 'Y',
101+
'OCS': 'C',
102+
'M3L': 'K',
103+
'FME': 'M',
104+
'ALY': 'K',
105+
'HYP': 'P',
106+
'CAS': 'C',
107+
'CRO': 'T',
108+
'CSX': 'C',
109+
'DPR': 'P',
110+
'DGL': 'E',
111+
'DVA': 'V',
112+
'CSS': 'C',
113+
'DPN': 'F',
114+
'DSN': 'S',
115+
'DLE': 'L',
116+
'HIC': 'H',
117+
'NLE': 'L',
118+
'MVA': 'V',
119+
'MLZ': 'K',
120+
'CR2': 'G',
121+
'SAR': 'G',
122+
'DAR': 'R',
123+
'DLY': 'K',
124+
'YCM': 'C',
125+
'NRQ': 'M',
126+
'CGU': 'E',
127+
'0TD': 'D',
128+
'MLE': 'L',
129+
'DAS': 'D',
130+
'DTR': 'W',
131+
'CXM': 'M',
132+
'TPQ': 'Y',
133+
'DCY': 'C',
134+
'DSG': 'N',
135+
'DTY': 'Y',
136+
'DHI': 'H',
137+
'MEN': 'N',
138+
'DTH': 'T',
139+
'SAC': 'S',
140+
'DGN': 'Q',
141+
'AIB': 'A',
142+
'SMC': 'C',
143+
'IAS': 'D',
144+
'CIR': 'R',
145+
'BMT': 'T',
146+
'DIL': 'I',
147+
'FGA': 'E',
148+
'PHI': 'F',
149+
'CRQ': 'Q',
150+
'SME': 'M',
151+
'GHP': 'G',
152+
'MHO': 'M',
153+
'NEP': 'H',
154+
'TRQ': 'W',
155+
'TOX': 'W',
156+
'ALC': 'A',
157+
'SCH': 'C',
158+
'MDO': 'A',
159+
'MAA': 'A',
160+
'GYS': 'S',
161+
'MK8': 'L',
162+
'CR8': 'H',
163+
'KPI': 'K',
164+
'SCY': 'C',
165+
'DHA': 'S',
166+
'OMY': 'Y',
167+
'CAF': 'C',
168+
'0AF': 'W',
169+
'SNN': 'N',
170+
'MHS': 'H',
171+
'SNC': 'C',
172+
'PHD': 'D',
173+
'B3E': 'E',
174+
'MEA': 'F',
175+
'MED': 'M',
176+
'OAS': 'S',
177+
'GL3': 'G',
178+
'FVA': 'V',
179+
'PHL': 'F',
180+
'CRF': 'T',
181+
'BFD': 'D',
182+
'MEQ': 'Q',
183+
'DAB': 'A',
184+
'AGM': 'R',
185+
'4BF': 'Y',
186+
'B3A': 'A',
187+
'B3D': 'D',
188+
'B3K': 'K',
189+
'B3Y': 'Y',
190+
'BAL': 'A',
191+
'DBZ': 'A',
192+
'GPL': 'K',
193+
'HSK': 'H',
194+
'HY3': 'P',
195+
'HZP': 'P',
196+
'KYN': 'W',
197+
'MGN': 'Q'
198+
}

evcouplings/utils/pipeline.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
from evcouplings.utils.tracker import (
3333
get_result_tracker, EStatus
3434
)
35-
from evcouplings.utils import BailoutException
35+
from evcouplings.utils import BailoutException, TerminatedException
3636

3737
import evcouplings.align.protocol as ap
3838
import evcouplings.couplings.protocol as cp
@@ -524,6 +524,13 @@ def _handler(signal_, frame):
524524
message = "Pipeline bailed out of execution: {}".format(
525525
formatted_exception
526526
)
527+
elif isinstance(e, TerminatedException):
528+
# exception remapped to termination (e.g. denied memory allocation)
529+
extension = EXTENSION_TERMINATED
530+
status = EStatus.TERM
531+
message = "Terminated with exception: {}".format(
532+
formatted_exception
533+
)
527534
else:
528535
extension = EXTENSION_FAILED
529536
status = EStatus.FAIL

0 commit comments

Comments
 (0)