Merge pull request #331 from debbiemarkslab/pdb-loading-fixes

thomashopf · web-flow · commit 92c5d6faa946 · 2025-10-23T12:27:20.000+02:00
Fix regressions related to structure loading
diff --git a/config/sample_config_monomer.txt b/config/sample_config_monomer.txt
@@ -428,9 +428,9 @@ databases:
     sifts_sequence_db: /n/groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.o2.fasta
 
     # Mapping onto predicted 3D structure models
-    # modeldb_type: alphafolddb_v4
-    # modeldb_sequence_file: /n/groups/marks/databases/alphafolddb/2022-11-14/sequences.fasta
-    # modeldb_list_file: /n/groups/marks/databases/alphafolddb/2022-11-14/accession_ids.csv
+    # modeldb_type: alphafolddb
+    # modeldb_sequence_file: /n/groups/marks/databases/alphafolddb/2025-10-22/sequences.fasta
+    # modeldb_list_file: /n/groups/marks/databases/alphafolddb/2025-10-22/accession_ids.csv
     # modeldb_file_dir:
 
 # Paths to external tools used by evcouplings. Please refer to README.md for installation instructions and which tools are required.
diff --git a/evcouplings/align/tools.py b/evcouplings/align/tools.py
@@ -9,8 +9,10 @@
 
 from collections import namedtuple
 import pandas as pd
+
+from evcouplings.utils import TerminatedException
 from evcouplings.utils.system import (
-    run, create_prefix_folders, verify_resources, temp
+    run, create_prefix_folders, verify_resources, temp, ExternalToolError
 )
 from evcouplings.utils.config import check_required
 
@@ -353,7 +355,21 @@ def run_jackhmmer(query, database, prefix,
 
     cmd += [query, database]
 
-    return_code, stdout, stderr = run(cmd)
+    return_code, stdout, stderr = run(
+        cmd, check_returncode=False
+    )
+
+    # override return code checking here, -9 equals out of memory due to denied memory allocation
+    # (but job not being terminated)
+    if return_code != 0:
+        message = "Call failed:\ncmd={}\nreturncode={}\nstdout={}\nstderr={}".format(
+            cmd, return_code, stdout, stderr
+        )
+
+        if return_code == -9:
+            raise TerminatedException(message)
+        else:
+            raise ExternalToolError(message)
 
     # also check we actually created some sort of alignment
     verify_resources(
diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
@@ -4,11 +4,11 @@
 Authors:
   Thomas A. Hopf
 """
-
+import io
 from collections import OrderedDict
 from collections.abc import Iterable
 import gzip
-from io import BytesIO
+from io import BytesIO, TextIOWrapper
 from os import path
 from urllib.error import HTTPError
 
@@ -20,13 +20,14 @@
 from Bio.PDB.MMCIF2Dict import MMCIF2Dict
 
 from evcouplings.utils.config import InvalidParameterError
-from evcouplings.utils.constants import AA3_to_AA1
+from evcouplings.utils.constants import AA3_to_AA1, AA3_to_AA1_FULL
 from evcouplings.utils.helpers import DefaultOrderedDict
 from evcouplings.utils.system import (
     valid_file, ResourceError, tempdir
 )
 
 PDB_BCIF_DOWNLOAD_URL = "https://models.rcsb.org/{pdb_id}.bcif.gz"
+PDB_CIF_DOWNLOAD_URL = "https://files.rcsb.org/download/{pdb_id}.cif.gz"
 
 
 # Mapping from MMTF secondary structure codes to DSSP symbols
@@ -676,7 +677,7 @@ def from_file(cls, filename, keep_full_data=False):
             if binary:
                 mode = "rb"
             else:
-                mode = "r"
+                mode = "rt"
 
             with openfunc(filename, mode=mode) as f:
                 return cls(f, binary=binary, keep_full_data=keep_full_data)
@@ -704,22 +705,36 @@ def from_id(cls, pdb_id, keep_full_data=False):
         """
         # TODO: add proper retry logic and timeouts
         # TODO: add better exception handling
+
+        # easy toggle if we want to switch back to bCIF instead of mmCIF
         try:
             r = requests.get(
-                PDB_BCIF_DOWNLOAD_URL.format(pdb_id=pdb_id.lower())
+                PDB_CIF_DOWNLOAD_URL.format(pdb_id=pdb_id.lower())
             )
         except requests.exceptions.RequestException as e:
             raise ResourceError(
-                "Error fetching bCIF data for {}".format(pdb_id)
+                "Error fetching CIF data for {}".format(pdb_id)
             ) from e
 
         if not r.ok:
             raise ResourceError(
                 "Did not receive valid response fetching {}".format(pdb_id)
             )
 
-        with gzip.GzipFile(fileobj=BytesIO(r.content), mode="r") as f:
-            return cls(f, keep_full_data=keep_full_data)
+        # bCIF:
+        #     with gzip.GzipFile(fileobj=BytesIO(r.content), mode="r") as f:
+        #        return cls(f, binary=True, keep_full_data=keep_full_data)
+
+        # following gzip.open() from https://github.com/python/cpython/blob/3.13/Lib/gzip.py
+        with TextIOWrapper(
+            gzip.GzipFile(fileobj=BytesIO(r.content), mode="r"),  # noqa
+            encoding="utf-8",
+            errors=None,
+            newline=None,
+        ) as f:
+            return cls(
+                f, binary=False, keep_full_data=keep_full_data
+            )
 
     def get_chain(self, chain, model=0, is_author_id=True):
         """
@@ -773,7 +788,7 @@ def get_chain(self, chain, model=0, is_author_id=True):
             # (this should be unique and circumvents issues from 0 seqres values if selecting based on author chain ID)
             coord_id=lambda df: df.auth_seq_id.astype(str) + df.insertion_code,
             seqres_id=lambda df: df.label_seq_id.astype(str).replace("0", pd.NA).replace("", pd.NA),
-            one_letter_code=lambda df: df.label_comp_id.map(AA3_to_AA1, na_action="ignore"),
+            one_letter_code=lambda df: df.label_comp_id.map(AA3_to_AA1_FULL, na_action="ignore"),
             # note that MSE will now be labeled as HETATM, which was not the case with MMTF
             hetatm=lambda df: df.record_type == "HETATM",
         ).reset_index(
diff --git a/evcouplings/compare/protocol.py b/evcouplings/compare/protocol.py
@@ -36,7 +36,7 @@
 from evcouplings.visualize import pairs, misc
 
 SIFTS_TABLE_FORMAT_STR = "{pdb_id}:{pdb_chain} ({coord_start}-{coord_end})"
-AVAILABLE_MODEL_DB_TYPES = ["alphafolddb_v4"]
+AVAILABLE_MODEL_DB_TYPES = ["alphafolddb"]
 ALPHAFOLDDB_DOWNLOAD_URL = "https://alphafold.ebi.ac.uk/files/{id}.cif"
 
 
@@ -664,7 +664,7 @@ def _identify_predicted_structures(**kwargs):
         )
 
     table_callback = None
-    if modeldb_type == "alphafolddb_v4":
+    if modeldb_type == "alphafolddb":
         table_callback = lambda ali, hits: (
             _map_alphafold_hits(
                 kwargs["modeldb_list_file"], set(hits.uniprot_ac)
@@ -732,7 +732,7 @@ def _load_models(model_ids, modeldb_type, structure_dir=None, raise_missing=True
         )
 
     # implement database-specific retrieval behaviour here
-    if modeldb_type == "alphafolddb_v4":
+    if modeldb_type == "alphafolddb":
         make_download_url = lambda model_id: ALPHAFOLDDB_DOWNLOAD_URL.format(id=model_id)
 
     structures = {}
@@ -835,7 +835,8 @@ def models(**kwargs):
     if len(sifts_map.hits) > 0:
         d_intra = intra_dists(
             sifts_map, structures, atom_filter=kwargs["atom_filter"],
-            output_prefix=aux_prefix + "model_distmap_intra"
+            output_prefix=aux_prefix + "model_distmap_intra",
+            raise_missing=False
         )
 
         residue_table_filename, dist_mat_filename = d_intra.to_file(outcfg["model_distmap_monomer"])
@@ -1057,7 +1058,8 @@ def standard(**kwargs):
     if len(sifts_map.hits) > 0:
         d_intra = intra_dists(
             sifts_map, structures, atom_filter=kwargs["atom_filter"],
-            output_prefix=aux_prefix + "_distmap_intra"
+            output_prefix=aux_prefix + "_distmap_intra",
+            raise_missing=False
         )
 
         residue_table_filename, dist_mat_filename = d_intra.to_file(outcfg["distmap_monomer"])
diff --git a/evcouplings/utils/__init__.py b/evcouplings/utils/__init__.py
@@ -42,3 +42,9 @@ class BailoutException(Exception):
     """
     Exception for pipeline stopping itself (e.g. if no sequences found)
     """
+
+class TerminatedException(Exception):
+    """
+    Exception for pipeline crashing due to external constraint that (e.g. denied memory allocation),
+    but does not lead to the job being terminated with a signal
+    """
diff --git a/evcouplings/utils/constants.py b/evcouplings/utils/constants.py
@@ -36,3 +36,163 @@
 AA3_to_AA1 = {
     v: k for k, v in AA1_to_AA3.items()
 }
+
+"""
+Mapping extracted from https://github.com/steineggerlab/foldseek/blob/8979d230fb64c7089380b652758d8705493ed4a5/src/strucclustutils/GemmiWrapper.cpp#L110
+with following Python code, after manually editing fall-through case:
+
+for row in AA_CODES.split("\n"):
+    row = row.strip()
+    if "return" not in row or row.startswith("//"):
+        continue
+
+    symbol = row.split('"')[1]
+    code = row.split("return ")[1].split(";")[0].replace("'", "")
+
+    if code not in code_to_symbol:
+        code_to_symbol[code] = []
+
+    code_to_symbol[code].append(symbol)
+    symbol_to_code[symbol] = code
+"""
+AA3_to_AA1_FULL = {
+    'ALA': 'A',
+    'ARG': 'R',
+    'ASN': 'N',
+    'ABA': 'A',
+    'ASP': 'D',
+    'ASX': 'B',
+    'CYS': 'C',
+    'CSH': 'S',
+    'GLN': 'Q',
+    'GLU': 'E',
+    'GLX': 'Z',
+    'GLY': 'G',
+    'HIS': 'H',
+    'ILE': 'I',
+    'LEU': 'L',
+    'LYS': 'K',
+    'MET': 'M',
+    'MSE': 'M',
+    'ORN': 'A',
+    'PHE': 'F',
+    'PRO': 'P',
+    'SER': 'S',
+    'THR': 'T',
+    'TRY': 'T',
+    'TRP': 'W',
+    'TYR': 'Y',
+    'UNK': 'X',
+    'VAL': 'V',
+    'SEC': 'C',
+    'PYL': 'O',
+    'SEP': 'S',
+    'TPO': 'T',
+    'PCA': 'E',
+    'CSO': 'C',
+    'PTR': 'Y',
+    'KCX': 'K',
+    'CSD': 'C',
+    'LLP': 'K',
+    'CME': 'C',
+    'MLY': 'K',
+    'DAL': 'A',
+    'TYS': 'Y',
+    'OCS': 'C',
+    'M3L': 'K',
+    'FME': 'M',
+    'ALY': 'K',
+    'HYP': 'P',
+    'CAS': 'C',
+    'CRO': 'T',
+    'CSX': 'C',
+    'DPR': 'P',
+    'DGL': 'E',
+    'DVA': 'V',
+    'CSS': 'C',
+    'DPN': 'F',
+    'DSN': 'S',
+    'DLE': 'L',
+    'HIC': 'H',
+    'NLE': 'L',
+    'MVA': 'V',
+    'MLZ': 'K',
+    'CR2': 'G',
+    'SAR': 'G',
+    'DAR': 'R',
+    'DLY': 'K',
+    'YCM': 'C',
+    'NRQ': 'M',
+    'CGU': 'E',
+    '0TD': 'D',
+    'MLE': 'L',
+    'DAS': 'D',
+    'DTR': 'W',
+    'CXM': 'M',
+    'TPQ': 'Y',
+    'DCY': 'C',
+    'DSG': 'N',
+    'DTY': 'Y',
+    'DHI': 'H',
+    'MEN': 'N',
+    'DTH': 'T',
+    'SAC': 'S',
+    'DGN': 'Q',
+    'AIB': 'A',
+    'SMC': 'C',
+    'IAS': 'D',
+    'CIR': 'R',
+    'BMT': 'T',
+    'DIL': 'I',
+    'FGA': 'E',
+    'PHI': 'F',
+    'CRQ': 'Q',
+    'SME': 'M',
+    'GHP': 'G',
+    'MHO': 'M',
+    'NEP': 'H',
+    'TRQ': 'W',
+    'TOX': 'W',
+    'ALC': 'A',
+    'SCH': 'C',
+    'MDO': 'A',
+    'MAA': 'A',
+    'GYS': 'S',
+    'MK8': 'L',
+    'CR8': 'H',
+    'KPI': 'K',
+    'SCY': 'C',
+    'DHA': 'S',
+    'OMY': 'Y',
+    'CAF': 'C',
+    '0AF': 'W',
+    'SNN': 'N',
+    'MHS': 'H',
+    'SNC': 'C',
+    'PHD': 'D',
+    'B3E': 'E',
+    'MEA': 'F',
+    'MED': 'M',
+    'OAS': 'S',
+    'GL3': 'G',
+    'FVA': 'V',
+    'PHL': 'F',
+    'CRF': 'T',
+    'BFD': 'D',
+    'MEQ': 'Q',
+    'DAB': 'A',
+    'AGM': 'R',
+    '4BF': 'Y',
+    'B3A': 'A',
+    'B3D': 'D',
+    'B3K': 'K',
+    'B3Y': 'Y',
+    'BAL': 'A',
+    'DBZ': 'A',
+    'GPL': 'K',
+    'HSK': 'H',
+    'HY3': 'P',
+    'HZP': 'P',
+    'KYN': 'W',
+    'MGN': 'Q'
+}
diff --git a/evcouplings/utils/pipeline.py b/evcouplings/utils/pipeline.py
@@ -32,7 +32,7 @@
 from evcouplings.utils.tracker import (
     get_result_tracker, EStatus
 )
-from evcouplings.utils import BailoutException
+from evcouplings.utils import BailoutException, TerminatedException
 
 import evcouplings.align.protocol as ap
 import evcouplings.couplings.protocol as cp
@@ -524,6 +524,13 @@ def _handler(signal_, frame):
             message = "Pipeline bailed out of execution: {}".format(
                 formatted_exception
             )
+        elif isinstance(e, TerminatedException):
+            # exception remapped to termination (e.g. denied memory allocation)
+            extension = EXTENSION_TERMINATED
+            status = EStatus.TERM
+            message = "Terminated with exception: {}".format(
+                formatted_exception
+            )
         else:
             extension = EXTENSION_FAILED
             status = EStatus.FAIL