Refactor mmCIF parsing helper functions to add a new (dummy) data pipeline (#62)

amorehead · web-flow · commit c1fa9aacc1f6 · 2024-07-05T17:55:13.000-07:00
* Update data_pipeline.py

* Update mmcif_parsing.py

* Create mmcif_writing.py

* Update cluster_pdb_mmcifs.py

* Update filter_pdb_mmcifs.py

* Fix bug for protein clustering ratio in `cluster_pdb_mmcifs.py`
diff --git a/alphafold3_pytorch/data/data_pipeline.py b/alphafold3_pytorch/data/data_pipeline.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 
-from alphafold3_pytorch.common import amino_acid_constants
+from alphafold3_pytorch.common.biomolecule import _from_mmcif_object
 from alphafold3_pytorch.data import mmcif_parsing
 
 FeatureDict = MutableMapping[str, np.ndarray]
@@ -13,23 +13,17 @@
 def make_sequence_features(sequence: str, description: str, num_res: int) -> FeatureDict:
     """Construct a feature dict of sequence features."""
     features = {}
-    features["restype"] = amino_acid_constants.sequence_to_onehot(
-        sequence=sequence,
-        mapping=amino_acid_constants.restype_order_with_x,
-        map_unknown_to_x=True,
-    )
     features["between_segment_residues"] = np.zeros((num_res,), dtype=np.int32)
     features["domain_name"] = np.array([description.encode("utf-8")], dtype=object)
-    features["residue_index"] = np.array(range(num_res), dtype=np.int32)
     features["seq_length"] = np.array([num_res] * num_res, dtype=np.int32)
     features["sequence"] = np.array([sequence.encode("utf-8")], dtype=object)
     return features
 
 
-def make_mmcif_features(mmcif_object: mmcif_parsing.MmcifObject, chain_id: str) -> FeatureDict:
+def make_mmcif_features(mmcif_object: mmcif_parsing.MmcifObject) -> FeatureDict:
     """Make features from an mmCIF object."""
-    input_sequence = mmcif_object.chain_to_seqres[chain_id]
-    description = "_".join([mmcif_object.file_id, chain_id])
+    input_sequence = "".join(mmcif_object.chain_to_seqres[chain_id] for chain_id in mmcif_object.chain_to_seqres)
+    description = mmcif_object.file_id
     num_res = len(input_sequence)
 
     mmcif_feats = {}
@@ -42,15 +36,19 @@ def make_mmcif_features(mmcif_object: mmcif_parsing.MmcifObject, chain_id: str)
         )
     )
 
-    all_atom_positions, all_atom_mask = mmcif_parsing.get_atom_coords(
-        mmcif_object=mmcif_object, chain_id=chain_id
-    )
+    biomol = _from_mmcif_object(mmcif_object)
 
     # TODO: Expand the first bioassembly/model sequence and structure, to obtain a biologically relevant complex (AF3 Supplement, Section 2.1).
     # Reference: https://github.com/biotite-dev/biotite/blob/1045f43f80c77a0dc00865e924442385ce8f83ab/src/biotite/structure/io/pdbx/convert.py#L1441
 
-    mmcif_feats["all_atom_positions"] = all_atom_positions
-    mmcif_feats["all_atom_mask"] = all_atom_mask
+    mmcif_feats["all_atom_positions"] = biomol.atom_positions
+    mmcif_feats["all_atom_mask"] = biomol.atom_mask
+    mmcif_feats["b_factors"] = biomol.b_factors
+    mmcif_feats["chain_index"] = biomol.chain_index
+    mmcif_feats["chemid"] = biomol.chemid
+    mmcif_feats["chemtype"] = biomol.chemtype
+    mmcif_feats["residue_index"] = biomol.residue_index
+    mmcif_feats["restype"] = biomol.restype
 
     mmcif_feats["resolution"] = np.array([mmcif_object.header["resolution"]], dtype=np.float32)
 
@@ -61,3 +59,13 @@ def make_mmcif_features(mmcif_object: mmcif_parsing.MmcifObject, chain_id: str)
     mmcif_feats["is_distillation"] = np.array(0.0, dtype=np.float32)
 
     return mmcif_feats
+
+
+if __name__ == "__main__":
+    mmcif_object = mmcif_parsing.parse_mmcif_object(
+        # Load an example mmCIF file that includes
+        # protein, nucleic acid, and ligand residues.
+        filepath="data/pdb_data/mmcifs/16/316d.cif",
+        file_id="316d",
+    )
+    mmcif_feats = make_mmcif_features(mmcif_object)
diff --git a/alphafold3_pytorch/data/mmcif_parsing.py b/alphafold3_pytorch/data/mmcif_parsing.py
@@ -5,6 +5,7 @@
 import io
 import logging
 from collections import defaultdict
+from operator import itemgetter
 from typing import Any, Mapping, Optional, Sequence, Set, Tuple
 
 from Bio import PDB
@@ -645,3 +646,71 @@ def _get_complex_chains(
 def _is_set(data: str) -> bool:
     """Returns False if data is a special mmCIF character indicating 'unset'."""
     return data not in (".", "?")
+
+
+def parse_mmcif_object(
+    filepath: str, file_id: str, auth_chains: bool = True, auth_residues: bool = True
+) -> MmcifObject:
+    """Parse an mmCIF file into an `MmcifObject` containing a BioPython `Structure` object as well as associated metadata."""
+    with open(filepath, "r") as f:
+        mmcif_string = f.read()
+
+    parsing_result = parse(
+        file_id=file_id,
+        mmcif_string=mmcif_string,
+        auth_chains=auth_chains,
+        auth_residues=auth_residues,
+    )
+
+    # Crash if an error is encountered. Any parsing errors should have
+    # been dealt with beforehand (e.g., at the alignment stage).
+    if parsing_result.mmcif_object is None:
+        raise list(parsing_result.errors.values())[0]
+
+    return parsing_result.mmcif_object
+
+
+def filter_mmcif(mmcif_object: MmcifObject) -> MmcifObject:
+    """Filter an `MmcifObject` based on collected (atom/residue/chain) removal sets."""
+    model = mmcif_object.structure
+
+    # Filter out specified chains
+    chains_to_remove = set()
+
+    for chain in model:
+        # Filter out specified residues
+        residues_to_remove = set()
+        assert len(chain) == len(mmcif_object.chem_comp_details[chain.id]), (
+            f"Number of residues in chain {chain.id} does not match "
+            f"number of chemical component details for this chain: {len(chain)} vs. "
+            f"{len(mmcif_object.chem_comp_details[chain.id])}."
+        )
+        for res_index, residue in enumerate(chain):
+            # Filter out specified atoms
+            atoms_to_remove = set()
+            for atom in residue:
+                if atom.get_full_id() in mmcif_object.atoms_to_remove:
+                    atoms_to_remove.add(atom)
+            if len(atoms_to_remove) == len(residue):
+                residues_to_remove.add((res_index, residue))
+            for atom in atoms_to_remove:
+                residue.detach_child(atom.id)
+            if residue.get_full_id() in mmcif_object.residues_to_remove:
+                residues_to_remove.add((res_index, residue))
+        if len(residues_to_remove) == len(chain):
+            chains_to_remove.add(chain)
+        for res_index, residue in sorted(residues_to_remove, key=itemgetter(0), reverse=True):
+            del mmcif_object.chem_comp_details[chain.id][res_index]
+            chain.detach_child(residue.id)
+        if chain.get_full_id() in mmcif_object.chains_to_remove:
+            chains_to_remove.add(chain)
+
+    for chain in chains_to_remove:
+        model.detach_child(chain.id)
+        mmcif_object.chem_comp_details.pop(chain.id)
+
+    mmcif_object.atoms_to_remove.clear()
+    mmcif_object.residues_to_remove.clear()
+    mmcif_object.chains_to_remove.clear()
+
+    return mmcif_object
diff --git a/alphafold3_pytorch/data/mmcif_writing.py b/alphafold3_pytorch/data/mmcif_writing.py
@@ -0,0 +1,54 @@
+"""An mmCIF file format writer."""
+
+from typing import List
+
+from alphafold3_pytorch.common.biomolecule import (
+    _from_mmcif_object,
+    get_residue_constants,
+    to_mmcif,
+)
+from alphafold3_pytorch.data.mmcif_parsing import MmcifObject
+from alphafold3_pytorch.utils.data_utils import is_polymer
+
+
+def get_unique_res_atom_names(mmcif_object: MmcifObject) -> List[List[List[str]]]:
+    """Get atom names for each (e.g. ligand) "pseudoresidue" of each residue in each chain."""
+    unique_res_atom_names = []
+    for chain in mmcif_object.structure:
+        chain_chem_comp = mmcif_object.chem_comp_details[chain.id]
+        for res, res_chem_comp in zip(chain, chain_chem_comp):
+            is_polymer_residue = is_polymer(res_chem_comp.type)
+            residue_constants = get_residue_constants(res_chem_type=res_chem_comp.type)
+            if is_polymer_residue:
+                # For polymer residues, append the atom types directly.
+                atoms_to_append = [residue_constants.atom_types]
+            else:
+                # For non-polymer residues, create a nested list of atom names.
+                atoms_to_append = [
+                    [atom.name for _ in range(residue_constants.atom_type_num)] for atom in res
+                ]
+            unique_res_atom_names.append(atoms_to_append)
+    return unique_res_atom_names
+
+
+def write_mmcif(
+    mmcif_object: MmcifObject,
+    output_filepath: str,
+    gapless_poly_seq: bool = True,
+    insert_orig_atom_names: bool = True,
+    insert_alphafold_mmcif_metadata: bool = True,
+):
+    """Write a BioPython `Structure` object to an mmCIF file using an intermediate `Biomolecule` object."""
+    biomol = _from_mmcif_object(mmcif_object)
+    unique_res_atom_names = (
+        get_unique_res_atom_names(mmcif_object) if insert_orig_atom_names else None
+    )
+    mmcif_string = to_mmcif(
+        biomol,
+        mmcif_object.file_id,
+        gapless_poly_seq=gapless_poly_seq,
+        insert_alphafold_mmcif_metadata=insert_alphafold_mmcif_metadata,
+        unique_res_atom_names=unique_res_atom_names,
+    )
+    with open(output_filepath, "w") as f:
+        f.write(mmcif_string)
diff --git a/scripts/cluster_pdb_mmcifs.py b/scripts/cluster_pdb_mmcifs.py
@@ -35,9 +35,9 @@
 from sklearn.cluster import AgglomerativeClustering
 from tqdm import tqdm
 
+from alphafold3_pytorch.data import mmcif_parsing
 from alphafold3_pytorch.tensor_typing import IntType, typecheck
 from alphafold3_pytorch.utils.utils import exists, np_mode
-from scripts.filter_pdb_mmcifs import parse_mmcif_object
 
 # Constants
 
@@ -170,7 +170,7 @@ def parse_chain_sequences_and_interfaces_from_mmcif(
     """
     assert filepath.endswith(".cif"), "The input file must be an mmCIF file."
     file_id = os.path.splitext(os.path.basename(filepath))[0]
-    mmcif_object = parse_mmcif_object(filepath, file_id)
+    mmcif_object = mmcif_parsing.parse_mmcif_object(filepath, file_id)
     model = mmcif_object.structure
 
     # NOTE: After dataset filtering, only heavy (non-hydrogen) atoms remain in the structure
@@ -707,7 +707,7 @@ def cluster_interfaces(
             # Cluster proteins at 40% sequence homology
             AgglomerativeClustering(
                 n_clusters=None,
-                distance_threshold=40.0 + 1e-6,
+                distance_threshold=60.0 + 1e-6,
                 metric="precomputed",
                 linkage="complete",
             ).fit_predict(protein_dist_matrix)
diff --git a/scripts/filter_pdb_mmcifs.py b/scripts/filter_pdb_mmcifs.py