Create outline of how to do cropping with Biomolecule objects (#99)

amorehead · web-flow · commit f14b81622d04 · 2024-07-20T12:53:21.000-07:00
* Update biomolecule.py

* Update data_pipeline.py
diff --git a/alphafold3_pytorch/common/biomolecule.py b/alphafold3_pytorch/common/biomolecule.py
@@ -4,6 +4,7 @@
 import dataclasses
 import functools
 import io
+import random
 from types import ModuleType
 from typing import Any, Dict, List, Optional, Set, Tuple
 
@@ -232,6 +233,69 @@ def repeat(self, coord: np.ndarray) -> "Biomolecule":
             mmcif_metadata=self.mmcif_metadata,
         )
 
+    def crop_chains_with_masks(
+        self, chain_ids_and_lengths: List[Tuple[str, int]], crop_masks: List[np.ndarray]
+    ):
+        """
+        Crop the chains and metadata within a Biomolecule
+        to only include the specified chain residues.
+        """
+        assert len(chain_ids_and_lengths) == len(
+            crop_masks
+        ), "The number of chains and crop masks must be equal."
+        raise NotImplementedError("Chain cropping is not yet implemented.")
+
+    def contiguous_crop(self, n_res: int) -> "Biomolecule":
+        """
+        Crop a Biomolecule to only include contiguous
+        polymer residues and/or ligand atoms for each chain.
+        """
+        chain_ids_and_lengths = list(collections.Counter(self.chain_id).items())
+        random.shuffle(chain_ids_and_lengths)
+        crop_masks = create_contiguous_crop_masks(chain_ids_and_lengths, n_res)
+        self.crop_chains_with_masks(chain_ids_and_lengths, crop_masks)
+
+    def spatial_crop(self) -> "Biomolecule":
+        """
+        Crop a Biomolecule to only include polymer residues and ligand atoms
+        near a (random) reference atom within a sampled chain/interface.
+        """
+        raise NotImplementedError("Spatial cropping is not yet implemented.")
+
+    def spatial_interface_crop(self) -> "Biomolecule":
+        """
+        Crop a Biomolecule to only include contiguous polymer residues
+        and/or ligand atoms for each chain.
+        """
+        raise NotImplementedError("Spatial interface cropping is not yet implemented.")
+
+
+@typecheck
+def create_contiguous_crop_masks(
+    chain_ids_and_lengths: List[Tuple[str, int]], n_res: int
+) -> List[np.ndarray]:
+    """
+    Create contiguous crop masks for each given chain.
+    Implements Algorithm 1 from the AlphaFold-Multimer paper.
+    """
+    m_ks = []
+    n_added = 0
+    n_remaining = n_res
+    for chain_id_and_length in chain_ids_and_lengths:
+        n_k = chain_id_and_length[1]
+        n_remaining -= n_k
+        crop_size_max = min(n_res - n_added, n_k)
+        # NOTE: `max(0, n_remaining)` was analytically added to prevent invalid crop sizes.
+        crop_size_min = min(n_k, max(0, n_res - (n_added + max(0, n_remaining))))
+        crop_size = random.randrange(crop_size_min, crop_size_max + 1)
+        n_added += crop_size
+        crop_start = random.randrange(0, n_k - crop_size + 1)
+        m_k = np.zeros(n_k, dtype=bool)
+        keep = np.arange(crop_start, crop_start + crop_size)
+        m_k[keep] = True
+        m_ks.append(m_k)
+    return m_ks
+
 
 @typecheck
 def get_residue_constants(
@@ -307,7 +371,8 @@ def get_unique_res_atom_names(
 
 @typecheck
 def _from_mmcif_object(
-    mmcif_object: mmcif_parsing.MmcifObject, chain_ids: Optional[Set[str]] = None,
+    mmcif_object: mmcif_parsing.MmcifObject,
+    chain_ids: Optional[Set[str]] = None,
 ) -> Biomolecule:
     """Takes a Biopython structure/model mmCIF object and creates a `Biomolecule` instance.
 
@@ -543,7 +608,9 @@ def _from_mmcif_object(
 
 
 @typecheck
-def from_mmcif_string(mmcif_str: str, file_id: str, chain_ids: Optional[Set[str]] = None) -> Biomolecule:
+def from_mmcif_string(
+    mmcif_str: str, file_id: str, chain_ids: Optional[Set[str]] = None
+) -> Biomolecule:
     """Takes a mmCIF string and constructs a `Biomolecule` object.
 
     WARNING: All non-standard residue types will be converted into UNK. All
diff --git a/alphafold3_pytorch/data/data_pipeline.py b/alphafold3_pytorch/data/data_pipeline.py
@@ -14,12 +14,10 @@
 FeatureDict = MutableMapping[str, np.ndarray]
 
 
-def make_sequence_features(sequence: str, description: str, num_res: int) -> FeatureDict:
+def make_sequence_features(sequence: str, description: str) -> FeatureDict:
     """Construct a feature dict of sequence features."""
     features = {}
-    features["between_segment_residues"] = np.zeros((num_res,), dtype=np.int32)
     features["domain_name"] = np.array([description.encode("utf-8")], dtype=object)
-    features["seq_length"] = np.array([num_res] * num_res, dtype=np.int32)
     features["sequence"] = np.array([sequence.encode("utf-8")], dtype=object)
     return features
 
@@ -101,22 +99,24 @@ def make_mmcif_features(
         mmcif_object.chain_to_seqres[chain_id] for chain_id in mmcif_object.chain_to_seqres
     )
     description = mmcif_object.file_id
-    num_res = len(input_sequence)
 
     mmcif_feats = {}
 
     mmcif_feats.update(
         make_sequence_features(
             sequence=input_sequence,
             description=description,
-            num_res=num_res,
         )
     )
 
     # As necessary, expand the first bioassembly/model sequence and structure, to obtain a biologically relevant complex (AF3 Supplement, Section 2.1).
     # Reference: https://github.com/biotite-dev/biotite/blob/1045f43f80c77a0dc00865e924442385ce8f83ab/src/biotite/structure/io/pdbx/convert.py#L1441
 
-    assembly = _from_mmcif_object(mmcif_object) if "assembly" in description else get_assembly(_from_mmcif_object(mmcif_object))
+    assembly = (
+        _from_mmcif_object(mmcif_object)
+        if "assembly" in description
+        else get_assembly(_from_mmcif_object(mmcif_object))
+    )
 
     mmcif_feats["all_atom_positions"] = assembly.atom_positions
     mmcif_feats["all_atom_mask"] = assembly.atom_mask
@@ -128,6 +128,8 @@ def make_mmcif_features(
     mmcif_feats["residue_index"] = assembly.residue_index
     mmcif_feats["restype"] = assembly.restype
 
+    mmcif_feats["bonds"] = mmcif_object.bonds
+
     mmcif_feats["resolution"] = np.array([mmcif_object.header["resolution"]], dtype=np.float32)
 
     mmcif_feats["release_date"] = np.array(
@@ -148,12 +150,15 @@ def make_mmcif_features(
         file_id=file_id,
     )
     mmcif_feats, assembly = make_mmcif_features(mmcif_object)
+    # cropped_assembly = assembly.contiguous_crop(384)
     mmcif_string = to_mmcif(
         assembly,
+        # cropped_assembly,
         file_id=file_id,
         gapless_poly_seq=True,
         insert_alphafold_mmcif_metadata=False,
         unique_res_atom_names=assembly.unique_res_atom_names,
+        # unique_res_atom_names=cropped_assembly.unique_res_atom_names,
     )
     with open(os.path.basename(filepath).replace(".cif", "_reconstructed.cif"), "w") as f:
         f.write(mmcif_string)