Take a step towards loading, training, and sampling with mmCIF files (#74)

amorehead · web-flow · commit bf679bfa6355 · 2024-07-12T05:28:56.000-07:00
* Update trainer.py

* Update data_pipeline.py

* Create 7a4d-assembly1.cif

* Update mmcif_writing.py

* Update test_input.py

* Update inputs.py

* Fix test-time error in `inputs.py`

* Update __init__.py

* Update __init__.py
diff --git a/alphafold3_pytorch/__init__.py b/alphafold3_pytorch/__init__.py
@@ -39,6 +39,7 @@
     BatchedAtomInput,
     MoleculeInput,
     Alphafold3Input,
+    PDBInput,
     maybe_transform_to_atom_input,
     maybe_transform_to_atom_inputs
 )
@@ -47,7 +48,8 @@
     Trainer,
     DataLoader,
     collate_inputs_to_batched_atom_input,
-    alphafold3_inputs_to_batched_atom_input
+    alphafold3_inputs_to_batched_atom_input,
+    pdb_inputs_to_batched_atom_input,
 )
 
 from alphafold3_pytorch.configs import (
@@ -90,10 +92,12 @@
     Alphafold3WithHubMixin,
     Alphafold3Config,
     AtomInput,
+    PDBInput,
     Trainer,
     TrainerConfig,
     ConductorConfig,
     create_alphafold3_from_yaml,
     create_trainer_from_yaml,
-    create_trainer_from_conductor_yaml
+    create_trainer_from_conductor_yaml,
+    pdb_inputs_to_batched_atom_input,
 ]
diff --git a/alphafold3_pytorch/data/data_pipeline.py b/alphafold3_pytorch/data/data_pipeline.py
@@ -140,7 +140,7 @@ def make_mmcif_features(
 
 
 if __name__ == "__main__":
-    filepath = "data/pdb_data/mmcifs/ak/7akd-assembly1.cif"
+    filepath = os.path.join("data", "test", "7a4d-assembly1.cif")
     file_id = os.path.splitext(os.path.basename(filepath))[0]
 
     mmcif_object = mmcif_parsing.parse_mmcif_object(
diff --git a/alphafold3_pytorch/data/mmcif_writing.py b/alphafold3_pytorch/data/mmcif_writing.py
@@ -1,10 +1,16 @@
 """An mmCIF file format writer."""
 
+import numpy as np
+
+from typing import Optional
+
 from alphafold3_pytorch.common.biomolecule import (
     _from_mmcif_object,
     to_mmcif,
 )
+from alphafold3_pytorch.data.data_pipeline import get_assembly
 from alphafold3_pytorch.data.mmcif_parsing import MmcifObject
+from alphafold3_pytorch.utils.utils import exists
 
 
 def write_mmcif(
@@ -13,9 +19,21 @@ def write_mmcif(
     gapless_poly_seq: bool = True,
     insert_orig_atom_names: bool = True,
     insert_alphafold_mmcif_metadata: bool = True,
+    sampled_atom_positions: Optional[np.ndarray] = None,
 ):
     """Write a BioPython `Structure` object to an mmCIF file using an intermediate `Biomolecule` object."""
-    biomol = _from_mmcif_object(mmcif_object)
+    biomol = (
+        _from_mmcif_object(mmcif_object)
+        if "assembly" in mmcif_object.file_id
+        else get_assembly(_from_mmcif_object(mmcif_object))
+    )
+    if exists(sampled_atom_positions):
+        atom_mask = biomol.atom_mask.astype(bool)
+        assert biomol.atom_positions[atom_mask].shape == sampled_atom_positions.shape, (
+            f"Expected sampled atom positions to have masked shape {biomol.atom_positions[atom_mask].shape}, "
+            f"but got {sampled_atom_positions.shape}."
+        )
+        biomol.atom_positions[atom_mask] = sampled_atom_positions
     unique_res_atom_names = biomol.unique_res_atom_names if insert_orig_atom_names else None
     mmcif_string = to_mmcif(
         biomol,
diff --git a/alphafold3_pytorch/inputs.py b/alphafold3_pytorch/inputs.py
@@ -1,27 +1,33 @@
 from __future__ import annotations
 
-from functools import wraps, partial
-from dataclasses import dataclass, asdict, field
-from typing import Type, Literal, Callable, List, Any, Tuple
-
+import einx
+import json
+import os
 import torch
-from torch import tensor
+
 import torch.nn.functional as F
-import einx
 
-from rdkit.Chem import AllChem as Chem
+from dataclasses import dataclass, asdict, field
+from functools import wraps, partial
+from loguru import logger
+from pdbeccdutils.core import ccd_reader
+from rdkit import Chem
 from rdkit.Chem.rdchem import Atom, Mol
+from torch import tensor
+from typing import Type, Literal, Callable, List, Any, Tuple
 
 from alphafold3_pytorch.attention import (
     pad_to_length
 )
 
-from alphafold3_pytorch.tensor_typing import (
-    typecheck,
-    beartype_isinstance,
-    Int, Bool, Float
+from alphafold3_pytorch.common.biomolecule import (
+    _from_mmcif_object,
+    get_residue_constants,
 )
 
+from alphafold3_pytorch.data import mmcif_parsing
+from alphafold3_pytorch.data.data_pipeline import get_assembly
+
 from alphafold3_pytorch.life import (
     HUMAN_AMINO_ACIDS,
     DNA_NUCLEOTIDES,
@@ -36,11 +42,46 @@
     reverse_complement_tensor
 )
 
+from alphafold3_pytorch.tensor_typing import (
+    typecheck,
+    beartype_isinstance,
+    Int, Bool, Float
+)
+
 # constants
 
 IS_MOLECULE_TYPES = 4
 ADDITIONAL_MOLECULE_FEATS = 5
 
+CCD_COMPONENTS_FILEPATH = os.path.join("data", "ccd_data", "components.cif")
+CCD_COMPONENTS_SMILES_FILEPATH = os.path.join("data", "ccd_data", "components_smiles.json")
+
+# load all SMILES strings in the PDB Chemical Component Dictionary (CCD)
+
+CCD_COMPONENTS_SMILES = None
+
+if os.path.exists(CCD_COMPONENTS_SMILES_FILEPATH):
+    logger.info(f"Loading CCD component SMILES strings from {CCD_COMPONENTS_SMILES_FILEPATH}.")
+    with open(CCD_COMPONENTS_SMILES_FILEPATH) as f:
+        CCD_COMPONENTS_SMILES = json.load(f)
+elif os.path.exists(CCD_COMPONENTS_FILEPATH):
+    logger.info(
+        f"Loading CCD components from {CCD_COMPONENTS_FILEPATH} to extract all available SMILES strings (~3 minutes, one-time only)."
+    )
+    CCD_COMPONENTS = ccd_reader.read_pdb_components_file(
+        CCD_COMPONENTS_FILEPATH,
+        sanitize=False,  # Reduce loading time
+    )
+    logger.info(
+        f"Saving CCD component SMILES strings to {CCD_COMPONENTS_SMILES_FILEPATH} (one-time only)."
+    )
+    with open(CCD_COMPONENTS_SMILES_FILEPATH, "w") as f:
+        CCD_COMPONENTS_SMILES = {
+            ccd_code: Chem.MolToSmiles(CCD_COMPONENTS[ccd_code].component.mol)
+            for ccd_code in CCD_COMPONENTS
+        }
+        json.dump(CCD_COMPONENTS_SMILES, f)
+
 # functions
 
 def exists(v):
@@ -740,9 +781,9 @@ def get_num_atoms_per_chain(chains: List[List[Mol]]) -> List[int]:
     unrepeated_sym_ids = [
         *[*range(len(i.proteins))],
         *[*range(len(i.ss_rna))],
-        *[i for rna in i.ds_rna for i in range(2)],
+        *[i for _ in i.ds_rna for i in range(2)],
         *[*range(len(i.ss_dna))],
-        *[i for dna in i.ds_dna for i in range(2)],
+        *[i for _ in i.ds_dna for i in range(2)],
         *([0] * len(mol_ligands)),
         0
     ]
@@ -861,9 +902,112 @@ def get_num_atoms_per_chain(chains: List[List[Mol]]) -> List[int]:
 class PDBInput:
     filepath: str
 
+@typecheck
+def extract_chain_sequences_from_chemical_components(
+    chem_comps: List[mmcif_parsing.ChemComp],
+) -> Tuple[List[str], List[str], List[str], List[Mol | str]]:
+    assert exists(CCD_COMPONENTS_SMILES), (
+        f"The PDB Chemical Component Dictionary (CCD) components SMILES file {CCD_COMPONENTS_SMILES_FILEPATH} does not exist. "
+        f"Please re-run this script after ensuring the preliminary CCD file {CCD_COMPONENTS_FILEPATH} has been downloaded according to this project's `README.md` file."
+        f"After doing so, the SMILES file {CCD_COMPONENTS_SMILES_FILEPATH} will be cached locally and used for subsequent runs."
+    )
+
+    current_chain_seq = []
+    proteins, ss_dna, ss_rna, ligands = [], [], [], []
+
+    for idx, details in enumerate(chem_comps):
+        residue_constants = get_residue_constants(details.type)
+        restype = residue_constants.restype_3to1.get(details.id, "X")
+
+        # Protein residues
+
+        if "peptide" in details.type.lower():
+            if not current_chain_seq:
+                proteins.append(current_chain_seq)
+            current_chain_seq.append(restype)
+            # Reset current_chain_seq if the next residue is not a protein residue
+            if idx + 1 < len(chem_comps) and "peptide" not in chem_comps[idx + 1].type.lower():
+                current_chain_seq = []
+
+        # DNA residues
+
+        elif "dna" in details.type.lower():
+            if not current_chain_seq:
+                ss_dna.append(current_chain_seq)
+            current_chain_seq.append(restype)
+            # Reset current_chain_seq if the next residue is not a DNA residue
+            if idx + 1 < len(chem_comps) and "dna" not in chem_comps[idx + 1].type.lower():
+                current_chain_seq = []
+
+        # RNA residues
+
+        elif "rna" in details.type.lower():
+            if not current_chain_seq:
+                ss_rna.append(current_chain_seq)
+            current_chain_seq.append(restype)
+            # Reset current_chain_seq if the next residue is not a RNA residue
+            if idx + 1 < len(chem_comps) and "rna" not in chem_comps[idx + 1].type.lower():
+                current_chain_seq = []
+
+        # Ligand SMILES strings
+
+        else:
+            if not current_chain_seq:
+                ligands.append(current_chain_seq)
+            current_chain_seq.append(CCD_COMPONENTS_SMILES[details.id])
+            # Reset current_chain_seq after adding each ligand's SMILES string
+            current_chain_seq = []
+
+    # Efficiently build sequence strings
+
+    proteins = ["".join(protein) for protein in proteins]
+    ss_dna = ["".join(dna) for dna in ss_dna]
+    ss_rna = ["".join(rna) for rna in ss_rna]
+    ligands = ["".join(ligand) for ligand in ligands]
+
+    return proteins, ss_dna, ss_rna, ligands
+
 @typecheck
 def pdb_input_to_alphafold3_input(pdb_input: PDBInput) -> Alphafold3Input:
-    raise NotImplementedError
+    filepath = pdb_input.filepath
+    file_id = os.path.splitext(os.path.basename(filepath))[0]
+    assert os.path.exists(filepath), f"PDB input file `{filepath}` does not exist."
+
+    mmcif_object = mmcif_parsing.parse_mmcif_object(
+        filepath=filepath,
+        file_id=file_id,
+    )
+
+    biomol = (
+        _from_mmcif_object(mmcif_object)
+        if "assembly" in file_id
+        else get_assembly(_from_mmcif_object(mmcif_object))
+    )
+
+    chem_comp_table = {comp.id: comp for comp in biomol.chem_comp_table}
+    chem_comp_details = [chem_comp_table[chemid] for chemid in biomol.chemid]
+
+    proteins, ss_dna, ss_rna, ligands = extract_chain_sequences_from_chemical_components(
+        chem_comp_details
+    )
+
+    atom_positions = biomol.atom_positions[biomol.atom_mask.astype(bool)]
+    alphafold_input = Alphafold3Input(
+        proteins=proteins,
+        ss_dna=ss_dna,
+        ss_rna=ss_rna,
+        ligands=ligands,
+        atom_pos=torch.from_numpy(atom_positions.astype("float32")),
+    )
+
+    # TODO: Add support for AlphaFold 2-style amino/nucleic acid atom parametrization (i.e., 47 possible atom types per residue)
+
+    # TODO: Reference bonds from `biomol` instead of instantiating them within `Alphafold3Input`
+
+    # TODO: Ensure only polymer-ligand (e.g., protein/RNA/DNA-ligand) and ligand-ligand bonds
+    # (and bonds less than 2.4 Å) are referenced in `Alphafold3Input` (AF3 Supplement - Table 5, `token_bonds`)
+
+    return alphafold_input
 
 # the config used for keeping track of all the disparate inputs and their transforms down to AtomInput
 # this can be preprocessed or will be taken care of automatically within the Trainer during data collation
diff --git a/alphafold3_pytorch/trainer.py b/alphafold3_pytorch/trainer.py
@@ -23,6 +23,7 @@
     AtomInput,
     BatchedAtomInput,
     Alphafold3Input,
+    PDBInput,
     maybe_transform_to_atom_inputs,
     alphafold3_input_to_molecule_input
 )
@@ -200,6 +201,18 @@ def alphafold3_inputs_to_batched_atom_input(
     atom_inputs = maybe_transform_to_atom_inputs(inp)
     return collate_inputs_to_batched_atom_input(atom_inputs, **collate_kwargs)
 
+@typecheck
+def pdb_inputs_to_batched_atom_input(
+    inp: PDBInput | List[PDBInput],
+    **collate_kwargs
+) -> BatchedAtomInput:
+
+    if isinstance(inp, PDBInput):
+        inp = [inp]
+
+    atom_inputs = maybe_transform_to_atom_inputs(inp)
+    return collate_inputs_to_batched_atom_input(atom_inputs, **collate_kwargs)
+
 @typecheck
 def DataLoader(
     *args,
diff --git a/data/test/7a4d-assembly1.cif b/data/test/7a4d-assembly1.cif
diff --git a/tests/test_input.py b/tests/test_input.py