lucidrains
diff --git a/‎alphafold3_pytorch/alphafold3.py‎
Lines changed: 5 additions & 1 deletion b/‎alphafold3_pytorch/alphafold3.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎alphafold3_pytorch/common/amino_acid_constants.py‎
Lines changed: 4 additions & 2 deletions b/‎alphafold3_pytorch/common/amino_acid_constants.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎alphafold3_pytorch/common/biomolecule.py‎
Lines changed: 26 additions & 7 deletions b/‎alphafold3_pytorch/common/biomolecule.py‎
Lines changed: 26 additions & 7 deletions
diff --git a/‎alphafold3_pytorch/common/dna_constants.py‎
Lines changed: 5 additions & 2 deletions b/‎alphafold3_pytorch/common/dna_constants.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎alphafold3_pytorch/common/ligand_constants.py‎
Lines changed: 56 additions & 2 deletions b/‎alphafold3_pytorch/common/ligand_constants.py‎
Lines changed: 56 additions & 2 deletions
diff --git a/‎alphafold3_pytorch/common/rna_constants.py‎
Lines changed: 5 additions & 3 deletions b/‎alphafold3_pytorch/common/rna_constants.py‎
Lines changed: 5 additions & 3 deletions
@@ -68,6 +68,7 @@
 n - molecule sequence length
 i - molecule sequence length (source)
 j - molecule sequence length (target)
+l - present (i.e., non-missing) atom sequence length
 m - atom sequence length
 nw - windowed sequence length
 d - feature dimension
@@ -3377,9 +3378,10 @@ def forward(
         resolved_labels: Int['b n'] | None = None,
         return_loss_breakdown = False,
         return_loss: bool = None,
+        return_present_sampled_atoms: bool = False,
         num_rollout_steps: int = 20,
         rollout_show_tqdm_pbar: bool = False
-    ) -> Float['b m 3'] | Float[''] | Tuple[Float[''], LossBreakdown]:
+    ) -> Float['b m 3'] | Float['l 3'] | Float[''] | Tuple[Float[''], LossBreakdown]:
 
         atom_seq_len = atom_inputs.shape[-2]
 
@@ -3622,6 +3624,8 @@ def forward(
 
             if exists(atom_mask):
                 sampled_atom_pos = einx.where('b m, b m c, -> b m c', atom_mask, sampled_atom_pos, 0.)
+            if exists(missing_atom_mask) and return_present_sampled_atoms:
+                sampled_atom_pos = sampled_atom_pos[~missing_atom_mask]
 
             return sampled_atom_pos
 
 
@@ -1,9 +1,9 @@
 """Amino acid constants used in AlphaFold."""
 
-import numpy as np
-
 from typing import Final
 
+import numpy as np
+
 # This mapping is used when we need to store atom data in a format that requires
 # fixed atom data size for every residue (e.g. a numpy array).
 # From: https://github.com/google-deepmind/alphafold/blob/f251de6613cb478207c732bf9627b1e853c99c2f/alphafold/common/residue_constants.py#L492C1-L497C2
@@ -59,6 +59,7 @@
 atom_types_set = set(atom_types)
 atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
 atom_type_num = len(atom_types)  # := 37 + 10 null types := 47.
+res_rep_atom_index = 1  # The index of the atom used to represent the center of the residue.
 
 
 # This is the standard residue order when coding AA type as a number.
@@ -111,6 +112,7 @@
     "W": "TRP",
     "Y": "TYR",
     "V": "VAL",
+    "X": "UNK",
 }
 
 BIOMOLECULE_CHAIN: Final[str] = "polypeptide(L)"
 
@@ -39,6 +39,7 @@
     "_struct_conn.",
 ]
 MMCIF_PREFIXES_TO_DROP_POST_AF3 = MMCIF_PREFIXES_TO_DROP_POST_PARSING + [
+    "_audit_author.",
     "_citation.",
     "_citation_author.",
 ]
@@ -53,6 +54,10 @@ class Biomolecule:
     # amino acid residues.
     atom_positions: np.ndarray  # [num_res, num_atom_type, 3]
 
+    # Name of each residue-representative atom as a string,
+    # which matches the number of (pseudo)residues (A.K.A. tokens).
+    atom_name: np.ndarray  # [num_res]
+
     # Amino-acid or nucleotide type for each residue represented as an integer
     # between 0 and 31, where:
     # 20 represents the the unknown amino acid 'X';
@@ -124,6 +129,7 @@ def __add__(self, other: "Biomolecule") -> "Biomolecule":
         """Merges two `Biomolecule` instances."""
         return Biomolecule(
             atom_positions=np.concatenate([self.atom_positions, other.atom_positions], axis=0),
+            atom_name=np.concatenate([self.atom_name, other.atom_name], axis=0),
             restype=np.concatenate([self.restype, other.restype], axis=0),
             atom_mask=np.concatenate([self.atom_mask, other.atom_mask], axis=0),
             residue_index=np.concatenate([self.residue_index, other.residue_index], axis=0),
@@ -169,6 +175,7 @@ def subset_chains(self, subset_chain_ids: List[str]) -> "Biomolecule":
         chain_mask = np.isin(self.chain_index, list(subset_chain_index_mapping.keys()))
         return Biomolecule(
             atom_positions=self.atom_positions[chain_mask],
+            atom_name=self.atom_name[chain_mask],
             restype=self.restype[chain_mask],
             atom_mask=self.atom_mask[chain_mask],
             residue_index=self.residue_index[chain_mask],
@@ -203,6 +210,7 @@ def repeat(self, coord: np.ndarray) -> "Biomolecule":
         """Repeat a Biomolecule according to a (repeated) coordinate array."""
         return Biomolecule(
             atom_positions=coord.reshape(-1, 47, 3),
+            atom_name=np.tile(self.atom_name, (coord.shape[0], 1)).reshape(-1),
             restype=np.tile(self.restype, (coord.shape[0], 1)).reshape(-1),
             atom_mask=np.tile(self.atom_mask, (coord.shape[0], 1, 1)).reshape(-1, 47),
             residue_index=np.tile(self.residue_index, (coord.shape[0], 1)).reshape(-1),
@@ -258,13 +266,18 @@ def get_ligand_atom_name(atom_name: str, atom_types_set: Set[str]) -> str:
     elif len(atom_name) == 2:
         return atom_name if atom_name in atom_types_set else atom_name[0]
     elif len(atom_name) == 3:
-        return (
-            atom_name
-            if atom_name in atom_types_set
-            else (
-                atom_name[:2] if atom_name[:2] in atom_types_set else atom_name[0] + atom_name[2]
-            )
-        )
+        if atom_name in atom_types_set:
+            return atom_name
+        elif atom_name[:2] in atom_types_set:
+            return atom_name[:2]
+        elif atom_name[1:] in atom_types_set:
+            return atom_name[1:]
+        elif atom_name[0] + atom_name[2] in atom_types_set:
+            return atom_name[0] + atom_name[2]
+        elif atom_name.split("H")[0] in atom_types_set:
+            return atom_name.split("H")[0]
+        else:
+            return atom_name
     else:
         return atom_name
 
@@ -334,6 +347,7 @@ def _from_mmcif_object(
         model = models[0]
 
     atom_positions = []
+    atom_names = []
     restype = []
     chemid = []
     chemtype = []
@@ -412,6 +426,9 @@ def _from_mmcif_object(
                 chemid.append(res_chem_comp_details.id)
                 chemtype.append(residue_constants.chemtype_num)
                 atom_positions.append(pos)
+                atom_names.append(
+                    residue_constants.atom_types[residue_constants.res_rep_atom_index]
+                )
                 atom_mask.append(mask)
                 residue_index.append(res_index + 1)
                 chain_ids.append(chain.id)
@@ -448,6 +465,7 @@ def _from_mmcif_object(
                     atom_name = get_ligand_atom_name(atom.name, residue_constants.atom_types_set)
                     if atom_name not in residue_constants.atom_types_set:
                         atom_name = "ATM"
+                    atom_names.append(atom_name)
                     pos[residue_constants.atom_order[atom_name]] = atom.coord
                     mask[residue_constants.atom_order[atom_name]] = 1.0
                     res_b_factors[residue_constants.atom_order[atom_name]] = atom.bfactor
@@ -505,6 +523,7 @@ def _from_mmcif_object(
 
     return Biomolecule(
         atom_positions=np.array(atom_positions),
+        atom_name=np.array(atom_names),
         restype=np.array(restype),
         atom_mask=np.array(atom_mask),
         residue_index=np.array(residue_index),
 
@@ -1,9 +1,9 @@
 """Deoxyribonucleic acid (DNA) constants used in AlphaFold."""
 
-import numpy as np
-
 from typing import Final
 
+import numpy as np
+
 from alphafold3_pytorch.common import amino_acid_constants, rna_constants
 
 # This mapping is used when we need to store atom data in a format that requires
@@ -62,6 +62,7 @@
 atom_types_set = set(atom_types)
 atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
 atom_type_num = len(atom_types)  # := 28 + 19 null types := 47.
+res_rep_atom_index = 11  # The index of the atom used to represent the center of the residue.
 
 
 # This is the standard residue order when coding DNA type as a number.
@@ -79,6 +80,7 @@
     "C": "DC",
     "G": "DG",
     "T": "DT",
+    "X": "DN",
 }
 
 BIOMOLECULE_CHAIN: Final[str] = "polydeoxyribonucleotide"
@@ -252,4 +254,5 @@ def _make_constants():
             compact_atom_idx = restype_name_to_compact_atom_names[resname].index(atomname)
             restype_atom47_to_compact_atom[restype, atomtype] = compact_atom_idx
 
+
 _make_constants()
@@ -1,9 +1,9 @@
 """Ligand constants used in AlphaFold."""
 
-import numpy as np
-
 from typing import Final
 
+import numpy as np
+
 from alphafold3_pytorch.common import amino_acid_constants, dna_constants
 
 # This mapping is used when we need to store atom data in a format that requires
@@ -58,9 +58,62 @@
     "ZN",
     "ATM",
 ]
+element_types = [
+    # NOTE: Taken from: https://github.com/baker-laboratory/RoseTTAFold-All-Atom/blob/c1fd92455be2a4133ad147242fc91cea35477282/rf2aa/chemical.py#L117C13-L126C18
+    "Al",
+    "As",
+    "Au",
+    "B",
+    "Be",
+    "Br",
+    "C",
+    "Ca",
+    "Cl",
+    "Co",
+    "Cr",
+    "Cu",
+    "F",
+    "Fe",
+    "Hg",
+    "I",
+    "Ir",
+    "K",
+    "Li",
+    "Mg",
+    "Mn",
+    "Mo",
+    "N",
+    "Ni",
+    "O",
+    "Os",
+    "P",
+    "Pb",
+    "Pd",
+    "Pr",
+    "Pt",
+    "Re",
+    "Rh",
+    "Ru",
+    "S",
+    "Sb",
+    "Se",
+    "Si",
+    "Sn",
+    "Tb",
+    "Te",
+    "U",
+    "W",
+    "V",
+    "Y",
+    "Zn",
+    "ATM",
+]
 atom_types_set = set(atom_types)
 atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
 atom_type_num = len(atom_types)  # := 47.
+res_rep_atom_index = (
+    len(atom_types) - 1
+)  # := 46  # The index of the atom used to represent the center of a ligand pseudoresidue.
 
 
 # All ligand residues are mapped to the unknown amino acid type index (:= 20).
@@ -109,4 +162,5 @@ def _make_constants():
             compact_atom_idx = restype_name_to_compact_atom_names[resname].index(atomname)
             restype_atom47_to_compact_atom[restype, atomtype] = compact_atom_idx
 
+
 _make_constants()
@@ -1,9 +1,9 @@
 """Ribonucleic acid (RNA) constants used in AlphaFold."""
 
-import numpy as np
-
 from typing import Final
 
+import numpy as np
+
 from alphafold3_pytorch.common import amino_acid_constants
 
 # This mapping is used when we need to store atom data in a format that requires
@@ -62,6 +62,7 @@
 atom_types_set = set(atom_types)
 atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
 atom_type_num = len(atom_types)  # := 28 + 19 null types := 47.
+res_rep_atom_index = 12  # The index of the atom used to represent the center of the residue.
 
 
 # This is the standard residue order when coding RNA type as a number.
@@ -72,7 +73,7 @@
 restype_num = min_restype_num + len(restypes)  # := 21 + 4 := 25.
 
 
-restype_1to3 = {"A": "A", "C": "C", "G": "G", "U": "U"}
+restype_1to3 = {"A": "A", "C": "C", "G": "G", "U": "U", "X": "N"}
 
 BIOMOLECULE_CHAIN: Final[str] = "polyribonucleotide"
 POLYMER_CHAIN: Final[str] = "polymer"
@@ -244,4 +245,5 @@ def _make_constants():
             compact_atom_idx = restype_name_to_compact_atom_names[resname].index(atomname)
             restype_atom47_to_compact_atom[restype, atomtype] = compact_atom_idx
 
+
 _make_constants()