add the canonical smiles representation for all human amino acids and nucleotides in life.py

lucidrains · lucidrains · commit be52f3cb3342 · 2024-06-25T11:15:28.000-07:00
diff --git a/alphafold3_pytorch/inputs.py b/alphafold3_pytorch/inputs.py
@@ -1,4 +1,7 @@
-from typing import Type, TypedDict, Literal, Callable
+from typing import Type, TypedDict, Literal, Callable, List
+
+from rdkit import Chem
+from rdkit.Chem.rdchem import Mol
 
 from alphafold3_pytorch.tensor_typing import (
     typecheck,
@@ -51,6 +54,33 @@ class BatchedAtomInput(TypedDict):
     pde_labels:                 Int['b n'] | None
     resolved_labels:            Int['b n'] | None
 
+# molecule input - accepting list of molecules as rdchem.Mol + the atomic lengths for how to pool into tokens
+
+@typecheck
+class MoleculeInput(TypedDict):
+    molecules:                  List[Mol]
+    molecule_atom_lens:         List[Int['t']]
+    molecule_ids:               Int['n']
+    additional_molecule_feats:  Float['n 9']
+    templates:                  Float['t n n dt']
+    msa:                        Float['s n dm']
+    token_bonds:                Bool['n n'] | None
+    template_mask:              Bool['t'] | None
+    msa_mask:                   Bool['s'] | None
+    atom_pos:                   Float['m 3'] | None
+    molecule_atom_indices:      Int['n'] | None
+    distance_labels:            Int['n n'] | None
+    pae_labels:                 Int['n n'] | None
+    pde_labels:                 Int['n'] | None
+    resolved_labels:            Int['n'] | None
+
+@typecheck
+def molecule_to_atom_input(molecule_input: MoleculeInput) -> AtomInput:
+    raise NotImplementedError
+
+def validate_molecule_input(molecule_input: MoleculeInput):
+    assert True
+
 # residue level - single chain proteins for starters
 
 @typecheck
@@ -105,6 +135,7 @@ def single_protein_input_and_single_nucleic_acid_to_atom_input(
 # this can be preprocessed or will be taken care of automatically within the Trainer during data collation
 
 INPUT_TO_ATOM_TRANSFORM = {
+    MoleculeInput: molecule_to_atom_input,
     SingleProteinInput: single_protein_input_to_atom_input,
     SingleProteinSingleNucleicAcidInput: single_protein_input_and_single_nucleic_acid_to_atom_input
 }
diff --git a/alphafold3_pytorch/life.py b/alphafold3_pytorch/life.py
@@ -1,66 +1,96 @@
+import rdkit
+from rdkit import Chem
+
+# human amino acids
 
 HUMAN_AMINO_ACIDS = dict(
     A = dict(
-        bonds = [[0,1], [1,2], [2,3], [1,4]],
+        smile = 'CC(C(=O)O)N'
     ),
     R = dict(
-        bonds = [[0,1], [1,2], [2,3], [2,4], [4,5], [5,6], [6,7], [7,8], [8,9], [8,10]]
+        smile = 'C(CC(C(=O)O)N)CN=C(N)N'
     ),
     N = dict(
-        bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [5,7]]
+        smile = 'C(C(C(=O)O)N)C(=O)N'
     ),
     D = dict(
-        bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [5,7]]
+        smile = 'C(C(C(=O)O)N)C(=O)O'
     ),
     C = dict(
-        bonds = [[0,1], [1,2], [2,3], [1,4], [4,5]]
+        smile = 'C(C(C(=O)O)N)S'
     ),
     Q = dict(
-        bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [6,7], [6,8]]
+        smile = 'C(CC(=O)N)C(C(=O)O)N'
     ),
     E = dict(
-        bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [6,7], [7,8]]
+        smile = 'C(CC(=O)O)C(C(=O)O)N'
     ),
     G = dict(
-        bonds = [[0,1], [1,2], [2,3]]
+        smile = 'C(C(=O)O)N'
     ),
     H = dict(
-        bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [6,7], [7,8], [8,9], [5,9]]
+        smile = 'C1=C(NC=N1)CC(C(=O)O)N'
     ),
     I = dict(
-        bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [4,7]]
+        smile = 'CCC(C)C(C(=O)O)N'
     ),
     L = dict(
-        bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [5,7]]
+        smile = 'CC(C)CC(C(=O)O)N'
     ),
     K = dict(
-        bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [6,7], [7,8]]
+        smile = 'C(CCN)CC(C(=O)O)N'
     ),
     M = dict(
-        bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [6,7]]
+        smile = 'CSCCC(C(=O)O)N'
     ),
     F = dict(
-        bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [6,7], [7,8], [8,9], [9,10], [5,10]]
+        smile = 'C1=CC=C(C=C1)CC(C(=O)O)N'
     ),
     P = dict(
-        bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [0,6]]
+        smile = 'C1CC(NC1)C(=O)O'
     ),
     S = dict(
-        bonds = [[0,1], [1,2], [2,3], [1,4], [4,5]]
+        smile = 'C(C(C(=O)O)N)O'
     ),
     T = dict(
-        bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [4,6]]
+        smile = 'CC(C(C(=O)O)N)O'
     ),
     W = dict(
-        bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [6,7], [7,8], [8,9], [9,10], [10,11], [11,12], [12, 13], [5,13], [8,13]]
+        smile = 'C1=CC=C2C(=C1)C(=CN2)CC(C(=O)O)N'
     ),
     Y = dict(
-        bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [6,7], [7,8], [8,9], [8,10], [10,11], [5,11]] 
+        smile = 'C1=CC(=CC=C1CC(C(=O)O)N)O'
     ),
     V = dict(
-        bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [4,6]]
+        smile = 'CC(C)C(C(=O)O)N'
+    )
+)
+
+# nucleotides
+
+NUCLEOTIDES = dict(
+    A = dict(
+        smile = 'C1=NC2=NC=NC(=C2N1)N'
+    ),
+    G = dict(
+        smile = 'C1=NC2=C(N1)C(=O)NC(=N2)N'
     ),
-    _ = dict(
-        bonds = []
+    C = dict(
+        smile = 'C1=C(NC(=O)N=C1)N'
+    ),
+    T = dict(
+        smile = 'CC1=CN(C(=O)NC1=O)C2CC(C(O2)CO)O'
+    ),
+    U = dict(
+        smile = 'C1=CNC(=O)NC1=O'
     )
 )
+
+# initialize rdkit.Chem with canonical SMILES
+
+for aa_dict in HUMAN_AMINO_ACIDS.values():
+    aa_dict['rdkit_chem'] = Chem.MolFromSmiles(aa_dict['smile'])
+
+
+for nuc_dict in NUCLEOTIDES.values():
+    nuc_dict['rdkit_chem'] = Chem.MolFromSmiles(nuc_dict['smile'])