Skip to content

Commit be52f3c

Browse files
committed
add the canonical smiles representation for all human amino acids and nucleotides in life.py
1 parent d7c0f08 commit be52f3c

File tree

2 files changed

+84
-23
lines changed

2 files changed

+84
-23
lines changed

alphafold3_pytorch/inputs.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
from typing import Type, TypedDict, Literal, Callable
1+
from typing import Type, TypedDict, Literal, Callable, List
2+
3+
from rdkit import Chem
4+
from rdkit.Chem.rdchem import Mol
25

36
from alphafold3_pytorch.tensor_typing import (
47
typecheck,
@@ -51,6 +54,33 @@ class BatchedAtomInput(TypedDict):
5154
pde_labels: Int['b n'] | None
5255
resolved_labels: Int['b n'] | None
5356

57+
# molecule input - accepting list of molecules as rdchem.Mol + the atomic lengths for how to pool into tokens
58+
59+
@typecheck
60+
class MoleculeInput(TypedDict):
61+
molecules: List[Mol]
62+
molecule_atom_lens: List[Int['t']]
63+
molecule_ids: Int['n']
64+
additional_molecule_feats: Float['n 9']
65+
templates: Float['t n n dt']
66+
msa: Float['s n dm']
67+
token_bonds: Bool['n n'] | None
68+
template_mask: Bool['t'] | None
69+
msa_mask: Bool['s'] | None
70+
atom_pos: Float['m 3'] | None
71+
molecule_atom_indices: Int['n'] | None
72+
distance_labels: Int['n n'] | None
73+
pae_labels: Int['n n'] | None
74+
pde_labels: Int['n'] | None
75+
resolved_labels: Int['n'] | None
76+
77+
@typecheck
78+
def molecule_to_atom_input(molecule_input: MoleculeInput) -> AtomInput:
79+
raise NotImplementedError
80+
81+
def validate_molecule_input(molecule_input: MoleculeInput):
82+
assert True
83+
5484
# residue level - single chain proteins for starters
5585

5686
@typecheck
@@ -105,6 +135,7 @@ def single_protein_input_and_single_nucleic_acid_to_atom_input(
105135
# this can be preprocessed or will be taken care of automatically within the Trainer during data collation
106136

107137
INPUT_TO_ATOM_TRANSFORM = {
138+
MoleculeInput: molecule_to_atom_input,
108139
SingleProteinInput: single_protein_input_to_atom_input,
109140
SingleProteinSingleNucleicAcidInput: single_protein_input_and_single_nucleic_acid_to_atom_input
110141
}

alphafold3_pytorch/life.py

Lines changed: 52 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,66 +1,96 @@
1+
import rdkit
2+
from rdkit import Chem
3+
4+
# human amino acids
15

26
HUMAN_AMINO_ACIDS = dict(
37
A = dict(
4-
bonds = [[0,1], [1,2], [2,3], [1,4]],
8+
smile = 'CC(C(=O)O)N'
59
),
610
R = dict(
7-
bonds = [[0,1], [1,2], [2,3], [2,4], [4,5], [5,6], [6,7], [7,8], [8,9], [8,10]]
11+
smile = 'C(CC(C(=O)O)N)CN=C(N)N'
812
),
913
N = dict(
10-
bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [5,7]]
14+
smile = 'C(C(C(=O)O)N)C(=O)N'
1115
),
1216
D = dict(
13-
bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [5,7]]
17+
smile = 'C(C(C(=O)O)N)C(=O)O'
1418
),
1519
C = dict(
16-
bonds = [[0,1], [1,2], [2,3], [1,4], [4,5]]
20+
smile = 'C(C(C(=O)O)N)S'
1721
),
1822
Q = dict(
19-
bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [6,7], [6,8]]
23+
smile = 'C(CC(=O)N)C(C(=O)O)N'
2024
),
2125
E = dict(
22-
bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [6,7], [7,8]]
26+
smile = 'C(CC(=O)O)C(C(=O)O)N'
2327
),
2428
G = dict(
25-
bonds = [[0,1], [1,2], [2,3]]
29+
smile = 'C(C(=O)O)N'
2630
),
2731
H = dict(
28-
bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [6,7], [7,8], [8,9], [5,9]]
32+
smile = 'C1=C(NC=N1)CC(C(=O)O)N'
2933
),
3034
I = dict(
31-
bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [4,7]]
35+
smile = 'CCC(C)C(C(=O)O)N'
3236
),
3337
L = dict(
34-
bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [5,7]]
38+
smile = 'CC(C)CC(C(=O)O)N'
3539
),
3640
K = dict(
37-
bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [6,7], [7,8]]
41+
smile = 'C(CCN)CC(C(=O)O)N'
3842
),
3943
M = dict(
40-
bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [6,7]]
44+
smile = 'CSCCC(C(=O)O)N'
4145
),
4246
F = dict(
43-
bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [6,7], [7,8], [8,9], [9,10], [5,10]]
47+
smile = 'C1=CC=C(C=C1)CC(C(=O)O)N'
4448
),
4549
P = dict(
46-
bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [0,6]]
50+
smile = 'C1CC(NC1)C(=O)O'
4751
),
4852
S = dict(
49-
bonds = [[0,1], [1,2], [2,3], [1,4], [4,5]]
53+
smile = 'C(C(C(=O)O)N)O'
5054
),
5155
T = dict(
52-
bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [4,6]]
56+
smile = 'CC(C(C(=O)O)N)O'
5357
),
5458
W = dict(
55-
bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [6,7], [7,8], [8,9], [9,10], [10,11], [11,12], [12, 13], [5,13], [8,13]]
59+
smile = 'C1=CC=C2C(=C1)C(=CN2)CC(C(=O)O)N'
5660
),
5761
Y = dict(
58-
bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [5,6], [6,7], [7,8], [8,9], [8,10], [10,11], [5,11]]
62+
smile = 'C1=CC(=CC=C1CC(C(=O)O)N)O'
5963
),
6064
V = dict(
61-
bonds = [[0,1], [1,2], [2,3], [1,4], [4,5], [4,6]]
65+
smile = 'CC(C)C(C(=O)O)N'
66+
)
67+
)
68+
69+
# nucleotides
70+
71+
NUCLEOTIDES = dict(
72+
A = dict(
73+
smile = 'C1=NC2=NC=NC(=C2N1)N'
74+
),
75+
G = dict(
76+
smile = 'C1=NC2=C(N1)C(=O)NC(=N2)N'
6277
),
63-
_ = dict(
64-
bonds = []
78+
C = dict(
79+
smile = 'C1=C(NC(=O)N=C1)N'
80+
),
81+
T = dict(
82+
smile = 'CC1=CN(C(=O)NC1=O)C2CC(C(O2)CO)O'
83+
),
84+
U = dict(
85+
smile = 'C1=CNC(=O)NC1=O'
6586
)
6687
)
88+
89+
# initialize rdkit.Chem with canonical SMILES
90+
91+
for aa_dict in HUMAN_AMINO_ACIDS.values():
92+
aa_dict['rdkit_chem'] = Chem.MolFromSmiles(aa_dict['smile'])
93+
94+
95+
for nuc_dict in NUCLEOTIDES.values():
96+
nuc_dict['rdkit_chem'] = Chem.MolFromSmiles(nuc_dict['smile'])

0 commit comments

Comments
 (0)