reorder the atoms in biomolecules to convenient format, and change to removing the last atom when chaining up the biomolecules from Alphafold3Input -> MoleculeInput

lucidrains · lucidrains · commit 9b7a88e86395 · 2024-07-01T09:25:44.000-07:00
diff --git a/alphafold3_pytorch/inputs.py b/alphafold3_pytorch/inputs.py
@@ -329,8 +329,7 @@ def map_int_or_string_indices_to_mol(
     entries: dict,
     indices: Int[' _'] | List[str] | str,
     mol_keyname = 'rdchem_mol',
-    remove_hydroxyl = False,
-    hydroxyl_idx_keyname = 'hydroxyl_idx',
+    chain = False,
     return_entries = False
 ) -> List[Mol] | Tuple[List[Mol], List[dict]]:
 
@@ -356,8 +355,9 @@ def map_int_or_string_indices_to_mol(
 
         mol = entry[mol_keyname]
 
-        if remove_hydroxyl and not is_last:
-            hydroxyl_idx = entry[hydroxyl_idx_keyname]
+        if chain and not is_last:
+            # hydroxyl oxygen to be removed should be the last atom
+            hydroxyl_idx = mol.GetNumAtoms() - 1
             mol = remove_atom_from_mol(mol, hydroxyl_idx)
 
         mols.append(mol)
@@ -423,7 +423,7 @@ def alphafold3_input_to_molecule_input(
     molecule_atom_indices = []
 
     for protein in proteins:
-        mol_peptides, protein_entries = map_int_or_string_indices_to_mol(HUMAN_AMINO_ACIDS, protein, remove_hydroxyl = True, return_entries = True)
+        mol_peptides, protein_entries = map_int_or_string_indices_to_mol(HUMAN_AMINO_ACIDS, protein, chain = True, return_entries = True)
         mol_proteins.append(mol_peptides)
 
         molecule_atom_indices.extend([entry['distogram_atom_idx'] for entry in protein_entries])
@@ -437,14 +437,14 @@ def alphafold3_input_to_molecule_input(
     mol_ss_rnas = []
 
     for seq in ss_rnas:
-        mol_seq = map_int_or_string_indices_to_mol(RNA_NUCLEOTIDES, seq, remove_hydroxyl = True)
+        mol_seq = map_int_or_string_indices_to_mol(RNA_NUCLEOTIDES, seq, chain = True)
         mol_ss_rnas.append(mol_seq)
 
         rna_ids = maybe_string_to_int(RNA_NUCLEOTIDES, seq) + rna_offset
         molecule_ids.append(rna_ids)
 
     for seq in ss_dnas:
-        mol_seq = map_int_or_string_indices_to_mol(DNA_NUCLEOTIDES, seq, remove_hydroxyl = True)
+        mol_seq = map_int_or_string_indices_to_mol(DNA_NUCLEOTIDES, seq, chain = True)
         mol_ss_dnas.append(mol_seq)
 
         dna_ids = maybe_string_to_int(DNA_NUCLEOTIDES, seq) + dna_offset
diff --git a/alphafold3_pytorch/life.py b/alphafold3_pytorch/life.py
@@ -335,14 +335,38 @@ def remove_atom_from_mol(mol: Mol, atom_idx: int) -> Mol:
 
 # initialize rdkit.Chem with canonical SMILES
 
-ALL_ENTRIES = [
+CHAINABLE_BIOMOLECULES = [
     *HUMAN_AMINO_ACIDS.values(),
     *DNA_NUCLEOTIDES.values(),
-    *RNA_NUCLEOTIDES.values(),
+    *RNA_NUCLEOTIDES.values()
+]
+
+METALS_AND_MISC = [
     *METALS.values(),
     *MISC.values(),
 ]
 
-for entry in ALL_ENTRIES:
+for entry in [*CHAINABLE_BIOMOLECULES, *METALS_AND_MISC]:
     mol = mol_from_smile(entry['smile'])
     entry['rdchem_mol'] = mol
+
+# reorder all the chainable biomolecules
+# to simplify chaining them up and specifying the peptide or phosphodiesterase bonds
+
+for entry in CHAINABLE_BIOMOLECULES:
+    mol = entry['rdchem_mol']
+
+    atom_order = torch.arange(mol.GetNumAtoms())
+
+    atom_order[entry['first_atom_idx']] = -1
+    atom_order[entry['last_atom_idx']] = 1e4
+    atom_order[entry['hydroxyl_idx']] = 1e4 + 1
+
+    atom_reorder = atom_order.argsort().tolist()
+
+    mol = Chem.RenumberAtoms(mol, atom_reorder)
+
+    entry.update(
+        atom_reorder = atom_reorder,
+        rdchem_mol = mol
+    )