Skip to content

Commit 9b7a88e

Browse files
committed
reorder the atoms in biomolecules to convenient format, and change to removing the last atom when chaining up the biomolecules from Alphafold3Input -> MoleculeInput
1 parent f5b303b commit 9b7a88e

File tree

2 files changed

+34
-10
lines changed

2 files changed

+34
-10
lines changed

alphafold3_pytorch/inputs.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -329,8 +329,7 @@ def map_int_or_string_indices_to_mol(
329329
entries: dict,
330330
indices: Int[' _'] | List[str] | str,
331331
mol_keyname = 'rdchem_mol',
332-
remove_hydroxyl = False,
333-
hydroxyl_idx_keyname = 'hydroxyl_idx',
332+
chain = False,
334333
return_entries = False
335334
) -> List[Mol] | Tuple[List[Mol], List[dict]]:
336335

@@ -356,8 +355,9 @@ def map_int_or_string_indices_to_mol(
356355

357356
mol = entry[mol_keyname]
358357

359-
if remove_hydroxyl and not is_last:
360-
hydroxyl_idx = entry[hydroxyl_idx_keyname]
358+
if chain and not is_last:
359+
# hydroxyl oxygen to be removed should be the last atom
360+
hydroxyl_idx = mol.GetNumAtoms() - 1
361361
mol = remove_atom_from_mol(mol, hydroxyl_idx)
362362

363363
mols.append(mol)
@@ -423,7 +423,7 @@ def alphafold3_input_to_molecule_input(
423423
molecule_atom_indices = []
424424

425425
for protein in proteins:
426-
mol_peptides, protein_entries = map_int_or_string_indices_to_mol(HUMAN_AMINO_ACIDS, protein, remove_hydroxyl = True, return_entries = True)
426+
mol_peptides, protein_entries = map_int_or_string_indices_to_mol(HUMAN_AMINO_ACIDS, protein, chain = True, return_entries = True)
427427
mol_proteins.append(mol_peptides)
428428

429429
molecule_atom_indices.extend([entry['distogram_atom_idx'] for entry in protein_entries])
@@ -437,14 +437,14 @@ def alphafold3_input_to_molecule_input(
437437
mol_ss_rnas = []
438438

439439
for seq in ss_rnas:
440-
mol_seq = map_int_or_string_indices_to_mol(RNA_NUCLEOTIDES, seq, remove_hydroxyl = True)
440+
mol_seq = map_int_or_string_indices_to_mol(RNA_NUCLEOTIDES, seq, chain = True)
441441
mol_ss_rnas.append(mol_seq)
442442

443443
rna_ids = maybe_string_to_int(RNA_NUCLEOTIDES, seq) + rna_offset
444444
molecule_ids.append(rna_ids)
445445

446446
for seq in ss_dnas:
447-
mol_seq = map_int_or_string_indices_to_mol(DNA_NUCLEOTIDES, seq, remove_hydroxyl = True)
447+
mol_seq = map_int_or_string_indices_to_mol(DNA_NUCLEOTIDES, seq, chain = True)
448448
mol_ss_dnas.append(mol_seq)
449449

450450
dna_ids = maybe_string_to_int(DNA_NUCLEOTIDES, seq) + dna_offset

alphafold3_pytorch/life.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -335,14 +335,38 @@ def remove_atom_from_mol(mol: Mol, atom_idx: int) -> Mol:
335335

336336
# initialize rdkit.Chem with canonical SMILES
337337

338-
ALL_ENTRIES = [
338+
CHAINABLE_BIOMOLECULES = [
339339
*HUMAN_AMINO_ACIDS.values(),
340340
*DNA_NUCLEOTIDES.values(),
341-
*RNA_NUCLEOTIDES.values(),
341+
*RNA_NUCLEOTIDES.values()
342+
]
343+
344+
METALS_AND_MISC = [
342345
*METALS.values(),
343346
*MISC.values(),
344347
]
345348

346-
for entry in ALL_ENTRIES:
349+
for entry in [*CHAINABLE_BIOMOLECULES, *METALS_AND_MISC]:
347350
mol = mol_from_smile(entry['smile'])
348351
entry['rdchem_mol'] = mol
352+
353+
# reorder all the chainable biomolecules
354+
# to simplify chaining them up and specifying the peptide or phosphodiesterase bonds
355+
356+
for entry in CHAINABLE_BIOMOLECULES:
357+
mol = entry['rdchem_mol']
358+
359+
atom_order = torch.arange(mol.GetNumAtoms())
360+
361+
atom_order[entry['first_atom_idx']] = -1
362+
atom_order[entry['last_atom_idx']] = 1e4
363+
atom_order[entry['hydroxyl_idx']] = 1e4 + 1
364+
365+
atom_reorder = atom_order.argsort().tolist()
366+
367+
mol = Chem.RenumberAtoms(mol, atom_reorder)
368+
369+
entry.update(
370+
atom_reorder = atom_reorder,
371+
rdchem_mol = mol
372+
)

0 commit comments

Comments
 (0)