first go for a simple cache for atompair ids for residues and nucleot… (#286)

lucidrains · web-flow · commit fa1a06571d62 · 2024-09-23T09:47:33.000-07:00
* first go for a simple cache for atompair ids for residues and nucleotides

* type

* fix function

* cache in another place

* some more fixes
diff --git a/alphafold3_pytorch/inputs.py b/alphafold3_pytorch/inputs.py
@@ -8,15 +8,15 @@
 import statistics
 import traceback
 from collections import defaultdict
-from collections.abc import Iterable
 from contextlib import redirect_stderr
 from dataclasses import asdict, dataclass, field
 from datetime import datetime, timedelta
-from functools import partial
+from functools import partial, wraps
 from io import StringIO
 from itertools import groupby
 from pathlib import Path
-from retrying import retry
+
+from collections.abc import Iterable
 from beartype.typing import (
     Any,
     Callable,
@@ -28,23 +28,29 @@
     Type,
 )
 
-import einx
 import numpy as np
 import polars as pl
+
 import timeout_decorator
+from retrying import retry
+
 import torch
 import torch.nn.functional as F
+from torch import repeat_interleave, tensor
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset
+
+import einx
 from einops import pack, rearrange
+
 from joblib import Parallel, delayed
 from loguru import logger
 from pdbeccdutils.core import ccd_reader
+
 from rdkit import Chem, RDLogger, rdBase
 from rdkit.Chem import AllChem, rdDetermineBonds
 from rdkit.Chem.rdchem import Atom, Mol
 from rdkit.Geometry import Point3D
-from torch import repeat_interleave, tensor
-from torch.nn.utils.rnn import pad_sequence
-from torch.utils.data import Dataset
 
 from alphafold3_pytorch.common import (
     amino_acid_constants,
@@ -204,6 +210,100 @@
         }
         json.dump(CCD_COMPONENTS_SMILES, f)
 
+# simple caching
+
+ATOMPAIR_IDS_CACHE = dict()
+
+@typecheck
+def maybe_cache(
+    fn,
+    *,
+    cache: dict,
+    key: str,
+    should_cache: bool = True
+) -> Callable:
+
+    if not should_cache:
+        return fn
+
+    @wraps(fn)
+    def inner(*args, **kwargs):
+        if key in cache:
+            return cache[key]
+
+        out = fn(*args, **kwargs)
+
+        cache[key] = out
+        return out
+
+    return inner
+
+# get atompair bonds functions
+
+ATOM_BOND_INDEX = {symbol: (idx + 1) for idx, symbol in enumerate(ATOM_BONDS)}
+
+@typecheck
+def get_atompair_ids(
+    mol: Mol,
+    directed_bonds: bool
+) -> Tensor | None:
+
+    coordinates = []
+    updates = []
+
+    num_atoms = mol.GetNumAtoms()
+    mol_atompair_ids = torch.zeros(num_atoms, num_atoms).long()
+
+    bonds = mol.GetBonds()
+    num_bonds = len(bonds)
+
+    num_atom_bond_types = len(ATOM_BOND_INDEX)
+    other_index = len(ATOM_BONDS) + 1
+
+    for bond in bonds:
+        atom_start_index = bond.GetBeginAtomIdx()
+        atom_end_index = bond.GetEndAtomIdx()
+
+        coordinates.extend(
+            [
+                [atom_start_index, atom_end_index],
+                [atom_end_index, atom_start_index],
+            ]
+        )
+
+        bond_type = bond.GetBondType()
+        bond_id = ATOM_BOND_INDEX.get(bond_type, other_index) + 1
+
+        # default to symmetric bond type (undirected atom bonds)
+
+        bond_to = bond_from = bond_id
+
+        # if allowing for directed bonds, assume num_atompair_embeds = (2 * num_atom_bond_types) + 1
+        # offset other edge by num_atom_bond_types
+
+        if directed_bonds:
+            bond_from += num_atom_bond_types
+
+        updates.extend([bond_to, bond_from])
+
+    if num_bonds == 0:
+        return None
+
+    coordinates = tensor(coordinates).long()
+    updates = tensor(updates).long()
+
+    # mol_atompair_ids = einx.set_at("[h w], c [2], c -> [h w]", mol_atompair_ids, coordinates, updates)
+
+    molpair_strides = tensor(mol_atompair_ids.stride())
+    flattened_coordinates = (coordinates * molpair_strides).sum(dim = -1)
+
+    packed_atompair_ids, unpack_one = pack_one(mol_atompair_ids, '*')
+    packed_atompair_ids[flattened_coordinates] = updates
+
+    mol_atompair_ids = unpack_one(packed_atompair_ids)
+
+    return mol_atompair_ids
+
 # functions
 
 
@@ -730,10 +830,6 @@ def molecule_to_atom_input(mol_input: MoleculeInput) -> AtomInput:
     atompair_ids = None
 
     if i.add_atompair_ids:
-        atom_bond_index = {symbol: (idx + 1) for idx, symbol in enumerate(ATOM_BONDS)}
-        num_atom_bond_types = len(atom_bond_index)
-
-        other_index = len(ATOM_BONDS) + 1
 
         atompair_ids = torch.zeros(total_atoms, total_atoms).long()
 
@@ -753,70 +849,35 @@ def molecule_to_atom_input(mol_input: MoleculeInput) -> AtomInput:
 
         for (
             mol,
+            mol_id,
             is_first_mol_in_chain,
             is_chainable_biomolecule,
             src_tgt_atom_indices,
             offset,
         ) in zip(
             molecules,
+            molecule_ids,
             is_first_mol_in_chains,
             is_chainable_biomolecules,
             i.src_tgt_atom_indices,
             offsets,
         ):
-            coordinates = []
-            updates = []
-
-            num_atoms = mol.GetNumAtoms()
-            mol_atompair_ids = torch.zeros(num_atoms, num_atoms).long()
-
-            bonds = mol.GetBonds()
-            num_bonds = len(bonds)
 
-            for bond in bonds:
-                atom_start_index = bond.GetBeginAtomIdx()
-                atom_end_index = bond.GetEndAtomIdx()
-
-                coordinates.extend(
-                    [
-                        [atom_start_index, atom_end_index],
-                        [atom_end_index, atom_start_index],
-                    ]
-                )
-
-                bond_type = bond.GetBondType()
-                bond_id = atom_bond_index.get(bond_type, other_index) + 1
-
-                # default to symmetric bond type (undirected atom bonds)
-
-                bond_to = bond_from = bond_id
-
-                # if allowing for directed bonds, assume num_atompair_embeds = (2 * num_atom_bond_types) + 1
-                # offset other edge by num_atom_bond_types
-
-                if i.directed_bonds:
-                    bond_from += num_atom_bond_types
-
-                updates.extend([bond_to, bond_from])
-
-            if num_bonds > 0:
-                coordinates = tensor(coordinates).long()
-                updates = tensor(updates).long()
-
-                # mol_atompair_ids = einx.set_at("[h w], c [2], c -> [h w]", mol_atompair_ids, coordinates, updates)
-
-                molpair_strides = tensor(mol_atompair_ids.stride())
-                flattened_coordinates = (coordinates * molpair_strides).sum(dim = -1)
-
-                packed_atompair_ids, unpack_one = pack_one(mol_atompair_ids, '*')
-                packed_atompair_ids[flattened_coordinates] = updates
+            maybe_cached_get_atompair_ids = maybe_cache(
+                get_atompair_ids,
+                cache = ATOMPAIR_IDS_CACHE,
+                key = f'{mol_id}:{i.directed_bonds}',
+                should_cache = is_chainable_biomolecule.item()
+            )
 
-                mol_atompair_ids = unpack_one(packed_atompair_ids)
+            mol_atompair_ids = maybe_cached_get_atompair_ids(mol, directed_bonds = i.directed_bonds)
 
             # /einx.set_at
 
-            row_col_slice = slice(offset, offset + num_atoms)
-            atompair_ids[row_col_slice, row_col_slice] = mol_atompair_ids
+            if exists(mol_atompair_ids) and mol_atompair_ids.numel() > 0:
+                num_atoms = mol.GetNumAtoms()
+                row_col_slice = slice(offset, offset + num_atoms)
+                atompair_ids[row_col_slice, row_col_slice] = mol_atompair_ids
 
             # if is chainable biomolecule
             # and not the first biomolecule in the chain, add a single covalent bond between first atom of incoming biomolecule and the last atom of the last biomolecule
@@ -1251,10 +1312,6 @@ def molecule_lengthed_molecule_input_to_atom_input(
     atompair_ids = None
 
     if i.add_atompair_ids:
-        atom_bond_index = {symbol: (idx + 1) for idx, symbol in enumerate(ATOM_BONDS)}
-        num_atom_bond_types = len(atom_bond_index)
-
-        other_index = len(ATOM_BONDS) + 1
 
         atompair_ids = torch.zeros(total_atoms, total_atoms).long()
 
@@ -1265,56 +1322,35 @@ def molecule_lengthed_molecule_input_to_atom_input(
 
         for (
             mol,
+            mol_id,
             is_first_mol_in_chain,
             is_chainable_biomolecule,
             src_tgt_atom_indices,
             offset,
         ) in zip(
             molecules,
+            molecule_ids,
             is_first_mol_in_chains,
             is_chainable_biomolecules,
             i.src_tgt_atom_indices,
             offsets,
         ):
-            coordinates = []
-            updates = []
-
-            num_atoms = mol.GetNumAtoms()
-            mol_atompair_ids = torch.zeros(num_atoms, num_atoms).long()
-
-            for bond in mol.GetBonds():
-                atom_start_index = bond.GetBeginAtomIdx()
-                atom_end_index = bond.GetEndAtomIdx()
-
-                coordinates.extend(
-                    [
-                        [atom_start_index, atom_end_index],
-                        [atom_end_index, atom_start_index],
-                    ]
-                )
-
-                bond_type = bond.GetBondType()
-                bond_id = atom_bond_index.get(bond_type, other_index) + 1
 
-                # default to symmetric bond type (undirected atom bonds)
-
-                bond_to = bond_from = bond_id
-
-                # if allowing for directed bonds, assume num_atompair_embeds = (2 * num_atom_bond_types) + 1
-                # offset other edge by num_atom_bond_types
-
-                if i.directed_bonds:
-                    bond_from += num_atom_bond_types
-
-                updates.extend([bond_to, bond_from])
+            maybe_cached_get_atompair_ids = maybe_cache(
+                get_atompair_ids,
+                cache = ATOMPAIR_IDS_CACHE,
+                key = f'{mol_id}:{i.directed_bonds}',
+                should_cache = is_chainable_biomolecule.item()
+            )
 
-            coordinates = tensor(coordinates).long()
-            updates = tensor(updates).long()
+            mol_atompair_ids = maybe_cached_get_atompair_ids(mol, directed_bonds = i.directed_bonds)
 
             # mol_atompair_ids = einx.set_at("[h w], c [2], c -> [h w]", mol_atompair_ids, coordinates, updates)
 
-            row_col_slice = slice(offset, offset + num_atoms)
-            atompair_ids[row_col_slice, row_col_slice] = mol_atompair_ids
+            if exists(mol_atompair_ids) and mol_atompair_ids.numel() > 0:
+                num_atoms = mol.GetNumAtoms()
+                row_col_slice = slice(offset, offset + num_atoms)
+                atompair_ids[row_col_slice, row_col_slice] = mol_atompair_ids
 
             # if is chainable biomolecule
             # and not the first biomolecule in the chain, add a single covalent bond between first atom of incoming biomolecule and the last atom of the last biomolecule