switch to using dataclasses for the inputs

lucidrains · lucidrains · commit 3bd8bf49c70b · 2024-06-26T09:42:21.000-07:00
diff --git a/alphafold3_pytorch/alphafold3.py b/alphafold3_pytorch/alphafold3.py
@@ -38,6 +38,11 @@
     full_pairwise_repr_to_windowed
 )
 
+from alphafold3_pytorch.inputs import (
+    IS_MOLECULE_TYPES,
+    ADDITIONAL_MOLECULE_FEATS
+)
+
 from frame_averaging_pytorch import FrameAverage
 
 from taylor_series_linear_attention import TaylorSeriesLinearAttn
@@ -98,11 +103,6 @@
 
 # constants
 
-from alphafold3_pytorch.inputs import (
-    IS_MOLECULE_TYPES,
-    ADDITIONAL_MOLECULE_FEATS
-)
-
 LinearNoBias = partial(Linear, bias = False)
 
 # helper functions
diff --git a/alphafold3_pytorch/inputs.py b/alphafold3_pytorch/inputs.py
@@ -1,11 +1,13 @@
 from functools import wraps
-from typing import Type, TypedDict, Literal, Callable, List
+from dataclasses import dataclass
+from typing import Type, Literal, Callable, List, Any
 
 from rdkit import Chem
 from rdkit.Chem.rdchem import Mol
 
 from alphafold3_pytorch.tensor_typing import (
     typecheck,
+    beartype_isinstance,
     Int, Bool, Float
 )
 
@@ -21,10 +23,17 @@
 IS_MOLECULE_TYPES = 4
 ADDITIONAL_MOLECULE_FEATS = 5
 
-# simple compose function
-# for chaining from Alphafold3Input -> MoleculeInput -> AtomInput
+# functions
+
+def exists(v):
+    return v is not None
+
+def identity(t):
+    return t
 
 def compose(*fns: Callable):
+    # for chaining from Alphafold3Input -> MoleculeInput -> AtomInput
+
     def inner(x, *args, **kwargs):
         for fn in fns:
             x = fn(x, *args, **kwargs)
@@ -34,30 +43,33 @@ def inner(x, *args, **kwargs):
 # atom level, what Alphafold3 accepts
 
 @typecheck
-class AtomInput(TypedDict):
+@dataclass
+class AtomInput:
     atom_inputs:                Float['m dai']
-    molecule_ids:               Int['n']
-    molecule_atom_lens:         Int['n']
+    molecule_ids:               Int[' n']
+    molecule_atom_lens:         Int[' n']
     atompair_inputs:            Float['m m dapi'] | Float['nw w (w*2) dapi']
     additional_molecule_feats:  Float[f'n {ADDITIONAL_MOLECULE_FEATS}']
     is_molecule_types:          Bool[f'n {IS_MOLECULE_TYPES}']
     templates:                  Float['t n n dt']
     msa:                        Float['s n dm']
-    token_bonds:                Bool['n n'] | None
-    atom_ids:                   Int['m'] | None
-    atom_parent_ids:            Int['m'] | None
-    atompair_ids:               Int['m m'] | Int['nw w (w*2)'] | None
-    template_mask:              Bool['t'] | None
-    msa_mask:                   Bool['s'] | None
-    atom_pos:                   Float['m 3'] | None
-    molecule_atom_indices:      Int['n'] | None
-    distance_labels:            Int['n n'] | None
-    pae_labels:                 Int['n n'] | None
-    pde_labels:                 Int['n'] | None
-    resolved_labels:            Int['n'] | None
+    token_bonds:                Bool['n n'] | None = None
+    atom_ids:                   Int[' m'] | None = None
+    atom_parent_ids:            Int[' m'] | None = None
+    atompair_ids:               Int['m m'] | Int['nw w (w*2)'] | None = None
+    template_mask:              Bool[' t'] | None = None
+    msa_mask:                   Bool[' s'] | None = None
+    atom_pos:                   Float['m 3'] | None = None
+    molecule_atom_indices:      Int[' n'] | None = None
+    distance_labels:            Int['n n'] | None = None
+    pae_labels:                 Int['n n'] | None = None
+    pde_labels:                 Int['n n'] | None = None
+    plddt_labels:               Int[' n'] | None = None
+    resolved_labels:            Int[' n'] | None = None
 
 @typecheck
-class BatchedAtomInput(TypedDict):
+@dataclass
+class BatchedAtomInput:
     atom_inputs:                Float['b m dai']
     molecule_ids:               Int['b n']
     molecule_atom_lens:         Int['b n']
@@ -66,38 +78,40 @@ class BatchedAtomInput(TypedDict):
     is_molecule_types:          Bool[f'b n {IS_MOLECULE_TYPES}']
     templates:                  Float['b t n n dt']
     msa:                        Float['b s n dm']
-    token_bonds:                Bool['b n n'] | None
-    atom_ids:                   Int['b m'] | None
-    atom_parent_ids:            Int['b m'] | None
-    atompair_ids:               Int['b m m'] | Int['b nw w (w*2)'] | None
-    template_mask:              Bool['b t'] | None
-    msa_mask:                   Bool['b s'] | None
-    atom_pos:                   Float['b m 3'] | None
-    molecule_atom_indices:      Int['b n'] | None
-    distance_labels:            Int['b n n'] | None
-    pae_labels:                 Int['b n n'] | None
-    pde_labels:                 Int['b n'] | None
-    resolved_labels:            Int['b n'] | None
+    token_bonds:                Bool['b n n'] | None = None
+    atom_ids:                   Int['b m'] | None = None
+    atom_parent_ids:            Int['b m'] | None = None
+    atompair_ids:               Int['b m m'] | Int['b nw w (w*2)'] | None = None
+    template_mask:              Bool['b t'] | None = None
+    msa_mask:                   Bool['b s'] | None = None
+    atom_pos:                   Float['b m 3'] | None = None
+    molecule_atom_indices:      Int['b n'] | None = None
+    distance_labels:            Int['b n n'] | None = None
+    pae_labels:                 Int['b n n'] | None = None
+    pde_labels:                 Int['b n n'] | None = None
+    plddt_labels:               Int['b n'] | None = None
+    resolved_labels:            Int['b n'] | None = None
 
 # molecule input - accepting list of molecules as rdchem.Mol + the atomic lengths for how to pool into tokens
 
 @typecheck
-class MoleculeInput(TypedDict):
+@dataclass
+class MoleculeInput:
     molecules:                  List[Mol]
     molecule_token_pool_lens:   List[List[int]]
     molecule_atom_indices:      List[List[int] | None]
-    molecule_ids:               Int['n']
+    molecule_ids:               Int[' n']
     additional_molecule_feats:  Float['n 5']
     is_molecule_types:          Bool['n 4']
-    atom_pos:                   List[Float['_ 3']] | Float['m 3'] | None
+    atom_pos:                   List[Float['_ 3']] | Float['m 3'] | None = None
     templates:                  Float['t n n dt']
-    template_mask:              Bool['t'] | None
+    template_mask:              Bool[' t'] | None = None
     msa:                        Float['s n dm']
-    msa_mask:                   Bool['s'] | None
-    distance_labels:            Int['n n'] | None
-    pae_labels:                 Int['n n'] | None
-    pde_labels:                 Int['n'] | None
-    resolved_labels:            Int['n'] | None
+    msa_mask:                   Bool[' s'] | None = None
+    distance_labels:            Int['n n'] | None = None
+    pae_labels:                 Int['n n'] | None = None
+    pde_labels:                 Int[' n'] | None = None
+    resolved_labels:            Int[' n'] | None = None
 
 @typecheck
 def molecule_to_atom_input(molecule_input: MoleculeInput) -> AtomInput:
@@ -106,41 +120,79 @@ def molecule_to_atom_input(molecule_input: MoleculeInput) -> AtomInput:
 # alphafold3 input - support polypeptides, nucleic acids, metal ions + any number of ligands + misc biomolecules
 
 @typecheck
-class Alphafold3Input(TypedDict):
-    proteins:                   List[Int['_']]
-    protein_atom_lens:          List[Int['_']]
-    nucleic_acids:              List[Int['_']]
-    nucleic_acid_atom_lens:     List[Int['_']]
-    metal_ions:                 List[int]
-    misc_molecule_ids:          List[int]
+@dataclass
+class Alphafold3Input:
+    proteins:                   List[Int[' _']]
+    protein_atom_lens:          List[Int[' _']]
+    nucleic_acids:              List[Int[' _']]
+    nucleic_acid_atom_lens:     List[Int[' _']]
+    metal_ions:                 Int[' _']
+    misc_molecule_ids:          Int[' _']
     ligands:                    List[Mol | str] # can be given as smiles
-    atom_pos:                   List[Float['_ 3']] | Float['m 3'] | None
+    atom_pos:                   List[Float['_ 3']] | Float['m 3'] | None = None
     templates:                  Float['t n n dt']
     msa:                        Float['s n dm']
-    template_mask:              Bool['t'] | None
-    msa_mask:                   Bool['s'] | None
-    distance_labels:            Int['n n'] | None
-    pae_labels:                 Int['n n'] | None
-    pde_labels:                 Int['n'] | None
-    resolved_labels:            Int['n'] | None
+    template_mask:              Bool[' t'] | None = None
+    msa_mask:                   Bool[' s'] | None = None
+    distance_labels:            Int['n n'] | None = None
+    pae_labels:                 Int['n n'] | None = None
+    pde_labels:                 Int[' n'] | None = None
+    resolved_labels:            Int[' n'] | None = None
 
 @typecheck
 def af3_input_to_molecule_input(af3_input: Alphafold3Input) -> AtomInput:
     raise NotImplementedError
 
+# pdb input
+
+@typecheck
+@dataclass
+class PDBInput:
+    filepath: str
+
+@typecheck
+def pdb_input_to_alphafold3_input(pdb_input: PDBInput) -> Alphafold3Input:
+    raise NotImplementedError
+
 # the config used for keeping track of all the disparate inputs and their transforms down to AtomInput
 # this can be preprocessed or will be taken care of automatically within the Trainer during data collation
 
 INPUT_TO_ATOM_TRANSFORM = {
+    AtomInput: identity,
     MoleculeInput: molecule_to_atom_input,
-    Alphafold3Input: compose(af3_input_to_molecule_input, molecule_to_atom_input)
+    Alphafold3Input: compose(
+        af3_input_to_molecule_input,
+        molecule_to_atom_input
+    ),
+    PDBInput: compose(
+        pdb_input_to_alphafold3_input,
+        af3_input_to_molecule_input,
+        molecule_to_atom_input
+    )
 }
 
 # function for extending the config
 
 @typecheck
 def register_input_transform(
     input_type: Type,
-    fn: Callable[[TypedDict], AtomInput]
+    fn: Callable[[Any], AtomInput]
 ):
+    assert input_type not in INPUT_TO_ATOM_TRANSFORM, f'{input_type} is already registered'
     INPUT_TO_ATOM_TRANSFORM[input_type] = fn
+
+# functions for transforming to atom inputs
+
+def maybe_transform_to_atom_inputs(inputs: List[Any]) -> List[AtomInput]:
+    atom_inputs = []
+
+    for i in inputs:
+
+        maybe_to_atom_fn = INPUT_TO_ATOM_TRANSFORM.get(type(i), None)
+
+        if not exists(maybe_to_atom_fn):
+            raise TypeError(f'invalid input type {type(i)} being passed into Trainer that is not converted to AtomInput correctly')
+
+        atom_inputs.append(maybe_to_atom_fn(i))
+
+    return atom_inputs
diff --git a/alphafold3_pytorch/trainer.py b/alphafold3_pytorch/trainer.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from functools import wraps, partial
+from dataclasses import asdict
 from pathlib import Path
 
 from alphafold3_pytorch.alphafold3 import Alphafold3
@@ -10,7 +11,6 @@
 
 from alphafold3_pytorch.tensor_typing import (
     typecheck,
-    beartype_isinstance,
     Int, Bool, Float
 )
 
@@ -22,7 +22,7 @@
 from alphafold3_pytorch.inputs import (
     AtomInput,
     BatchedAtomInput,
-    INPUT_TO_ATOM_TRANSFORM
+    maybe_transform_to_atom_inputs
 )
 
 import torch
@@ -88,43 +88,32 @@ def collate_af3_inputs(
     # go through all the inputs
     # and for any that is not AtomInput, try to transform it with the registered input type to corresponding registered function
 
-    atom_inputs = []
-
-    for i in inputs:
-        if beartype_isinstance(i, AtomInput):
-            atom_inputs.append(i)
-            continue
-
-        maybe_to_atom_fn = INPUT_TO_ATOM_TRANSFORM.get(type(i), None)
-
-        if not exists(maybe_to_atom_fn):
-            raise TypeError(f'invalid input type {type(i)} being passed into Trainer that is not converted to AtomInput correctly')
-
-        atom_inputs.append(maybe_to_atom_fn(i))
+    atom_inputs = maybe_transform_to_atom_inputs(inputs)
 
     # take care of windowing the atompair_inputs and atompair_ids if they are not windowed already
 
     if exists(atoms_per_window):
         for atom_input in atom_inputs:
-            atompair_inputs = atom_input['atompair_inputs']
-            atompair_ids = atom_input.get('atompair_ids', None)
+            atompair_inputs = atom_input.atompair_inputs
+            atompair_ids = atom_input.atompair_ids
 
             atompair_inputs_is_windowed = atompair_inputs.ndim == 4
 
             if not atompair_inputs_is_windowed:
-                atom_input['atompair_inputs'] = full_pairwise_repr_to_windowed(atompair_inputs, window_size = atoms_per_window)
+                atom_input.atompair_inputs = full_pairwise_repr_to_windowed(atompair_inputs, window_size = atoms_per_window)
 
             if exists(atompair_ids):
                 atompair_ids_is_windowed = atompair_ids.ndim == 3
 
                 if not atompair_ids_is_windowed:
-                    atom_input['atompair_ids'] = full_attn_bias_to_windowed(atompair_ids, window_size = atoms_per_window)
+                    atom_input.atompair_ids = full_attn_bias_to_windowed(atompair_ids, window_size = atoms_per_window)
 
     # separate input dictionary into keys and values
 
-    keys = atom_inputs[0].keys()
-    atom_inputs = [i.values() for i in atom_inputs]
+    keys = asdict(atom_inputs[0]).keys()
+    atom_inputs = [asdict(i).values() for i in atom_inputs]
 
+    print(keys)
     outputs = []
 
     for grouped in zip(*atom_inputs):
@@ -183,7 +172,7 @@ def collate_af3_inputs(
 
     # reconstitute dictionary
 
-    batched_atom_inputs = BatchedAtomInput(tuple(zip(keys, outputs)))
+    batched_atom_inputs = BatchedAtomInput(**dict(tuple(zip(keys, outputs))))
     return batched_atom_inputs
 
 @typecheck
@@ -522,7 +511,7 @@ def __call__(
                     # model forwards
 
                     loss, loss_breakdown = self.model(
-                        **inputs,
+                        **asdict(inputs),
                         return_loss_breakdown = True
                     )
 
@@ -582,11 +571,11 @@ def __call__(
 
                     for valid_batch in self.valid_dataloader:
                         valid_loss, loss_breakdown = self.ema_model(
-                            **valid_batch,
+                            **asdict(valid_batch),
                             return_loss_breakdown = True
                         )
 
-                        valid_batch_size = valid_batch.get('atom_inputs').shape[0]
+                        valid_batch_size = valid_batch.atom_inputs.shape[0]
                         scale = valid_batch_size / self.valid_dataset_size
 
                         total_valid_loss += valid_loss.item() * scale
@@ -620,11 +609,11 @@ def __call__(
 
                 for test_batch in self.test_dataloader:
                     test_loss, loss_breakdown = self.ema_model(
-                        **test_batch,
+                        **asdict(test_batch),
                         return_loss_breakdown = True
                     )
 
-                    test_batch_size = test_batch.get('atom_inputs').shape[0]
+                    test_batch_size = test_batch.atom_inputs.shape[0]
                     scale = test_batch_size / self.test_dataset_size
 
                     total_test_loss += test_loss.item() * scale
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "alphafold3-pytorch"
-version = "0.1.69"
+version = "0.1.70"
 description = "Alphafold 3 - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_trainer.py b/tests/test_trainer.py