sketch out the general outlines for how all the disparate types of input data will be handled

lucidrains · lucidrains · commit 06ae6afcce53 · 2024-06-08T09:31:27.000-07:00
diff --git a/alphafold3_pytorch/inputs.py b/alphafold3_pytorch/inputs.py
@@ -5,7 +5,7 @@
     Int, Bool, Float
 )
 
-# constants
+# atom level, what Alphafold3 accepts
 
 @typecheck
 class AtomInput(TypedDict):
@@ -25,3 +25,33 @@ class AtomInput(TypedDict):
     pae_labels:                 Int['*b n n'] | None
     pde_labels:                 Int['*b n'] | None
     resolved_labels:            Int['*b n'] | None
+
+# residue level - single chain proteins for starters
+
+@typecheck
+class ProteinInput(TypedDict):
+    residue_ids:                Int['*b n']
+    residue_atom_lens:          Int['*b n']
+    templates:                  Float['*b t n n dt']
+    msa:                        Float['*b s n dm']
+    template_mask:              Bool['*b t'] | None
+    msa_mask:                   Bool['*b s'] | None
+    atom_pos:                   Float['*b m 3'] | None
+    distance_labels:            Int['*b n n'] | None
+    pae_labels:                 Int['*b n n'] | None
+    pde_labels:                 Int['*b n'] | None
+    resolved_labels:            Int['*b n'] | None
+
+@typecheck
+def single_protein_input_to_atom_input(
+    residue_input: ProteinInput
+) -> AtomInput:
+
+    raise NotImplementedError
+
+# the config used for keeping track of all the disparate inputs and their transforms down to AtomInput
+# this can be preprocessed or will be taken care of automatically within the Trainer during data collation
+
+INPUT_TO_ATOM_TRANSFORM = {
+    ProteinInput: single_protein_input_to_atom_input
+}
diff --git a/alphafold3_pytorch/trainer.py b/alphafold3_pytorch/trainer.py
@@ -15,7 +15,8 @@
 )
 
 from alphafold3_pytorch.inputs import (
-    AtomInput
+    AtomInput,
+    INPUT_TO_ATOM_TRANSFORM
 )
 
 import torch
@@ -77,18 +78,31 @@ def collate_af3_inputs(
     if exists(map_input_fn):
         inputs = [map_input_fn(i) for i in inputs]
 
-    # make sure all inputs are AtomInput
+    # go through all the inputs
+    # and for any that is not AtomInput, try to transform it with the registered input type to corresponding registered function
 
-    assert all([beartype_isinstance(i, AtomInput) for i in inputs])
+    atom_inputs = []
+
+    for i in inputs:
+        if beartype_isinstance(i, AtomInput):
+            atom_inputs.append(i)
+            continue
+
+        maybe_to_atom_fn = INPUT_TO_ATOM_TRANSFORM.get(type(i), None)
+
+        if not exists(maybe_to_atom_fn):
+            raise TypeError(f'invalid input type {type(i)} being passed into Trainer that is not converted to AtomInput correctly')
+
+        atom_inputs = maybe_to_atom_fn(i)
 
     # separate input dictionary into keys and values
 
-    keys = inputs[0].keys()
-    inputs = [i.values() for i in inputs]
+    keys = atom_inputs[0].keys()
+    atom_inputs = [i.values() for i in atom_inputs]
 
     outputs = []
 
-    for grouped in zip(*inputs):
+    for grouped in zip(*atom_inputs):
         # if all None, just return None
 
         not_none_grouped = [*filter(exists, grouped)]
@@ -144,7 +158,8 @@ def collate_af3_inputs(
 
     # reconstitute dictionary
 
-    return AtomInput(tuple(zip(keys, outputs)))
+    batched_atom_inputs = AtomInput(tuple(zip(keys, outputs)))
+    return batched_atom_inputs
 
 @typecheck
 def DataLoader(
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "alphafold3-pytorch"
-version = "0.1.44"
+version = "0.1.45"
 description = "Alphafold 3 - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }