able to precompute atom dataset from a pdb dataset all the way from a training config and take a training step

lucidrains · lucidrains · commit 95c99d17f7b4 · 2024-07-28T07:39:35.000-07:00
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -1,6 +1,9 @@
 name: Pytest
 on: [push, pull_request]
 
+env:
+  TYPECHECK: True
+
 jobs:
   build:
 
diff --git a/alphafold3_pytorch/__init__.py b/alphafold3_pytorch/__init__.py
@@ -45,7 +45,6 @@
     AtomDataset,
     PDBInput,
     PDBDataset,
-    DatasetWithReturnedIndex,
     maybe_transform_to_atom_input,
     maybe_transform_to_atom_inputs,
 )
diff --git a/alphafold3_pytorch/confidence.py b/alphafold3_pytorch/confidence.py
@@ -11,7 +11,6 @@
     repeat_consecutive_with_lens
 )
 
-
 from alphafold3_pytorch.tensor_typing import (
     Float,
     Int,
diff --git a/alphafold3_pytorch/configs.py b/alphafold3_pytorch/configs.py
@@ -7,7 +7,8 @@
 
 from alphafold3_pytorch.inputs import (
     AtomDataset,
-    PDBDataset
+    PDBDataset,
+    pdb_dataset_to_atom_inputs
 )
 
 from alphafold3_pytorch.trainer import (
@@ -145,6 +146,7 @@ class DatasetConfig(BaseModelWithExtra):
     train_folder: DirectoryPath
     valid_folder: DirectoryPath | None = None
     test_folder: DirectoryPath | None = None
+    convert_pdb_to_atom: bool = False
     train_weighted_sampler: WeightedPDBSamplerConfig | None = None
     kwargs: dict = dict()
 
@@ -219,6 +221,11 @@ def create_instance(
             dataset_type = dataset_config.dataset_type
             dataset_kwargs = dataset_config.kwargs
 
+            convert_pdb_to_atom = dataset_config.convert_pdb_to_atom
+
+            if convert_pdb_to_atom:
+                assert dataset_type == 'atom', 'must be `atom` dataset_type if `convert_pdb_to_atom` is set to True'
+
             if dataset_type == 'pdb':
                 dataset_klass = PDBDataset
             elif dataset_type == 'atom':
@@ -230,6 +237,11 @@ def create_instance(
 
             if exists(train_folder):
                 assert 'dataset' not in trainer_kwargs
+
+                if convert_pdb_to_atom:
+                    pdb_dataset = PDBDataset(train_folder, **dataset_kwargs)
+                    train_folder = pdb_dataset_to_atom_inputs(pdb_dataset)
+
                 dataset = dataset_klass(train_folder, **dataset_kwargs)
                 trainer_kwargs.update(dataset = dataset)
 
diff --git a/alphafold3_pytorch/inputs.py b/alphafold3_pytorch/inputs.py
@@ -7,6 +7,7 @@
 from functools import partial
 from itertools import groupby
 from collections import defaultdict
+from collections.abc import Iterable
 from dataclasses import asdict, dataclass, field
 from typing import Any, Callable, List, Literal, Set, Tuple, Type
 
@@ -187,11 +188,12 @@ def dict(self):
 @typecheck
 def atom_input_to_file(
     atom_input: AtomInput,
-    path: str,
+    path: str | Path,
     overwrite: bool = False
 ) -> Path:
 
-    path = Path(path)
+    if isinstance(path, str):
+        path = Path(path)
 
     if not overwrite:
         assert not path.exists()
@@ -211,6 +213,53 @@ def file_to_atom_input(path: str | Path) -> AtomInput:
     atom_input_dict = torch.load(str(path))
     return AtomInput(**atom_input_dict)
 
+@typecheck
+def pdb_dataset_to_atom_inputs(
+    pdb_dataset: PDBDataset,
+    *,
+    output_atom_folder: str | Path | None = None,
+    indices: Iterable | None = None,
+    return_atom_dataset = False,
+    verbose = True
+) -> Path | AtomDataset:
+
+    if not exists(output_atom_folder):
+        pdb_folder = Path(pdb_dataset.folder).resolve()
+        parent_folder = pdb_folder.parents[0]
+        output_atom_folder = parent_folder / f'{pdb_folder.stem}.atom-inputs'
+
+    if isinstance(output_atom_folder, str):
+        output_atom_folder = Path(output_atom_folder)
+
+    if not exists(indices):
+        indices = torch.randperm(len(pdb_dataset)).tolist()
+
+    indices = iter(indices)
+
+    to_atom_input_fn = compose(
+        pdb_input_to_molecule_input,
+        molecule_to_atom_input
+    )
+
+    while index := next(indices, None):
+        if not exists(index):
+            break
+
+        pdb_input = pdb_dataset[index]
+
+        atom_input = to_atom_input_fn(pdb_input)
+        atom_input_path = output_atom_folder / f'{index}.pt'
+
+        atom_input_to_file(atom_input, atom_input_path)
+
+        if verbose:
+            logger.info(f'converted pdb input with index {index} to {str(atom_input_path)}')
+
+    if not return_atom_dataset:
+        return output_atom_folder
+
+    return AtomDataset(output_atom_folder)
+
 # Atom dataset that returns a AtomInput based on folders of atom inputs stored on disk
 
 class AtomDataset(Dataset):
@@ -221,11 +270,13 @@ def __init__(
         if isinstance(folder, str):
             folder = Path(folder)
 
-        assert folder.exists() and folder.is_dir()
+        assert folder.exists() and folder.is_dir(), f'atom dataset not found at {str(folder)}'
 
         self.folder = folder
         self.files = [*folder.glob('**/*.pt')]
 
+        assert len(self) > 0, f'no valid atom .pt files found at {str(folder)}'
+
     def __len__(self):
         return len(self.files)
 
@@ -1919,19 +1970,6 @@ def pdb_input_to_molecule_input(pdb_input: PDBInput) -> MoleculeInput:
 
 # datasets
 
-# dataset wrapper for returning index along with dataset item
-# for caching logic both integrated into trainer and for precaching
-
-class DatasetWithReturnedIndex(Dataset):
-    def __init__(self, dataset: Dataset):
-        self.dataset = dataset
-
-    def __len__(self):
-        return len(self.dataset)
-
-    def __getitem__(self, idx):
-        return idx, self.dataset[idx]
-
 # PDB dataset that returns a PDBInput based on folder
 
 class PDBDataset(Dataset):
@@ -1953,7 +1991,9 @@ def __init__(
         if isinstance(folder, str):
             folder = Path(folder)
 
-        assert folder.exists() and folder.is_dir()
+        assert folder.exists() and folder.is_dir(), f'{str(folder)} does not exist for PDBDataset'
+        self.folder = folder
+
         self.files = {
             os.path.splitext(os.path.basename(file.name))[0]: file
             for file in folder.glob(os.path.join("**", "*.cif"))
@@ -1967,6 +2007,8 @@ def __init__(
         self.training = training
         self.pdb_input_kwargs = pdb_input_kwargs
 
+        assert len(self) > 0, f'no valid mmcifs / pdbs found at {str(folder)}'
+
     def __len__(self):
         """Return the number of PDB mmCIF files in the dataset."""
         return len(self.files)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "alphafold3-pytorch"
-version = "0.2.42"
+version = "0.2.43"
 description = "Alphafold 3 - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/configs/trainer_with_atom_dataset_created_from_pdb.yaml b/tests/configs/trainer_with_atom_dataset_created_from_pdb.yaml
@@ -0,0 +1,61 @@
+---
+model:
+    dim_atom: 4
+    dim_atompair: 4
+    dim_input_embedder_token: 4
+    dim_single: 4
+    dim_pairwise: 4
+    dim_token: 4
+    dim_atom_inputs: 3
+    dim_atompair_inputs: 1
+    dim_template_model: 8
+    atoms_per_window: 27
+    dim_template_feats: 44
+    num_dist_bins: 38
+    ignore_index: -1
+    num_dist_bins: null
+    num_plddt_bins: 50
+    num_pde_bins: 64
+    num_pae_bins: 64
+    sigma_data: 16
+    diffusion_num_augmentations: 4
+    loss_confidence_weight: 0.0001
+    loss_distogram_weight: 0.01
+    loss_diffusion_weight: 4.
+    confidence_head_kwargs:
+        pairformer_depth: 1
+    template_embedder_kwargs:
+        pairformer_stack_depth: 1
+    msa_module_kwargs:
+        depth: 1
+    pairformer_stack:
+        depth: 1
+        pair_bias_attn_dim_head: 4
+        pair_bias_attn_heads: 2
+    diffusion_module_kwargs:
+        atom_encoder_depth: 1
+        token_transformer_depth: 1
+        atom_decoder_depth: 1
+        atom_decoder_kwargs:
+            attn_pair_bias_kwargs:
+                dim_head: 4
+        atom_encoder_kwargs:
+            attn_pair_bias_kwargs:
+                dim_head: 4
+num_train_steps: 1
+batch_size: 1
+grad_accum_every: 1
+valid_every: 1
+use_ema: false
+ema_decay: 0.999
+lr: 0.0001
+clip_grad_norm: 10.
+accelerator: cpu
+checkpoint_prefix: af3.ckpt.
+checkpoint_every: 1
+checkpoint_folder: ./checkpoints
+overwrite_checkpoints: false
+dataset_config:
+    dataset_type: atom
+    convert_pdb_to_atom: true
+    train_folder: ./test-folder/data/train
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
@@ -12,7 +12,6 @@
 from alphafold3_pytorch import (
     Alphafold3,
     PDBDataset,
-    DatasetWithReturnedIndex,
     AtomInput,
     atom_input_to_file,
     DataLoader,
@@ -314,17 +313,6 @@ def test_collate_fn():
 
     _, breakdown = alphafold3(**asdict(batched_atom_inputs), return_loss_breakdown = True)
 
-# test use of a dataset wrapper that returns the indices, for caching
-
-def test_dataset_return_index_wrapper():
-    dataset = MockAtomDataset(5)
-    wrapped_dataset = DatasetWithReturnedIndex(dataset)
-
-    assert len(wrapped_dataset) == len(dataset)
-
-    idx, item = wrapped_dataset[3]
-    assert idx == 3 and isinstance(item, AtomInput)
-
 # test creating trainer + alphafold3 from config
 
 def test_trainer_config():
@@ -387,6 +375,21 @@ def test_trainer_config_with_atom_dataset():
 
     shutil.rmtree(atom_folder, ignore_errors = True)
 
+# test creating trainer + alphafold3 with atom dataset that is precomputed from a pdb dataset
+
+def test_trainer_config_with_atom_dataset_from_pdb_dataset(populate_mock_pdb_and_remove_test_folders):
+
+    curr_dir = Path(__file__).parents[0]
+    trainer_yaml_path = curr_dir / 'configs/trainer_with_atom_dataset_created_from_pdb.yaml'
+
+    trainer = create_trainer_from_yaml(trainer_yaml_path)
+
+    assert isinstance(trainer, Trainer)
+
+    # take a single training step
+
+    trainer()
+
 # test creating trainer without model, given when creating instance
 
 def test_trainer_config_without_model():

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,6 @@`
`45`	`45`	`AtomDataset,`
`46`	`46`	`PDBInput,`
`47`	`47`	`PDBDataset,`
`48`		`- DatasetWithReturnedIndex,`
`49`	`48`	`maybe_transform_to_atom_input,`
`50`	`49`	`maybe_transform_to_atom_inputs,`
`51`	`50`	`)`
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,6 @@`
`11`	`11`	`repeat_consecutive_with_lens`
`12`	`12`	`)`
`13`	`13`
`14`		`-`
`15`	`14`	`from alphafold3_pytorch.tensor_typing import (`
`16`	`15`	`Float,`
`17`	`16`	`Int,`