ema on cpu (#119)

lucidrains · web-flow · commit 241318cc6249 · 2024-08-02T08:18:38.000-07:00
ema on cpu
diff --git a/alphafold3_pytorch/trainer.py b/alphafold3_pytorch/trainer.py
@@ -2,6 +2,7 @@
 
 from functools import wraps, partial
 from dataclasses import asdict
+from contextlib import contextmanager
 from pathlib import Path
 
 from alphafold3_pytorch.alphafold3 import Alphafold3
@@ -64,6 +65,22 @@ def divisible_by(num, den):
 def at_most_one_of(*flags: bool) -> bool:
     return sum([*map(int, flags)]) <= 1
 
+@contextmanager
+def to_device_and_back(
+    module: Module,
+    device: torch.device
+):
+    orig_device = next(module.parameters()).device
+    need_move_device = orig_device != device
+
+    if need_move_device:
+        module.to(device)
+
+    yield
+
+    if need_move_device:
+        module.to(orig_device)
+
 def cycle(dataloader: DataLoader):
     while True:
         for batch in dataloader:
@@ -284,6 +301,7 @@ def __init__(
         ema_kwargs: dict = dict(
             use_foreach = True
         ),
+        ema_on_cpu = False,
         use_adam_atan2: bool = False,
         use_lion: bool = False,
         use_torch_compile: bool = False
@@ -314,9 +332,13 @@ def __init__(
                 model,
                 beta = ema_decay,
                 include_online_model = False,
+                allow_different_devices = True,
                 **ema_kwargs
             )
 
+            self.ema_device = 'cpu' if ema_on_cpu else self.device
+            self.ema_model.to(self.ema_device)
+
         # maybe torch compile
 
         if use_torch_compile:
@@ -437,6 +459,10 @@ def __init__(
         self.last_loaded_train_id = None
         self.model_loaded_from_path: Path | None = None
 
+    @property
+    def device(self):
+        return self.fabric.device
+
     @property
     def is_main(self):
         return self.fabric.global_rank == 0
@@ -656,7 +682,7 @@ def __call__(
             ):
                 eval_model = default(self.ema_model, self.model)
 
-                with torch.no_grad():
+                with torch.no_grad(), to_device_and_back(eval_model, self.device):
                     eval_model.eval()
 
                     total_valid_loss = 0.
@@ -696,7 +722,7 @@ def __call__(
         if self.is_main and self.needs_test:
             eval_model = default(self.ema_model, self.model)
 
-            with torch.no_grad():
+            with torch.no_grad(), to_device_and_back(eval_model, self.device):
                 eval_model.eval()
 
                 total_test_loss = 0.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "alphafold3-pytorch"
-version = "0.2.73"
+version = "0.2.74"
 description = "Alphafold 3 - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }