clip gradients as in paper and also make sure not to do gradient sync until last step

lucidrains · lucidrains · commit 936210c2c10c · 2024-05-25T11:44:36.000-07:00
diff --git a/alphafold3_pytorch/trainer.py b/alphafold3_pytorch/trainer.py
@@ -158,15 +158,21 @@ def __call__(
         steps = 0
 
         while steps < self.num_train_steps:
-            for _ in range(self.grad_accum_every):
+
+            for grad_accum_step in range(self.grad_accum_every):
+                is_accumulating = grad_accum_step < (self.grad_accum_every - 1)
+
                 inputs = next(dl)
 
-                loss = self.model(**inputs)
+                with self.fabric.no_backward_sync(self.model, enabled = is_accumulating):
+                    loss = self.model(**inputs)
 
                 self.fabric.backward(loss / self.grad_accum_every)
 
             print(f'loss: {loss.item():.3f}')
 
+            self.fabric.clip_gradients(self.model, self.optimizer, max_norm = self.clip_grad_norm)
+
             self.optimizer.step()
 
             if self.is_main:
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "alphafold3-pytorch"
-version = "0.0.39"
+version = "0.0.40"
 description = "Alphafold 3 - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }