address #124 and #68

lucidrains · lucidrains · commit 1a4506275720 · 2024-08-03T10:30:40.000-07:00
diff --git a/alphafold3_pytorch/alphafold3.py b/alphafold3_pytorch/alphafold3.py
@@ -2407,7 +2407,8 @@ def __init__(
         S_tmax = 50,
         S_noise = 1.003,
         smooth_lddt_loss_kwargs: dict = dict(),
-        weighted_rigid_align_kwargs: dict = dict()
+        weighted_rigid_align_kwargs: dict = dict(),
+        karras_formulation = False  # use the original EDM formulation from Karras et al. Table 1 in https://arxiv.org/abs/2206.00364 - differences are that the noise and sampling schedules are scaled by sigma data, as well as loss weight adds the sigma data instead of multiply in denominator
     ):
         super().__init__()
         self.net = net
@@ -2440,6 +2441,10 @@ def __init__(
 
         self.register_buffer('zero', torch.tensor(0.), persistent = False)
 
+        # whether to use original karras formulation or not
+
+        self.karras_formulation = karras_formulation
+
     @property
     def device(self):
         return next(self.net.parameters()).device
@@ -2504,7 +2509,9 @@ def sample_schedule(self, num_sample_steps = None):
         sigmas = (self.sigma_max ** inv_rho + steps / (N - 1) * (self.sigma_min ** inv_rho - self.sigma_max ** inv_rho)) ** self.rho
 
         sigmas = F.pad(sigmas, (0, 1), value = 0.) # last step is sigma value of 0.
-        return sigmas
+
+        scale = 1. if self.karras_formulation else self.sigma_data
+        return sigmas * scale
 
     @torch.no_grad()
     def sample(
@@ -2573,11 +2580,17 @@ def sample(
 
     # training
 
-    def loss_weight(self, sigma):
+    def karras_loss_weight(self, sigma):
         return (sigma ** 2 + self.sigma_data ** 2) * (sigma * self.sigma_data) ** -2
 
+    def loss_weight(self, sigma):
+        """ for some reason, in paper they add instead of multiply as in original paper """
+        return (sigma ** 2 + self.sigma_data ** 2) * (sigma + self.sigma_data) ** -2
+
     def noise_distribution(self, batch_size):
-        return (self.P_mean + self.P_std * torch.randn((batch_size,), device = self.device)).exp()
+        scale = 1. if self.karras_formulation else self.sigma_data
+
+        return (self.P_mean + self.P_std * torch.randn((batch_size,), device = self.device)).exp() * scale
 
     def forward(
         self,
@@ -2672,7 +2685,9 @@ def forward(
 
         # regular loss weight as defined in EDM paper
 
-        loss_weights = self.loss_weight(padded_sigmas)
+        loss_weight_fn = self.karras_loss_weight if self.karras_formulation else self.loss_weight
+
+        loss_weights = loss_weight_fn(padded_sigmas)
 
         losses = losses * loss_weights
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "alphafold3-pytorch"
-version = "0.2.78"
+version = "0.2.79"
 description = "Alphafold 3 - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_af3.py b/tests/test_af3.py
@@ -285,7 +285,10 @@ def test_sequence_local_attn():
     out = attn(atoms, attn_bias = attn_bias)
     assert out.shape == atoms.shape
 
-def test_diffusion_module():
+@pytest.mark.parametrize('karras_formulation', (True, False))
+def test_diffusion_module(
+    karras_formulation
+):
 
     seq_len = 16
 
@@ -338,6 +341,7 @@ def test_diffusion_module():
 
     edm = ElucidatedAtomDiffusion(
         diffusion_module,
+        karras_formulation = karras_formulation,
         num_sample_steps = 2
     )