add layerscale, for zeroing out the contributions from MSA module and template embedder, so one can introduce MSA and templates at later stage

lucidrains · lucidrains · commit f66be08639bb · 2024-06-06T07:18:23.000-07:00
diff --git a/README.md b/README.md
@@ -248,3 +248,14 @@ docker run -v .:/data --gpus all -it af3
     url     = {https://api.semanticscholar.org/CorpusID:258564608}
 }
 ```
+
+```bibtex
+@article{Wang2022DeepNetST,
+    title   = {DeepNet: Scaling Transformers to 1, 000 Layers},
+    author  = {Hongyu Wang and Shuming Ma and Li Dong and Shaohan Huang and Dongdong Zhang and Furu Wei},
+    journal = {ArXiv},
+    year    = {2022},
+    volume  = {abs/2203.00555},
+    url     = {https://api.semanticscholar.org/CorpusID:247187905}
+}
+```
diff --git a/alphafold3_pytorch/alphafold3.py b/alphafold3_pytorch/alphafold3.py
@@ -900,7 +900,8 @@ def __init__(
         msa_pwa_heads = 8,
         msa_pwa_dim_head = 32,
         pairwise_block_kwargs: dict = dict(),
-        max_num_msa: int | None = None
+        max_num_msa: int | None = None,
+        layerscale_output: bool = True
     ):
         super().__init__()
 
@@ -947,6 +948,8 @@ def __init__(
 
         self.layers = layers
 
+        self.layerscale_output = nn.Parameter(torch.zeros(dim_pairwise)) if layerscale_output else 1.
+
     @typecheck
     def forward(
         self,
@@ -1012,7 +1015,7 @@ def forward(
                 has_msa, pairwise_repr, 0.
             )
 
-        return pairwise_repr
+        return pairwise_repr * self.layerscale_output
 
 # pairformer stack
 
@@ -1214,7 +1217,8 @@ def __init__(
         dim_pairwise = 128,
         pairformer_stack_depth = 2,
         pairwise_block_kwargs: dict = dict(),
-        eps = 1e-5
+        eps = 1e-5,
+        layerscale_output = True
     ):
         super().__init__()
         self.eps = eps
@@ -1246,6 +1250,8 @@ def __init__(
             nn.ReLU()
         )
 
+        self.layerscale = nn.Parameter(torch.zeros(dim_pairwise)) if layerscale_output else 1.
+
     @typecheck
     def forward(
         self,
@@ -1299,7 +1305,7 @@ def forward(
             has_templates, out, 0.
         )
 
-        return out
+        return out * self.layerscale
 
 # diffusion related
 # both diffusion transformer as well as atom encoder / decoder
@@ -2852,6 +2858,7 @@ def __init__(
         template_embedder_kwargs: dict = dict(
             pairformer_stack_depth = 2,
             pairwise_block_kwargs = dict(),
+            layerscale_output = True,
         ),
         msa_module_kwargs: dict = dict(
             depth = 4,
@@ -2861,7 +2868,8 @@ def __init__(
             msa_pwa_dropout_row_prob = 0.15,
             msa_pwa_heads = 8,
             msa_pwa_dim_head = 32,
-            pairwise_block_kwargs = dict()
+            pairwise_block_kwargs = dict(),
+            layerscale_output = True,
         ),
         pairformer_stack: dict = dict(
             depth = 48,
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "alphafold3-pytorch"
-version = "0.1.27"
+version = "0.1.28"
 description = "Alphafold 3 - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }