simplify local atom attention to one lookback window, as one can just increase the window size, and makes atompair input construction much simpler

lucidrains · lucidrains · commit c2c1eb7e6b4d · 2024-05-30T18:27:06.000-07:00
diff --git a/alphafold3_pytorch/alphafold3.py b/alphafold3_pytorch/alphafold3.py
@@ -32,7 +32,7 @@
     slice_at_dim,
     pad_or_slice_to,
     pad_to_multiple,
-    concat_neighboring_windows,
+    concat_previous_window,
     full_attn_bias_to_windowed,
     full_pairwise_repr_to_windowed
 )
@@ -547,8 +547,8 @@ def forward(
         self,
         single_repr: Float['b n ds'],
         *,
-        pairwise_repr: Float['b n n dp'] | Float['b nw w (w*3) dp'],
-        attn_bias: Float['b n n'] | Float['b nw w (w*3)'] | None = None,
+        pairwise_repr: Float['b n n dp'] | Float['b nw w (w*2) dp'],
+        attn_bias: Float['b n n'] | Float['b nw w (w*2)'] | None = None,
         **kwargs
     ) -> Float['b n ds']:
 
@@ -1450,7 +1450,7 @@ def forward(
         noised_repr: Float['b n d'],
         *,
         single_repr: Float['b n ds'],
-        pairwise_repr: Float['b n n dp'] | Float['b nw w (w*3) dp'],
+        pairwise_repr: Float['b n n dp'] | Float['b nw w (w*2) dp'],
         mask: Bool['b n'] | None = None
     ):
         w = self.attn_window_size
@@ -1691,7 +1691,7 @@ def forward(
         noised_atom_pos: Float['b m 3'],
         *,
         atom_feats: Float['b m da'],
-        atompair_feats: Float['b m m dap'] | Float['b nw w (w*3) dap'],
+        atompair_feats: Float['b m m dap'] | Float['b nw w (w*2) dap'],
         atom_mask: Bool['b m'],
         times: Float[' b'],
         mask: Bool['b n'],
@@ -1757,7 +1757,7 @@ def forward(
         row_indices = rearrange(row_indices, 'b n w -> b n w 1', w = w)
         col_indices = rearrange(col_indices, 'b n w -> b n 1 w', w = w)
 
-        col_indices = concat_neighboring_windows(col_indices, dim_seq = 1, dim_window = -1)
+        col_indices = concat_previous_window(col_indices, dim_seq = 1, dim_window = -1)
         row_indices, col_indices = torch.broadcast_tensors(row_indices, col_indices)
 
         pairwise_repr_cond = einx.get_at('b [i j] dap, b nw w1 w2, b nw w1 w2 -> b nw w1 w2 dap', pairwise_repr_cond, row_indices, col_indices)
@@ -1771,7 +1771,7 @@ def forward(
 
         atom_repr_cond_row, atom_repr_cond_col = atom_repr_cond.chunk(2, dim = -1)
 
-        atom_repr_cond_col = concat_neighboring_windows(atom_repr_cond_col, dim_seq = 1, dim_window = 2)
+        atom_repr_cond_col = concat_previous_window(atom_repr_cond_col, dim_seq = 1, dim_window = 2)
 
         atompair_feats = einx.add('b nw w1 w2 dap, b nw w1 dap -> b nw w1 w2 dap', atompair_feats, atom_repr_cond_row)
         atompair_feats = einx.add('b nw w1 w2 dap, b nw w2 dap -> b nw w1 w2 dap', atompair_feats, atom_repr_cond_col)
@@ -2556,7 +2556,7 @@ def forward(
         atom_feats_cond = pad_and_window(atom_feats_cond, w)
 
         atom_feats_cond_row, atom_feats_cond_col = atom_feats_cond.chunk(2, dim = -1)
-        atom_feats_cond_col = concat_neighboring_windows(atom_feats_cond_col, dim_seq = 1, dim_window = -2)
+        atom_feats_cond_col = concat_previous_window(atom_feats_cond_col, dim_seq = 1, dim_window = -2)
 
         atompair_feats = einx.add('b nw w1 w2 dap, b nw w1 dap',atompair_feats, atom_feats_cond_row)
         atompair_feats = einx.add('b nw w1 w2 dap, b nw w2 dap',atompair_feats, atom_feats_cond_col)
diff --git a/alphafold3_pytorch/attention.py b/alphafold3_pytorch/attention.py
@@ -99,18 +99,17 @@ def pad_to_multiple(
     return pad_at_dim(t, (0, padding_needed), dim = dim, value = value)
 
 @typecheck
-def concat_neighboring_windows(
+def concat_previous_window(
     t: Tensor,
     *,
     dim_seq: int,
     dim_window: int
 ):
-    t = pad_at_dim(t, (1, 1), dim = dim_seq, value = 0.)
+    t = pad_at_dim(t, (1, 0), dim = dim_seq, value = 0.)
 
     t = torch.cat((
-        slice_at_dim(t, slice(None, -2), dim = dim_seq),
-        slice_at_dim(t, slice(1, -1), dim = dim_seq),
-        slice_at_dim(t, slice(2, None), dim = dim_seq)
+        slice_at_dim(t, slice(None, -1), dim = dim_seq),
+        slice_at_dim(t, slice(1, None), dim = dim_seq),
     ), dim = dim_window)
 
     return t
@@ -121,14 +120,14 @@ def concat_neighboring_windows(
 def full_pairwise_repr_to_windowed(
     pairwise_repr: Float['... m m dp'],
     window_size: int
-) -> Float['... n w (w*3) dp']:
+) -> Float['... n w (w*2) dp']:
 
     seq_len, device = pairwise_repr.shape[-2], pairwise_repr.device
 
     padding_needed = (window_size - (seq_len % window_size)) % window_size
     pairwise_repr = F.pad(pairwise_repr, (0, 0, 0, padding_needed, 0, padding_needed), value = 0.)
     pairwise_repr = rearrange(pairwise_repr, '... (i w1) (j w2) d -> ... i j w1 w2 d', w1 = window_size, w2 = window_size)
-    pairwise_repr = concat_neighboring_windows(pairwise_repr, dim_seq = -4, dim_window = -2)
+    pairwise_repr = concat_previous_window(pairwise_repr, dim_seq = -4, dim_window = -2)
 
     # get the diagonal
 
@@ -145,7 +144,7 @@ def full_pairwise_repr_to_windowed(
 def full_attn_bias_to_windowed(
     attn_bias: Float['... m m'],
     window_size: int
-) -> Float['... n w (w*3)']:
+) -> Float['... n w (w*2)']:
 
     attn_bias = rearrange(attn_bias, '... -> ... 1')
     attn_bias = full_pairwise_repr_to_windowed(attn_bias, window_size = window_size)
@@ -215,7 +214,7 @@ def forward(
         seq: Float['b i d'],
         mask: Bool['b n']| None = None,
         context: Float['b j d'] | None = None,
-        attn_bias: Float['... i j'] | Float['... nw w (w*3)'] | None = None
+        attn_bias: Float['... i j'] | Float['... nw w (w*2)'] | None = None
 
     ) -> Float['b i d']:
 
@@ -316,7 +315,7 @@ def local_attn(
         k: Float['b h n d'],
         v: Float['b h n d'],
         mask: Bool['b n'] | None = None,
-        attn_bias: Float['... n n'] | Float['... nw w (w*3)'] | None = None
+        attn_bias: Float['... n n'] | Float['... nw w (w*2)'] | None = None
     ) -> Float['b h n d']:
         """
         simple local attention with a radius of 1 window size
@@ -345,11 +344,11 @@ def local_attn(
         # just do radius of 1 for now
         # perhaps not even necessary, and could try shifted windows (a la Swin)
 
-        k, v = tuple(pad_at_dim(t, (1, 1), dim = -2) for t in (k, v))
-        mask = F.pad(mask, (1, 1), value = False)
+        k, v = tuple(pad_at_dim(t, (1, 0), dim = -2) for t in (k, v))
+        mask = F.pad(mask, (1, 0), value = False)
 
-        k, v = tuple(torch.cat((t[..., :-2, :], t[..., 1:-1, :], t[..., 2:, :]), dim = -2) for t in (k, v))
-        mask = torch.cat((mask[..., :-2], mask[..., 1:-1], mask[..., 2:]), dim = -1)
+        k, v = tuple(torch.cat((t[..., :-1, :], t[..., 1:, :]), dim = -2) for t in (k, v))
+        mask = torch.cat((mask[..., :-1], mask[..., 1:]), dim = -1)
 
         # handle attention bias (inefficiently)
 
@@ -399,7 +398,7 @@ def forward(
         k: Float['b h j d'],
         v: Float['b h j d'],
         mask: Bool['b j'] | None = None,
-        attn_bias: Float['... i j'] | Float['... nw w (w*3)'] | None = None,
+        attn_bias: Float['... i j'] | Float['... nw w (w*2)'] | None = None,
     ) -> Float['b h i d']:
 
         is_windowed_attn_bias = None
diff --git a/alphafold3_pytorch/trainer.py b/alphafold3_pytorch/trainer.py
@@ -25,7 +25,7 @@
 class Alphafold3Input(TypedDict):
     atom_inputs:                Float['m dai']
     residue_atom_lens:          Int['n 2']
-    atompair_inputs:            Float['m m dapi'] | Float['nw w (w*3) dapi']
+    atompair_inputs:            Float['m m dapi'] | Float['nw w (w*2) dapi']
     additional_residue_feats:   Float['n 10']
     templates:                  Float['t n n dt']
     msa:                        Float['s n dm']
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "alphafold3-pytorch"
-version = "0.1.2"
+version = "0.1.4"
 description = "Alphafold 3 - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }