also bring in the qk norm used in simple diffusion (and in a lot of vision models at Brain) for potential stability, but make it an option, as still not sure if it hurts eval

lucidrains · lucidrains · commit 34caeca3c72b · 2023-09-08T10:47:21.000-07:00
diff --git a/README.md b/README.md
@@ -158,3 +158,11 @@ sampled_images.shape # (4, 3, 128, 128)
     year    = {2022}
 }
 ```
+
+```bibtex
+@inproceedings{Hoogeboom2023simpleDE,
+    title   = {simple diffusion: End-to-end diffusion for high resolution images},
+    author  = {Emiel Hoogeboom and Jonathan Heek and Tim Salimans},
+    year    = {2023}
+}
+```
diff --git a/rin_pytorch/rin_pytorch.py b/rin_pytorch/rin_pytorch.py
@@ -81,6 +81,15 @@ def __init__(self, dim):
     def forward(self, x):
         return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta)
 
+class MultiHeadedRMSNorm(nn.Module):
+    def __init__(self, dim, heads = 1):
+        super().__init__()
+        self.scale = dim ** 0.5
+        self.gamma = nn.Parameter(torch.ones(heads, 1, dim))
+
+    def forward(self, x):
+        return F.normalize(x, dim = -1) * self.scale * self.gamma
+
 # positional embeds
 
 class LearnedSinusoidalPosEmb(nn.Module):
@@ -104,6 +113,7 @@ def __init__(
         heads = 4,
         dim_head = 32,
         norm = False,
+        qk_norm = False,
         time_cond_dim = None
     ):
         super().__init__()
@@ -127,6 +137,11 @@ def __init__(
 
         self.to_qkv = nn.Linear(dim, hidden_dim * 3, bias = False)
 
+        self.qk_norm = qk_norm
+        if qk_norm:
+            self.q_norm = MultiHeadedRMSNorm(dim_head, heads)
+            self.k_norm = MultiHeadedRMSNorm(dim_head, heads)
+
         self.to_out = nn.Sequential(
             nn.Linear(hidden_dim, dim, bias = False),
             LayerNorm(dim)
@@ -148,6 +163,10 @@ def forward(
         qkv = self.to_qkv(x).chunk(3, dim = -1)
         q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)
 
+        if self.qk_norm:
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+
         q = q.softmax(dim = -1)
         k = k.softmax(dim = -2)
 
@@ -169,7 +188,8 @@ def __init__(
         norm = False,
         norm_context = False,
         time_cond_dim = None,
-        flash = False
+        flash = False,
+        qk_norm = False
     ):
         super().__init__()
         hidden_dim = dim_head * heads
@@ -197,6 +217,11 @@ def __init__(
         self.to_kv = nn.Linear(dim_context, hidden_dim * 2, bias = False)
         self.to_out = nn.Linear(hidden_dim, dim, bias = False)
 
+        self.qk_norm = qk_norm
+        if qk_norm:
+            self.q_norm = MultiHeadedRMSNorm(dim_head, heads)
+            self.k_norm = MultiHeadedRMSNorm(dim_head, heads)
+
         self.attend = Attend(flash = flash)
 
     def forward(
@@ -222,6 +247,10 @@ def forward(
         qkv = (self.to_q(x), *self.to_kv(context).chunk(2, dim = -1))
         q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)
 
+        if self.qk_norm:
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+
         out = self.attend(q, k, v)
 
         out = rearrange(out, 'b h n d -> b n (h d)')
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'RIN-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.7.8',
+  version = '0.7.9',
   license='MIT',
   description = 'RIN - Recurrent Interface Network - Pytorch',
   author = 'Phil Wang',