in this paper, attention is all we need

lucidrains · lucidrains · commit 627fa4e8c36a · 2022-12-24T09:09:40.000-08:00
diff --git a/rin_pytorch/rin_pytorch.py b/rin_pytorch/rin_pytorch.py
@@ -65,42 +65,20 @@ def __init__(self, fn):
     def forward(self, x, *args, **kwargs):
         return self.fn(x, *args, **kwargs) + x
 
-def Upsample(dim, dim_out = None):
-    return nn.Sequential(
-        nn.Upsample(scale_factor = 2, mode = 'nearest'),
-        nn.Conv2d(dim, default(dim_out, dim), 3, padding = 1)
-    )
-
-def Downsample(dim, dim_out = None):
-    return nn.Conv2d(dim, default(dim_out, dim), 4, 2, 1)
+# use layernorm without bias, more stable
 
 class LayerNorm(nn.Module):
     def __init__(self, dim):
         super().__init__()
-        self.g = nn.Parameter(torch.ones(1, dim, 1, 1))
-
-    def forward(self, x):
-        eps = 1e-5 if x.dtype == torch.float32 else 1e-3
-        var = torch.var(x, dim = 1, unbiased = False, keepdim = True)
-        mean = torch.mean(x, dim = 1, keepdim = True)
-        return (x - mean) * var.clamp(min = eps).rsqrt() * self.g
-
-class PreNorm(nn.Module):
-    def __init__(self, dim, fn):
-        super().__init__()
-        self.fn = fn
-        self.norm = LayerNorm(dim)
+        self.gamma = nn.Parameter(torch.ones(dim))
+        self.register_buffer("beta", torch.zeros(dim))
 
     def forward(self, x):
-        x = self.norm(x)
-        return self.fn(x)
+        return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta)
 
 # positional embeds
 
 class LearnedSinusoidalPosEmb(nn.Module):
-    """ following @crowsonkb 's lead with learned sinusoidal pos emb """
-    """ https://github.com/crowsonkb/v-diffusion-jax/blob/master/diffusion/models/danbooru_128.py#L8 """
-
     def __init__(self, dim):
         super().__init__()
         assert (dim % 2) == 0
@@ -114,54 +92,13 @@ def forward(self, x):
         fouriered = torch.cat((x, fouriered), dim = -1)
         return fouriered
 
-# building block modules
-
-class Block(nn.Module):
-    def __init__(self, dim, dim_out, groups = 8):
-        super().__init__()
-        self.proj = nn.Conv2d(dim, dim_out, 3, padding = 1)
-        self.norm = nn.GroupNorm(groups, dim_out)
-        self.act = nn.SiLU()
-
-    def forward(self, x, scale_shift = None):
-        x = self.proj(x)
-        x = self.norm(x)
-
-        if exists(scale_shift):
-            scale, shift = scale_shift
-            x = x * (scale + 1) + shift
-
-        x = self.act(x)
-        return x
-
-class ResnetBlock(nn.Module):
-    def __init__(self, dim, dim_out, *, time_emb_dim = None, groups = 8):
-        super().__init__()
-        self.mlp = nn.Sequential(
-            nn.SiLU(),
-            nn.Linear(time_emb_dim, dim_out * 2)
-        ) if exists(time_emb_dim) else None
-
-        self.block1 = Block(dim, dim_out, groups = groups)
-        self.block2 = Block(dim_out, dim_out, groups = groups)
-        self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()
-
-    def forward(self, x, time_emb = None):
-
-        scale_shift = None
-        if exists(self.mlp) and exists(time_emb):
-            time_emb = self.mlp(time_emb)
-            time_emb = rearrange(time_emb, 'b c -> b c 1 1')
-            scale_shift = time_emb.chunk(2, dim = 1)
-
-        h = self.block1(x, scale_shift = scale_shift)
-
-        h = self.block2(h)
-
-        return h + self.res_conv(x)
-
 class LinearAttention(nn.Module):
-    def __init__(self, dim, heads = 4, dim_head = 32):
+    def __init__(
+        self,
+        dim,
+        heads = 4,
+        dim_head = 32
+    ):
         super().__init__()
         self.scale = dim_head ** -0.5
         self.heads = heads
@@ -182,7 +119,6 @@ def forward(self, x):
         k = k.softmax(dim = -1)
 
         q = q * self.scale
-        v = v / (h * w)
 
         context = torch.einsum('b h d n, b h e n -> b h d e', k, v)
 
@@ -191,7 +127,12 @@ def forward(self, x):
         return self.to_out(out)
 
 class Attention(nn.Module):
-    def __init__(self, dim, heads = 4, dim_head = 32):
+    def __init__(
+        self,
+        dim,
+        heads = 4,
+        dim_head = 32
+    ):
         super().__init__()
         self.scale = dim_head ** -0.5
         self.heads = heads
@@ -212,6 +153,27 @@ def forward(self, x):
         out = rearrange(out, 'b h (x y) d -> b (h d) x y', x = h, y = w)
         return self.to_out(out)
 
+class FiLM(nn.Module):
+    def __init__(
+        self,
+        dim,
+        hidden_dim
+    ):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, hidden_dim * 4),
+            nn.SiLU(),
+            nn.Linear(hidden_dim * 4, hidden_dim * 2)
+        )
+
+        nn.init.zeros_(self.net[-1].weight)
+        nn.init.zeros_(self.net[-1].bias)
+
+    def forward(self, conditions, hiddens):
+        scale, shift = self.net(conditions).chunk(2, dim = -1)        
+        scale, shift = map(lambda t: rearrange(t, 'b d -> b 1 d'), (scale, shift))
+        return hiddens * (scale + 1) + shift
+
 # model
 
 class RIN(nn.Module):
@@ -352,7 +314,7 @@ def beta_linear_log_snr(t):
     return -log(expm1(1e-4 + 10 * (t ** 2)))
 
 def alpha_cosine_log_snr(t, s = 0.008):
-    return -log((torch.cos((t + s) / (1 + s) * math.pi * 0.5) ** -2) - 1, eps = 1e-5) # not sure if this accounts for beta being clipped to 0.999 in discrete version
+    return -log((torch.cos((t + s) / (1 + s) * math.pi * 0.5) ** -2) - 1, eps = 1e-5)
 
 def gamma_sigmoid_log_snr(t, start = -3, end = 3, tau = 1, clamp_min = 1e-5):
     v_start = torch.tensor(start / tau).sigmoid()