complete magnitude preserving temporal unet layers for space-time karras unet

lucidrains · lucidrains · commit 158c91b71448 · 2024-02-13T09:58:17.000-08:00
diff --git a/README.md b/README.md
@@ -75,8 +75,8 @@ assert noised_video.shape == denoised_video.shape
 - [x] expose only temporal parameters for learning, freeze everything else
 - [x] figure out the best way to deal with the time conditioning after temporal downsampling - instead of pytree transform at the beginning, probably will need to hook into all the modules and inspect the batch sizes
 - [x] handle middle modules that may have output shape as `(batch, seq, dim)`
+- [x] following the conclusions of Tero Karras, improvise a variant of the 4 modules with magnitude preservation
 
-- [ ] following the conclusions of Tero Karras, improvise a variant of the 4 modules with magnitude preservation
 - [ ] test out on <a href="https://github.com/lucidrains/imagen-pytorch">imagen-pytorch</a>
 
 ## Citations
diff --git a/lumiere_pytorch/mp_lumiere.py b/lumiere_pytorch/mp_lumiere.py
@@ -49,6 +49,11 @@ def compact_values(d: dict):
 def l2norm(t, dim = -1, eps = 1e-12):
     return F.normalize(t, dim = dim, eps = eps)
 
+def interpolate_1d(x, length, mode = 'bilinear'):
+    x = rearrange(x, 'b c t -> b c t 1')
+    x = F.interpolate(x, (length, 1), mode = mode)
+    return rearrange(x, 'b c t 1 -> b c t')
+
 # mp activations
 # section 2.5
 
@@ -85,6 +90,65 @@ def forward(self, x):
         weight = l2norm(self.weight, eps = self.eps) / sqrt(self.fan_in)
         return F.linear(x, weight)
 
+# forced weight normed conv2d and linear
+# algorithm 1 in paper
+
+class Conv2d(Module):
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        kernel_size,
+        eps = 1e-4
+    ):
+        super().__init__()
+        weight = torch.randn(dim_out, dim_in, kernel_size, kernel_size)
+        self.weight = nn.Parameter(weight)
+
+        self.eps = eps
+        self.fan_in = dim_in * kernel_size ** 2
+
+    def forward(self, x):
+        if self.training:
+            with torch.no_grad():
+                weight, ps = pack_one(self.weight, 'o *')
+                normed_weight = l2norm(weight, eps = self.eps)
+                normed_weight = unpack_one(normed_weight, ps, 'o *')
+                self.weight.copy_(normed_weight)
+
+        weight = l2norm(self.weight, eps = self.eps) / sqrt(self.fan_in)
+        return F.conv2d(x, weight, padding = 'same')
+
+class Conv1d(Module):
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        kernel_size,
+        eps = 1e-4,
+        init_dirac = False
+    ):
+        super().__init__()
+        weight = torch.randn(dim_out, dim_in, kernel_size)
+        self.weight = nn.Parameter(weight)
+
+        if init_dirac:
+            nn.init.dirac_(self.weight)
+
+        self.eps = eps
+        self.fan_in = dim_in * kernel_size
+
+    def forward(self, x):
+        if self.training:
+            with torch.no_grad():
+                weight, ps = pack_one(self.weight, 'o *')
+                normed_weight = l2norm(weight, eps = self.eps)
+                normed_weight = unpack_one(normed_weight, ps, 'o *')
+                self.weight.copy_(normed_weight)
+
+        weight = l2norm(self.weight, eps = self.eps) / sqrt(self.fan_in)
+        return F.conv1d(x, weight, padding = 'same')
+
 # pixelnorm
 # equation (30)
 
@@ -183,18 +247,18 @@ def __init__(
         super().__init__()
         self.time_dim = time_dim
         self.channel_last = channel_last
-
-        self.conv = nn.Conv1d(dim, dim, kernel_size = 3, stride = 2, padding = 1)
-        init_bilinear_kernel_1d_(self.conv)
+        self.conv = Conv1d(dim, dim, 3, init_dirac = True)
 
     @handle_maybe_channel_last
     @image_or_video_to_time
     def forward(
         self,
         x
     ):
-        assert x.shape[-1] > 1, 'time dimension must be greater than 1 to be compressed'
+        t = x.shape[-1]
+        assert t > 1, 'time dimension must be greater than 1 to be compressed'
 
+        x = interpolate_1d(x, t // 2)
         return self.conv(x)
 
 class MPTemporalUpsample(Module):
@@ -207,16 +271,16 @@ def __init__(
         super().__init__()
         self.time_dim = time_dim
         self.channel_last = channel_last
-
-        self.conv = nn.ConvTranspose1d(dim, dim, kernel_size = 3, stride = 2, padding = 1, output_padding = 1)
-        init_bilinear_kernel_1d_(self.conv)
+        self.conv = Conv1d(dim, dim, 3, init_dirac = True)
 
     @handle_maybe_channel_last
     @image_or_video_to_time
     def forward(
         self,
         x
     ):
+        t = x.shape[-1]
+        x = interpolate_1d(x, t * 2)
         return self.conv(x)
 
 # main modules
@@ -233,26 +297,23 @@ def __init__(
         mp_add_t = 0.3
     ):
         super().__init__()
-        assert is_odd(conv2d_kernel_size)
-        assert is_odd(conv1d_kernel_size)
-
         self.time_dim = time_dim
         self.channel_last = channel_last
 
         self.spatial_conv = nn.Sequential(
-            nn.Conv2d(dim, dim, conv2d_kernel_size, padding = conv2d_kernel_size // 2),
+            Conv2d(dim, dim, conv2d_kernel_size, 3),
             MPSiLU()
         )
 
         self.temporal_conv = nn.Sequential(
-            nn.Conv1d(dim, dim, conv1d_kernel_size, padding = conv1d_kernel_size // 2),
+            Conv1d(dim, dim, conv1d_kernel_size, 3),
             MPSiLU()
         )
 
-        self.proj_out = nn.Conv1d(dim, dim, 1)
-
-        nn.init.zeros_(self.proj_out.weight)
-        nn.init.zeros_(self.proj_out.bias)
+        self.proj_out = nn.Sequential(
+            Conv1d(dim, dim, 1),
+            Gain()
+        )
 
         self.residual_mp_add = MPAdd(t = mp_add_t)
 
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'lumiere-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.16',
+  version = '0.0.17',
   license='MIT',
   description = 'Lumiere',
   author = 'Phil Wang',