handle channel last, for some unet middles may already be in that format for transformer

lucidrains · lucidrains · commit 16d69cdd3e2e · 2024-02-13T06:28:05.000-08:00
diff --git a/README.md b/README.md
@@ -74,8 +74,8 @@ assert noised_video.shape == denoised_video.shape
 
 - [x] expose only temporal parameters for learning, freeze everything else
 - [x] figure out the best way to deal with the time conditioning after temporal downsampling - instead of pytree transform at the beginning, probably will need to hook into all the modules and inspect the batch sizes
+- [x] handle middle modules that may have output shape as `(batch, seq, dim)`
 
-- [ ] handle shapes of `(batch, seq, dim)` as well as channel last
 - [ ] following the conclusions of Tero Karras, improvise a variant of the 4 modules with magnitude preservation
 - [ ] test out on <a href="https://github.com/lucidrains/imagen-pytorch">imagen-pytorch</a>
 
@@ -100,4 +100,3 @@ assert noised_video.shape == denoised_video.shape
     url     = {https://api.semanticscholar.org/CorpusID:265659032}
 }
 ```
-
diff --git a/lumiere_pytorch/lumiere_pytorch.py b/lumiere_pytorch/lumiere_pytorch.py
@@ -134,6 +134,30 @@ def inner(
 
     return inner
 
+# handle channel last
+
+def handle_maybe_channel_last(fn):
+
+    @wraps(fn)
+    def inner(
+        self,
+        x,
+        *args,
+        **kwargs
+    ):
+
+        if self.channel_last:
+            x = rearrange(x, 'b c ... -> b ... c')
+
+        out = fn(self, x, *args, **kwargs)
+
+        if self.channel_last:
+            out = rearrange(out, 'b c ... -> b ... c')
+
+        return out
+
+    return inner
+
 # helpers
 
 def Sequential(*modules):
@@ -164,14 +188,17 @@ class TemporalDownsample(Module):
     def __init__(
         self,
         dim,
+        channel_last = False,
         time_dim = None
     ):
         super().__init__()
         self.time_dim = time_dim
+        self.channel_last = channel_last
 
         self.conv = nn.Conv1d(dim, dim, kernel_size = 3, stride = 2, padding = 1)
         init_bilinear_kernel_1d_(self.conv)
 
+    @handle_maybe_channel_last
     @image_or_video_to_time
     def forward(
         self,
@@ -185,14 +212,17 @@ class TemporalUpsample(Module):
     def __init__(
         self,
         dim,
+        channel_last = False,
         time_dim = None
     ):
         super().__init__()
         self.time_dim = time_dim
+        self.channel_last = channel_last
 
         self.conv = nn.ConvTranspose1d(dim, dim, kernel_size = 3, stride = 2, padding = 1, output_padding = 1)
         init_bilinear_kernel_1d_(self.conv)
 
+    @handle_maybe_channel_last
     @image_or_video_to_time
     def forward(
         self,
@@ -210,13 +240,15 @@ def __init__(
         conv2d_kernel_size = 3,
         conv1d_kernel_size = 3,
         groups = 8,
+        channel_last = False,
         time_dim = None
     ):
         super().__init__()
         assert is_odd(conv2d_kernel_size)
         assert is_odd(conv1d_kernel_size)
 
         self.time_dim = time_dim
+        self.channel_last = channel_last
 
         self.spatial_conv = nn.Sequential(
             nn.Conv2d(dim, dim, conv2d_kernel_size, padding = conv2d_kernel_size // 2),
@@ -235,6 +267,7 @@ def __init__(
         nn.init.zeros_(self.proj_out.weight)
         nn.init.zeros_(self.proj_out.bias)
 
+    @handle_maybe_channel_last
     def forward(
         self,
         x,
@@ -277,11 +310,13 @@ def __init__(
         prenorm = True,
         residual_attn = True,
         time_dim = None,
+        channel_last = False,
         **attn_kwargs
     ):
         super().__init__()
 
         self.time_dim = time_dim
+        self.channel_last = channel_last
 
         self.temporal_attns = ModuleList([])
 
@@ -304,6 +339,7 @@ def __init__(
         nn.init.zeros_(self.proj_out.weight)
         nn.init.zeros_(self.proj_out.bias)
 
+    @handle_maybe_channel_last
     def forward(
         self,
         x,
@@ -312,6 +348,9 @@ def forward(
         is_video = x.ndim == 5
         assert is_video ^ (exists(batch_size) or exists(self.time_dim)), 'either a tensor of shape (batch, channels, time, height, width) is passed in, or (batch * time, channels, height, width) along with `batch_size`'
 
+        if self.channel_last:
+            x = rearrange(x, 'b ... c -> b c ...')
+
         if is_video:
             batch_size = x.shape[0]
             x = rearrange(x, 'b c t h w -> b h w t c')
@@ -339,6 +378,9 @@ def forward(
         else:
             x = rearrange(x, 'b h w t c -> (b t) c h w')
 
+        if self.channel_last:
+            x = rearrange(x, 'b c ... -> b ... c')
+
         return x
 
 # post module hook wrapper
@@ -375,7 +417,9 @@ def __init__(
         upsample_module_names: List[str] = [],
         channels: int = 3,
         conv_inflation_kwargs: dict = dict(),
-        attn_inflation_kwargs: dict = dict()
+        attn_inflation_kwargs: dict = dict(),
+        downsample_kwargs: dict = dict(),
+        upsample_kwargs: dict = dict(),
     ):
         super().__init__()
 
@@ -421,8 +465,8 @@ def __init__(
 
         self.convs = ModuleList([ConvolutionInflationBlock(dim = shape[1], **conv_inflation_kwargs) for shape in conv_shapes])
         self.attns = ModuleList([AttentionInflationBlock(dim = shape[1], **attn_inflation_kwargs) for shape in attn_shapes])
-        self.downsamples = ModuleList([TemporalDownsample(dim = shape[1]) for shape in downsample_shapes])
-        self.upsamples = ModuleList([TemporalUpsample(dim = shape[1]) for shape in upsample_shapes])
+        self.downsamples = ModuleList([TemporalDownsample(dim = shape[1], **downsample_kwargs) for shape in downsample_shapes])
+        self.upsamples = ModuleList([TemporalUpsample(dim = shape[1], **upsample_kwargs) for shape in upsample_shapes])
 
         # insert all the temporal modules with hooks
 
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'lumiere-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.11',
+  version = '0.0.14',
   license='MIT',
   description = 'Lumiere',
   author = 'Phil Wang',