address #4

lucidrains · lucidrains · commit 1b26c9fc7589 · 2024-05-07T08:06:41.000-07:00
diff --git a/lumiere_pytorch/__init__.py b/lumiere_pytorch/__init__.py
@@ -2,8 +2,7 @@
     ConvolutionInflationBlock,
     AttentionInflationBlock,
     TemporalDownsample,
-    TemporalUpsample,
-    set_time_dim_
+    TemporalUpsample
 )
 
 from lumiere_pytorch.lumiere import Lumiere
diff --git a/lumiere_pytorch/lumiere.py b/lumiere_pytorch/lumiere.py
@@ -16,7 +16,7 @@
 import torch.nn.functional as F
 
 from beartype import beartype
-from beartype.typing import List, Tuple, Optional, Type
+from beartype.typing import List, Tuple, Optional, Type, Any
 
 from einops import rearrange, pack, unpack, repeat
 
@@ -92,14 +92,15 @@ def freeze_all_layers_(module):
 
 # function that takes in the entire text-to-video network, and sets the time dimension
 
-def set_time_dim_(
+def set_attr_on_klasses_(
     klasses: Tuple[Type[Module]],
     model: Module,
-    time_dim: int
+    attr_name: str,
+    value: Any
 ):
     for model in model.modules():
         if isinstance(model, klasses):
-            model.time_dim = time_dim
+            setattr(model, attr_name, value)
 
 # decorator for residual
 
@@ -135,7 +136,7 @@ def inner(
             batch_size = x.shape[0]
             x = rearrange(x, 'b c t h w -> b h w c t')
         else:
-            assert exists(batch_size) or exists(self.time_dim)
+            batch_size = default(batch_size, self.batch_dim)
             rearrange_kwargs = dict(b = batch_size, t = self.time_dim)
             x = rearrange(x, '(b t) c h w -> b h w c t', **compact_values(rearrange_kwargs))
 
@@ -212,6 +213,7 @@ def __init__(
         time_dim = None
     ):
         super().__init__()
+        self.batch_dim = None
         self.time_dim = time_dim
         self.channel_last = channel_last
 
@@ -236,6 +238,7 @@ def __init__(
         time_dim = None
     ):
         super().__init__()
+        self.batch_dim = None
         self.time_dim = time_dim
         self.channel_last = channel_last
 
@@ -267,6 +270,7 @@ def __init__(
         assert is_odd(conv2d_kernel_size)
         assert is_odd(conv1d_kernel_size)
 
+        self.batch_dim = None
         self.time_dim = time_dim
         self.channel_last = channel_last
 
@@ -302,6 +306,7 @@ def forward(
 
         x = self.spatial_conv(x)
 
+        batch_size = default(batch_size, self.batch_dim)
         rearrange_kwargs = compact_values(dict(b = batch_size, t = self.time_dim))
 
         assert len(rearrange_kwargs) > 0, 'either batch_size is passed in on forward, or time_dim is set on init'
@@ -335,6 +340,7 @@ def __init__(
     ):
         super().__init__()
 
+        self.batch_dim = None
         self.time_dim = time_dim
         self.channel_last = channel_last
 
@@ -376,6 +382,7 @@ def forward(
             batch_size = x.shape[0]
             x = rearrange(x, 'b c t h w -> b h w t c')
         else:
+            batch_size = default(batch_size, self.batch_dim)
             assert exists(batch_size) or exists(self.time_dim)
 
             rearrange_kwargs = dict(b = batch_size, t = self.time_dim)
@@ -579,7 +586,7 @@ def forward(
 
         # set the correct time dimension for all temporal layers
 
-        set_time_dim_(self.temporal_klasses, self, time)
+        set_attr_on_klasses_(self.temporal_klasses, self, 'batch_dim', batch)
 
         # forward all images into text-to-image model
 
diff --git a/lumiere_pytorch/mp_lumiere.py b/lumiere_pytorch/mp_lumiere.py
@@ -290,6 +290,7 @@ def __init__(
         dropout = 0.
     ):
         super().__init__()
+        self.batch_dim = None
         self.time_dim = time_dim
         self.channel_last = channel_last
 
@@ -327,6 +328,7 @@ def forward(
 
         x = self.spatial_conv(x)
 
+        batch_size = default(batch_size, self.batch_dim)
         rearrange_kwargs = compact_values(dict(b = batch_size, t = self.time_dim))
 
         assert len(rearrange_kwargs) > 0, 'either batch_size is passed in on forward, or time_dim is set on init'
@@ -388,13 +390,14 @@ def forward(
         batch_size = None
     ):
         is_video = x.ndim == 5
+
+        batch_size = default(batch_size, self.batch_dim)
         assert is_video ^ (exists(batch_size) or exists(self.time_dim)), 'either a tensor of shape (batch, channels, time, height, width) is passed in, or (batch * time, channels, height, width) along with `batch_size`'
 
         if self.channel_last:
             x = rearrange(x, 'b ... c -> b c ...')
 
         if is_video:
-            batch_size = x.shape[0]
             x = rearrange(x, 'b c t h w -> b h w t c')
         else:
             assert exists(batch_size) or exists(self.time_dim)
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'lumiere-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.20',
+  version = '0.0.21',
   license='MIT',
   description = 'Lumiere',
   author = 'Phil Wang',

Original file line number	Diff line number	Diff line change
`@@ -2,8 +2,7 @@`
`2`	`2`	`ConvolutionInflationBlock,`
`3`	`3`	`AttentionInflationBlock,`
`4`	`4`	`TemporalDownsample,`
`5`		`- TemporalUpsample,`
`6`		`- set_time_dim_`
	`5`	`+ TemporalUpsample`
`7`	`6`	`)`
`8`	`7`
`9`	`8`	`from lumiere_pytorch.lumiere import Lumiere`