complete #295

lucidrains · lucidrains · commit 5f8aa9feacb6 · 2024-02-28T10:44:10.000-08:00
diff --git a/denoising_diffusion_pytorch/karras_unet_3d.py b/denoising_diffusion_pytorch/karras_unet_3d.py
@@ -2,6 +2,8 @@
 the magnitude-preserving unet proposed in https://arxiv.org/abs/2312.02696 by Karras et al.
 """
 
+from copy import deepcopy
+
 import math
 from math import sqrt, ceil
 from functools import partial
@@ -208,6 +210,7 @@ def __init__(
         attn_dim_head = 64,
         attn_res_mp_add_t = 0.3,
         attn_flash = False,
+        factorize_space_time_attn = False,
         downsample = False,
         downsample_config: Tuple[bool, bool, bool] = (True, True, True)
     ):
@@ -247,15 +250,25 @@ def __init__(
         self.res_mp_add = MPAdd(t = mp_add_t)
 
         self.attn = None
+        self.factorized_attn = factorize_space_time_attn
+
         if has_attn:
-            self.attn = Attention(
+            attn_kwargs = dict(
                 dim = dim_out,
                 heads = max(ceil(dim_out / attn_dim_head), 2),
                 dim_head = attn_dim_head,
                 mp_add_t = attn_res_mp_add_t,
                 flash = attn_flash
             )
 
+            if factorize_space_time_attn:
+                self.attn = nn.ModuleList([
+                    Attention(**attn_kwargs, only_space = True),
+                    Attention(**attn_kwargs, only_time = True),
+                ])
+            else:
+                self.attn = Attention(**attn_kwargs)
+
     def forward(
         self,
         x,
@@ -284,7 +297,13 @@ def forward(
         x = self.res_mp_add(x, res)
 
         if exists(self.attn):
-            x = self.attn(x)
+            if self.factorized_attn:
+                attn_space, attn_time = self.attn
+                x = attn_space(x)
+                x = attn_time(x)
+
+            else:
+                x = self.attn(x)
 
         return x
 
@@ -301,6 +320,7 @@ def __init__(
         attn_dim_head = 64,
         attn_res_mp_add_t = 0.3,
         attn_flash = False,
+        factorize_space_time_attn = False,
         upsample = False,
         upsample_config: Tuple[bool, bool, bool] = (True, True, True)
     ):
@@ -335,15 +355,25 @@ def __init__(
         self.res_mp_add = MPAdd(t = mp_add_t)
 
         self.attn = None
+        self.factorized_attn = factorize_space_time_attn
+
         if has_attn:
-            self.attn = Attention(
+            attn_kwargs = dict(
                 dim = dim_out,
                 heads = max(ceil(dim_out / attn_dim_head), 2),
                 dim_head = attn_dim_head,
                 mp_add_t = attn_res_mp_add_t,
                 flash = attn_flash
             )
 
+            if factorize_space_time_attn:
+                self.attn = nn.ModuleList([
+                    Attention(**attn_kwargs, only_space = True),
+                    Attention(**attn_kwargs, only_time = True),
+                ])
+            else:
+                self.attn = Attention(**attn_kwargs)
+
     def forward(
         self,
         x,
@@ -369,7 +399,13 @@ def forward(
         x = self.res_mp_add(x, res)
 
         if exists(self.attn):
-            x = self.attn(x)
+            if self.factorized_attn:
+                attn_space, attn_time = self.attn
+                x = attn_space(x)
+                x = attn_time(x)
+
+            else:
+                x = self.attn(x)
 
         return x
 
@@ -383,9 +419,13 @@ def __init__(
         dim_head = 64,
         num_mem_kv = 4,
         flash = False,
-        mp_add_t = 0.3
+        mp_add_t = 0.3,
+        only_space = False,
+        only_time = False
     ):
         super().__init__()
+        assert (int(only_space) + int(only_time)) <= 1
+
         self.heads = heads
         hidden_dim = dim_head * heads
 
@@ -399,20 +439,41 @@ def __init__(
 
         self.mp_add = MPAdd(t = mp_add_t)
 
+        self.only_space = only_space
+        self.only_time = only_time
+
     def forward(self, x):
-        res, b, c, t, h, w = x, *x.shape
+        res, orig_shape = x, x.shape
+        b, c, t, h, w = orig_shape
+
+        qkv = self.to_qkv(x)
+
+        if self.only_space:
+            qkv = rearrange(qkv, 'b c t x y -> (b t) c x y')
+        elif self.only_time:
+            qkv = rearrange(qkv, 'b c t x y -> (b x y) c t')
+
+        qkv = qkv.chunk(3, dim = 1)
 
-        qkv = self.to_qkv(x).chunk(3, dim = 1)
-        q, k, v = map(lambda t: rearrange(t, 'b (h c) t x y -> b h (t x y) c', h = self.heads), qkv)
+        q, k, v = map(lambda t: rearrange(t, 'b (h c) ... -> b h (...) c', h = self.heads), qkv)
+
+        mk, mv = map(lambda t: repeat(t, 'h n d -> b h n d', b = k.shape[0]), self.mem_kv)
 
-        mk, mv = map(lambda t: repeat(t, 'h n d -> b h n d', b = b), self.mem_kv)
         k, v = map(partial(torch.cat, dim = -2), ((mk, k), (mv, v)))
 
         q, k, v = map(self.pixel_norm, (q, k, v))
 
         out = self.attend(q, k, v)
 
-        out = rearrange(out, 'b h (t x y) d -> b (h d) t x y', t = t, x = h, y = w)
+        out = rearrange(out, 'b h n d -> b (h d) n')
+
+        if self.only_space:
+            out = rearrange(out, '(b t) c n -> b c (t n)', t = t)
+        elif self.only_time:
+            out = rearrange(out, '(b x y) c n -> b c (n x y)', x = h, y = w)
+
+        out = out.reshape(orig_shape)
+
         out = self.to_out(out)
 
         return self.mp_add(out, res)
@@ -446,7 +507,8 @@ def __init__(
         attn_res_mp_add_t = 0.3,
         resnet_mp_add_t = 0.3,
         dropout = 0.1,
-        self_condition = False
+        self_condition = False,
+        factorize_space_time_attn = False
     ):
         super().__init__()
 
@@ -576,6 +638,7 @@ def __init__(
                 has_attn = curr_image_res in attn_res,
                 upsample = True,
                 upsample_config = down_and_upsample_config,
+                factorize_space_time_attn = factorize_space_time_attn,
                 **block_kwargs
             )
 
@@ -593,6 +656,7 @@ def __init__(
                 downsample = True,
                 downsample_config = down_and_upsample_config,
                 has_attn = has_attn,
+                factorize_space_time_attn = factorize_space_time_attn,
                 **block_kwargs
             )
 
@@ -777,6 +841,7 @@ def forward(self, x):
         ),
         attn_dim_head = 8,
         num_classes = 1000,
+        factorize_space_time_attn = True  # whether to do attention across space and time separately
     )
 
     video = torch.randn(2, 4, 32, 64, 64)
diff --git a/denoising_diffusion_pytorch/version.py b/denoising_diffusion_pytorch/version.py
@@ -1 +1 @@
-__version__ = '1.10.17'
+__version__ = '1.11.0'

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = '1.10.17'`
	`1`	`+__version__ = '1.11.0'`