move condition logic to another class, to prepare for alternatives

lucidrains · lucidrains · commit 1e7e9f546833 · 2022-11-29T10:22:32.000-08:00
diff --git a/README.md b/README.md
@@ -4,8 +4,6 @@
 
 Implementation of <a href="https://arxiv.org/abs/2211.00611">MedSegDiff</a> in Pytorch - SOTA medical segmentation out of Baidu using DDPM and enhanced conditioning on the feature level, with filtering of features in fourier space.
 
-I will also add attention and introduce an extended type of cross modulation on the attention matrices, alphafold2 style.
-
 ## Install
 
 ```bash
@@ -47,8 +45,6 @@ pred.shape                              # predicted segmented images - (8, 3, 12
 
 ## Todo
 
-- [ ] add a cross attention variant for generating the attentive map (A)
-- [ ] modulate attention matrices in middle and other self attention layers, wherever full attention is used
 - [ ] some basic training code, with Trainer taking in custom dataset tailored for medical image formats
 
 ## Citations
diff --git a/med_seg_diff_pytorch/med_seg_diff_pytorch.py b/med_seg_diff_pytorch/med_seg_diff_pytorch.py
@@ -196,6 +196,19 @@ def forward(self, x):
         out = rearrange(out, 'b h (x y) d -> b (h d) x y', x = h, y = w)
         return self.to_out(out)
 
+# conditioning class
+
+class FourierConditioning(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.norm_input = LayerNorm(dim, bias = True)
+        self.norm_condition = LayerNorm(dim, bias = True)
+
+    def forward(self, x, c):
+        normed_x = self.norm_input(x)
+        normed_c = self.norm_condition(c)
+        return (normed_x * normed_c) * c   # eq 3 in paper
+
 # model
 
 class Unet(nn.Module):
@@ -207,7 +220,8 @@ def __init__(
         dim_mults=(1, 2, 4, 8),
         channels = 3,
         self_condition = False,
-        resnet_block_groups = 8
+        resnet_block_groups = 8,
+        conditioning_klass = FourierConditioning
     ):
         super().__init__()
 
@@ -242,18 +256,21 @@ def __init__(
 
         num_resolutions = len(in_out)
 
+        self.conditioners = nn.ModuleList([])
+
         # downsampling encoding blocks
 
         self.downs = nn.ModuleList([])
 
         for ind, (dim_in, dim_out) in enumerate(in_out):
             is_last = ind >= (num_resolutions - 1)
 
+            self.conditioners.append(conditioning_klass(dim_in))
+
             self.downs.append(nn.ModuleList([
                 block_klass(dim_in, dim_in, time_emb_dim = time_dim),
                 block_klass(dim_in, dim_in, time_emb_dim = time_dim),
                 Residual(LinearAttention(dim_in)),
-                LayerNorm(dim_in, bias = True),
                 Downsample(dim_in, dim_out) if not is_last else nn.Conv2d(dim_in, dim_out, 3, padding = 1)
             ]))
 
@@ -310,7 +327,7 @@ def forward(
 
         h = []
 
-        for (block1, block2, attn, norm, downsample), (cond_block1, cond_block2, cond_attn, cond_norm, cond_downsample) in zip(self.downs, self.cond_downs):
+        for (block1, block2, attn, downsample), (cond_block1, cond_block2, cond_attn, cond_downsample), conditioner in zip(self.downs, self.cond_downs, self.conditioners):
             x = block1(x, t)
             c = cond_block1(c, t)
 
@@ -322,22 +339,10 @@ def forward(
             x = attn(x)
             c = cond_attn(c)
 
-            # they create an attentive map A by element-wise multiplication of 
-            # then they use it to modulate it to modulate the condition in fourier space (ff-parse)
-            # eq. 3 in the paper
-
-            A = norm(x) * cond_norm(c) * c
-
-            # fc stands for conditioning in fourier space
-
-            fc = fft2(c)
-
-            fc = fc * A     # eq. 5 in paper
-
-            c = ifft2(fc).real
-            c = c.type(dtype)
+            # condition using modulation of fourier frequencies with attentive map
+            # you can test your own conditioners by passing in a different conditioner_klass , if you believe you can best the paper
 
-            # </conditioning>
+            c = conditioner(x, c)
 
             h.append(x)
 
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'med-seg-diff-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.2',
+  version = '0.0.3',
   license='MIT',
   description = 'MedSegDiff - SOTA medical image segmentation - Pytorch',
   author = 'Phil Wang',