feat: add option to insert quantizer as diffusion autoencoder bottleneck

flavioschneider · flavioschneider · commit f98c25e3c3c7 · 2022-09-10T11:45:33.000+02:00
diff --git a/README.md b/README.md
@@ -203,6 +203,8 @@ y_long = composer(y, keep_start=True) # [1, 1, 98304]
 - [x] Add trainer with experiments.
 - [x] Add diffusion upsampler.
 - [x] Add ancestral euler sampler `AEulerSampler`.
+- [x] Add diffusion autoencoder.
+- [x] Add autoencoder bottleneck option for quantization.
 
 ## Appreciation
 
diff --git a/audio_diffusion_pytorch/model.py b/audio_diffusion_pytorch/model.py
@@ -1,5 +1,5 @@
 import random
-from typing import Optional, Sequence, Union
+from typing import Any, Optional, Sequence, Tuple, Union
 
 import torch
 from torch import Tensor, nn
@@ -15,7 +15,7 @@
     Schedule,
 )
 from .modules import Encoder1d, ResnetBlock1d, UNet1d
-from .utils import default, prod, to_list
+from .utils import default, exists, prod, to_list
 
 """ Diffusion Classes (generic for 1d data) """
 
@@ -129,6 +129,13 @@ def sample(  # type: ignore
         return super().sample(noise, **{**default_kwargs, **kwargs})  # type: ignore
 
 
+class Bottleneck(nn.Module):
+    """Bottleneck interface (subclass can be provided to DiffusionAutoencoder1d)"""
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, Any]:
+        raise NotImplementedError()
+
+
 class DiffusionAutoencoder1d(Model1d):
     def __init__(
         self,
@@ -144,6 +151,7 @@ def __init__(
         encoder_depth: int,
         encoder_channels: int,
         context_channels: int,
+        bottleneck: Optional[Bottleneck] = None,
         **kwargs
     ):
         super().__init__(
@@ -162,6 +170,7 @@ def __init__(
 
         self.in_channels = in_channels
         self.encoder_factor = patch_size * prod(factors[0:encoder_depth])
+        self.bottleneck = bottleneck
 
         self.encoder = Encoder1d(
             in_channels=in_channels,
@@ -187,9 +196,15 @@ def forward(self, x: Tensor, **kwargs) -> Tensor:
         context = self.to_context(latent)
         return self.diffusion(x, context=[context], **kwargs)
 
-    def encode(self, x: Tensor) -> Tensor:
+    def encode(
+        self, x: Tensor, with_info: bool = False
+    ) -> Union[Tensor, Tuple[Tensor, Any]]:
         x = self.encoder(x)[-1]
         latent = torch.tanh(x)
+        # Apply bottleneck if provided (e.g. quantization module)
+        if exists(self.bottleneck):
+            latent, info = self.bottleneck(latent)
+            return (latent, info) if with_info else latent
         return latent
 
     def decode(self, latent: Tensor, **kwargs) -> Tensor:
diff --git a/audio_diffusion_pytorch/modules.py b/audio_diffusion_pytorch/modules.py
@@ -703,11 +703,12 @@ def __init__(
         self, in_channels: int, out_channels: int, kernel_sizes: Sequence[int]
     ):
         super().__init__()
+        mid_channels = in_channels * 8
 
         self.block1 = nn.ModuleList(
             Conv1d(
                 in_channels=in_channels,
-                out_channels=out_channels,
+                out_channels=mid_channels,
                 kernel_size=kernel_size,
                 padding=(kernel_size - 1) // 2,
             )
@@ -716,7 +717,7 @@ def __init__(
 
         self.block2 = nn.ModuleList(
             Conv1d(
-                in_channels=in_channels,
+                in_channels=mid_channels,
                 out_channels=out_channels,
                 kernel_size=kernel_size,
                 padding=(kernel_size - 1) // 2,
@@ -725,9 +726,9 @@ def __init__(
         )
 
     def forward(self, x: Tensor) -> Tensor:
-        xs = torch.stack([x] + [conv(x) for conv in self.block1])
+        xs = torch.stack([conv(x) for conv in self.block1])
         x = reduce(xs, "n b c t -> b c t", "sum")
-        xs = torch.stack([x] + [conv(x) for conv in self.block2])
+        xs = torch.stack([conv(x) for conv in self.block2])
         x = reduce(xs, "n b c t -> b c t", "sum")
         return x
 
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
     name="audio-diffusion-pytorch",
     packages=find_packages(exclude=[]),
-    version="0.0.27",
+    version="0.0.28",
     license="MIT",
     description="Audio Diffusion - PyTorch",
     long_description_content_type="text/markdown",