feat: add new v-inpainter

flavioschneider · flavioschneider · commit bcbb510b86dc · 2023-02-26T21:18:31.000+01:00
diff --git a/README.md b/README.md
@@ -170,6 +170,37 @@ latent = autoencoder.encode(audio) # Encode
 sample = autoencoder.decode(latent, num_steps=10) # Decode by sampling diffusion model conditioning on latent
 ```
 
+## Other
+
+### Inpainting
+```py
+from audio_diffusion_pytorch import UNetV0, VInpainter
+
+# The diffusion UNetV0 (this is an example, the net must be trained to work)
+net = UNetV0(
+    dim=1,
+    in_channels=2, # U-Net: number of input/output (audio) channels
+    channels=[8, 32, 64, 128, 256, 512, 512, 1024, 1024], # U-Net: channels at each layer
+    factors=[1, 4, 4, 4, 2, 2, 2, 2, 2], # U-Net: downsampling and upsampling factors at each layer
+    items=[1, 2, 2, 2, 2, 2, 2, 4, 4], # U-Net: number of repeating items at each layer
+    attentions=[0, 0, 0, 0, 0, 1, 1, 1, 1], # U-Net: attention enabled/disabled at each layer
+    attention_heads=8, # U-Net: number of attention heads per attention block
+    attention_features=64, # U-Net: number of attention features per attention block,
+)
+
+# Instantiate inpainter with trained net
+inpainter = VInpainter(net=net)
+
+# Inpaint source
+y = inpainter(
+    source=torch.randn(1, 2, 2**18), # Start source
+    mask=torch.randint(0, 2, (1, 2, 2 ** 18), dtype=torch.bool),  # Set to `True` the parts you want to keep
+    num_steps=10, # Number of inpainting steps
+    num_resamples=2, # Number of resampling steps
+    show_progress=True,
+) # [1, 2, 2 ** 18]
+```
+
 ## Appreciation
 
 * [StabilityAI](https://stability.ai/) for the compute, [Zach Evans](https://github.com/zqevans) and everyone else from [HarmonAI](https://www.harmonai.org/) for the interesting research discussions.
diff --git a/audio_diffusion_pytorch/__init__.py b/audio_diffusion_pytorch/__init__.py
@@ -7,6 +7,7 @@
     Schedule,
     UniformDistribution,
     VDiffusion,
+    VInpainter,
     VSampler,
 )
 from .models import (
@@ -15,4 +16,5 @@
     DiffusionModel,
     DiffusionUpsampler,
     DiffusionVocoder,
+    EncoderBase,
 )
diff --git a/audio_diffusion_pytorch/diffusion.py b/audio_diffusion_pytorch/diffusion.py
@@ -8,6 +8,8 @@
 from torch import Tensor
 from tqdm import tqdm
 
+from .utils import default
+
 """ Distributions """
 
 
@@ -166,6 +168,7 @@ def get_alpha_beta(self, sigmas: Tensor) -> Tuple[Tensor, Tensor]:
         alpha, beta = torch.cos(angle), torch.sin(angle)
         return alpha, beta
 
+    @torch.no_grad()
     def forward(  # type: ignore
         self, x_noisy: Tensor, num_steps: int, show_progress: bool = False, **kwargs
     ) -> Tensor:
@@ -242,6 +245,7 @@ def sample_start(self, num_items: int, num_steps: int, **kwargs) -> Tensor:
         # Sample start
         return self.sample_loop(current=noise, sigmas=sigmas, **kwargs)
 
+    @torch.no_grad()
     def forward(
         self,
         num_items: int,
@@ -289,3 +293,61 @@ def forward(
             chunks += [torch.randn(shape, device=self.device)]
 
         return torch.cat(chunks[:num_chunks], dim=-1)
+
+
+"""  Inpainters """
+
+
+class Inpainter(nn.Module):
+    pass
+
+
+class VInpainter(Inpainter):
+
+    diffusion_types = [VDiffusion]
+
+    def __init__(self, net: nn.Module, schedule: Schedule = LinearSchedule()):
+        super().__init__()
+        self.net = net
+        self.schedule = schedule
+
+    def get_alpha_beta(self, sigmas: Tensor) -> Tuple[Tensor, Tensor]:
+        angle = sigmas * pi / 2
+        alpha, beta = torch.cos(angle), torch.sin(angle)
+        return alpha, beta
+
+    @torch.no_grad()
+    def forward(  # type: ignore
+        self,
+        source: Tensor,
+        mask: Tensor,
+        num_steps: int,
+        num_resamples: int,
+        show_progress: bool = False,
+        x_noisy: Optional[Tensor] = None,
+        **kwargs,
+    ) -> Tensor:
+        x_noisy = default(x_noisy, lambda: torch.randn_like(source))
+        b = x_noisy.shape[0]
+        sigmas = self.schedule(num_steps + 1, device=x_noisy.device)
+        sigmas = repeat(sigmas, "i -> i b", b=b)
+        sigmas_batch = extend_dim(sigmas, dim=x_noisy.ndim + 1)
+        alphas, betas = self.get_alpha_beta(sigmas_batch)
+        progress_bar = tqdm(range(num_steps), disable=not show_progress)
+
+        for i in progress_bar:
+            for r in range(num_resamples):
+                v_pred = self.net(x_noisy, sigmas[i], **kwargs)
+                x_pred = alphas[i] * x_noisy - betas[i] * v_pred
+                noise_pred = betas[i] * x_noisy + alphas[i] * v_pred
+                # Renoise to current noise level if resampling
+                j = r == num_resamples - 1
+                x_noisy = alphas[i + j] * x_pred + betas[i + j] * noise_pred
+                s_noisy = alphas[i + j] * source + betas[i + j] * torch.randn_like(
+                    source
+                )
+                x_noisy = s_noisy * mask + x_noisy * ~mask
+
+            progress_bar.set_description(f"Inpainting (noise={sigmas[i+1,0]:.2f})")
+
+        return x_noisy
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
     name="audio-diffusion-pytorch",
     packages=find_packages(exclude=[]),
-    version="0.1.2",
+    version="0.1.3",
     license="MIT",
     description="Audio Diffusion - PyTorch",
     long_description_content_type="text/markdown",

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@`
`7`	`7`	`Schedule,`
`8`	`8`	`UniformDistribution,`
`9`	`9`	`VDiffusion,`
	`10`	`+ VInpainter,`
`10`	`11`	`VSampler,`
`11`	`12`	`)`
`12`	`13`	`from .models import (`
`@@ -15,4 +16,5 @@`
`15`	`16`	`DiffusionModel,`
`16`	`17`	`DiffusionUpsampler,`
`17`	`18`	`DiffusionVocoder,`
	`19`	`+ EncoderBase,`
`18`	`20`	`)`