feat: update to sigmoid norm

flavioschneider · flavioschneider · commit 13cddd61f9e7 · 2022-10-05T11:23:20.000+02:00
diff --git a/audio_diffusion_pytorch/modules.py b/audio_diffusion_pytorch/modules.py
@@ -810,6 +810,7 @@ def __init__(
         use_skip_scale: bool,
         use_context_time: bool,
         norm: float = 0.0,
+        norm_alpha: float = 20.0,
         out_channels: Optional[int] = None,
         context_features: Optional[int] = None,
         context_channels: Optional[Sequence[int]] = None,
@@ -823,8 +824,9 @@ def __init__(
         use_context_channels = len(context_channels) > 0
         context_mapping_features = None
 
-        self.norm = norm
         self.use_norm = norm > 0.0
+        self.norm = norm
+        self.norm_alpha = norm_alpha
         self.num_layers = num_layers
         self.use_context_time = use_context_time
         self.use_context_features = use_context_features
@@ -1003,7 +1005,7 @@ def forward(
         mapping = self.get_mapping(time, features)
 
         if self.use_norm:
-            x = wave_norm(x, peak=self.norm)
+            x = wave_norm(x, peak=self.norm, alpha=self.norm_alpha)
 
         x = self.to_in(x, mapping)
         skips_list = [x]
@@ -1025,7 +1027,7 @@ def forward(
         x = self.to_out(x, mapping)
 
         if self.use_norm:
-            x = wave_unnorm(x, peak=self.norm)
+            x = wave_unnorm(x, peak=self.norm, alpha=self.norm_alpha)
 
         return x
 
@@ -1129,13 +1131,15 @@ def __init__(
         use_noisy: bool = False,
         bottleneck: Optional[Bottleneck] = None,
         norm: float = 0.0,
+        norm_alpha: float = 20.0,
     ):
         super().__init__()
         num_layers = len(multipliers) - 1
         self.bottleneck = bottleneck
         self.use_noisy = use_noisy
         self.use_norm = norm > 0.0
         self.norm = norm
+        self.norm_alpha = norm_alpha
 
         assert len(factors) >= num_layers and len(num_blocks) >= num_layers
 
@@ -1186,7 +1190,7 @@ def encode(
         self, x: Tensor, with_info: bool = False
     ) -> Union[Tensor, Tuple[Tensor, Any]]:
         if self.use_norm:
-            x = wave_norm(x, peak=self.norm)
+            x = wave_norm(x, peak=self.norm, alpha=self.norm_alpha)
 
         x = self.to_in(x)
         for downsample in self.downsamples:
@@ -1207,7 +1211,7 @@ def decode(self, x: Tensor) -> Tensor:
         x = self.to_out(x)
 
         if self.use_norm:
-            x = wave_unnorm(x, peak=self.norm)
+            x = wave_unnorm(x, peak=self.norm, alpha=self.norm_alpha)
 
         return x
 
diff --git a/audio_diffusion_pytorch/utils.py b/audio_diffusion_pytorch/utils.py
@@ -85,15 +85,16 @@ def upsample(waveforms: Tensor, factor: int, **kwargs) -> Tensor:
     return resample(waveforms, factor_in=1, factor_out=factor, **kwargs)
 
 
-def wave_norm(x: Tensor, bits: int = 24, peak: float = 0.5) -> Tensor:
-    mu = 2 ** bits
+def wave_norm(x: Tensor, peak: float = 0.5, alpha: float = 20.0) -> Tensor:
+    x = x.clip(-1, 1)
+    x = torch.sigmoid(alpha * x)
     x = x.clip(-1, 1)
-    x = torch.sign(x) * torch.log1p(mu * torch.abs(x)) / math.log1p(mu)
     return x * peak
 
 
-def wave_unnorm(x: Tensor, bits: int = 24, peak: float = 0.5) -> Tensor:
-    x = (x / peak).clip(-1, 1)
-    mu = 2 ** bits
-    x = torch.sign(x) * (torch.exp(torch.abs(x) * math.log1p(mu)) - 1) / mu
+def wave_unnorm(x: Tensor, peak: float = 0.5, alpha: float = 20.0) -> Tensor:
+    x = x / peak
+    x = x.clip(-1, 1)
+    x = (1.0 / alpha) * torch.log(x / (1 - x))
+    x = x.clip(-1, 1)
     return x
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
     name="audio-diffusion-pytorch",
     packages=find_packages(exclude=[]),
-    version="0.0.54",
+    version="0.0.55",
     license="MIT",
     description="Audio Diffusion - PyTorch",
     long_description_content_type="text/markdown",