feat: add use_complex stft diffusion option

flavioschneider · flavioschneider · commit b63df7d63d93 · 2022-11-07T12:03:52.000+01:00
diff --git a/audio_diffusion_pytorch/modules.py b/audio_diffusion_pytorch/modules.py
@@ -1277,19 +1277,23 @@ def forward(self, texts: List[str]) -> Tensor:
 
 
 class STFT(nn.Module):
+    """Helper for torch stft and istft"""
+
     def __init__(
         self,
         num_fft: int = 1023,
-        hop_length: Optional[int] = None,
+        hop_length: int = 256,
         window_length: Optional[int] = None,
         length: Optional[int] = None,
+        use_complex: bool = False,
     ):
         super().__init__()
         self.num_fft = num_fft
         self.hop_length = default(hop_length, floor(num_fft // 4))
         self.window_length = default(window_length, num_fft)
         self.length = length
         self.register_buffer("window", torch.hann_window(self.window_length))
+        self.use_complex = use_complex
 
     def encode(self, wave: Tensor) -> Tuple[Tensor, Tensor]:
         b = wave.shape[0]
@@ -1302,43 +1306,54 @@ def encode(self, wave: Tensor) -> Tuple[Tensor, Tensor]:
             win_length=self.window_length,
             window=self.window,  # type: ignore
             return_complex=True,
+            normalized=True,
         )
 
-        mag = torch.sqrt(torch.clamp((stft.real ** 2) + (stft.imag ** 2), min=1e-8))
-        mag = rearrange(mag, "(b c) f l -> b c f l", b=b)
+        if self.use_complex:
+            # Returns real and imaginary
+            stft_a, stft_b = stft.real, stft.imag
+        else:
+            # Returns magnitude and phase matrices
+            magnitude, phase = torch.abs(stft), torch.angle(stft)
+            stft_a, stft_b = magnitude, phase
 
-        phase = torch.angle(stft)
-        phase = rearrange(phase, "(b c) f l -> b c f l", b=b)
-        return mag, phase
+        return rearrange_many((stft_a, stft_b), "(b c) f l -> b c f l", b=b)
 
-    def decode(self, magnitude: Tensor, phase: Tensor) -> Tensor:
-        b, l = magnitude.shape[0], magnitude.shape[-1]  # noqa
-        assert magnitude.shape == phase.shape, "magnitude and phase must be same shape"
-        real = rearrange(magnitude * torch.cos(phase), "b c f l -> (b c) f l")
-        imag = rearrange(magnitude * torch.sin(phase), "b c f l -> (b c) f l")
-        stft = torch.stack([real, imag], dim=-1)
+    def decode(self, stft_a: Tensor, stft_b: Tensor) -> Tensor:
+        b, l = stft_a.shape[0], stft_a.shape[-1]  # noqa
         length = closest_power_2(l * self.hop_length)
 
+        stft_a, stft_b = rearrange_many((stft_a, stft_b), "b c f l -> (b c) f l")
+
+        if self.use_complex:
+            real, imag = stft_a, stft_b
+        else:
+            magnitude, phase = stft_a, stft_b
+            real, imag = magnitude * torch.cos(phase), magnitude * torch.sin(phase)
+
+        stft = torch.stack([real, imag], dim=-1)
+
         wave = torch.istft(
             stft,
             n_fft=self.num_fft,
             hop_length=self.hop_length,
             win_length=self.window_length,
             window=self.window,  # type: ignore
             length=default(self.length, length),
+            normalized=True,
         )
-        wave = rearrange(wave, "(b c) t -> b c t", b=b)
-        return wave
+
+        return rearrange(wave, "(b c) t -> b c t", b=b)
 
     def encode1d(
         self, wave: Tensor, stacked: bool = True
     ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
-        magnitude, phase = self.encode(wave)
-        magnitude, phase = rearrange_many((magnitude, phase), "b c f l -> b (c f) l")
-        return torch.cat((magnitude, phase), dim=1) if stacked else (magnitude, phase)
+        stft_a, stft_b = self.encode(wave)
+        stft_a, stft_b = rearrange_many((stft_a, stft_b), "b c f l -> b (c f) l")
+        return torch.cat((stft_a, stft_b), dim=1) if stacked else (stft_a, stft_b)
 
-    def decode1d(self, magnitude_and_phase: Tensor) -> Tensor:
+    def decode1d(self, stft_pair: Tensor) -> Tensor:
         f = self.num_fft // 2 + 1
-        magnitude, phase = magnitude_and_phase.chunk(chunks=2, dim=1)
-        mag, phase = rearrange_many((magnitude, phase), "b (c f) l -> b c f l", f=f)
-        return self.decode(mag, phase)
+        stft_a, stft_b = stft_pair.chunk(chunks=2, dim=1)
+        stft_a, stft_b = rearrange_many((stft_a, stft_b), "b (c f) l -> b c f l", f=f)
+        return self.decode(stft_a, stft_b)
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
     name="audio-diffusion-pytorch",
     packages=find_packages(exclude=[]),
-    version="0.0.84",
+    version="0.0.85",
     license="MIT",
     description="Audio Diffusion - PyTorch",
     long_description_content_type="text/markdown",