Ipsedo
diff --git a/‎.gitattributes‎
Lines changed: 2 additions & 0 deletions b/‎.gitattributes‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 3 additions & 5 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎music_diffusion/__main__.py‎
Lines changed: 18 additions & 15 deletions b/‎music_diffusion/__main__.py‎
Lines changed: 18 additions & 15 deletions
diff --git a/‎music_diffusion/data/__init__.py‎
Lines changed: 8 additions & 1 deletion b/‎music_diffusion/data/__init__.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎music_diffusion/data/audio.py‎
Lines changed: 101 additions & 12 deletions b/‎music_diffusion/data/audio.py‎
Lines changed: 101 additions & 12 deletions
diff --git a/‎music_diffusion/data/constants.py‎
Lines changed: 2 additions & 0 deletions b/‎music_diffusion/data/constants.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎music_diffusion/data/transform.py‎
Lines changed: 5 additions & 0 deletions b/‎music_diffusion/data/transform.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎music_diffusion/generate.py‎
Lines changed: 19 additions & 15 deletions b/‎music_diffusion/generate.py‎
Lines changed: 19 additions & 15 deletions
@@ -0,0 +1,2 @@
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
@@ -19,33 +19,31 @@ repos:
     rev: 22.10.0
     hooks:
       - id: black
-        args: [ --config=pyproject.toml ]
   - repo: https://github.com/hadialqattan/pycln
     rev: v2.1.2
     hooks:
       - id: pycln
-        args: [ --config=pyproject.toml ]
+        args: [ --config, pyproject.toml ]
   - repo: https://github.com/pycqa/isort
     rev: 5.12.0
     hooks:
       - id: isort
         files: "\\.(py)$"
-        args: [ --settings-path=pyproject.toml ]
   - repo: local
     hooks:
       - id: mypy
         name: mypy
         language: system
         entry: mypy
-        args: [ music_diffusion, tests, --config-file=pyproject.toml ]
+        args: [ music_diffusion, tests ]
         types: [ python ]
         pass_filenames: false
         require_serial: true
       - id: pylint
         name: pylint
         language: system
         entry: pylint
-        args: [ music_diffusion, tests, --rcfile, pyproject.toml ]
+        args: [ music_diffusion, tests ]
         types: [ python ]
         pass_filenames: false
         require_serial: true
 
@@ -61,21 +61,19 @@ def main() -> None:
     model_parser = sub_command.add_parser("model")
 
     model_parser.add_argument("--steps", type=int, default=4096)
-    model_parser.add_argument("--beta-1", type=float, default=2.5e-5)
-    model_parser.add_argument("--beta-t", type=float, default=5e-3)
-    model_parser.add_argument("--channels", type=int, default=2)
     model_parser.add_argument(
         "--unet-channels",
         type=_channels,
         default=[
+            (2, 16),
             (16, 32),
-            (32, 48),
-            (48, 64),
-            (64, 80),
+            (32, 64),
+            (64, 128),
+            (128, 256),
+            (256, 512),
         ],
     )
-    model_parser.add_argument("--time-size", type=int, default=48)
-    model_parser.add_argument("--norm-groups", type=int, default=16)
+    model_parser.add_argument("--time-size", type=int, default=16)
     model_parser.add_argument("--cuda", action="store_true")
 
     # Sub command run {train, generate}
@@ -97,6 +95,10 @@ def main() -> None:
     train_parser.add_argument("--save-every", type=int, default=4096)
     train_parser.add_argument("-o", "--output-dir", type=str, required=True)
     train_parser.add_argument("--nb-samples", type=int, default=5)
+    train_parser.add_argument("--denoiser-state-dict", type=str)
+    train_parser.add_argument("--ema-state-dict", type=str)
+    train_parser.add_argument("--noiser-state-dict", type=str)
+    train_parser.add_argument("--optim-state-dict", type=str)
 
     # Generate parser
     generate_parser = model_sub_command.add_parser("generate")
@@ -106,6 +108,8 @@ def main() -> None:
     generate_parser.add_argument("--fast-sample", type=int, required=False)
     generate_parser.add_argument("--frames", type=int, required=True)
     generate_parser.add_argument("--musics", type=int, required=True)
+    generate_parser.add_argument("--ema", action="store_true")
+    generate_parser.add_argument("--magn-scale", type=float, default=1.0)
 
     #######
     # Main
@@ -116,12 +120,8 @@ def main() -> None:
     if args.mode == "model":
         model_options = ModelOptions(
             steps=args.steps,
-            beta_1=args.beta_1,
-            beta_t=args.beta_t,
-            input_channels=args.channels,
             unet_channels=args.unet_channels,
             time_size=args.time_size,
-            norm_groups=args.norm_groups,
             cuda=args.cuda,
         )
 
@@ -137,9 +137,10 @@ def main() -> None:
                 save_every=args.save_every,
                 output_directory=args.output_dir,
                 nb_samples=args.nb_samples,
-                noiser_state_dict=None,
-                denoiser_state_dict=None,
-                optim_state_dict=None,
+                noiser_state_dict=args.noiser_state_dict,
+                denoiser_state_dict=args.denoiser_state_dict,
+                ema_state_dict=args.ema_state_dict,
+                optim_state_dict=args.optim_state_dict,
             )
 
             train(model_options, train_options)
@@ -148,9 +149,11 @@ def main() -> None:
             generate_options = GenerateOptions(
                 fast_sample=args.fast_sample,
                 denoiser_dict_state=args.denoiser_dict_state,
+                ema_denoiser=args.ema,
                 output_dir=args.output_dir,
                 frames=args.frames,
                 musics=args.musics,
+                magn_scale=args.magn_scale,
             )
 
             generate(model_options, generate_options)
 
@@ -6,7 +6,14 @@
     stft_to_magnitude_phase,
     wav_to_stft,
 )
-from .constants import N_FFT, N_VEC, OUTPUT_SIZES, SAMPLE_RATE, STFT_STRIDE
+from .constants import (
+    BIN_SIZE,
+    N_FFT,
+    N_VEC,
+    OUTPUT_SIZES,
+    SAMPLE_RATE,
+    STFT_STRIDE,
+)
 from .datasets import AudioDataset
 from .primitive import simpson, trapezoid
 from .transform import (
 
@@ -49,6 +49,95 @@ def bark_scale(
     return res
 
 
+# copied code from
+# https://github.com/magenta/magenta/blob/main/magenta/models/gansynth/lib/spectral_ops.py
+_MEL_BREAK_FREQUENCY_HERTZ = 700.0
+_MEL_HIGH_FREQUENCY_Q = 1127.0
+
+
+def mel_to_hertz(mel_values: th.Tensor) -> th.Tensor:
+    return _MEL_BREAK_FREQUENCY_HERTZ * (
+        th.exp(mel_values / _MEL_HIGH_FREQUENCY_Q) - 1.0
+    )
+
+
+def hertz_to_mel(frequencies_hertz: th.Tensor) -> th.Tensor:
+    return _MEL_HIGH_FREQUENCY_Q * th.log(
+        1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ)
+    )
+
+
+def linear_to_mel_weight_matrix(
+    num_mel_bins: int = constants.N_FFT // 2,
+    num_spectrogram_bins: int = constants.N_FFT // 2,
+    sample_rate: int = constants.SAMPLE_RATE,
+    lower_edge_hertz: float = 125.0,
+    upper_edge_hertz: float = 3800.0,
+) -> th.Tensor:
+
+    # HTK excludes the spectrogram DC bin.
+    bands_to_zero = 1
+    nyquist_hertz = sample_rate / 2.0
+    linear_frequencies = th.linspace(0.0, nyquist_hertz, num_spectrogram_bins)[
+        bands_to_zero:, None
+    ]
+    # spectrogram_bins_mel = hertz_to_mel(linear_frequencies)
+
+    # Compute num_mel_bins triples of (lower_edge, center, upper_edge). The
+    # center of each band is the lower and upper edge of the adjacent bands.
+    # Accordingly, we divide [lower_edge_hertz, upper_edge_hertz] into
+    # num_mel_bins + 2 pieces.
+    band_edges_mel = th.linspace(
+        hertz_to_mel(th.tensor(lower_edge_hertz)).item(),
+        hertz_to_mel(th.tensor(upper_edge_hertz)).item(),
+        num_mel_bins + 2,
+    )
+
+    lower_edge_mel = band_edges_mel[0:-2]
+    center_mel = band_edges_mel[1:-1]
+    upper_edge_mel = band_edges_mel[2:]
+
+    freq_res = nyquist_hertz / float(num_spectrogram_bins)
+    freq_th = 1.5 * freq_res
+    for i in range(0, num_mel_bins):
+        center_hz = mel_to_hertz(center_mel[i])
+        lower_hz = mel_to_hertz(lower_edge_mel[i])
+        upper_hz = mel_to_hertz(upper_edge_mel[i])
+        if upper_hz - lower_hz < freq_th:
+            rhs = 0.5 * freq_th / (center_hz + _MEL_BREAK_FREQUENCY_HERTZ)
+            dm = _MEL_HIGH_FREQUENCY_Q * th.log(rhs + th.sqrt(1.0 + rhs**2))
+            lower_edge_mel[i] = center_mel[i] - dm
+            upper_edge_mel[i] = center_mel[i] + dm
+
+    lower_edge_hz = mel_to_hertz(lower_edge_mel)[None, :]
+    center_hz = mel_to_hertz(center_mel)[None, :]
+    upper_edge_hz = mel_to_hertz(upper_edge_mel)[None, :]
+
+    # Calculate lower and upper slopes for every spectrogram bin.
+    # Line segments are linear in the mel domain, not Hertz.
+    lower_slopes = (linear_frequencies - lower_edge_hz) / (
+        center_hz - lower_edge_hz
+    )
+    upper_slopes = (upper_edge_hz - linear_frequencies) / (
+        upper_edge_hz - center_hz
+    )
+
+    # Intersect the line segments with each other and zero.
+    mel_weights_matrix = th.maximum(
+        th.tensor(0.0), th.minimum(lower_slopes, upper_slopes)
+    )
+
+    # Re-add the zeroed lower bins we sliced out above.
+    # [freq, mel]
+    mel_weights_matrix = th_f.pad(
+        mel_weights_matrix, [bands_to_zero, 0, 0, 0], "constant"
+    )
+    return mel_weights_matrix
+
+
+# end of copied code
+
+
 def wav_to_stft(
     wav_p: str,
     n_per_seg: int = constants.N_FFT,
@@ -130,6 +219,8 @@ def magnitude_phase_to_wav(
     sample_rate: int,
     n_fft: int = constants.N_FFT,
     stft_stride: int = constants.STFT_STRIDE,
+    threshold: float = 1.0 / 2**8,
+    magn_scale: float = 1.0,
 ) -> None:
     assert (
         len(magnitude_phase.size()) == 4
@@ -151,7 +242,9 @@ def magnitude_phase_to_wav(
     phase = magnitude_phase_flattened[1, :, :]
 
     magnitude = (magnitude + 1.0) / 2.0
+    magnitude[magnitude < threshold] = 0.0
     magnitude = bark_scale(magnitude, "unscale")
+    magnitude = magnitude * magn_scale
 
     phase = (phase + 1.0) / 2.0 * 2.0 * th.pi - th.pi
     phase = simpson(th.zeros(phase.size()[0], 1), phase, 1, 1.0)
@@ -191,34 +284,30 @@ def create_dataset(
     elif not isdir(dataset_output_dir):
         raise NotADirectoryError(dataset_output_dir)
 
-    n_per_seg = constants.N_FFT
-    stride = constants.STFT_STRIDE
-
-    nb_vec = constants.N_VEC
-
     idx = 0
 
     for wav_p in tqdm(w_p):
-        complex_values = wav_to_stft(wav_p, n_per_seg=n_per_seg, stride=stride)
+        complex_values = wav_to_stft(
+            wav_p, n_per_seg=constants.N_FFT, stride=constants.STFT_STRIDE
+        )
 
-        if complex_values.size()[1] < nb_vec:
+        if complex_values.size()[1] < constants.N_VEC:
             continue
 
         magnitude, phase = stft_to_magnitude_phase(
-            complex_values, nb_vec=nb_vec
+            complex_values, nb_vec=constants.N_VEC
         )
 
         nb_sample = magnitude.size()[0]
 
         for s_idx in range(nb_sample):
-            s_magnitude = magnitude[s_idx, :, :]
-            s_phase = phase[s_idx, :, :]
-
             magnitude_phase_path = join(
                 dataset_output_dir, f"magn_phase_{idx}.pt"
             )
 
-            magnitude_phase = th.stack([s_magnitude, s_phase], dim=0)
+            magnitude_phase = th.stack(
+                [magnitude[s_idx, :, :], phase[s_idx, :, :]], dim=0
+            )
             magnitude_phase = magnitude_phase.to(th.float)
 
             th.save(magnitude_phase, magnitude_phase_path)
 
@@ -8,3 +8,5 @@
 SAMPLE_RATE: Final[int] = 16000
 
 OUTPUT_SIZES: Final[Tuple[int, int]] = (N_FFT // 2, N_VEC)
+
+BIN_SIZE: Final[float] = 1.0 / 2.0**16.0
@@ -3,6 +3,8 @@
 
 import torch as th
 
+# pylint: disable=too-few-public-methods
+
 
 class ImgTransform(metaclass=ABCMeta):
     @abstractmethod
@@ -55,3 +57,6 @@ def __init__(self, dtype: th.dtype) -> None:
 
     def __call__(self, img_data: th.Tensor) -> th.Tensor:
         return img_data.to(self.__dtype)
+
+
+# pylint: enable=too-few-public-methods
@@ -12,7 +12,6 @@
     STFT_STRIDE,
     magnitude_phase_to_wav,
 )
-from .networks import Denoiser
 from .options import GenerateOptions, ModelOptions
 
 
@@ -27,24 +26,28 @@ def generate(
 
     print("Load model...")
 
-    # pylint: disable=duplicate-code
-    denoiser = Denoiser(
-        model_options.input_channels,
-        model_options.steps,
-        model_options.time_size,
-        model_options.beta_1,
-        model_options.beta_t,
-        model_options.unet_channels,
-        model_options.norm_groups,
-    )
-    # pylint: enable=duplicate-code
+    denoiser = model_options.new_denoiser()
 
     device = "cuda" if model_options.cuda else "cpu"
 
-    denoiser.load_state_dict(
-        th.load(generate_options.denoiser_dict_state, map_location=device)
+    loaded_state_dict = th.load(
+        generate_options.denoiser_dict_state, map_location=device
     )
 
+    ema_prefix = "ema_model."
+
+    state_dict = (
+        {
+            k[len(ema_prefix) :]: p
+            for k, p in loaded_state_dict.items()
+            if k.startswith(ema_prefix)
+        }
+        if generate_options.ema_denoiser
+        else loaded_state_dict
+    )
+
+    denoiser.load_state_dict(state_dict)
+
     denoiser.eval()
 
     if model_options.cuda:
@@ -58,7 +61,7 @@ def generate(
 
         x_t = th.randn(
             generate_options.musics,
-            model_options.input_channels,
+            model_options.unet_channels[0][0],
             height,
             width * generate_options.frames,
             device=device,
@@ -85,4 +88,5 @@ def generate(
                 SAMPLE_RATE,
                 N_FFT,
                 STFT_STRIDE,
+                magn_scale=generate_options.magn_scale,
             )
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+*.pt filter=lfs diff=lfs merge=lfs -text`
	`2`	`+*.wav filter=lfs diff=lfs merge=lfs -text`