keonlee9420
diff --git a/‎.gitignore‎
Lines changed: 119 additions & 0 deletions b/‎.gitignore‎
Lines changed: 119 additions & 0 deletions
diff --git a/‎LICENSE‎
Lines changed: 21 additions & 0 deletions b/‎LICENSE‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 144 additions & 0 deletions b/‎README.md‎
Lines changed: 144 additions & 0 deletions
diff --git a/‎audio/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎audio/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎audio/audio_processing.py‎
Lines changed: 100 additions & 0 deletions b/‎audio/audio_processing.py‎
Lines changed: 100 additions & 0 deletions
@@ -0,0 +1,119 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+__pycache__
+.vscode
+.DS_Store
+
+# MFA
+montreal-forced-aligner/
+
+# data, checkpoint, and models
+raw_data/
+output/
+*.npy
+TextGrid/
+hifigan/*.pth.tar
+*.out
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Keon Lee
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,144 @@
+# DiffSinger - PyTorch Implementation
+
+PyTorch implementation of [DiffSinger: Diffusion Acoustic Model for Singing Voice Synthesis](https://arxiv.org/abs/2105.02446) (TTS Extension).
+
+<p align="center">
+    <img src="img/model_1.png" width="80%">
+</p>
+
+<p align="center">
+    <img src="img/model_2.png" width="80%">
+</p>
+
+# Status (2021.06.03)
+- [x] Naive Version of DiffSinger
+- [ ] Shallow Diffusion Mechanism: Training boundary predictor by leveraging pre-trained auxiliary decoder + Training denoiser using `k` as a maximum time step
+
+# Quickstart
+
+## Dependencies
+You can install the Python dependencies with
+```
+pip3 install -r requirements.txt
+```
+
+## Inference
+
+You have to download the [pretrained models]() and put them in ``output/ckpt/LJSpeech/``.
+
+For English single-speaker TTS, run
+```
+python3 synthesize.py --text "YOUR_DESIRED_TEXT" --restore_step 900000 --mode single -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml
+```
+The generated utterances will be put in ``output/result/``.
+
+
+## Batch Inference
+Batch inference is also supported, try
+
+```
+python3 synthesize.py --source preprocessed_data/LJSpeech/val.txt --restore_step 900000 --mode batch -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml
+```
+to synthesize all utterances in ``preprocessed_data/LJSpeech/val.txt``
+
+## Controllability
+The pitch/volume/speaking rate of the synthesized utterances can be controlled by specifying the desired pitch/energy/duration ratios.
+For example, one can increase the speaking rate by 20 % and decrease the volume by 20 % by
+
+```
+python3 synthesize.py --text "YOUR_DESIRED_TEXT" --restore_step 900000 --mode single -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml --duration_control 0.8 --energy_control 0.8
+```
+
+# Training
+
+## Datasets
+
+The supported datasets are
+
+- [LJSpeech](https://keithito.com/LJ-Speech-Dataset/): a single-speaker English dataset consists of 13100 short audio clips of a female speaker reading passages from 7 non-fiction books, approximately 24 hours in total.
+- (will be added more)
+
+## Preprocessing
+ 
+First, run 
+```
+python3 prepare_align.py config/LJSpeech/preprocess.yaml
+```
+for some preparations.
+
+As described in the paper, [Montreal Forced Aligner](https://montreal-forced-aligner.readthedocs.io/en/latest/) (MFA) is used to obtain the alignments between the utterances and the phoneme sequences.
+Alignments for the LJSpeech datasets are provided [here](https://drive.google.com/drive/folders/1DBRkALpPd6FL9gjHMmMEdHODmkgNIIK4?usp=sharing) from [ming024's FastSpeech2](https://github.com/ming024/FastSpeech2).
+You have to unzip the files in ``preprocessed_data/LJSpeech/TextGrid/``.
+
+After that, run the preprocessing script by
+```
+python3 preprocess.py config/LJSpeech/preprocess.yaml
+```
+
+Alternately, you can align the corpus by yourself. 
+Download the official MFA package and run
+```
+./montreal-forced-aligner/bin/mfa_align raw_data/LJSpeech/ lexicon/librispeech-lexicon.txt english preprocessed_data/LJSpeech
+```
+or
+```
+./montreal-forced-aligner/bin/mfa_train_and_align raw_data/LJSpeech/ lexicon/librispeech-lexicon.txt preprocessed_data/LJSpeech
+```
+
+to align the corpus and then run the preprocessing script.
+```
+python3 preprocess.py config/LJSpeech/preprocess.yaml
+```
+
+## Training
+
+Train your model with
+```
+python3 train.py -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml
+```
+
+# TensorBoard
+
+Use
+```
+tensorboard --logdir output/log/LJSpeech
+```
+
+to serve TensorBoard on your localhost.
+The loss curves, synthesized mel-spectrograms, and audios are shown.
+
+
+
+# Implementation Issues
+
+1. **Pitch extractor comparison (on LJ001-0006.wav)**
+
+    <p align="center">
+        <img src="img/pitch_extractor_comparison.png" width="100%">
+    </p>
+
+    **pyworld** is used to extract f0 (fundamental frequency) as pitch information in this implementation. Empirically, however, I found that all three methods were equally acceptable for clean datasets (e.g., LJSpeech) as above figures. Note that **pysptk** would work better for noisy datasets (as described in [STYLER](https://github.com/keonlee9420/STYLER)).
+
+2. Stack two layers of `FFTBlock` for the lyrics encoder (text encoder).
+3. (Naive version) The number of learnable parameters is `34.337M`, which is larger than the original paper (`26.744M`). The `diffusion` module takes a significant portion of whole parameters.
+4. I did not remove the energy prediction of FastSpeech2 since it is not critical to the model training or performance (as described in [LightSpeech](https://arxiv.org/abs/2102.04040)). It should be easily removed without any performance degradation.
+5. Use **HiFi-GAN** instead of **Parallel WaveGAN (PWG)** for vocoding.
+
+# Citation
+
+```
+@misc{lee2021diffsinger,
+  author = {Lee, Keon},
+  title = {DiffSinger},
+  year = {2021},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/keonlee9420/DiffSinger}}
+}
+```
+
+# References
+- Authors' codebase
+- [ming024's FastSpeech2](https://github.com/ming024/FastSpeech2) (Later than 2021.02.26 ver.)
+- [hojonathanho's diffusion](https://github.com/hojonathanho/diffusion)
+- [lmnt-com's diffwave](https://github.com/lmnt-com/diffwave)
@@ -0,0 +1,3 @@
+import audio.tools
+import audio.stft
+import audio.audio_processing
@@ -0,0 +1,100 @@
+import torch
+import numpy as np
+import librosa.util as librosa_util
+from scipy.signal import get_window
+
+
+def window_sumsquare(
+    window,
+    n_frames,
+    hop_length,
+    win_length,
+    n_fft,
+    dtype=np.float32,
+    norm=None,
+):
+    """
+    # from librosa 0.6
+    Compute the sum-square envelope of a window function at a given hop length.
+
+    This is used to estimate modulation effects induced by windowing
+    observations in short-time fourier transforms.
+
+    Parameters
+    ----------
+    window : string, tuple, number, callable, or list-like
+        Window specification, as in `get_window`
+
+    n_frames : int > 0
+        The number of analysis frames
+
+    hop_length : int > 0
+        The number of samples to advance between frames
+
+    win_length : [optional]
+        The length of the window function.  By default, this matches `n_fft`.
+
+    n_fft : int > 0
+        The length of each analysis frame.
+
+    dtype : np.dtype
+        The data type of the output
+
+    Returns
+    -------
+    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
+        The sum-squared envelope of the window function
+    """
+    if win_length is None:
+        win_length = n_fft
+
+    n = n_fft + hop_length * (n_frames - 1)
+    x = np.zeros(n, dtype=dtype)
+
+    # Compute the squared window at the desired length
+    win_sq = get_window(window, win_length, fftbins=True)
+    win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
+    win_sq = librosa_util.pad_center(win_sq, n_fft)
+
+    # Fill the envelope
+    for i in range(n_frames):
+        sample = i * hop_length
+        x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
+    return x
+
+
+def griffin_lim(magnitudes, stft_fn, n_iters=30):
+    """
+    PARAMS
+    ------
+    magnitudes: spectrogram magnitudes
+    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
+    """
+
+    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
+    angles = angles.astype(np.float32)
+    angles = torch.autograd.Variable(torch.from_numpy(angles))
+    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+
+    for i in range(n_iters):
+        _, angles = stft_fn.transform(signal)
+        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+    return signal
+
+
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+import audio.tools`
	`2`	`+import audio.stft`
	`3`	`+import audio.audio_processing`