diff --git a/audio_samples/MCD-DTW/fastpitch/audios/sample_0.wav b/audio_samples/MCD-DTW/fastpitch/audios/sample_0.wav
new file mode 100644
index 00000000..c42c2ce2
Binary files /dev/null and b/audio_samples/MCD-DTW/fastpitch/audios/sample_0.wav differ
diff --git a/audio_samples/MCD-DTW/fastpitch/mels/mels_0.npy b/audio_samples/MCD-DTW/fastpitch/mels/mels_0.npy
new file mode 100644
index 00000000..6f8c3c67
Binary files /dev/null and b/audio_samples/MCD-DTW/fastpitch/mels/mels_0.npy differ
diff --git a/audio_samples/MCD-DTW/fastpitch/mels/mels_1.npy b/audio_samples/MCD-DTW/fastpitch/mels/mels_1.npy
new file mode 100644
index 00000000..a073bb6e
Binary files /dev/null and b/audio_samples/MCD-DTW/fastpitch/mels/mels_1.npy differ
diff --git a/audio_samples/MCD-DTW/fastpitch/mels/mels_2.npy b/audio_samples/MCD-DTW/fastpitch/mels/mels_2.npy
new file mode 100644
index 00000000..db07dad0
Binary files /dev/null and b/audio_samples/MCD-DTW/fastpitch/mels/mels_2.npy differ
diff --git a/audio_samples/MCD-DTW/fastpitch/mels/mels_3.npy b/audio_samples/MCD-DTW/fastpitch/mels/mels_3.npy
new file mode 100644
index 00000000..cf4a522a
Binary files /dev/null and b/audio_samples/MCD-DTW/fastpitch/mels/mels_3.npy differ
diff --git a/audio_samples/MCD-DTW/fastpitch/mels/mels_4.npy b/audio_samples/MCD-DTW/fastpitch/mels/mels_4.npy
new file mode 100644
index 00000000..af04dc30
Binary files /dev/null and b/audio_samples/MCD-DTW/fastpitch/mels/mels_4.npy differ
diff --git a/audio_samples/MCD-DTW/fastpitch/mels/mels_5.npy b/audio_samples/MCD-DTW/fastpitch/mels/mels_5.npy
new file mode 100644
index 00000000..469c6864
Binary files /dev/null and b/audio_samples/MCD-DTW/fastpitch/mels/mels_5.npy differ
diff --git a/audio_samples/MCD-DTW/fastpitch/mels/mels_6.npy b/audio_samples/MCD-DTW/fastpitch/mels/mels_6.npy
new file mode 100644
index 00000000..09797476
Binary files /dev/null and b/audio_samples/MCD-DTW/fastpitch/mels/mels_6.npy differ
diff --git a/audio_samples/MCD-DTW/fastpitch/mels/mels_7.npy b/audio_samples/MCD-DTW/fastpitch/mels/mels_7.npy
new file mode 100644
index 00000000..81ba18c8
Binary files /dev/null and b/audio_samples/MCD-DTW/fastpitch/mels/mels_7.npy differ
diff --git a/audio_samples/MCD-DTW/fastpitch/mels/mels_8.npy b/audio_samples/MCD-DTW/fastpitch/mels/mels_8.npy
new file mode 100644
index 00000000..eb4ba361
Binary files /dev/null and b/audio_samples/MCD-DTW/fastpitch/mels/mels_8.npy differ
diff --git a/audio_samples/MCD-DTW/fastpitch/mels/mels_9.npy b/audio_samples/MCD-DTW/fastpitch/mels/mels_9.npy
new file mode 100644
index 00000000..941681c9
Binary files /dev/null and b/audio_samples/MCD-DTW/fastpitch/mels/mels_9.npy differ
diff --git a/audio_samples/MCD-DTW/gt/audios/sample_0.wav b/audio_samples/MCD-DTW/gt/audios/sample_0.wav
new file mode 100644
index 00000000..a274be89
Binary files /dev/null and b/audio_samples/MCD-DTW/gt/audios/sample_0.wav differ
diff --git a/audio_samples/MCD-DTW/gt/audios/sample_1.wav b/audio_samples/MCD-DTW/gt/audios/sample_1.wav
new file mode 100644
index 00000000..b1a0ed11
Binary files /dev/null and b/audio_samples/MCD-DTW/gt/audios/sample_1.wav differ
diff --git a/audio_samples/MCD-DTW/gt/audios/sample_2.wav b/audio_samples/MCD-DTW/gt/audios/sample_2.wav
new file mode 100644
index 00000000..3329ddb4
Binary files /dev/null and b/audio_samples/MCD-DTW/gt/audios/sample_2.wav differ
diff --git a/audio_samples/MCD-DTW/gt/audios/sample_3.wav b/audio_samples/MCD-DTW/gt/audios/sample_3.wav
new file mode 100644
index 00000000..ead8a0e3
Binary files /dev/null and b/audio_samples/MCD-DTW/gt/audios/sample_3.wav differ
diff --git a/audio_samples/MCD-DTW/gt/audios/sample_4.wav b/audio_samples/MCD-DTW/gt/audios/sample_4.wav
new file mode 100644
index 00000000..640f708c
Binary files /dev/null and b/audio_samples/MCD-DTW/gt/audios/sample_4.wav differ
diff --git a/audio_samples/MCD-DTW/gt/audios/sample_5.wav b/audio_samples/MCD-DTW/gt/audios/sample_5.wav
new file mode 100644
index 00000000..15cffd54
Binary files /dev/null and b/audio_samples/MCD-DTW/gt/audios/sample_5.wav differ
diff --git a/audio_samples/MCD-DTW/gt/audios/sample_6.wav b/audio_samples/MCD-DTW/gt/audios/sample_6.wav
new file mode 100644
index 00000000..0d33e450
Binary files /dev/null and b/audio_samples/MCD-DTW/gt/audios/sample_6.wav differ
diff --git a/audio_samples/MCD-DTW/gt/audios/sample_7.wav b/audio_samples/MCD-DTW/gt/audios/sample_7.wav
new file mode 100644
index 00000000..a1871dd8
Binary files /dev/null and b/audio_samples/MCD-DTW/gt/audios/sample_7.wav differ
diff --git a/audio_samples/MCD-DTW/gt/audios/sample_8.wav b/audio_samples/MCD-DTW/gt/audios/sample_8.wav
new file mode 100644
index 00000000..b534f1b9
Binary files /dev/null and b/audio_samples/MCD-DTW/gt/audios/sample_8.wav differ
diff --git a/audio_samples/MCD-DTW/gt/audios/sample_9.wav b/audio_samples/MCD-DTW/gt/audios/sample_9.wav
new file mode 100644
index 00000000..01a2e688
Binary files /dev/null and b/audio_samples/MCD-DTW/gt/audios/sample_9.wav differ
diff --git a/audio_samples/MCD-DTW/radtts/mels/mels_0.npy b/audio_samples/MCD-DTW/radtts/mels/mels_0.npy
new file mode 100644
index 00000000..c12c673b
Binary files /dev/null and b/audio_samples/MCD-DTW/radtts/mels/mels_0.npy differ
diff --git a/audio_samples/MCD-DTW/radtts/mels/mels_1.npy b/audio_samples/MCD-DTW/radtts/mels/mels_1.npy
new file mode 100644
index 00000000..715bd3da
Binary files /dev/null and b/audio_samples/MCD-DTW/radtts/mels/mels_1.npy differ
diff --git a/audio_samples/MCD-DTW/radtts/mels/mels_2.npy b/audio_samples/MCD-DTW/radtts/mels/mels_2.npy
new file mode 100644
index 00000000..9d2c7fe5
Binary files /dev/null and b/audio_samples/MCD-DTW/radtts/mels/mels_2.npy differ
diff --git a/audio_samples/MCD-DTW/radtts/mels/mels_3.npy b/audio_samples/MCD-DTW/radtts/mels/mels_3.npy
new file mode 100644
index 00000000..80fb1b07
Binary files /dev/null and b/audio_samples/MCD-DTW/radtts/mels/mels_3.npy differ
diff --git a/audio_samples/MCD-DTW/radtts/mels/mels_4.npy b/audio_samples/MCD-DTW/radtts/mels/mels_4.npy
new file mode 100644
index 00000000..5d19bd93
Binary files /dev/null and b/audio_samples/MCD-DTW/radtts/mels/mels_4.npy differ
diff --git a/audio_samples/MCD-DTW/radtts/mels/mels_5.npy b/audio_samples/MCD-DTW/radtts/mels/mels_5.npy
new file mode 100644
index 00000000..ecc95bcb
Binary files /dev/null and b/audio_samples/MCD-DTW/radtts/mels/mels_5.npy differ
diff --git a/audio_samples/MCD-DTW/radtts/mels/mels_6.npy b/audio_samples/MCD-DTW/radtts/mels/mels_6.npy
new file mode 100644
index 00000000..26c8dae1
Binary files /dev/null and b/audio_samples/MCD-DTW/radtts/mels/mels_6.npy differ
diff --git a/audio_samples/MCD-DTW/radtts/mels/mels_7.npy b/audio_samples/MCD-DTW/radtts/mels/mels_7.npy
new file mode 100644
index 00000000..ce357445
Binary files /dev/null and b/audio_samples/MCD-DTW/radtts/mels/mels_7.npy differ
diff --git a/audio_samples/MCD-DTW/radtts/mels/mels_8.npy b/audio_samples/MCD-DTW/radtts/mels/mels_8.npy
new file mode 100644
index 00000000..07b82303
Binary files /dev/null and b/audio_samples/MCD-DTW/radtts/mels/mels_8.npy differ
diff --git a/audio_samples/MCD-DTW/radtts/mels/mels_9.npy b/audio_samples/MCD-DTW/radtts/mels/mels_9.npy
new file mode 100644
index 00000000..a3070f3f
Binary files /dev/null and b/audio_samples/MCD-DTW/radtts/mels/mels_9.npy differ
diff --git a/imgs/riva-tts-MCD_DTW_final_comparision.jpeg b/imgs/riva-tts-MCD_DTW_final_comparision.jpeg
new file mode 100644
index 00000000..9e67107a
Binary files /dev/null and b/imgs/riva-tts-MCD_DTW_final_comparision.jpeg differ
diff --git a/imgs/riva-tts-MCD_DTW_mels.jpeg b/imgs/riva-tts-MCD_DTW_mels.jpeg
new file mode 100644
index 00000000..ab34200d
Binary files /dev/null and b/imgs/riva-tts-MCD_DTW_mels.jpeg differ
diff --git a/tts-evaluation-MCD-DTW.ipynb b/tts-evaluation-MCD-DTW.ipynb
new file mode 100644
index 00000000..3cb60648
--- /dev/null
+++ b/tts-evaluation-MCD-DTW.ipynb
@@ -0,0 +1,503 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "f0408502",
+ "metadata": {},
+ "source": [
+ "# Calculating MCD DTW(Mel cepstral distortion - Dynamic time warping)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8a749b10",
+ "metadata": {},
+ "source": [
+ "In this tutorial, we are going to learn how to calculate MCD DTW between a synthesized mel spec and a reference mel spec.\n",
+ "\n",
+ "Mel cepstral distortion(MCD) is an objective measure of speech quality. MCD is calculated between a tts generated mels and a ground truth mels. Two mel specs are similar if the MCD value between them is low. MCD of a mel spec with itself is 0.\n",
+ "\n",
+ "MCD DTW is a modification of MCD that works with nonaligned mel specs by using dynamic time warping cost matrix. The scale depends on factors such as mel extractor and reduction algorithm(mean of DTW cost or min DTW path cost). It is useful for comparing the model convergence trained on the same training data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "09cff1ab",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Install necessary packages\n",
+ "!pip install librosa numpy matplotlib"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8b9b6e1c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## Import libraries\n",
+ "import librosa\n",
+ "import librosa.display\n",
+ "import numpy as np\n",
+ "import math\n",
+ "import matplotlib.pyplot as plt\n",
+ "import os"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "398b01e0",
+ "metadata": {},
+ "source": [
+ "### Define parameters for spectrogram generation.\n",
+ "In this section we define parameters required to generate our spectrograms. More information about these parameteres can found on librosa documentation for [stft](https://librosa.org/doc/main/generated/librosa.stft.html), [mels](https://librosa.org/doc/main/generated/librosa.filters.mel.html) and [mfcc](https://librosa.org/doc/main/generated/librosa.feature.mfcc.html) generation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "191012de",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## Mel params\n",
+ "n_fft=1024\n",
+ "hop_length=256\n",
+ "win_length=None\n",
+ "window='hann'\n",
+ "n_mels = 80\n",
+ "sr = 22050\n",
+ "\n",
+ "## Mfcc params\n",
+ "n_mfcc=34"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "54830c8c",
+ "metadata": {},
+ "source": [
+ "A little bit about these parameters: \n",
+ " - `n_fft`: Number of fft_components or length of windowed signal after padding in stft. \n",
+ " - `hop_length`: Number of audio samples between adjacent STFT columns. \n",
+ " - `window`: Window to use in stft. \n",
+ " - `win_length`: Length of the window to be used. \n",
+ " - `n_mels`: Number of number of Mel bands to generate. \n",
+ " - `sr`: Sample rate of the samples. \n",
+ " - `n_mfcc`: Number of MFCCs to generate."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4cb9e37c",
+ "metadata": {},
+ "source": [
+ "### Load and Visualize data\n",
+ "Lets apply the algorithm on a synthesized, ground truth mels pair for understanding."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6df31125",
+ "metadata": {},
+ "source": [
+ "Write a function to generate mel spectrograms from audio files. Mel spectrogram are generated using [librosa mel extractor](https://librosa.org/doc/main/generated/librosa.filters.mel.html)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "042ceb7f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def wav2mel(filename):\n",
+ " \"\"\"\n",
+ " Function to load an audio file and generate, and return mel specs:\n",
+ " args: filename: full path of the audio file.\n",
+ " return: mel spectogram\n",
+ " \"\"\"\n",
+ " wav_, _ = librosa.load(filename)\n",
+ " mels = librosa.feature.melspectrogram(wav_, sr=sr, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, n_mels=n_mels)\n",
+ " mels = librosa.power_to_db(mels, ref=np.max)\n",
+ " return mels"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8546edb5",
+ "metadata": {},
+ "source": [
+ "Write a function to convert mels to mfcc, we will use [mfcc](https://librosa.org/doc/main/generated/librosa.feature.mfcc.html) generation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c1a0663d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def mel2mfcc(mels):\n",
+ " mfcc = librosa.feature.mfcc(S=mels, n_mfcc=n_mfcc)\n",
+ " return mfcc"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6efac554",
+ "metadata": {},
+ "source": [
+ "For this tutorial, we have already generated mels for a [fastpitch](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/tts_en_fastpitch) model and [radtts](https://github.com/NVIDIA/radtts) model."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7aeabe75",
+ "metadata": {},
+ "source": [
+ "Generate mel specs for ground truth audio, load mel specs for generated audio and generate MFCC(Mel frequency cepstral coefficient) for both."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d1a71f88",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "## Generate spectrograms\n",
+ "gt_mels = wav2mel(\"audio_samples/MCD-DTW/gt/audios/sample_0.wav\")\n",
+ "synt_mels = np.load(\"audio_samples/MCD-DTW/fastpitch/mels/mels_0.npy\")\n",
+ "\n",
+ "## Generate MFCC\n",
+ "gt_mfcc = mel2mfcc(gt_mels)\n",
+ "synt_mfcc = mel2mfcc(synt_mels)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "faa5df64",
+ "metadata": {},
+ "source": [
+ "## Visualize the sample audios\n",
+ "\n",
+ "
\n",
+ "
Ground truth audio Synthesized audio \n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "93c4c301",
+ "metadata": {},
+ "source": [
+ "### Calculate DTW matrix\n",
+ "\n",
+ "Dynamic time warping(DTW) measures similarity between two time series that are not in sync. DTW achieves it by finding the most optimal path to align two sequences of different length.\n",
+ "\n",
+ "It uses dynamic programming to calculate the cost of every alignment path and chooses the path with least accumulated cost. The accumulated Cost matrix D at $x_a$ and $y_b$ is the minimum distance between the points, where $x$ and $y$ are two time series. Formally cost matrix can be defined as:\n",
+ " \n",
+ "$D(a,b) = min(D(a-1, b), D(a, b-1), D(a-1, b-1)) + c(x_{a}, y_{b})$ \n",
+ "$D(1, b) = \\sum(c(1, y_{b}))$ \n",
+ "$D(a, 1) = \\sum(c(x_{a}, 1))$ \n",
+ " \n",
+ "Where ***x*** and ***y*** are the audio signals and ***c*** is the log cost function we have defined.\n",
+ "\n",
+ "We will use [DTW function from librosa](https://librosa.org/doc/main/generated/librosa.sequence.dtw.html) on MFCC. It returns DTW accumulated cost matrix and DTW optimum path."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "62eccf95",
+ "metadata": {},
+ "source": [
+ "Cost function for DTW."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8a4adae2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## Define the cost function for calculating DTW\n",
+ "def log_spec_dB_dist(x, y):\n",
+ " log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0)\n",
+ " diff = x - y\n",
+ " return log_spec_dB_const * math.sqrt(np.inner(diff, diff))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0c4c5338",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "frames = synt_mels.shape[1]\n",
+ "mcd = 0\n",
+ "dtw_cost, dtw_min_path = librosa.sequence.dtw(gt_mfcc, synt_mfcc, metric=log_spec_dB_dist)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "913b6548",
+ "metadata": {},
+ "source": [
+ "Reduction of DTW matrix can be done by either taking a mean of the entire cost matrix or averaging DTW cost for the minimum cost path per frame. We will use the DTW cost along the min cost path."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6c85ab38",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "path_cost_matrix = dtw_cost[dtw_min_path[:, 0], dtw_min_path[:, 1]]\n",
+ "path_cost = np.sum(path_cost_matrix)\n",
+ "path_length = dtw_min_path.shape[0]\n",
+ "reduced_dtw_cost = path_cost/path_length\n",
+ "\n",
+ "mcd = reduced_dtw_cost/frames\n",
+ "\n",
+ "print(f\"MCD_DTW is: {mcd}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "24801d61",
+ "metadata": {},
+ "source": [
+ "## MCD_DTW usecase\n",
+ "\n",
+ "MCD is a very useful metric to compare the convergence of two models. Therefore in this section, we will calculate the average MCD for files from two models. We will first put the above calculations in functions for better readability."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e7bd5fff",
+ "metadata": {},
+ "source": [
+ "write functions for getting average cost of Dynamic time warping along a path."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e4181d60",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def extract_path_cost(D, wp):\n",
+ " \"\"\"\n",
+ " Get the path cost from D(cost matrix), wp (warped path)\n",
+ " :returns: sum of path cost \n",
+ " \"\"\"\n",
+ " path_cost = D[wp[:, 0], wp[:, 1]]\n",
+ " return np.sum(path_cost)\n",
+ "\n",
+ "def extract_frame_avg_path_cost(D, wp):\n",
+ " path_cost = extract_path_cost(D, wp)\n",
+ " path_length = wp.shape[0]\n",
+ " frame_avg_path_cost = path_cost / float(path_length)\n",
+ " return frame_avg_path_cost\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "08ca0f47",
+ "metadata": {},
+ "source": [
+ "write a function for calculating MCD for single file."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "99800a97",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def cal_mcd(gt_mfcc, synt_mfcc, cost_function, dtw_type='path_cost'):\n",
+ " frames = synt_mfcc.shape[1]\n",
+ " path_cost = 0\n",
+ " \n",
+ " # dynamic time warping for MCD\n",
+ " dtw_cost, dtw_min_path = librosa.sequence.dtw(gt_mfcc, synt_mfcc, metric=cost_function)\n",
+ " if dtw_type == 'mean':\n",
+ " path_cost = np.mean(dtw_cost)\n",
+ " else:\n",
+ " path_cost = extract_frame_avg_path_cost(dtw_cost, dtw_min_path)\n",
+ " \n",
+ " mcd = path_cost / frames\n",
+ " \n",
+ " return mcd, frames"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "07bb51df",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def cal_mcd_dir(synt_dir, gt_dir):\n",
+ " mcds = []\n",
+ " \n",
+ " synt_filenames = os.listdir(synt_dir)\n",
+ " synt_filepaths = [os.path.join(synt_dir, filename) for filename in synt_filenames]\n",
+ " synt_filepaths.sort()\n",
+ " gt_filenames = os.listdir(gt_dir)\n",
+ " gt_filepaths = [os.path.join(gt_dir, filename) for filename in gt_filenames]\n",
+ " gt_filepaths.sort()\n",
+ "\n",
+ " for synt_melname, gt_audio in zip(synt_filepaths, gt_filepaths):\n",
+ " synt_mels = np.load(synt_melname)\n",
+ " synt_mfcc = mel2mfcc(synt_mels)\n",
+ " \n",
+ " gt_mels = wav2mel(gt_audio)\n",
+ " gt_mfcc = mel2mfcc(gt_mels)\n",
+ " \n",
+ " mcd, _ = cal_mcd(gt_mfcc, synt_mfcc, log_spec_dB_dist)\n",
+ " mcds.append(mcd)\n",
+ " return mcds"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fc295283",
+ "metadata": {
+ "scrolled": true
+ },
+ "source": [
+ "### Calculate MCD DTW on synthesized files from each model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "af3b129d",
+ "metadata": {},
+ "source": [
+ "Get mel specs for both models and compare them."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2bc8da46",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "%%capture --no-display\n",
+ "mels_dir_m1 = \"audio_samples/MCD-DTW/fastpitch/mels/\"\n",
+ "mels_dir_m2 = \"audio_samples/MCD-DTW/radtts/mels/\"\n",
+ "mels_dir_gt = \"audio_samples/MCD-DTW/gt/audios/\"\n",
+ "\n",
+ "mcds_m1 = cal_mcd_dir(mels_dir_m1, mels_dir_gt)\n",
+ "mcds_m2 = cal_mcd_dir(mels_dir_m2, mels_dir_gt)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6bd1ef33",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(f\"Average MCD for model 1 is: {sum(mcds_m1)/len(mcds_m1):.2f}\")\n",
+ "print(f\"Average MCD for model 2 is: {sum(mcds_m2)/len(mcds_m2):.2f}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "71e998a9",
+ "metadata": {},
+ "source": [
+ "### Plotting MCD"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7f81c5b5",
+ "metadata": {},
+ "source": [
+ "Plot the MCD DTW values for both models, the model with a lower MCD DTW value is closer to ground truth."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3f8d71bf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "x_axis = np.linspace(1, len(mcds_m1), 10) ## Define an x axis for for plotting in matplotlib\n",
+ "plt.plot(x_axis, mcds_m1, label=\"fastpitch\")\n",
+ "plt.plot(x_axis, mcds_m2, label=\"radTTS\")\n",
+ "\n",
+ "plt.title(\"MCD DTW value for each file\")\n",
+ "plt.ylabel(\"MCD_DTW value\")\n",
+ "\n",
+ "plt.grid()\n",
+ "plt.legend()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fd4eb0a6",
+ "metadata": {},
+ "source": [
+ "## Conclusion\n",
+ "
\n",
+ "\n",
+ "From the graph above the value of MCD is greater for radtts audios than fastpitch mel specs, this is also reflected in the average MCD value for both models. Therefore we can conclude that fastpitch has better convergence than radtts. However, we cannot evaluate the quality of audio generated by these models using MCD. MCD is a great tool for testing model convergence, but generated audio may have pronunciation and quality artifacts. Therefore MCD evaluation should be followed by a MOS(Mean opinion score) and CMOS(Comparative mean opinion scores) evaluation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0aa4d25b",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "py38_speech_ml0.6",
+ "language": "python",
+ "name": "py38_speech_ml0.6"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}