38
38
39
39
40
40
import matplotlib .pyplot as plt
41
- import mir_eval
42
41
from IPython .display import Audio
43
42
44
43
######################################################################
48
47
49
48
######################################################################
50
49
# 2.1. Import the packages
51
- # ~~~~~~~~~~~~~~~~~~~~~~~~
52
50
#
53
- # First, we install and import the necessary packages.
54
- #
55
- # ``mir_eval``, ``pesq``, and ``pystoi`` packages are required for
56
- # evaluating the speech enhancement performance.
57
- #
58
-
59
- # When running this example in notebook, install the following packages.
60
- # !pip3 install mir_eval
61
- # !pip3 install pesq
62
- # !pip3 install pystoi
63
51
64
- from pesq import pesq
65
- from pystoi import stoi
66
52
from torchaudio .utils import download_asset
67
53
68
54
######################################################################
@@ -142,8 +128,14 @@ def generate_mixture(waveform_clean, waveform_noise, target_snr):
142
128
waveform_noise *= 10 ** (- (target_snr - current_snr ) / 20 )
143
129
return waveform_clean + waveform_noise
144
130
145
-
131
+ # If you have mir_eval installed, you can use it to evaluate the separation quality of the estimated sources.
132
+ # You can also evaluate the intelligibility of the speech with the Short-Time Objective Intelligibility (STOI) metric
133
+ # available in the `pystoi` package, or the Perceptual Evaluation of Speech Quality (PESQ) metric available in the `pesq` package.
146
134
def evaluate (estimate , reference ):
135
+ from pesq import pesq
136
+ from pystoi import stoi
137
+ import mir_eval
138
+
147
139
si_snr_score = si_snr (estimate , reference )
148
140
(
149
141
sdr ,
@@ -158,7 +150,6 @@ def evaluate(estimate, reference):
158
150
print (f"PESQ score: { pesq_mix } " )
159
151
print (f"STOI score: { stoi_mix } " )
160
152
161
-
162
153
######################################################################
163
154
# 3. Generate Ideal Ratio Masks (IRMs)
164
155
# ------------------------------------
@@ -211,18 +202,9 @@ def evaluate(estimate, reference):
211
202
# 3.2.1. Visualize mixture speech
212
203
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
213
204
#
214
- # We evaluate the quality of the mixture speech or the enhanced speech
215
- # using the following three metrics:
216
- #
217
- # - signal-to-distortion ratio (SDR)
218
- # - scale-invariant signal-to-noise ratio (Si-SNR, or Si-SDR in some papers)
219
- # - Perceptual Evaluation of Speech Quality (PESQ)
220
- #
221
- # We also evaluate the intelligibility of the speech with the Short-Time Objective Intelligibility
222
- # (STOI) metric.
205
+
223
206
224
207
plot_spectrogram (stft_mix [0 ], "Spectrogram of Mixture Speech (dB)" )
225
- evaluate (waveform_mix [0 :1 ], waveform_clean [0 :1 ])
226
208
Audio (waveform_mix [0 ], rate = SAMPLE_RATE )
227
209
228
210
@@ -335,7 +317,6 @@ def get_irms(stft_clean, stft_noise):
335
317
336
318
plot_spectrogram (stft_souden , "Enhanced Spectrogram by SoudenMVDR (dB)" )
337
319
waveform_souden = waveform_souden .reshape (1 , - 1 )
338
- evaluate (waveform_souden , waveform_clean [0 :1 ])
339
320
Audio (waveform_souden , rate = SAMPLE_RATE )
340
321
341
322
@@ -393,7 +374,6 @@ def get_irms(stft_clean, stft_noise):
393
374
394
375
plot_spectrogram (stft_rtf_evd , "Enhanced Spectrogram by RTFMVDR and F.rtf_evd (dB)" )
395
376
waveform_rtf_evd = waveform_rtf_evd .reshape (1 , - 1 )
396
- evaluate (waveform_rtf_evd , waveform_clean [0 :1 ])
397
377
Audio (waveform_rtf_evd , rate = SAMPLE_RATE )
398
378
399
379
@@ -404,5 +384,4 @@ def get_irms(stft_clean, stft_noise):
404
384
405
385
plot_spectrogram (stft_rtf_power , "Enhanced Spectrogram by RTFMVDR and F.rtf_power (dB)" )
406
386
waveform_rtf_power = waveform_rtf_power .reshape (1 , - 1 )
407
- evaluate (waveform_rtf_power , waveform_clean [0 :1 ])
408
387
Audio (waveform_rtf_power , rate = SAMPLE_RATE )
0 commit comments