|
40 | 40 |
|
41 | 41 |
|
42 | 42 | ######################################################################
|
43 |
| -# Applying effects and filtering |
| 43 | +# Loading the data |
44 | 44 | # ------------------------------
|
45 | 45 | #
|
46 |
| -# :py:class:`torchaudio.io.AudioEffector` allows for directly applying |
47 |
| -# filters and codecs to Tensor objects, in a similar way as ``ffmpeg`` |
48 |
| -# command |
49 |
| -# |
50 |
| -# `AudioEffector Usages <./effector_tutorial.html>` explains how to use |
51 |
| -# this class, so for the detail, please refer to the tutorial. |
52 |
| -# |
53 | 46 |
|
54 |
| -# Load the data |
55 | 47 | waveform1, sample_rate = torchaudio.load(SAMPLE_WAV, channels_first=False)
|
56 | 48 |
|
57 |
| -# Define effects |
58 |
| -effect = ",".join( |
59 |
| - [ |
60 |
| - "lowpass=frequency=300:poles=1", # apply single-pole lowpass filter |
61 |
| - "atempo=0.8", # reduce the speed |
62 |
| - "aecho=in_gain=0.8:out_gain=0.9:delays=200:decays=0.3|delays=400:decays=0.3" |
63 |
| - # Applying echo gives some dramatic feeling |
64 |
| - ], |
65 |
| -) |
66 |
| - |
67 |
| - |
68 |
| -# Apply effects |
69 |
| -def apply_effect(waveform, sample_rate, effect): |
70 |
| - effector = torchaudio.io.AudioEffector(effect=effect) |
71 |
| - return effector.apply(waveform, sample_rate) |
72 |
| - |
73 |
| - |
74 |
| -waveform2 = apply_effect(waveform1, sample_rate, effect) |
75 |
| - |
76 | 49 | print(waveform1.shape, sample_rate)
|
77 |
| -print(waveform2.shape, sample_rate) |
78 | 50 |
|
79 | 51 | ######################################################################
|
80 |
| -# Note that the number of frames and number of channels are different from |
81 |
| -# those of the original after the effects are applied. Let’s listen to the |
82 |
| -# audio. |
| 52 | +# Let’s listen to the audio. |
83 | 53 | #
|
84 | 54 |
|
85 | 55 |
|
@@ -124,24 +94,11 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
|
124 | 94 |
|
125 | 95 |
|
126 | 96 | ######################################################################
|
127 |
| -# Original |
128 |
| -# ~~~~~~~~ |
129 |
| -# |
130 | 97 |
|
131 | 98 | plot_waveform(waveform1.T, sample_rate, title="Original", xlim=(-0.1, 3.2))
|
132 | 99 | plot_specgram(waveform1.T, sample_rate, title="Original", xlim=(0, 3.04))
|
133 | 100 | Audio(waveform1.T, rate=sample_rate)
|
134 | 101 |
|
135 |
| -###################################################################### |
136 |
| -# Effects applied |
137 |
| -# ~~~~~~~~~~~~~~~ |
138 |
| -# |
139 |
| - |
140 |
| -plot_waveform(waveform2.T, sample_rate, title="Effects Applied", xlim=(-0.1, 3.2)) |
141 |
| -plot_specgram(waveform2.T, sample_rate, title="Effects Applied", xlim=(0, 3.04)) |
142 |
| -Audio(waveform2.T, rate=sample_rate) |
143 |
| - |
144 |
| - |
145 | 102 | ######################################################################
|
146 | 103 | # Simulating room reverberation
|
147 | 104 | # -----------------------------
|
@@ -265,143 +222,3 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
|
265 | 222 | plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
|
266 | 223 | plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]")
|
267 | 224 | Audio(noisy_speech, rate=sample_rate)
|
268 |
| - |
269 |
| - |
270 |
| -###################################################################### |
271 |
| -# Applying codec to Tensor object |
272 |
| -# ------------------------------- |
273 |
| -# |
274 |
| -# :py:class:`torchaudio.io.AudioEffector` can also apply codecs to |
275 |
| -# a Tensor object. |
276 |
| -# |
277 |
| - |
278 |
| -waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH, channels_first=False) |
279 |
| - |
280 |
| - |
281 |
| -def apply_codec(waveform, sample_rate, format, encoder=None): |
282 |
| - encoder = torchaudio.io.AudioEffector(format=format, encoder=encoder) |
283 |
| - return encoder.apply(waveform, sample_rate) |
284 |
| - |
285 |
| - |
286 |
| -###################################################################### |
287 |
| -# Original |
288 |
| -# ~~~~~~~~ |
289 |
| -# |
290 |
| - |
291 |
| -plot_waveform(waveform.T, sample_rate, title="Original") |
292 |
| -plot_specgram(waveform.T, sample_rate, title="Original") |
293 |
| -Audio(waveform.T, rate=sample_rate) |
294 |
| - |
295 |
| -###################################################################### |
296 |
| -# 8 bit mu-law |
297 |
| -# ~~~~~~~~~~~~ |
298 |
| -# |
299 |
| - |
300 |
| -mulaw = apply_codec(waveform, sample_rate, "wav", encoder="pcm_mulaw") |
301 |
| -plot_waveform(mulaw.T, sample_rate, title="8 bit mu-law") |
302 |
| -plot_specgram(mulaw.T, sample_rate, title="8 bit mu-law") |
303 |
| -Audio(mulaw.T, rate=sample_rate) |
304 |
| - |
305 |
| -###################################################################### |
306 |
| -# G.722 |
307 |
| -# ~~~~~ |
308 |
| -# |
309 |
| - |
310 |
| -g722 = apply_codec(waveform, sample_rate, "g722") |
311 |
| -plot_waveform(g722.T, sample_rate, title="G.722") |
312 |
| -plot_specgram(g722.T, sample_rate, title="G.722") |
313 |
| -Audio(g722.T, rate=sample_rate) |
314 |
| - |
315 |
| -###################################################################### |
316 |
| -# Vorbis |
317 |
| -# ~~~~~~ |
318 |
| -# |
319 |
| - |
320 |
| -vorbis = apply_codec(waveform, sample_rate, "ogg", encoder="vorbis") |
321 |
| -plot_waveform(vorbis.T, sample_rate, title="Vorbis") |
322 |
| -plot_specgram(vorbis.T, sample_rate, title="Vorbis") |
323 |
| -Audio(vorbis.T, rate=sample_rate) |
324 |
| - |
325 |
| -###################################################################### |
326 |
| -# Simulating a phone recoding |
327 |
| -# --------------------------- |
328 |
| -# |
329 |
| -# Combining the previous techniques, we can simulate audio that sounds |
330 |
| -# like a person talking over a phone in a echoey room with people talking |
331 |
| -# in the background. |
332 |
| -# |
333 |
| - |
334 |
| -sample_rate = 16000 |
335 |
| -original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH) |
336 |
| - |
337 |
| -plot_specgram(original_speech, sample_rate, title="Original") |
338 |
| - |
339 |
| -# Apply RIR |
340 |
| -rir_applied = F.fftconvolve(speech, rir) |
341 |
| - |
342 |
| -plot_specgram(rir_applied, sample_rate, title="RIR Applied") |
343 |
| - |
344 |
| -# Add background noise |
345 |
| -# Because the noise is recorded in the actual environment, we consider that |
346 |
| -# the noise contains the acoustic feature of the environment. Therefore, we add |
347 |
| -# the noise after RIR application. |
348 |
| -noise, _ = torchaudio.load(SAMPLE_NOISE) |
349 |
| -noise = noise[:, : rir_applied.shape[1]] |
350 |
| - |
351 |
| -snr_db = torch.tensor([8]) |
352 |
| -bg_added = F.add_noise(rir_applied, noise, snr_db) |
353 |
| - |
354 |
| -plot_specgram(bg_added, sample_rate, title="BG noise added") |
355 |
| - |
356 |
| -# Apply filtering and change sample rate |
357 |
| -effect = ",".join( |
358 |
| - [ |
359 |
| - "lowpass=frequency=4000:poles=1", |
360 |
| - "compand=attacks=0.02:decays=0.05:points=-60/-60|-30/-10|-20/-8|-5/-8|-2/-8:gain=-8:volume=-7:delay=0.05", |
361 |
| - ] |
362 |
| -) |
363 |
| - |
364 |
| -filtered = apply_effect(bg_added.T, sample_rate, effect) |
365 |
| -sample_rate2 = 8000 |
366 |
| - |
367 |
| -plot_specgram(filtered.T, sample_rate2, title="Filtered") |
368 |
| - |
369 |
| -# Apply telephony codec |
370 |
| -codec_applied = apply_codec(filtered, sample_rate2, "g722") |
371 |
| -plot_specgram(codec_applied.T, sample_rate2, title="G.722 Codec Applied") |
372 |
| - |
373 |
| - |
374 |
| -###################################################################### |
375 |
| -# Original speech |
376 |
| -# ~~~~~~~~~~~~~~~ |
377 |
| -# |
378 |
| - |
379 |
| -Audio(original_speech, rate=sample_rate) |
380 |
| - |
381 |
| -###################################################################### |
382 |
| -# RIR applied |
383 |
| -# ~~~~~~~~~~~ |
384 |
| -# |
385 |
| - |
386 |
| -Audio(rir_applied, rate=sample_rate) |
387 |
| - |
388 |
| -###################################################################### |
389 |
| -# Background noise added |
390 |
| -# ~~~~~~~~~~~~~~~~~~~~~~ |
391 |
| -# |
392 |
| - |
393 |
| -Audio(bg_added, rate=sample_rate) |
394 |
| - |
395 |
| -###################################################################### |
396 |
| -# Filtered |
397 |
| -# ~~~~~~~~ |
398 |
| -# |
399 |
| - |
400 |
| -Audio(filtered.T, rate=sample_rate2) |
401 |
| - |
402 |
| -###################################################################### |
403 |
| -# Codec applied |
404 |
| -# ~~~~~~~~~~~~~ |
405 |
| -# |
406 |
| - |
407 |
| -Audio(codec_applied.T, rate=sample_rate2) |
0 commit comments