Skip to content

Commit e973327

Browse files
authored
Merge pull request #670 from realpython/torchaudio
TorchAudio Materials
2 parents 373f6c7 + edfd55a commit e973327

File tree

2 files changed

+149
-0
lines changed

2 files changed

+149
-0
lines changed

torchaudio/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Use TorchAudio to Prepare Audio Data for Deep Learning
2+
3+
This folder provides sample code for the Real Python tutorial [Use TorchAudio to Prepare Audio Data for Deep Learning](https://realpython.com/python-torchaudio/).

torchaudio/speech.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
try:
2+
from copy import replace
3+
except ImportError:
4+
5+
def replace(obj, **kwargs):
6+
return obj._replace(**kwargs)
7+
8+
9+
from pathlib import Path
10+
from typing import Callable, NamedTuple, Self
11+
12+
import sounddevice as sd
13+
from IPython.display import Audio
14+
from torch import Tensor, clamp, randn_like
15+
from torch.nn import functional as F
16+
from torch.utils.data import Dataset
17+
from tqdm import tqdm
18+
19+
import torchaudio
20+
from torchaudio import functional as AF
21+
from torchaudio.datasets import SPEECHCOMMANDS
22+
from torchaudio.datasets.speechcommands import FOLDER_IN_ARCHIVE
23+
24+
25+
class SpeechSample(NamedTuple):
26+
waveform: Tensor
27+
sample_rate: int
28+
label: str
29+
speaker_id: str
30+
utterance_number: int
31+
32+
@property
33+
def num_channels(self) -> int:
34+
return self.waveform.size(0)
35+
36+
@property
37+
def num_samples(self) -> int:
38+
return self.waveform.size(1)
39+
40+
@property
41+
def num_seconds(self) -> float:
42+
return self.num_samples / self.sample_rate
43+
44+
def play(self) -> None:
45+
sd.play(
46+
self.waveform.numpy().reshape(-1, self.num_channels),
47+
self.sample_rate,
48+
blocking=True,
49+
)
50+
51+
def play_widget(self) -> Audio:
52+
return Audio(
53+
self.waveform.numpy(), rate=self.sample_rate, autoplay=True
54+
)
55+
56+
def save(self, path: str | Path) -> None:
57+
torchaudio.save(path, self.waveform, self.sample_rate)
58+
59+
def apply(self, transform: Callable[[Tensor], Tensor]) -> Self:
60+
return replace(self, waveform=transform(self.waveform))
61+
62+
def resample(self, sample_rate: int) -> Self:
63+
return replace(
64+
self,
65+
sample_rate=sample_rate,
66+
waveform=AF.resample(
67+
self.waveform,
68+
orig_freq=self.sample_rate,
69+
new_freq=sample_rate,
70+
),
71+
)
72+
73+
def pad_trim(self, seconds: int | float) -> Self:
74+
num_samples = int(self.sample_rate * seconds)
75+
if self.num_samples > num_samples:
76+
return replace(self, waveform=self.waveform[:, :num_samples])
77+
elif self.num_samples < num_samples:
78+
padding_amount = num_samples - self.num_samples
79+
return replace(
80+
self, waveform=F.pad(self.waveform, (0, padding_amount))
81+
)
82+
else:
83+
return self
84+
85+
def with_gaussian_noise(self, level=0.01) -> Self:
86+
noise = randn_like(self.waveform) * level
87+
return replace(self, waveform=clamp(self.waveform + noise, -1.0, 1.0))
88+
89+
90+
class AugmentedSpeechCommands(Dataset):
91+
def __init__(
92+
self,
93+
folder: str | Path | None = None,
94+
seconds: int | float | None = None,
95+
noise_level: float = 0.005,
96+
enable_noise: bool = True,
97+
transform: Callable[[Tensor], Tensor] | None = None,
98+
) -> None:
99+
if folder:
100+
self.folder = Path(folder).resolve()
101+
else:
102+
self.folder = Path.cwd() / FOLDER_IN_ARCHIVE
103+
self._raw_dataset = SPEECHCOMMANDS(
104+
self.folder.parent, folder_in_archive=self.folder.name
105+
)
106+
self._noise = noise_level
107+
self._enable_noise = enable_noise
108+
self._transform = transform
109+
self._seconds = seconds
110+
111+
def __len__(self) -> int:
112+
return len(self._raw_dataset)
113+
114+
def __getitem__(self, index: int) -> SpeechSample:
115+
relative_path, _, *metadata = self._raw_dataset.get_metadata(index)
116+
absolute_path = self.folder / relative_path
117+
waveform, sample_rate = torchaudio.load(absolute_path)
118+
speech_sample = SpeechSample(waveform, sample_rate, *metadata)
119+
120+
if self._seconds is not None:
121+
speech_sample = speech_sample.pad_trim(self._seconds)
122+
123+
if self._enable_noise:
124+
speech_sample = speech_sample.with_gaussian_noise(self._noise)
125+
126+
if self._transform:
127+
speech_sample = speech_sample.apply(self._transform)
128+
129+
return speech_sample
130+
131+
132+
def bulk_process(
133+
dataset: SPEECHCOMMANDS,
134+
output_dir: str | Path,
135+
sample_rate: int,
136+
seconds: int | float,
137+
) -> None:
138+
for index, sample in tqdm(enumerate(dataset), total=len(dataset)):
139+
speech_sample = SpeechSample(*sample)
140+
input_path, *_ = dataset.get_metadata(index)
141+
output_path = Path(output_dir).resolve() / input_path
142+
output_path.parent.mkdir(parents=True, exist_ok=True)
143+
if speech_sample.sample_rate != sample_rate:
144+
speech_sample = speech_sample.resample(sample_rate)
145+
speech_sample = speech_sample.pad_trim(seconds)
146+
speech_sample.save(output_path)

0 commit comments

Comments
 (0)