Skip to content

Commit a4576a7

Browse files
committed
Use torchcodec in examples and integration tests too
1 parent 74135c8 commit a4576a7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+166
-104
lines changed

docs/source/index.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ Tutorials
182182

183183
.. customcarditem::
184184
:header: Loading waveform Tensors from files and saving them
185-
:card_description: Learn how to query/load audio files and save waveform tensors to files, using <code>torchaudio.info</code>, <code>torchaudio.load</code> and <code>torchaudio.save</code> functions.
185+
:card_description: Learn how to query/load audio files and save waveform tensors to files, using <code>torchaudio.info</code>, <code>torchaudio.utils.load_torchcodec</code> and <code>torchaudio.save</code> functions.
186186
:image: https://download.pytorch.org/torchaudio/tutorial-assets/thumbnails/audio_io_tutorial.png
187187
:link: tutorials/audio_io_tutorial.html
188188
:tags: I/O
@@ -399,7 +399,7 @@ In BibTeX format:
399399
.. code-block:: bibtex
400400
401401
@misc{hwang2023torchaudio,
402-
title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch},
402+
title={TorchAudio 2.1: Advancing speech recognition, self-supervised learning, and audio processing components for PyTorch},
403403
author={Jeff Hwang and Moto Hira and Caroline Chen and Xiaohui Zhang and Zhaoheng Ni and Guangzhi Sun and Pingchuan Ma and Ruizhe Huang and Vineel Pratap and Yuekai Zhang and Anurag Kumar and Chin-Yun Yu and Chuang Zhu and Chunxi Liu and Jacob Kahn and Mirco Ravanelli and Peng Sun and Shinji Watanabe and Yangyang Shi and Yumeng Tao and Robin Scheibler and Samuele Cornell and Sean Kim and Stavros Petridis},
404404
year={2023},
405405
eprint={2310.17864},

examples/asr/emformer_rnnt/mustc/dataset.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import torch
55
import torchaudio
66
import yaml
7+
from torchaudio.utils import load_torchcodec
78

89

910
FOLDER_IN_ARCHIVE = "en-de"
@@ -39,7 +40,7 @@ def __init__(
3940

4041
def _get_mustc_item(self, idx):
4142
file_path, offset, duration = self.wav_list[idx]
42-
waveform, sr = torchaudio.load(file_path, frame_offset=offset, num_frames=duration)
43+
waveform, sr = load_torchcodec(file_path, frame_offset=offset, num_frames=duration)
4344
assert sr == SAMPLE_RATE
4445
transcript = self.trans_list[idx].replace("\n", "")
4546
return (waveform, transcript)

examples/avsr/data_prep/data/data_module.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import torch
88
import torchaudio
99
import torchvision
10-
10+
from torchaudio.utils import load_torchcodec
1111

1212
class AVSRDataLoader:
1313
def __init__(self, modality, detector="retinaface", resize=None):
@@ -39,7 +39,7 @@ def load_data(self, data_filename, transform=True):
3939
return video
4040

4141
def load_audio(self, data_filename):
42-
waveform, sample_rate = torchaudio.load(data_filename, normalize=True)
42+
waveform, sample_rate = load_torchcodec(data_filename, normalize=True)
4343
return waveform, sample_rate
4444

4545
def load_video(self, data_filename):

examples/avsr/lrs3.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import torchaudio
44
import torchvision
55
from torch.utils.data import Dataset
6+
from torchaudio.utils import load_torchcodec
67

78

89
def _load_list(args, *filenames):
@@ -31,7 +32,7 @@ def load_audio(path):
3132
"""
3233
rtype: torch, T x 1
3334
"""
34-
waveform, sample_rate = torchaudio.load(path, normalize=True)
35+
waveform, sample_rate = load_torchcodec(path, normalize=True)
3536
return waveform.transpose(1, 0)
3637

3738

examples/dnn_beamformer/datamodule.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from torch import Tensor
99
from torch.utils.data import Dataset
1010
from utils import CollateFnL3DAS22
11+
from torchaudio.utils import load_torchcodec
1112

1213
_PREFIX = "L3DAS22_Task1_"
1314
_SUBSETS = {
@@ -46,10 +47,10 @@ def __getitem__(self, n: int) -> Tuple[Tensor, Tensor, int, str]:
4647
noisy_path_B = str(noisy_path_A).replace("_A.wav", "_B.wav")
4748
clean_path = noisy_path_A.parent.parent / "labels" / noisy_path_A.name.replace("_A.wav", ".wav")
4849
transcript_path = str(clean_path).replace("wav", "txt")
49-
waveform_noisy_A, sample_rate1 = torchaudio.load(noisy_path_A)
50-
waveform_noisy_B, sample_rate2 = torchaudio.load(noisy_path_B)
50+
waveform_noisy_A, sample_rate1 = load_torchcodec(noisy_path_A)
51+
waveform_noisy_B, sample_rate2 = load_torchcodec(noisy_path_B)
5152
waveform_noisy = torch.cat((waveform_noisy_A, waveform_noisy_B), dim=0)
52-
waveform_clean, sample_rate3 = torchaudio.load(clean_path)
53+
waveform_clean, sample_rate3 = load_torchcodec(clean_path)
5354
assert sample_rate1 == _SAMPLE_RATE and sample_rate2 == _SAMPLE_RATE and sample_rate3 == _SAMPLE_RATE
5455
with open(transcript_path, "r") as f:
5556
transcript = f.readline()

examples/hubert/dataset/hubert_dataset.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
from torch import Tensor
1313
from torch.utils.data import BatchSampler, Dataset, DistributedSampler
1414

15+
from torchaudio.utils import load_torchcodec
16+
17+
1518
sys.path.append("..")
1619
from utils import _get_label2id
1720

@@ -299,7 +302,7 @@ def _load_audio(self, index: int) -> Tensor:
299302
(Tensor): The corresponding waveform Tensor.
300303
"""
301304
wav_path = self.f_list[index]
302-
waveform, sample_rate = torchaudio.load(wav_path)
305+
waveform, sample_rate = load_torchcodec(wav_path)
303306
assert waveform.shape[1] == self.len_list[index]
304307
return waveform
305308

examples/hubert/utils/feature_utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from torch.nn import Module
1414

1515
from .common_utils import _get_feat_lens_paths
16+
from torchaudio.utils import load_torchcodec
1617

1718
_LG = logging.getLogger(__name__)
1819
_DEFAULT_DEVICE = torch.device("cpu")
@@ -53,7 +54,7 @@ def extract_feature_mfcc(
5354
Returns:
5455
Tensor: The desired feature tensor of the given audio file.
5556
"""
56-
waveform, sr = torchaudio.load(path)
57+
waveform, sr = load_torchcodec(path)
5758
assert sr == sample_rate
5859
feature_extractor = torchaudio.transforms.MFCC(
5960
sample_rate=sample_rate, n_mfcc=13, melkwargs={"n_fft": 400, "hop_length": 160, "center": False}
@@ -88,7 +89,7 @@ def extract_feature_hubert(
8889
Returns:
8990
Tensor: The desired feature tensor of the given audio file.
9091
"""
91-
waveform, sr = torchaudio.load(path)
92+
waveform, sr = load_torchcodec(path)
9293
assert sr == sample_rate
9394
waveform = waveform.to(device)
9495
with torch.inference_mode():

examples/libtorchaudio/augmentation/create_jittable_pipeline.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
import torch
99
import torchaudio
10-
10+
from torchaudio.utils import load_torchcodec
1111

1212
class Pipeline(torch.nn.Module):
1313
"""Example audio process pipeline.
@@ -17,15 +17,15 @@ class Pipeline(torch.nn.Module):
1717

1818
def __init__(self, rir_path: str):
1919
super().__init__()
20-
rir, sample_rate = torchaudio.load(rir_path)
20+
rir, sample_rate = load_torchcodec(rir_path)
2121
self.register_buffer("rir", rir)
2222
self.rir_sample_rate: int = sample_rate
2323

2424
def forward(self, input_path: str, output_path: str):
2525
torchaudio.sox_effects.init_sox_effects()
2626

2727
# 1. load audio
28-
waveform, sample_rate = torchaudio.load(input_path)
28+
waveform, sample_rate = load_torchcodec(input_path)
2929

3030
# 2. Add background noise
3131
alpha = 0.01

examples/libtorchaudio/speech_recognition/build_pipeline_from_fairseq.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from greedy_decoder import Decoder
1515
from torch.utils.mobile_optimizer import optimize_for_mobile
1616
from torchaudio.models.wav2vec2.utils.import_fairseq import import_fairseq_model
17+
from torchaudio.utils import load_torchcodec
1718

1819
TORCH_VERSION: Tuple[int, ...] = tuple(int(x) for x in torch.__version__.split(".")[:2])
1920
if TORCH_VERSION >= (1, 10):
@@ -58,7 +59,7 @@ def _parse_args():
5859

5960
class Loader(torch.nn.Module):
6061
def forward(self, audio_path: str) -> torch.Tensor:
61-
waveform, sample_rate = torchaudio.load(audio_path)
62+
waveform, sample_rate = load_torchcodec(audio_path)
6263
if sample_rate != 16000:
6364
waveform = torchaudio.functional.resample(waveform, float(sample_rate), 16000.0)
6465
return waveform

examples/libtorchaudio/speech_recognition/build_pipeline_from_huggingface_transformers.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import torchaudio
99
from greedy_decoder import Decoder
1010
from torchaudio.models.wav2vec2.utils.import_huggingface import import_huggingface_model
11+
from torchaudio.utils import load_torchcodec
1112

1213
TORCH_VERSION: Tuple[int, ...] = tuple(int(x) for x in torch.__version__.split(".")[:2])
1314
if TORCH_VERSION >= (1, 10):
@@ -49,7 +50,7 @@ def _parse_args():
4950

5051
class Loader(torch.nn.Module):
5152
def forward(self, audio_path: str) -> torch.Tensor:
52-
waveform, sample_rate = torchaudio.load(audio_path)
53+
waveform, sample_rate = load_torchcodec(audio_path)
5354
if sample_rate != 16000:
5455
waveform = torchaudio.functional.resample(waveform, float(sample_rate), 16000.0)
5556
return waveform

0 commit comments

Comments
 (0)