Skip to content

Commit bde6f27

Browse files
author
pytorchbot
committed
2025-03-22 nightly release (93f5d47)
1 parent 72f6fe3 commit bde6f27

File tree

14 files changed

+2253
-16
lines changed

14 files changed

+2253
-16
lines changed

docs/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ sphinx_copybutton
55
sphinx-tabs
66
matplotlib
77
torchvision
8+
ipython
89
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme

docs/source/api_ref_decoders.rst

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ torchcodec.decoders
77
.. currentmodule:: torchcodec.decoders
88

99

10-
For a tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
10+
For a video decoder tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
11+
For an audio decoder tutorial, see: :ref:`sphx_glr_generated_examples_audio_decoding.py`.
1112

1213

1314
.. autosummary::
@@ -16,6 +17,7 @@ For a tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
1617
:template: class.rst
1718

1819
VideoDecoder
20+
AudioDecoder
1921

2022

2123
.. autosummary::
@@ -24,3 +26,4 @@ For a tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
2426
:template: dataclass.rst
2527

2628
VideoStreamMetadata
29+
AudioStreamMetadata

docs/source/api_ref_torchcodec.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@ torchcodec
1414

1515
Frame
1616
FrameBatch
17+
AudioSamples

docs/source/glossary.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Glossary
44
.. glossary::
55

66
pts
7-
Presentation Time Stamp. The time at which a frame should be played.
7+
Presentation Time Stamp. The time at which a frame or audio sample should be played.
88
In TorchCodec, pts are expressed in seconds.
99

1010
best stream

examples/audio_decoding.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""
8+
========================================
9+
Decoding audio streams with AudioDecoder
10+
========================================
11+
12+
In this example, we'll learn how to decode an audio file using the
13+
:class:`~torchcodec.decoders.AudioDecoder` class.
14+
"""
15+
16+
# %%
17+
# First, a bit of boilerplate: we'll download an audio file from the web and
18+
# define an audio playing utility. You can ignore that part and jump right
19+
# below to :ref:`creating_decoder_audio`.
20+
import requests
21+
from IPython.display import Audio
22+
23+
24+
def play_audio(samples):
25+
return Audio(samples.data, rate=samples.sample_rate)
26+
27+
28+
# Audio source is CC0: https://opengameart.org/content/town-theme-rpg
29+
# Attribution: cynicmusic.com pixelsphere.org
30+
url = "https://opengameart.org/sites/default/files/TownTheme.mp3"
31+
response = requests.get(url, headers={"User-Agent": ""})
32+
if response.status_code != 200:
33+
raise RuntimeError(f"Failed to download video. {response.status_code = }.")
34+
35+
raw_audio_bytes = response.content
36+
37+
# %%
38+
# .. _creating_decoder_audio:
39+
#
40+
# Creating a decoder
41+
# ------------------
42+
#
43+
# We can now create a decoder from the raw (encoded) audio bytes. You can of
44+
# course use a local audio file and pass the path as input. You can also decode
45+
# audio streams from videos!
46+
47+
from torchcodec.decoders import AudioDecoder
48+
49+
decoder = AudioDecoder(raw_audio_bytes)
50+
51+
# %%
52+
# The has not yet been decoded by the decoder, but we already have access to
53+
# some metadata via the ``metadata`` attribute which is an
54+
# :class:`~torchcodec.decoders.AudioStreamMetadata` object.
55+
print(decoder.metadata)
56+
57+
# %%
58+
# Decoding samples
59+
# ----------------
60+
#
61+
# To get decoded samples, we just need to call the
62+
# :meth:`~torchcodec.decoders.AudioDecoder.get_samples_played_in_range` method,
63+
# which returns an :class:`~torchcodec.AudioSamples` object:
64+
65+
samples = decoder.get_samples_played_in_range(start_seconds=0)
66+
67+
print(samples)
68+
play_audio(samples)
69+
70+
# %%
71+
# The ``.data`` field is a tensor of shape ``(num_channels, num_samples)`` and
72+
# of float dtype with values in [-1, 1].
73+
#
74+
# The ``.pts_seconds`` field indicates the starting time of the output samples.
75+
# Here it's 0.025 seconds, even though we asked for samples starting from 0. Not
76+
# all streams start exactly at 0! This is not a bug in TorchCodec, this is a
77+
# property of the file that was defined when it was encoded.
78+
#
79+
# We only output the *start* of the samples, not the end or the duration. Those can
80+
# be easily derived from the number of samples and the sample rate:
81+
82+
duration_seconds = samples.data.shape[1] / samples.sample_rate
83+
print(f"Duration = {int(duration_seconds // 60)}m{int(duration_seconds % 60)}s.")
84+
85+
# %%
86+
# Specifying a range
87+
# ------------------
88+
#
89+
# By default,
90+
# :meth:`~torchcodec.decoders.AudioDecoder.get_samples_played_in_range` decodes
91+
# the entire audio stream, but we can specify a custom range:
92+
93+
samples = decoder.get_samples_played_in_range(start_seconds=10, stop_seconds=70)
94+
95+
print(samples)
96+
play_audio(samples)
97+
98+
# %%
99+
# Custom sample rate
100+
# ------------------
101+
#
102+
# We can also decode the samples into a desired sample rate using the
103+
# ``sample_rate`` parameter of :class:`~torchcodec.decoders.AudioDecoder`. The
104+
# ouput will sound the same, but note that the number of samples greatly
105+
# increased:
106+
107+
decoder = AudioDecoder(raw_audio_bytes, sample_rate=16_000)
108+
samples = decoder.get_samples_played_in_range(start_seconds=0)
109+
110+
print(samples)
111+
play_audio(samples)

src/torchcodec/_frame.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,10 +120,12 @@ def __repr__(self):
120120
class AudioSamples(Iterable):
121121
"""Audio samples with associated metadata."""
122122

123-
# TODO-AUDIO: docs
124123
data: Tensor
124+
"""The sample data (``torch.Tensor`` of float in [-1, 1], shape is ``(num_channels, num_samples)``)."""
125125
pts_seconds: float
126+
"""The :term:`pts` of the first sample, in seconds."""
126127
sample_rate: int
128+
"""The sample rate of the samples, in Hz."""
127129

128130
def __post_init__(self):
129131
# This is called after __init__() when a Frame is created. We can run

src/torchcodec/decoders/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7-
from ._core import VideoStreamMetadata
7+
from ._audio_decoder import AudioDecoder # noqa
8+
from ._core import AudioStreamMetadata, VideoStreamMetadata
89
from ._video_decoder import VideoDecoder # noqa
910

1011
SimpleVideoDecoder = VideoDecoder

src/torchcodec/decoders/_audio_decoder.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,31 @@
1818

1919

2020
class AudioDecoder:
21-
"""TODO-AUDIO docs"""
21+
"""A single-stream audio decoder.
22+
23+
This can be used to decode audio from pure audio files (e.g. mp3, wav,
24+
etc.), or from videos that contain audio streams (e.g. mp4 videos).
25+
26+
Returned samples are float samples normalized in [-1, 1]
27+
28+
Args:
29+
source (str, ``Pathlib.path``, ``torch.Tensor``, or bytes): The source of the audio:
30+
31+
- If ``str``: a local path or a URL to a video or audio file.
32+
- If ``Pathlib.path``: a path to a local video or audio file.
33+
- If ``bytes`` object or ``torch.Tensor``: the raw encoded audio data.
34+
stream_index (int, optional): Specifies which stream in the file to decode samples from.
35+
Note that this index is absolute across all media types. If left unspecified, then
36+
the :term:`best stream` is used.
37+
sample_rate (int, optional): The desired output sample rate of the decoded samples.
38+
By default, the samples are returned in their original sample rate.
39+
40+
Attributes:
41+
metadata (AudioStreamMetadata): Metadata of the audio stream.
42+
stream_index (int): The stream index that this decoder is retrieving samples from. If a
43+
stream index was provided at initialization, this is the same value. If it was left
44+
unspecified, this is the :term:`best stream`.
45+
"""
2246

2347
def __init__(
2448
self,
@@ -46,10 +70,23 @@ def __init__(
4670
sample_rate if sample_rate is not None else self.metadata.sample_rate
4771
)
4872

73+
# TODO-AUDIO: start_seconds should be 0 by default
4974
def get_samples_played_in_range(
5075
self, start_seconds: float, stop_seconds: Optional[float] = None
5176
) -> AudioSamples:
52-
"""TODO-AUDIO docs"""
77+
"""Returns audio samples in the given range.
78+
79+
Samples are in the half open range [start_seconds, stop_seconds).
80+
81+
Args:
82+
start_seconds (float): Time, in seconds, of the start of the
83+
range.
84+
stop_seconds (float): Time, in seconds, of the end of the
85+
range. As a half open range, the end is excluded.
86+
87+
Returns:
88+
AudioSamples: The samples within the specified range.
89+
"""
5390
if stop_seconds is not None and not start_seconds <= stop_seconds:
5491
raise ValueError(
5592
f"Invalid start seconds: {start_seconds}. It must be less than or equal to stop seconds ({stop_seconds})."

src/torchcodec/decoders/_core/VideoDecoder.cpp

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -908,7 +908,7 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
908908
// sample rate, so in theory we know the number of output samples.
909909
std::vector<torch::Tensor> frames;
910910

911-
double firstFramePtsSeconds = std::numeric_limits<double>::max();
911+
std::optional<double> firstFramePtsSeconds = std::nullopt;
912912
auto stopPts = secondsToClosestPts(stopSeconds, streamInfo.timeBase);
913913
auto finished = false;
914914
while (!finished) {
@@ -918,8 +918,9 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
918918
return startPts < avFrame->pts + getDuration(avFrame);
919919
});
920920
auto frameOutput = convertAVFrameToFrameOutput(avFrame);
921-
firstFramePtsSeconds =
922-
std::min(firstFramePtsSeconds, frameOutput.ptsSeconds);
921+
if (!firstFramePtsSeconds.has_value()) {
922+
firstFramePtsSeconds = frameOutput.ptsSeconds;
923+
}
923924
frames.push_back(frameOutput.data);
924925
} catch (const EndOfFileException& e) {
925926
finished = true;
@@ -940,7 +941,13 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
940941
frames.push_back(*lastSamples);
941942
}
942943

943-
return AudioFramesOutput{torch::cat(frames, 1), firstFramePtsSeconds};
944+
TORCH_CHECK(
945+
frames.size() > 0 && firstFramePtsSeconds.has_value(),
946+
"No audio frames were decoded. ",
947+
"This should probably not happen. ",
948+
"Please report an issue on the TorchCodec repo.");
949+
950+
return AudioFramesOutput{torch::cat(frames, 1), *firstFramePtsSeconds};
944951
}
945952

946953
// --------------------------------------------------------------------------
@@ -1481,8 +1488,11 @@ UniqueAVFrame VideoDecoder::convertAudioAVFrameSampleFormatAndSampleRate(
14811488
static_cast<const uint8_t**>(
14821489
const_cast<const uint8_t**>(srcAVFrame->data)),
14831490
srcAVFrame->nb_samples);
1491+
// numConvertedSamples can be 0 if we're downsampling by a great factor and
1492+
// the first frame doesn't contain a lot of samples. It should be handled
1493+
// properly by the caller.
14841494
TORCH_CHECK(
1485-
numConvertedSamples > 0,
1495+
numConvertedSamples >= 0,
14861496
"Error in swr_convert: ",
14871497
getFFMPEGErrorStringFromErrorCode(numConvertedSamples));
14881498

@@ -1509,17 +1519,22 @@ std::optional<torch::Tensor> VideoDecoder::maybeFlushSwrBuffers() {
15091519
return std::nullopt;
15101520
}
15111521

1512-
torch::Tensor lastSamples = torch::empty(
1513-
{getNumChannels(streamInfo.codecContext), numRemainingSamples},
1514-
torch::kFloat32);
1515-
uint8_t* lastSamplesData = static_cast<uint8_t*>(lastSamples.data_ptr());
1522+
auto numChannels = getNumChannels(streamInfo.codecContext);
1523+
torch::Tensor lastSamples =
1524+
torch::empty({numChannels, numRemainingSamples}, torch::kFloat32);
1525+
1526+
std::vector<uint8_t*> outputBuffers(numChannels);
1527+
for (auto i = 0; i < numChannels; i++) {
1528+
outputBuffers[i] = static_cast<uint8_t*>(lastSamples[i].data_ptr());
1529+
}
15161530

15171531
auto actualNumRemainingSamples = swr_convert(
15181532
streamInfo.swrContext.get(),
1519-
&lastSamplesData,
1533+
outputBuffers.data(),
15201534
numRemainingSamples,
15211535
nullptr,
15221536
0);
1537+
15231538
return lastSamples.narrow(
15241539
/*dim=*/1, /*start=*/0, /*length=*/actualNumRemainingSamples);
15251540
}

src/torchcodec/decoders/_core/_metadata.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
# TODO-AUDIO: docs below are mostly for video streams, we should edit them and /
2626
# or make sure they're OK for audio streams as well. Not sure how to best handle
2727
# docs for such class hierarchy.
28+
# TODO very related, none of these common fields in this base class show up in
29+
# the docs right now.
2830
@dataclass
2931
class StreamMetadata:
3032
duration_seconds_from_header: Optional[float]
@@ -162,8 +164,11 @@ class AudioStreamMetadata(StreamMetadata):
162164
"""Metadata of a single audio stream."""
163165

164166
sample_rate: Optional[int]
167+
"""The original sample rate."""
165168
num_channels: Optional[int]
169+
"""The number of channels (1 for mono, 2 for stereo, etc.)"""
166170
sample_format: Optional[str]
171+
"""The original sample format, as described by FFmpeg. E.g. 'fltp', 's32', etc."""
167172

168173
def __repr__(self):
169174
return super().__repr__()

0 commit comments

Comments
 (0)