Skip to content

Commit 1ca8443

Browse files
committed
Merge branch 'main' of github.com:pytorch/torchcodec into file_like
2 parents a093003 + 8e611bb commit 1ca8443

26 files changed

+1163
-107
lines changed

.github/workflows/build_ffmpeg.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ jobs:
3434
contents: read
3535
with:
3636
job-name: Build
37-
upload-artifact: ffmpeg-lgpl
37+
upload-artifact: ffmpeg-lgpl-linux_x86_64-${{ matrix.ffmpeg-version }}
3838
repository: pytorch/torchcodec
3939
script: |
4040
export FFMPEG_VERSION="${{ matrix.ffmpeg-version }}"
@@ -56,7 +56,7 @@ jobs:
5656
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
5757
with:
5858
job-name: Build
59-
upload-artifact: ffmpeg-lgpl
59+
upload-artifact: ffmpeg-lgpl-macos-${{ matrix.ffmpeg-version }}
6060
repository: pytorch/torchcodec
6161
runner: macos-14-xlarge
6262
script: |

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ format you want. Refer to Nvidia's GPU support matrix for more details
152152
the CUDA Toolkit.
153153

154154
2. Install or compile FFmpeg with NVDEC support.
155-
TorchCodec with CUDA should work with FFmpeg versions in [5, 7].
155+
TorchCodec with CUDA should work with FFmpeg versions in [4, 7].
156156

157157
If FFmpeg is not already installed, or you need a more recent version, an
158158
easy way to install it is to use `conda`:

packaging/build_ffmpeg.sh

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ tar -xf ffmpeg.tar.gz --strip-components 1
6161
--enable-avfilter \
6262
--enable-avformat \
6363
--enable-avutil \
64-
--enable-swscale
64+
--enable-swscale \
65+
--enable-swresample
6566

6667
make -j install
6768
ls ${prefix}/*
@@ -78,27 +79,31 @@ if [[ "$(uname)" == Darwin ]]; then
7879
avdevice=libavdevice.58
7980
avfilter=libavfilter.7
8081
swscale=libswscale.5
82+
swresample=libswresample.3
8183
elif [[ ${major_ver} == 5 ]]; then
8284
avutil=libavutil.57
8385
avcodec=libavcodec.59
8486
avformat=libavformat.59
8587
avdevice=libavdevice.59
8688
avfilter=libavfilter.8
8789
swscale=libswscale.6
90+
swresample=libswresample.4
8891
elif [[ ${major_ver} == 6 ]]; then
8992
avutil=libavutil.58
9093
avcodec=libavcodec.60
9194
avformat=libavformat.60
9295
avdevice=libavdevice.60
9396
avfilter=libavfilter.9
9497
swscale=libswscale.7
98+
swresample=libswresample.4
9599
elif [[ ${major_ver} == 7 ]]; then
96100
avutil=libavutil.59
97101
avcodec=libavcodec.61
98102
avformat=libavformat.61
99103
avdevice=libavdevice.61
100104
avfilter=libavfilter.10
101105
swscale=libswscale.8
106+
swresample=libswresample.5
102107
else
103108
printf "Error: unexpected FFmpeg major version: %s\n" ${major_ver}
104109
exit 1;
@@ -120,7 +125,7 @@ if [[ "$(uname)" == Darwin ]]; then
120125
fi
121126

122127
# list up the paths to fix
123-
for lib in ${avcodec} ${avdevice} ${avfilter} ${avformat} ${avutil} ${swscale}; do
128+
for lib in ${avcodec} ${avdevice} ${avfilter} ${avformat} ${avutil} ${swscale} ${swresample}; do
124129
${otool} -l ${prefix}/lib/${lib}.dylib | grep -B2 ${prefix}
125130
done
126131

@@ -155,6 +160,13 @@ if [[ "$(uname)" == Darwin ]]; then
155160
${prefix}/lib/${swscale}.dylib
156161
${otool} -l ${prefix}/lib/${swscale}.dylib | grep -B2 ${prefix}
157162

163+
${install_name_tool} \
164+
-change ${prefix}/lib/${avutil}.dylib @rpath/${avutil}.dylib \
165+
-delete_rpath ${prefix}/lib \
166+
-id @rpath/${swresample}.dylib \
167+
${prefix}/lib/${swresample}.dylib
168+
${otool} -l ${prefix}/lib/${swresample}.dylib | grep -B2 ${prefix}
169+
158170
${install_name_tool} \
159171
-change ${prefix}/lib/${avcodec}.dylib @rpath/${avcodec}.dylib \
160172
-change ${prefix}/lib/${avutil}.dylib @rpath/${avutil}.dylib \

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[project]
2-
name = "TorchCodec"
2+
name = "torchcodec"
33
description = "A video decoder for PyTorch"
44
readme = "README.md"
55
requires-python = ">=3.8"

src/torchcodec/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
# Note: usort wants to put Frame and FrameBatch after decoders and samplers,
88
# but that results in circular import.
9-
from ._frame import Frame, FrameBatch # usort:skip # noqa
9+
from ._frame import AudioSamples, Frame, FrameBatch # usort:skip # noqa
1010
from . import decoders, samplers # noqa
1111

1212
try:

src/torchcodec/_frame.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313

1414
def _frame_repr(self):
15-
# Utility to replace Frame and FrameBatch __repr__ method. This prints the
15+
# Utility to replace __repr__ method of dataclasses below. This prints the
1616
# shape of the .data tensor rather than printing the (potentially very long)
1717
# data tensor itself.
1818
s = self.__class__.__name__ + ":\n"
@@ -114,3 +114,28 @@ def __len__(self):
114114

115115
def __repr__(self):
116116
return _frame_repr(self)
117+
118+
119+
@dataclass
120+
class AudioSamples(Iterable):
121+
"""Audio samples with associated metadata."""
122+
123+
# TODO-AUDIO: docs
124+
data: Tensor
125+
pts_seconds: float
126+
sample_rate: int
127+
128+
def __post_init__(self):
129+
# This is called after __init__() when a Frame is created. We can run
130+
# input validation checks here.
131+
if not self.data.ndim == 2:
132+
raise ValueError(f"data must be 2-dimensional, got {self.data.shape = }")
133+
self.pts_seconds = float(self.pts_seconds)
134+
self.sample_rate = int(self.sample_rate)
135+
136+
def __iter__(self) -> Iterator[Union[Tensor, float]]:
137+
for field in dataclasses.fields(self):
138+
yield getattr(self, field.name)
139+
140+
def __repr__(self):
141+
return _frame_repr(self)

src/torchcodec/decoders/_audio_decoder.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from torch import Tensor
1111

12+
from torchcodec import AudioSamples
1213
from torchcodec.decoders import _core as core
1314
from torchcodec.decoders._decoder_utils import (
1415
create_decoder,
@@ -37,3 +38,70 @@ def __init__(
3738
) = get_and_validate_stream_metadata(
3839
decoder=self._decoder, stream_index=stream_index, media_type="audio"
3940
)
41+
assert isinstance(self.metadata, core.AudioStreamMetadata) # mypy
42+
43+
def get_samples_played_in_range(
44+
self, start_seconds: float, stop_seconds: Optional[float] = None
45+
) -> AudioSamples:
46+
"""TODO-AUDIO docs"""
47+
if stop_seconds is not None and not start_seconds <= stop_seconds:
48+
raise ValueError(
49+
f"Invalid start seconds: {start_seconds}. It must be less than or equal to stop seconds ({stop_seconds})."
50+
)
51+
if not self._begin_stream_seconds <= start_seconds < self._end_stream_seconds:
52+
raise ValueError(
53+
f"Invalid start seconds: {start_seconds}. "
54+
f"It must be greater than or equal to {self._begin_stream_seconds} "
55+
f"and less than or equal to {self._end_stream_seconds}."
56+
)
57+
frames, first_pts = core.get_frames_by_pts_in_range_audio(
58+
self._decoder,
59+
start_seconds=start_seconds,
60+
stop_seconds=stop_seconds,
61+
)
62+
first_pts = first_pts.item()
63+
64+
# x = frame boundaries
65+
#
66+
# first_pts last_pts
67+
# v v
68+
# ....x..........x..........x...........x..........x..........x.....
69+
# ^ ^
70+
# start_seconds stop_seconds
71+
#
72+
# We want to return the samples in [start_seconds, stop_seconds). But
73+
# because the core API is based on frames, the `frames` tensor contains
74+
# the samples in [first_pts, last_pts)
75+
# So we do some basic math to figure out the position of the view that
76+
# we'll return.
77+
78+
# TODO: sample_rate is either the original one from metadata, or the
79+
# user-specified one (NIY)
80+
assert isinstance(self.metadata, core.AudioStreamMetadata) # mypy
81+
sample_rate = self.metadata.sample_rate
82+
83+
# TODO: metadata's sample_rate should probably not be Optional
84+
assert sample_rate is not None # mypy.
85+
86+
if first_pts < start_seconds:
87+
offset_beginning = round((start_seconds - first_pts) * sample_rate)
88+
output_pts_seconds = start_seconds
89+
else:
90+
# In normal cases we'll have first_pts <= start_pts, but in some
91+
# edge cases it's possible to have first_pts > start_seconds,
92+
# typically if the stream's first frame's pts isn't exactly 0.
93+
offset_beginning = 0
94+
output_pts_seconds = first_pts
95+
96+
num_samples = frames.shape[1]
97+
last_pts = first_pts + num_samples / self.metadata.sample_rate
98+
if stop_seconds is not None and stop_seconds < last_pts:
99+
offset_end = num_samples - round((last_pts - stop_seconds) * sample_rate)
100+
else:
101+
offset_end = num_samples
102+
103+
return AudioSamples(
104+
data=frames[:, offset_beginning:offset_end],
105+
pts_seconds=output_pts_seconds,
106+
sample_rate=sample_rate,
107+
)

src/torchcodec/decoders/_core/FFMPEGCommon.cpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,15 +60,22 @@ int64_t getDuration(const AVFrame* frame) {
6060
#endif
6161
}
6262

63-
int64_t getNumChannels(const UniqueAVCodecContext& avCodecContext) {
63+
int getNumChannels(const AVFrame* avFrame) {
6464
#if LIBAVFILTER_VERSION_MAJOR > 8 || \
6565
(LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
66-
int numChannels = avCodecContext->ch_layout.nb_channels;
66+
return avFrame->ch_layout.nb_channels;
6767
#else
68-
int numChannels = avCodecContext->channels;
68+
return av_get_channel_layout_nb_channels(avFrame->channel_layout);
6969
#endif
70+
}
7071

71-
return static_cast<int64_t>(numChannels);
72+
int getNumChannels(const UniqueAVCodecContext& avCodecContext) {
73+
#if LIBAVFILTER_VERSION_MAJOR > 8 || \
74+
(LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
75+
return avCodecContext->ch_layout.nb_channels;
76+
#else
77+
return avCodecContext->channels;
78+
#endif
7279
}
7380

7481
AVIOBytesContext::AVIOBytesContext(

src/torchcodec/decoders/_core/FFMPEGCommon.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,8 @@ std::string getFFMPEGErrorStringFromErrorCode(int errorCode);
139139
int64_t getDuration(const UniqueAVFrame& frame);
140140
int64_t getDuration(const AVFrame* frame);
141141

142-
int64_t getNumChannels(const UniqueAVCodecContext& avCodecContext);
142+
int getNumChannels(const AVFrame* avFrame);
143+
int getNumChannels(const UniqueAVCodecContext& avCodecContext);
143144

144145
// Returns true if sws_scale can handle unaligned data.
145146
bool canSwsScaleHandleUnalignedData();

0 commit comments

Comments
 (0)