Skip to content

Commit e8c5704

Browse files
author
pytorchbot
committed
2025-06-25 nightly release (d6ce570)
1 parent ae0777c commit e8c5704

File tree

7 files changed

+203
-12
lines changed

7 files changed

+203
-12
lines changed

src/torchcodec/_core/Metadata.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
extern "C" {
1414
#include <libavcodec/avcodec.h>
1515
#include <libavutil/avutil.h>
16+
#include <libavutil/rational.h>
1617
}
1718

1819
namespace facebook::torchcodec {
@@ -45,6 +46,7 @@ struct StreamMetadata {
4546
// Video-only fields derived from the AVCodecContext.
4647
std::optional<int64_t> width;
4748
std::optional<int64_t> height;
49+
std::optional<AVRational> sampleAspectRatio;
4850

4951
// Audio-only fields
5052
std::optional<int64_t> sampleRate;

src/torchcodec/_core/SingleStreamDecoder.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -454,6 +454,8 @@ void SingleStreamDecoder::addVideoStream(
454454

455455
streamMetadata.width = streamInfo.codecContext->width;
456456
streamMetadata.height = streamInfo.codecContext->height;
457+
streamMetadata.sampleAspectRatio =
458+
streamInfo.codecContext->sample_aspect_ratio;
457459
}
458460

459461
void SingleStreamDecoder::addAudioStream(

src/torchcodec/_core/_metadata.py

Lines changed: 46 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import json
99
import pathlib
1010
from dataclasses import dataclass
11+
from fractions import Fraction
1112
from typing import List, Optional, Union
1213

1314
import torch
@@ -80,22 +81,37 @@ class VideoStreamMetadata(StreamMetadata):
8081
average_fps_from_header: Optional[float]
8182
"""Averate fps of the stream, obtained from the header (float or None).
8283
We recommend using the ``average_fps`` attribute instead."""
84+
pixel_aspect_ratio: Optional[Fraction]
85+
"""Pixel Aspect Ratio (PAR), also known as Sample Aspect Ratio
86+
(SAR --- not to be confused with Storage Aspect Ratio, also SAR),
87+
is the ratio between the width and height of each pixel
88+
(``fractions.Fraction`` or None)."""
8389

8490
@property
8591
def duration_seconds(self) -> Optional[float]:
8692
"""Duration of the stream in seconds. We try to calculate the duration
8793
from the actual frames if a :term:`scan` was performed. Otherwise we
88-
fall back to ``duration_seconds_from_header``.
94+
fall back to ``duration_seconds_from_header``. If that value is also None,
95+
we instead calculate the duration from ``num_frames_from_header`` and
96+
``average_fps_from_header``.
8997
"""
9098
if (
91-
self.end_stream_seconds_from_content is None
92-
or self.begin_stream_seconds_from_content is None
99+
self.end_stream_seconds_from_content is not None
100+
and self.begin_stream_seconds_from_content is not None
93101
):
102+
return (
103+
self.end_stream_seconds_from_content
104+
- self.begin_stream_seconds_from_content
105+
)
106+
elif self.duration_seconds_from_header is not None:
94107
return self.duration_seconds_from_header
95-
return (
96-
self.end_stream_seconds_from_content
97-
- self.begin_stream_seconds_from_content
98-
)
108+
elif (
109+
self.num_frames_from_header is not None
110+
and self.average_fps_from_header is not None
111+
):
112+
return self.num_frames_from_header / self.average_fps_from_header
113+
else:
114+
return None
99115

100116
@property
101117
def begin_stream_seconds(self) -> float:
@@ -123,14 +139,22 @@ def end_stream_seconds(self) -> Optional[float]:
123139

124140
@property
125141
def num_frames(self) -> Optional[int]:
126-
"""Number of frames in the stream. This corresponds to
127-
``num_frames_from_content`` if a :term:`scan` was made, otherwise it
128-
corresponds to ``num_frames_from_header``.
142+
"""Number of frames in the stream (int or None).
143+
This corresponds to ``num_frames_from_content`` if a :term:`scan` was made,
144+
otherwise it corresponds to ``num_frames_from_header``. If that value is also
145+
None, the number of frames is calculated from the duration and the average fps.
129146
"""
130147
if self.num_frames_from_content is not None:
131148
return self.num_frames_from_content
132-
else:
149+
elif self.num_frames_from_header is not None:
133150
return self.num_frames_from_header
151+
elif (
152+
self.average_fps_from_header is not None
153+
and self.duration_seconds_from_header is not None
154+
):
155+
return int(self.average_fps_from_header * self.duration_seconds_from_header)
156+
else:
157+
return None
134158

135159
@property
136160
def average_fps(self) -> Optional[float]:
@@ -211,6 +235,16 @@ def best_audio_stream(self) -> AudioStreamMetadata:
211235
return metadata
212236

213237

238+
def _get_optional_par_fraction(stream_dict):
239+
try:
240+
return Fraction(
241+
stream_dict["sampleAspectRatioNum"],
242+
stream_dict["sampleAspectRatioDen"],
243+
)
244+
except KeyError:
245+
return None
246+
247+
214248
# TODO-AUDIO: This is user-facing. Should this just be `get_metadata`, without
215249
# the "container" name in it? Same below.
216250
def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata:
@@ -247,6 +281,7 @@ def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata:
247281
num_frames_from_header=stream_dict.get("numFramesFromHeader"),
248282
num_frames_from_content=stream_dict.get("numFramesFromContent"),
249283
average_fps_from_header=stream_dict.get("averageFpsFromHeader"),
284+
pixel_aspect_ratio=_get_optional_par_fraction(stream_dict),
250285
**common_meta,
251286
)
252287
)

src/torchcodec/_core/custom_ops.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -604,6 +604,12 @@ std::string get_stream_json_metadata(
604604
if (streamMetadata.height.has_value()) {
605605
map["height"] = std::to_string(*streamMetadata.height);
606606
}
607+
if (streamMetadata.sampleAspectRatio.has_value()) {
608+
map["sampleAspectRatioNum"] =
609+
std::to_string((*streamMetadata.sampleAspectRatio).num);
610+
map["sampleAspectRatioDen"] =
611+
std::to_string((*streamMetadata.sampleAspectRatio).den);
612+
}
607613
if (streamMetadata.averageFpsFromHeader.has_value()) {
608614
map["averageFpsFromHeader"] =
609615
std::to_string(*streamMetadata.averageFpsFromHeader);

test/test_decoders.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
import contextlib
88
import gc
9+
import json
10+
from unittest.mock import patch
911

1012
import numpy
1113
import pytest
@@ -738,6 +740,56 @@ def test_get_frames_in_range(self, stream_index, device, seek_mode):
738740
empty_frames.duration_seconds, NASA_VIDEO.empty_duration_seconds
739741
)
740742

743+
@pytest.mark.parametrize("device", cpu_and_cuda())
744+
@pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
745+
@patch("torchcodec._core._metadata._get_stream_json_metadata")
746+
def test_get_frames_with_missing_num_frames_metadata(
747+
self, mock_get_stream_json_metadata, device, seek_mode
748+
):
749+
# Create a mock stream_dict to test that initializing VideoDecoder without
750+
# num_frames_from_header and num_frames_from_content calculates num_frames
751+
# using the average_fps and duration_seconds metadata.
752+
mock_stream_dict = {
753+
"averageFpsFromHeader": 29.97003,
754+
"beginStreamSecondsFromContent": 0.0,
755+
"beginStreamSecondsFromHeader": 0.0,
756+
"bitRate": 128783.0,
757+
"codec": "h264",
758+
"durationSecondsFromHeader": 13.013,
759+
"endStreamSecondsFromContent": 13.013,
760+
"width": 480,
761+
"height": 270,
762+
"mediaType": "video",
763+
"numFramesFromHeader": None,
764+
"numFramesFromContent": None,
765+
}
766+
# Set the return value of the mock to be the mock_stream_dict
767+
mock_get_stream_json_metadata.return_value = json.dumps(mock_stream_dict)
768+
769+
decoder = VideoDecoder(
770+
NASA_VIDEO.path,
771+
stream_index=3,
772+
device=device,
773+
seek_mode=seek_mode,
774+
)
775+
776+
assert decoder.metadata.num_frames_from_header is None
777+
assert decoder.metadata.num_frames_from_content is None
778+
assert decoder.metadata.duration_seconds is not None
779+
assert decoder.metadata.average_fps is not None
780+
assert decoder.metadata.num_frames == int(
781+
decoder.metadata.duration_seconds * decoder.metadata.average_fps
782+
)
783+
assert len(decoder) == 390
784+
785+
# Test get_frames_in_range Python logic which uses the num_frames metadata mocked earlier.
786+
# The frame is read at the C++ level.
787+
ref_frames9 = NASA_VIDEO.get_frame_data_by_range(
788+
start=9, stop=10, stream_index=3
789+
).to(device)
790+
frames9 = decoder.get_frames_in_range(start=9, stop=10)
791+
assert_frames_equal(ref_frames9, frames9.data)
792+
741793
@pytest.mark.parametrize("dimension_order", ["NCHW", "NHWC"])
742794
@pytest.mark.parametrize(
743795
"frame_getter",

test/test_metadata.py

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# LICENSE file in the root directory of this source tree.
66

77
import functools
8+
from fractions import Fraction
89

910
import pytest
1011

@@ -81,6 +82,7 @@ def test_get_metadata(metadata_getter):
8182
assert best_video_stream_metadata.begin_stream_seconds_from_header == 0
8283
assert best_video_stream_metadata.bit_rate == 128783
8384
assert best_video_stream_metadata.average_fps == pytest.approx(29.97, abs=0.001)
85+
assert best_video_stream_metadata.pixel_aspect_ratio is None
8486
assert best_video_stream_metadata.codec == "h264"
8587
assert best_video_stream_metadata.num_frames_from_content == (
8688
390 if with_scan else None
@@ -119,7 +121,7 @@ def test_get_metadata_audio_file(metadata_getter):
119121

120122
@pytest.mark.parametrize(
121123
"num_frames_from_header, num_frames_from_content, expected_num_frames",
122-
[(None, 10, 10), (10, None, 10), (None, None, None)],
124+
[(10, 20, 20), (None, 10, 10), (10, None, 10)],
123125
)
124126
def test_num_frames_fallback(
125127
num_frames_from_header, num_frames_from_content, expected_num_frames
@@ -137,12 +139,100 @@ def test_num_frames_fallback(
137139
width=123,
138140
height=321,
139141
average_fps_from_header=30,
142+
pixel_aspect_ratio=Fraction(1, 1),
140143
stream_index=0,
141144
)
142145

143146
assert metadata.num_frames == expected_num_frames
144147

145148

149+
@pytest.mark.parametrize(
150+
"average_fps_from_header, duration_seconds_from_header, expected_num_frames",
151+
[(60, 10, 600), (60, None, None), (None, 10, None), (None, None, None)],
152+
)
153+
def test_calculate_num_frames_using_fps_and_duration(
154+
average_fps_from_header, duration_seconds_from_header, expected_num_frames
155+
):
156+
"""Check that if num_frames_from_content and num_frames_from_header are missing,
157+
`.num_frames` is calculated using average_fps_from_header and duration_seconds_from_header
158+
"""
159+
metadata = VideoStreamMetadata(
160+
duration_seconds_from_header=duration_seconds_from_header,
161+
bit_rate=123,
162+
num_frames_from_header=None, # None to test calculating num_frames
163+
num_frames_from_content=None, # None to test calculating num_frames
164+
begin_stream_seconds_from_header=0,
165+
begin_stream_seconds_from_content=0,
166+
end_stream_seconds_from_content=4,
167+
codec="whatever",
168+
width=123,
169+
height=321,
170+
average_fps_from_header=average_fps_from_header,
171+
stream_index=0,
172+
)
173+
174+
assert metadata.num_frames == expected_num_frames
175+
176+
177+
@pytest.mark.parametrize(
178+
"duration_seconds_from_header, begin_stream_seconds_from_content, end_stream_seconds_from_content, expected_duration_seconds",
179+
[(60, 5, 20, 15), (60, 1, None, 60), (60, None, 1, 60), (None, 0, 10, 10)],
180+
)
181+
def test_duration_seconds_fallback(
182+
duration_seconds_from_header,
183+
begin_stream_seconds_from_content,
184+
end_stream_seconds_from_content,
185+
expected_duration_seconds,
186+
):
187+
"""Check that using begin_stream_seconds_from_content and end_stream_seconds_from_content to calculate `.duration_seconds`
188+
has priority. If either value is missing, duration_seconds_from_header is used.
189+
"""
190+
metadata = VideoStreamMetadata(
191+
duration_seconds_from_header=duration_seconds_from_header,
192+
bit_rate=123,
193+
num_frames_from_header=5,
194+
num_frames_from_content=10,
195+
begin_stream_seconds_from_header=0,
196+
begin_stream_seconds_from_content=begin_stream_seconds_from_content,
197+
end_stream_seconds_from_content=end_stream_seconds_from_content,
198+
codec="whatever",
199+
width=123,
200+
height=321,
201+
average_fps_from_header=5,
202+
stream_index=0,
203+
)
204+
205+
assert metadata.duration_seconds == expected_duration_seconds
206+
207+
208+
@pytest.mark.parametrize(
209+
"num_frames_from_header, average_fps_from_header, expected_duration_seconds",
210+
[(100, 10, 10), (100, None, None), (None, 10, None), (None, None, None)],
211+
)
212+
def test_calculate_duration_seconds_using_fps_and_num_frames(
213+
num_frames_from_header, average_fps_from_header, expected_duration_seconds
214+
):
215+
"""Check that duration_seconds is calculated using average_fps_from_header and num_frames_from_header
216+
if duration_seconds_from_header is missing.
217+
"""
218+
metadata = VideoStreamMetadata(
219+
duration_seconds_from_header=None, # None to test calculating duration_seconds
220+
bit_rate=123,
221+
num_frames_from_header=num_frames_from_header,
222+
num_frames_from_content=10,
223+
begin_stream_seconds_from_header=0,
224+
begin_stream_seconds_from_content=None, # None to test calculating duration_seconds
225+
end_stream_seconds_from_content=None, # None to test calculating duration_seconds
226+
codec="whatever",
227+
width=123,
228+
height=321,
229+
average_fps_from_header=average_fps_from_header,
230+
stream_index=0,
231+
)
232+
assert metadata.duration_seconds_from_header is None
233+
assert metadata.duration_seconds == expected_duration_seconds
234+
235+
146236
def test_repr():
147237
# Test for calls to print(), str(), etc. Useful to make sure we don't forget
148238
# to add additional @properties to __repr__
@@ -161,6 +251,7 @@ def test_repr():
161251
num_frames_from_header: 390
162252
num_frames_from_content: 390
163253
average_fps_from_header: 29.97003
254+
pixel_aspect_ratio: 1
164255
duration_seconds: 13.013
165256
begin_stream_seconds: 0.0
166257
end_stream_seconds: 13.013

test/test_samplers.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,9 @@ def restore_metadata():
592592
with restore_metadata():
593593
decoder.metadata.end_stream_seconds_from_content = None
594594
decoder.metadata.duration_seconds_from_header = None
595+
decoder.metadata.num_frames_from_header = (
596+
None # Set to none to prevent fallback calculation
597+
)
595598
with pytest.raises(
596599
ValueError, match="Could not infer stream end from video metadata"
597600
):

0 commit comments

Comments
 (0)