2025-06-25 nightly release (d6ce570)

pytorchbot · pytorchbot · commit e8c57049f4d2 · 2025-06-25T11:36:38.000Z
diff --git a/src/torchcodec/_core/Metadata.h b/src/torchcodec/_core/Metadata.h
@@ -13,6 +13,7 @@
 extern "C" {
 #include <libavcodec/avcodec.h>
 #include <libavutil/avutil.h>
+#include <libavutil/rational.h>
 }
 
 namespace facebook::torchcodec {
@@ -45,6 +46,7 @@ struct StreamMetadata {
   // Video-only fields derived from the AVCodecContext.
   std::optional<int64_t> width;
   std::optional<int64_t> height;
+  std::optional<AVRational> sampleAspectRatio;
 
   // Audio-only fields
   std::optional<int64_t> sampleRate;
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -454,6 +454,8 @@ void SingleStreamDecoder::addVideoStream(
 
   streamMetadata.width = streamInfo.codecContext->width;
   streamMetadata.height = streamInfo.codecContext->height;
+  streamMetadata.sampleAspectRatio =
+      streamInfo.codecContext->sample_aspect_ratio;
 }
 
 void SingleStreamDecoder::addAudioStream(
diff --git a/src/torchcodec/_core/_metadata.py b/src/torchcodec/_core/_metadata.py
@@ -8,6 +8,7 @@
 import json
 import pathlib
 from dataclasses import dataclass
+from fractions import Fraction
 from typing import List, Optional, Union
 
 import torch
@@ -80,22 +81,37 @@ class VideoStreamMetadata(StreamMetadata):
     average_fps_from_header: Optional[float]
     """Averate fps of the stream, obtained from the header (float or None).
     We recommend using the ``average_fps`` attribute instead."""
+    pixel_aspect_ratio: Optional[Fraction]
+    """Pixel Aspect Ratio (PAR), also known as Sample Aspect Ratio
+    (SAR --- not to be confused with Storage Aspect Ratio, also SAR),
+    is the ratio between the width and height of each pixel
+    (``fractions.Fraction`` or None)."""
 
     @property
     def duration_seconds(self) -> Optional[float]:
         """Duration of the stream in seconds. We try to calculate the duration
         from the actual frames if a :term:`scan` was performed. Otherwise we
-        fall back to ``duration_seconds_from_header``.
+        fall back to ``duration_seconds_from_header``. If that value is also None,
+        we  instead calculate the duration from ``num_frames_from_header`` and
+        ``average_fps_from_header``.
         """
         if (
-            self.end_stream_seconds_from_content is None
-            or self.begin_stream_seconds_from_content is None
+            self.end_stream_seconds_from_content is not None
+            and self.begin_stream_seconds_from_content is not None
         ):
+            return (
+                self.end_stream_seconds_from_content
+                - self.begin_stream_seconds_from_content
+            )
+        elif self.duration_seconds_from_header is not None:
             return self.duration_seconds_from_header
-        return (
-            self.end_stream_seconds_from_content
-            - self.begin_stream_seconds_from_content
-        )
+        elif (
+            self.num_frames_from_header is not None
+            and self.average_fps_from_header is not None
+        ):
+            return self.num_frames_from_header / self.average_fps_from_header
+        else:
+            return None
 
     @property
     def begin_stream_seconds(self) -> float:
@@ -123,14 +139,22 @@ def end_stream_seconds(self) -> Optional[float]:
 
     @property
     def num_frames(self) -> Optional[int]:
-        """Number of frames in the stream. This corresponds to
-        ``num_frames_from_content`` if a :term:`scan` was made, otherwise it
-        corresponds to ``num_frames_from_header``.
+        """Number of frames in the stream (int or None).
+        This corresponds to ``num_frames_from_content`` if a :term:`scan` was made,
+        otherwise it corresponds to ``num_frames_from_header``. If that value is also
+        None, the number of frames is calculated from the duration and the average fps.
         """
         if self.num_frames_from_content is not None:
             return self.num_frames_from_content
-        else:
+        elif self.num_frames_from_header is not None:
             return self.num_frames_from_header
+        elif (
+            self.average_fps_from_header is not None
+            and self.duration_seconds_from_header is not None
+        ):
+            return int(self.average_fps_from_header * self.duration_seconds_from_header)
+        else:
+            return None
 
     @property
     def average_fps(self) -> Optional[float]:
@@ -211,6 +235,16 @@ def best_audio_stream(self) -> AudioStreamMetadata:
         return metadata
 
 
+def _get_optional_par_fraction(stream_dict):
+    try:
+        return Fraction(
+            stream_dict["sampleAspectRatioNum"],
+            stream_dict["sampleAspectRatioDen"],
+        )
+    except KeyError:
+        return None
+
+
 # TODO-AUDIO: This is user-facing. Should this just be `get_metadata`, without
 # the "container" name in it? Same below.
 def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata:
@@ -247,6 +281,7 @@ def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata:
                     num_frames_from_header=stream_dict.get("numFramesFromHeader"),
                     num_frames_from_content=stream_dict.get("numFramesFromContent"),
                     average_fps_from_header=stream_dict.get("averageFpsFromHeader"),
+                    pixel_aspect_ratio=_get_optional_par_fraction(stream_dict),
                     **common_meta,
                 )
             )
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -604,6 +604,12 @@ std::string get_stream_json_metadata(
   if (streamMetadata.height.has_value()) {
     map["height"] = std::to_string(*streamMetadata.height);
   }
+  if (streamMetadata.sampleAspectRatio.has_value()) {
+    map["sampleAspectRatioNum"] =
+        std::to_string((*streamMetadata.sampleAspectRatio).num);
+    map["sampleAspectRatioDen"] =
+        std::to_string((*streamMetadata.sampleAspectRatio).den);
+  }
   if (streamMetadata.averageFpsFromHeader.has_value()) {
     map["averageFpsFromHeader"] =
         std::to_string(*streamMetadata.averageFpsFromHeader);
diff --git a/test/test_decoders.py b/test/test_decoders.py
@@ -6,6 +6,8 @@
 
 import contextlib
 import gc
+import json
+from unittest.mock import patch
 
 import numpy
 import pytest
@@ -738,6 +740,56 @@ def test_get_frames_in_range(self, stream_index, device, seek_mode):
             empty_frames.duration_seconds, NASA_VIDEO.empty_duration_seconds
         )
 
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
+    @patch("torchcodec._core._metadata._get_stream_json_metadata")
+    def test_get_frames_with_missing_num_frames_metadata(
+        self, mock_get_stream_json_metadata, device, seek_mode
+    ):
+        # Create a mock stream_dict to test that initializing VideoDecoder without
+        # num_frames_from_header and num_frames_from_content calculates num_frames
+        # using the average_fps and duration_seconds metadata.
+        mock_stream_dict = {
+            "averageFpsFromHeader": 29.97003,
+            "beginStreamSecondsFromContent": 0.0,
+            "beginStreamSecondsFromHeader": 0.0,
+            "bitRate": 128783.0,
+            "codec": "h264",
+            "durationSecondsFromHeader": 13.013,
+            "endStreamSecondsFromContent": 13.013,
+            "width": 480,
+            "height": 270,
+            "mediaType": "video",
+            "numFramesFromHeader": None,
+            "numFramesFromContent": None,
+        }
+        # Set the return value of the mock to be the mock_stream_dict
+        mock_get_stream_json_metadata.return_value = json.dumps(mock_stream_dict)
+
+        decoder = VideoDecoder(
+            NASA_VIDEO.path,
+            stream_index=3,
+            device=device,
+            seek_mode=seek_mode,
+        )
+
+        assert decoder.metadata.num_frames_from_header is None
+        assert decoder.metadata.num_frames_from_content is None
+        assert decoder.metadata.duration_seconds is not None
+        assert decoder.metadata.average_fps is not None
+        assert decoder.metadata.num_frames == int(
+            decoder.metadata.duration_seconds * decoder.metadata.average_fps
+        )
+        assert len(decoder) == 390
+
+        # Test get_frames_in_range Python logic which uses the num_frames metadata mocked earlier.
+        # The frame is read at the C++ level.
+        ref_frames9 = NASA_VIDEO.get_frame_data_by_range(
+            start=9, stop=10, stream_index=3
+        ).to(device)
+        frames9 = decoder.get_frames_in_range(start=9, stop=10)
+        assert_frames_equal(ref_frames9, frames9.data)
+
     @pytest.mark.parametrize("dimension_order", ["NCHW", "NHWC"])
     @pytest.mark.parametrize(
         "frame_getter",
diff --git a/test/test_metadata.py b/test/test_metadata.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import functools
+from fractions import Fraction
 
 import pytest
 
@@ -81,6 +82,7 @@ def test_get_metadata(metadata_getter):
     assert best_video_stream_metadata.begin_stream_seconds_from_header == 0
     assert best_video_stream_metadata.bit_rate == 128783
     assert best_video_stream_metadata.average_fps == pytest.approx(29.97, abs=0.001)
+    assert best_video_stream_metadata.pixel_aspect_ratio is None
     assert best_video_stream_metadata.codec == "h264"
     assert best_video_stream_metadata.num_frames_from_content == (
         390 if with_scan else None
@@ -119,7 +121,7 @@ def test_get_metadata_audio_file(metadata_getter):
 
 @pytest.mark.parametrize(
     "num_frames_from_header, num_frames_from_content, expected_num_frames",
-    [(None, 10, 10), (10, None, 10), (None, None, None)],
+    [(10, 20, 20), (None, 10, 10), (10, None, 10)],
 )
 def test_num_frames_fallback(
     num_frames_from_header, num_frames_from_content, expected_num_frames
@@ -137,12 +139,100 @@ def test_num_frames_fallback(
         width=123,
         height=321,
         average_fps_from_header=30,
+        pixel_aspect_ratio=Fraction(1, 1),
         stream_index=0,
     )
 
     assert metadata.num_frames == expected_num_frames
 
 
+@pytest.mark.parametrize(
+    "average_fps_from_header, duration_seconds_from_header, expected_num_frames",
+    [(60, 10, 600), (60, None, None), (None, 10, None), (None, None, None)],
+)
+def test_calculate_num_frames_using_fps_and_duration(
+    average_fps_from_header, duration_seconds_from_header, expected_num_frames
+):
+    """Check that if num_frames_from_content and num_frames_from_header are missing,
+    `.num_frames` is calculated using average_fps_from_header and duration_seconds_from_header
+    """
+    metadata = VideoStreamMetadata(
+        duration_seconds_from_header=duration_seconds_from_header,
+        bit_rate=123,
+        num_frames_from_header=None,  # None to test calculating num_frames
+        num_frames_from_content=None,  # None to test calculating num_frames
+        begin_stream_seconds_from_header=0,
+        begin_stream_seconds_from_content=0,
+        end_stream_seconds_from_content=4,
+        codec="whatever",
+        width=123,
+        height=321,
+        average_fps_from_header=average_fps_from_header,
+        stream_index=0,
+    )
+
+    assert metadata.num_frames == expected_num_frames
+
+
+@pytest.mark.parametrize(
+    "duration_seconds_from_header, begin_stream_seconds_from_content, end_stream_seconds_from_content, expected_duration_seconds",
+    [(60, 5, 20, 15), (60, 1, None, 60), (60, None, 1, 60), (None, 0, 10, 10)],
+)
+def test_duration_seconds_fallback(
+    duration_seconds_from_header,
+    begin_stream_seconds_from_content,
+    end_stream_seconds_from_content,
+    expected_duration_seconds,
+):
+    """Check that using begin_stream_seconds_from_content and end_stream_seconds_from_content to calculate `.duration_seconds`
+    has priority. If either value is missing, duration_seconds_from_header is used.
+    """
+    metadata = VideoStreamMetadata(
+        duration_seconds_from_header=duration_seconds_from_header,
+        bit_rate=123,
+        num_frames_from_header=5,
+        num_frames_from_content=10,
+        begin_stream_seconds_from_header=0,
+        begin_stream_seconds_from_content=begin_stream_seconds_from_content,
+        end_stream_seconds_from_content=end_stream_seconds_from_content,
+        codec="whatever",
+        width=123,
+        height=321,
+        average_fps_from_header=5,
+        stream_index=0,
+    )
+
+    assert metadata.duration_seconds == expected_duration_seconds
+
+
+@pytest.mark.parametrize(
+    "num_frames_from_header, average_fps_from_header, expected_duration_seconds",
+    [(100, 10, 10), (100, None, None), (None, 10, None), (None, None, None)],
+)
+def test_calculate_duration_seconds_using_fps_and_num_frames(
+    num_frames_from_header, average_fps_from_header, expected_duration_seconds
+):
+    """Check that duration_seconds is calculated using average_fps_from_header and num_frames_from_header
+    if duration_seconds_from_header is missing.
+    """
+    metadata = VideoStreamMetadata(
+        duration_seconds_from_header=None,  # None to test calculating duration_seconds
+        bit_rate=123,
+        num_frames_from_header=num_frames_from_header,
+        num_frames_from_content=10,
+        begin_stream_seconds_from_header=0,
+        begin_stream_seconds_from_content=None,  # None to test calculating duration_seconds
+        end_stream_seconds_from_content=None,  # None to test calculating duration_seconds
+        codec="whatever",
+        width=123,
+        height=321,
+        average_fps_from_header=average_fps_from_header,
+        stream_index=0,
+    )
+    assert metadata.duration_seconds_from_header is None
+    assert metadata.duration_seconds == expected_duration_seconds
+
+
 def test_repr():
     # Test for calls to print(), str(), etc. Useful to make sure we don't forget
     # to add additional @properties to __repr__
@@ -161,6 +251,7 @@ def test_repr():
   num_frames_from_header: 390
   num_frames_from_content: 390
   average_fps_from_header: 29.97003
+  pixel_aspect_ratio: 1
   duration_seconds: 13.013
   begin_stream_seconds: 0.0
   end_stream_seconds: 13.013
diff --git a/test/test_samplers.py b/test/test_samplers.py
@@ -592,6 +592,9 @@ def restore_metadata():
     with restore_metadata():
         decoder.metadata.end_stream_seconds_from_content = None
         decoder.metadata.duration_seconds_from_header = None
+        decoder.metadata.num_frames_from_header = (
+            None  # Set to none to prevent fallback calculation
+        )
         with pytest.raises(
             ValueError, match="Could not infer stream end from video metadata"
         ):

Original file line number	Diff line number	Diff line change
`@@ -454,6 +454,8 @@ void SingleStreamDecoder::addVideoStream(`
`454`	`454`
`455`	`455`	`streamMetadata.width = streamInfo.codecContext->width;`
`456`	`456`	`streamMetadata.height = streamInfo.codecContext->height;`
	`457`	`+ streamMetadata.sampleAspectRatio =`
	`458`	`+ streamInfo.codecContext->sample_aspect_ratio;`
`457`	`459`	`}`
`458`	`460`
`459`	`461`	`void SingleStreamDecoder::addAudioStream(`