VideoStreamMetadata.sample_aspect_ratio: new metadata field (#733)

carandraug · carandraug · commit c51a341a96bf · 2025-06-22T17:00:25.000+01:00
New field to VideoStreamMetadata to at least get information about the
stream sample/pixel aspect ratio.  Getting this information is the
minimum required to support non-square pixels.
diff --git a/src/torchcodec/_core/Metadata.h b/src/torchcodec/_core/Metadata.h
@@ -8,6 +8,7 @@
 
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 extern "C" {
@@ -45,6 +46,7 @@ struct StreamMetadata {
   // Video-only fields derived from the AVCodecContext.
   std::optional<int64_t> width;
   std::optional<int64_t> height;
+  std::optional<std::pair<int, int>> sampleAspectRatio;
 
   // Audio-only fields
   std::optional<int64_t> sampleRate;
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -459,6 +459,9 @@ void SingleStreamDecoder::addVideoStream(
 
   streamMetadata.width = streamInfo.codecContext->width;
   streamMetadata.height = streamInfo.codecContext->height;
+  streamMetadata.sampleAspectRatio = {
+      streamInfo.codecContext->sample_aspect_ratio.num,
+      streamInfo.codecContext->sample_aspect_ratio.den};
 }
 
 void SingleStreamDecoder::addAudioStream(
diff --git a/src/torchcodec/_core/_metadata.py b/src/torchcodec/_core/_metadata.py
@@ -80,6 +80,12 @@ class VideoStreamMetadata(StreamMetadata):
     average_fps_from_header: Optional[float]
     """Averate fps of the stream, obtained from the header (float or None).
     We recommend using the ``average_fps`` attribute instead."""
+    sample_aspect_ratio: Optional[tuple[int, int]]
+    """Sample Aspect Ratio (SAR), also known as Pixel Aspect Ratio
+    (PAR), is the ratio between the width of a pixel and the height of
+    each pixel.  This is a tuple of two ints: the first element is the
+    numerator, and the second element is the denominator.  Not to be
+    confused with Storage Aspect Ratio (also SAR)."""
 
     @property
     def duration_seconds(self) -> Optional[float]:
@@ -211,6 +217,16 @@ def best_audio_stream(self) -> AudioStreamMetadata:
         return metadata
 
 
+def _get_optional_sar_tuple(stream_dict):
+    try:
+        return (
+            stream_dict["sampleAspectRatioNum"],
+            stream_dict["sampleAspectRatioDen"],
+        )
+    except KeyError:
+        return None
+
+
 # TODO-AUDIO: This is user-facing. Should this just be `get_metadata`, without
 # the "container" name in it? Same below.
 def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata:
@@ -247,6 +263,9 @@ def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata:
                     num_frames_from_header=stream_dict.get("numFramesFromHeader"),
                     num_frames_from_content=stream_dict.get("numFramesFromContent"),
                     average_fps_from_header=stream_dict.get("averageFpsFromHeader"),
+                    # sample_aspect_ratio is a tuple.  Return None,
+                    # and not (None, None), if missing.
+                    sample_aspect_ratio=_get_optional_sar_tuple(stream_dict),
                     **common_meta,
                 )
             )
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -601,6 +601,12 @@ std::string get_stream_json_metadata(
   if (streamMetadata.height.has_value()) {
     map["height"] = std::to_string(*streamMetadata.height);
   }
+  if (streamMetadata.sampleAspectRatio.has_value()) {
+    map["sampleAspectRatioNum"] =
+        std::to_string((*streamMetadata.sampleAspectRatio).first);
+    map["sampleAspectRatioDen"] =
+        std::to_string((*streamMetadata.sampleAspectRatio).second);
+  }
   if (streamMetadata.averageFpsFromHeader.has_value()) {
     map["averageFpsFromHeader"] =
         std::to_string(*streamMetadata.averageFpsFromHeader);
diff --git a/test/test_metadata.py b/test/test_metadata.py
@@ -81,6 +81,7 @@ def test_get_metadata(metadata_getter):
     assert best_video_stream_metadata.begin_stream_seconds_from_header == 0
     assert best_video_stream_metadata.bit_rate == 128783
     assert best_video_stream_metadata.average_fps == pytest.approx(29.97, abs=0.001)
+    assert best_video_stream_metadata.sample_aspect_ratio is None
     assert best_video_stream_metadata.codec == "h264"
     assert best_video_stream_metadata.num_frames_from_content == (
         390 if with_scan else None
@@ -137,6 +138,7 @@ def test_num_frames_fallback(
         width=123,
         height=321,
         average_fps_from_header=30,
+        sample_aspect_ratio=(1, 1),
         stream_index=0,
     )
 
@@ -161,6 +163,7 @@ def test_repr():
   num_frames_from_header: 390
   num_frames_from_content: 390
   average_fps_from_header: 29.97003
+  sample_aspect_ratio: (1, 1)
   duration_seconds: 13.013
   begin_stream_seconds: 0.0
   end_stream_seconds: 13.013

Original file line number	Diff line number	Diff line change
`@@ -459,6 +459,9 @@ void SingleStreamDecoder::addVideoStream(`
`459`	`459`
`460`	`460`	`streamMetadata.width = streamInfo.codecContext->width;`
`461`	`461`	`streamMetadata.height = streamInfo.codecContext->height;`
	`462`	`+ streamMetadata.sampleAspectRatio = {`
	`463`	`+ streamInfo.codecContext->sample_aspect_ratio.num,`
	`464`	`+ streamInfo.codecContext->sample_aspect_ratio.den};`
`462`	`465`	`}`
`463`	`466`
`464`	`467`	`void SingleStreamDecoder::addAudioStream(`