Add sample_format to audio metadata

NicolasHug · NicolasHug · commit 58e527772bd9 · 2025-03-13T17:23:37.000Z
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -170,6 +170,9 @@ void VideoDecoder::initializeDecoder() {
       }
       containerMetadata_.numVideoStreams++;
     } else if (avStream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
+      AVSampleFormat format =
+          static_cast<AVSampleFormat>(avStream->codecpar->format);
+      streamMetadata.sampleFormat = av_get_sample_fmt_name(format);
       containerMetadata_.numAudioStreams++;
     }
 
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -81,6 +81,7 @@ class VideoDecoder {
     // Audio-only fields
     std::optional<int64_t> sampleRate;
     std::optional<int64_t> numChannels;
+    std::optional<std::string> sampleFormat;
   };
 
   struct ContainerMetadata {
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -495,12 +495,15 @@ std::string get_stream_json_metadata(
   if (streamMetadata.numChannels.has_value()) {
     map["numChannels"] = std::to_string(*streamMetadata.numChannels);
   }
+  if (streamMetadata.sampleFormat.has_value()) {
+    map["sampleFormat"] = quoteValue(streamMetadata.sampleFormat.value());
+  }
   if (streamMetadata.mediaType == AVMEDIA_TYPE_VIDEO) {
-    map["mediaType"] = "\"video\"";
+    map["mediaType"] = quoteValue("video");
   } else if (streamMetadata.mediaType == AVMEDIA_TYPE_AUDIO) {
-    map["mediaType"] = "\"audio\"";
+    map["mediaType"] = quoteValue("audio");
   } else {
-    map["mediaType"] = "\"other\"";
+    map["mediaType"] = quoteValue("other");
   }
   return mapToJson(map);
 }
diff --git a/src/torchcodec/decoders/_core/_metadata.py b/src/torchcodec/decoders/_core/_metadata.py
@@ -161,9 +161,9 @@ def __repr__(self):
 class AudioStreamMetadata(StreamMetadata):
     """Metadata of a single audio stream."""
 
-    # TODO-AUDIO Add sample format field
     sample_rate: Optional[int]
     num_channels: Optional[int]
+    sample_format: Optional[str]
 
     def __repr__(self):
         return super().__repr__()
@@ -240,6 +240,7 @@ def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata:
                 AudioStreamMetadata(
                     sample_rate=stream_dict.get("sampleRate"),
                     num_channels=stream_dict.get("numChannels"),
+                    sample_format=stream_dict.get("sampleFormat"),
                     **common_meta,
                 )
             )
diff --git a/test/decoders/test_decoders.py b/test/decoders/test_decoders.py
@@ -955,6 +955,7 @@ def test_metadata(self, asset):
         )
         assert decoder.metadata.sample_rate == asset.sample_rate
         assert decoder.metadata.num_channels == asset.num_channels
+        assert decoder.metadata.sample_format == asset.sample_format
 
     @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
     def test_error(self, asset):
diff --git a/test/decoders/test_metadata.py b/test/decoders/test_metadata.py
@@ -90,6 +90,7 @@ def test_get_metadata(metadata_getter):
     )
     assert best_audio_stream_metadata.bit_rate == 128837
     assert best_audio_stream_metadata.codec == "aac"
+    assert best_audio_stream_metadata.sample_format == "fltp"
 
 
 @pytest.mark.parametrize(
@@ -109,6 +110,7 @@ def test_get_metadata_audio_file(metadata_getter):
     )
     assert best_audio_stream_metadata.bit_rate == 64000
     assert best_audio_stream_metadata.codec == "mp3"
+    assert best_audio_stream_metadata.sample_format == "fltp"
 
 
 @pytest.mark.parametrize(
diff --git a/test/utils.py b/test/utils.py
@@ -109,6 +109,7 @@ class TestAudioStreamInfo:
     num_channels: int
     duration_seconds: float
     num_frames: int
+    sample_format: str
 
 
 @dataclass
@@ -404,14 +405,22 @@ def duration_seconds(self) -> float:
     def num_frames(self) -> int:
         return self.stream_infos[self.default_stream_index].num_frames
 
+    @property
+    def sample_format(self) -> str:
+        return self.stream_infos[self.default_stream_index].sample_format
+
 
 NASA_AUDIO_MP3 = TestAudio(
     filename="nasa_13013.mp4.audio.mp3",
     default_stream_index=0,
     frames={},  # Automatically loaded from json file
     stream_infos={
         0: TestAudioStreamInfo(
-            sample_rate=8_000, num_channels=2, duration_seconds=13.248, num_frames=183
+            sample_rate=8_000,
+            num_channels=2,
+            duration_seconds=13.248,
+            num_frames=183,
+            sample_format="fltp",
         )
     },
 )
@@ -422,7 +431,11 @@ def num_frames(self) -> int:
     frames={},  # Automatically loaded from json file
     stream_infos={
         4: TestAudioStreamInfo(
-            sample_rate=16_000, num_channels=2, duration_seconds=13.056, num_frames=204
+            sample_rate=16_000,
+            num_channels=2,
+            duration_seconds=13.056,
+            num_frames=204,
+            sample_format="fltp",
         )
     },
 )

Original file line number	Diff line number	Diff line change
`@@ -170,6 +170,9 @@ void VideoDecoder::initializeDecoder() {`
`170`	`170`	`}`
`171`	`171`	`containerMetadata_.numVideoStreams++;`
`172`	`172`	`} else if (avStream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {`
	`173`	`+ AVSampleFormat format =`
	`174`	`+ static_cast<AVSampleFormat>(avStream->codecpar->format);`
	`175`	`+ streamMetadata.sampleFormat = av_get_sample_fmt_name(format);`
`173`	`176`	`containerMetadata_.numAudioStreams++;`
`174`	`177`	`}`
`175`	`178`
Original file line number	Diff line number	Diff line change
`@@ -495,12 +495,15 @@ std::string get_stream_json_metadata(`
`495`	`495`	`if (streamMetadata.numChannels.has_value()) {`
`496`	`496`	`map["numChannels"] = std::to_string(*streamMetadata.numChannels);`
`497`	`497`	`}`
	`498`	`+ if (streamMetadata.sampleFormat.has_value()) {`
	`499`	`+ map["sampleFormat"] = quoteValue(streamMetadata.sampleFormat.value());`
	`500`	`+ }`
`498`	`501`	`if (streamMetadata.mediaType == AVMEDIA_TYPE_VIDEO) {`
`499`		`- map["mediaType"] = "\"video\"";`
	`502`	`+ map["mediaType"] = quoteValue("video");`
`500`	`503`	`} else if (streamMetadata.mediaType == AVMEDIA_TYPE_AUDIO) {`
`501`		`- map["mediaType"] = "\"audio\"";`
	`504`	`+ map["mediaType"] = quoteValue("audio");`
`502`	`505`	`} else {`
`503`		`- map["mediaType"] = "\"other\"";`
	`506`	`+ map["mediaType"] = quoteValue("other");`
`504`	`507`	`}`
`505`	`508`	`return mapToJson(map);`
`506`	`509`	`}`
Original file line number	Diff line number	Diff line change
`@@ -955,6 +955,7 @@ def test_metadata(self, asset):`
`955`	`955`	`)`
`956`	`956`	`assert decoder.metadata.sample_rate == asset.sample_rate`
`957`	`957`	`assert decoder.metadata.num_channels == asset.num_channels`
	`958`	`+ assert decoder.metadata.sample_format == asset.sample_format`
`958`	`959`
`959`	`960`	`@pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))`
`960`	`961`	`def test_error(self, asset):`
Original file line number	Diff line number	Diff line change
`@@ -90,6 +90,7 @@ def test_get_metadata(metadata_getter):`
`90`	`90`	`)`
`91`	`91`	`assert best_audio_stream_metadata.bit_rate == 128837`
`92`	`92`	`assert best_audio_stream_metadata.codec == "aac"`
	`93`	`+ assert best_audio_stream_metadata.sample_format == "fltp"`
`93`	`94`
`94`	`95`
`95`	`96`	`@pytest.mark.parametrize(`
`@@ -109,6 +110,7 @@ def test_get_metadata_audio_file(metadata_getter):`
`109`	`110`	`)`
`110`	`111`	`assert best_audio_stream_metadata.bit_rate == 64000`
`111`	`112`	`assert best_audio_stream_metadata.codec == "mp3"`
	`113`	`+ assert best_audio_stream_metadata.sample_format == "fltp"`
`112`	`114`
`113`	`115`
`114`	`116`	`@pytest.mark.parametrize(`