@@ -74,6 +74,7 @@ class VideoDecoder {
7474 // Updates the metadata of the video to accurate values obtained by scanning
7575 // the contents of the video file.
7676 void scanFileAndUpdateMetadataAndIndex ();
77+
7778 struct StreamMetadata {
7879 // Common (video and audio) fields derived from the AVStream.
7980 int streamIndex;
@@ -103,6 +104,7 @@ class VideoDecoder {
103104 std::optional<int64_t > width;
104105 std::optional<int64_t > height;
105106 };
107+
106108 struct ContainerMetadata {
107109 std::vector<StreamMetadata> allStreamMetadata;
108110 int numAudioStreams = 0 ;
@@ -117,6 +119,7 @@ class VideoDecoder {
117119 // If set, this is the index to the default video stream.
118120 std::optional<int > bestVideoStreamIndex;
119121 };
122+
120123 // Returns the metadata for the container.
121124 ContainerMetadata getContainerMetadata () const ;
122125
@@ -130,8 +133,10 @@ class VideoDecoder {
130133 // Use the libswscale library for color conversion.
131134 SWSCALE
132135 };
136+
133137 struct VideoStreamOptions {
134138 VideoStreamOptions () {}
139+
135140 explicit VideoStreamOptions (const std::string& optionsString);
136141 // Number of threads we pass to FFMPEG for decoding.
137142 // 0 means FFMPEG will choose the number of threads automatically to fully
@@ -149,7 +154,9 @@ class VideoDecoder {
149154 // By default we use CPU for decoding for both C++ and python users.
150155 torch::Device device = torch::kCPU ;
151156 };
157+
152158 struct AudioStreamOptions {};
159+
153160 void addVideoStreamDecoder (
154161 int streamIndex,
155162 const VideoStreamOptions& videoStreamOptions = VideoStreamOptions());
@@ -202,6 +209,7 @@ class VideoDecoder {
202209 explicit EndOfFileException (const std::string& msg)
203210 : std::runtime_error(msg) {}
204211 };
212+
205213 // Decodes the frame where the current cursor position is. It also advances
206214 // the cursor to the next frame.
207215 FrameOutput getNextFrameNoDemux ();
@@ -261,6 +269,7 @@ class VideoDecoder {
261269 int streamIndex,
262270 double startSeconds,
263271 double stopSeconds);
272+
264273 // --------------------------------------------------------------------------
265274 // DECODER PERFORMANCE STATISTICS API
266275 // --------------------------------------------------------------------------
@@ -275,6 +284,7 @@ class VideoDecoder {
275284 int64_t numFramesReceivedByDecoder = 0 ;
276285 int64_t numFlushes = 0 ;
277286 };
287+
278288 DecodeStats getDecodeStats () const ;
279289 void resetDecodeStats ();
280290
@@ -290,11 +300,13 @@ class VideoDecoder {
290300 // done during pts -> index conversions.)
291301 int64_t nextPts = INT64_MAX;
292302 };
303+
293304 struct FilterState {
294305 UniqueAVFilterGraph filterGraph;
295306 AVFilterContext* sourceContext = nullptr ;
296307 AVFilterContext* sinkContext = nullptr ;
297308 };
309+
298310 struct DecodedFrameContext {
299311 int decodedWidth;
300312 int decodedHeight;
@@ -304,6 +316,7 @@ class VideoDecoder {
304316 bool operator ==(const DecodedFrameContext&);
305317 bool operator !=(const DecodedFrameContext&);
306318 };
319+
307320 // Stores information for each stream.
308321 struct StreamInfo {
309322 int streamIndex = -1 ;
@@ -327,6 +340,7 @@ class VideoDecoder {
327340 DecodedFrameContext prevFrameContext;
328341 UniqueSwsContext swsContext;
329342 };
343+
330344 // Returns the key frame index of the presentation timestamp using FFMPEG's
331345 // index. Note that this index may be truncated for some files.
332346 int getKeyFrameIndexForPtsUsingEncoderIndex (AVStream* stream, int64_t pts)
@@ -484,6 +498,7 @@ class VideoDecoder {
484498struct FrameDims {
485499 int height;
486500 int width;
501+
487502 FrameDims (int h, int w) : height(h), width(w) {}
488503};
489504
0 commit comments