@@ -74,6 +74,7 @@ class VideoDecoder {
7474 // Updates the metadata of the video to accurate values obtained by scanning
7575 // the contents of the video file.
7676 void scanFileAndUpdateMetadataAndIndex ();
77+
7778 struct StreamMetadata {
7879 // Common (video and audio) fields derived from the AVStream.
7980 int streamIndex;
@@ -103,6 +104,7 @@ class VideoDecoder {
103104 std::optional<int64_t > width;
104105 std::optional<int64_t > height;
105106 };
107+
106108 struct ContainerMetadata {
107109 std::vector<StreamMetadata> allStreamMetadata;
108110 int numAudioStreams = 0 ;
@@ -117,6 +119,7 @@ class VideoDecoder {
117119 // If set, this is the index to the default video stream.
118120 std::optional<int > bestVideoStreamIndex;
119121 };
122+
120123 // Returns the metadata for the container.
121124 ContainerMetadata getContainerMetadata () const ;
122125
@@ -130,8 +133,10 @@ class VideoDecoder {
130133 // Use the libswscale library for color conversion.
131134 SWSCALE
132135 };
136+
133137 struct VideoStreamOptions {
134138 VideoStreamOptions () {}
139+
135140 explicit VideoStreamOptions (const std::string& optionsString);
136141 // Number of threads we pass to FFMPEG for decoding.
137142 // 0 means FFMPEG will choose the number of threads automatically to fully
@@ -149,7 +154,9 @@ class VideoDecoder {
149154 // By default we use CPU for decoding for both C++ and python users.
150155 torch::Device device = torch::kCPU ;
151156 };
157+
152158 struct AudioStreamOptions {};
159+
153160 void addVideoStreamDecoder (
154161 int streamIndex,
155162 const VideoStreamOptions& videoStreamOptions = VideoStreamOptions());
@@ -164,6 +171,7 @@ class VideoDecoder {
164171 // Calling getNextFrameNoDemuxInternal() will return the first frame at
165172 // or after this position.
166173 void setCursorPtsInSeconds (double seconds);
174+
167175 // This structure ensures we always keep the streamIndex and AVFrame together
168176 // Note that AVFrame itself doesn't retain the streamIndex.
169177 struct RawDecodedOutput {
@@ -172,6 +180,7 @@ class VideoDecoder {
172180 // The stream index of the decoded frame.
173181 int streamIndex;
174182 };
183+
175184 struct DecodedOutput {
176185 // The actual decoded output as a Tensor.
177186 torch::Tensor frame;
@@ -183,11 +192,13 @@ class VideoDecoder {
183192 // The duration of the decoded frame in seconds.
184193 double durationSeconds;
185194 };
195+
186196 class EndOfFileException : public std ::runtime_error {
187197 public:
188198 explicit EndOfFileException (const std::string& msg)
189199 : std::runtime_error(msg) {}
190200 };
201+
191202 // Decodes the frame where the current cursor position is. It also advances
192203 // the cursor to the next frame.
193204 DecodedOutput getNextFrameNoDemux ();
@@ -207,6 +218,7 @@ class VideoDecoder {
207218 int streamIndex,
208219 int64_t frameIndex,
209220 std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt );
221+
210222 struct BatchDecodedOutput {
211223 torch::Tensor frames;
212224 torch::Tensor ptsSeconds;
@@ -257,6 +269,7 @@ class VideoDecoder {
257269 int streamIndex,
258270 double startSeconds,
259271 double stopSeconds);
272+
260273 // --------------------------------------------------------------------------
261274 // DECODER PERFORMANCE STATISTICS API
262275 // --------------------------------------------------------------------------
@@ -271,6 +284,7 @@ class VideoDecoder {
271284 int64_t numFramesReceivedByDecoder = 0 ;
272285 int64_t numFlushes = 0 ;
273286 };
287+
274288 DecodeStats getDecodeStats () const ;
275289 void resetDecodeStats ();
276290
@@ -286,11 +300,13 @@ class VideoDecoder {
286300 // done during pts -> index conversions.)
287301 int64_t nextPts = INT64_MAX;
288302 };
303+
289304 struct FilterState {
290305 UniqueAVFilterGraph filterGraph;
291306 AVFilterContext* sourceContext = nullptr ;
292307 AVFilterContext* sinkContext = nullptr ;
293308 };
309+
294310 struct DecodedFrameContext {
295311 int decodedWidth;
296312 int decodedHeight;
@@ -300,6 +316,7 @@ class VideoDecoder {
300316 bool operator ==(const DecodedFrameContext&);
301317 bool operator !=(const DecodedFrameContext&);
302318 };
319+
303320 // Stores information for each stream.
304321 struct StreamInfo {
305322 int streamIndex = -1 ;
@@ -323,6 +340,7 @@ class VideoDecoder {
323340 DecodedFrameContext prevFrameContext;
324341 UniqueSwsContext swsContext;
325342 };
343+
326344 // Returns the key frame index of the presentation timestamp using FFMPEG's
327345 // index. Note that this index may be truncated for some files.
328346 int getKeyFrameIndexForPtsUsingEncoderIndex (AVStream* stream, int64_t pts)
@@ -480,6 +498,7 @@ class VideoDecoder {
480498struct FrameDims {
481499 int height;
482500 int width;
501+
483502 FrameDims (int h, int w) : height(h), width(w) {}
484503};
485504
0 commit comments