@@ -275,10 +275,9 @@ class VideoDecoder {
275275 void resetDecodeStats ();
276276
277277 private:
278- explicit VideoDecoder (const std::string& videoFilePath, SeekMode seekMode);
279- explicit VideoDecoder (const void * buffer, size_t length, SeekMode seekMode);
280- torch::Tensor maybePermuteHWC2CHW (int streamIndex, torch::Tensor& hwcTensor);
281-
278+ // --------------------------------------------------------------------------
279+ // STREAMINFO AND ASSOCIATED STRUCTS
280+ // --------------------------------------------------------------------------
282281 struct FrameInfo {
283282 int64_t pts = 0 ;
284283 // The value of this default is important: the last frame's nextPts will be
@@ -305,73 +304,117 @@ class VideoDecoder {
305304 bool operator !=(const DecodedFrameContext&);
306305 };
307306
308- // Stores information for each stream.
309307 struct StreamInfo {
310308 int streamIndex = -1 ;
311309 AVStream* stream = nullptr ;
312310 AVRational timeBase = {};
313311 UniqueAVCodecContext codecContext;
314- // The current position of the cursor in the stream.
312+
313+ // The FrameInfo indices we built when scanFileAndUpdateMetadataAndIndex was
314+ // called.
315+ std::vector<FrameInfo> keyFrames;
316+ std::vector<FrameInfo> allFrames;
317+
318+ // The current position of the cursor in the stream, and associated frame
319+ // duration.
315320 int64_t currentPts = 0 ;
316321 int64_t currentDuration = 0 ;
317322 // The desired position of the cursor in the stream. We send frames >=
318323 // this pts to the user when they request a frame.
319- // We update this field if the user requested a seek.
324+ // We update this field if the user requested a seek. This typically
325+ // corresponds to the decoder's desiredPts_ attribute.
320326 int64_t discardFramesBeforePts = INT64_MIN;
321327 VideoStreamOptions videoStreamOptions;
322- // The filter state associated with this stream (for video streams). The
323- // actual graph will be nullptr for inactive streams.
324- FilterGraphContext filterGraphContext;
328+
329+ // color-conversion fields. Only one of FilterGraphContextr and
330+ // UniqueSwsContext should be non-null.
325331 ColorConversionLibrary colorConversionLibrary = FILTERGRAPH;
326- std::vector<FrameInfo> keyFrames;
327- std::vector<FrameInfo> allFrames;
328- DecodedFrameContext prevFrameContext;
332+ FilterGraphContext filterGraphContext;
329333 UniqueSwsContext swsContext;
334+
335+ // Used to know whether a new FilterGraphContext or UniqueSwsContext should
336+ // be created before decoding a new frame.
337+ DecodedFrameContext prevFrameContext;
330338 };
331339
332- // Returns the key frame index of the presentation timestamp using FFMPEG's
333- // index. Note that this index may be truncated for some files.
334- int getKeyFrameIndexForPtsUsingEncoderIndex (AVStream* stream, int64_t pts)
335- const ;
336- // Returns the key frame index of the presentation timestamp using our index.
337- // We build this index by scanning the file in buildKeyFrameIndex().
338- int getKeyFrameIndexForPtsUsingScannedIndex (
339- const std::vector<VideoDecoder::FrameInfo>& keyFrames,
340- int64_t pts) const ;
341- int getKeyFrameIndexForPts (const StreamInfo& stream, int64_t pts) const ;
340+ // --------------------------------------------------------------------------
341+ // CONSTRUCTORS AND INITIALIZERS
342+ // --------------------------------------------------------------------------
343+ // Don't use those, use the static methods to create a decoder object.
344+
345+ explicit VideoDecoder (const std::string& videoFilePath, SeekMode seekMode);
346+ explicit VideoDecoder (const void * buffer, size_t length, SeekMode seekMode);
347+ void initializeDecoder ();
348+ void updateMetadataWithCodecContext (
349+ int streamIndex,
350+ AVCodecContext* codecContext);
351+
352+ // --------------------------------------------------------------------------
353+ // DECODING APIS AND RELATED UTILS
354+ // --------------------------------------------------------------------------
355+
342356 bool canWeAvoidSeekingForStream (
343357 const StreamInfo& stream,
344358 int64_t currentPts,
345359 int64_t targetPts) const ;
346- // Returns the "best" stream index for a given media type. The "best" is
347- // determined by various heuristics in FFMPEG.
348- // See
349- // https://ffmpeg.org/doxygen/trunk/group__lavf__decoding.html#ga757780d38f482deb4d809c6c521fbcc2
350- // for more details about the heuristics.
351- int getBestStreamIndex (AVMediaType mediaType);
352- void initializeDecoder ();
353- void validateUserProvidedStreamIndex (int streamIndex);
354- void validateScannedAllStreams (const std::string& msg);
355- void validateFrameIndex (
356- const StreamMetadata& streamMetadata,
357- int64_t frameIndex);
358360
359- // Creates and initializes a filter graph for a stream. The filter graph can
360- // do rescaling and color conversion.
361+ void maybeSeekToBeforeDesiredPts ();
362+
363+ AVFrameStream getAVFrameUsingFilterFunction (
364+ std::function<bool (int , AVFrame*)>);
365+
366+ FrameOutput getNextFrameNoDemuxInternal (
367+ std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt );
368+
369+ torch::Tensor maybePermuteHWC2CHW (int streamIndex, torch::Tensor& hwcTensor);
370+
371+ FrameOutput convertAVFrameToFrameOutput (
372+ AVFrameStream& avFrameStream,
373+ std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt );
374+
375+ void convertAVFrameToFrameOutputOnCPU (
376+ AVFrameStream& avFrameStream,
377+ FrameOutput& frameOutput,
378+ std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt );
379+
380+ torch::Tensor convertAVFrameToTensorUsingFilterGraph (
381+ int streamIndex,
382+ const AVFrame* avFrame);
383+
384+ int convertAVFrameToTensorUsingSwsScale (
385+ int streamIndex,
386+ const AVFrame* avFrame,
387+ torch::Tensor& outputTensor);
388+
389+ // --------------------------------------------------------------------------
390+ // COLOR CONVERSION LIBRARIES HANDLERS CREATION
391+ // --------------------------------------------------------------------------
392+
361393 void createFilterGraph (
362394 StreamInfo& streamInfo,
363395 int expectedOutputHeight,
364396 int expectedOutputWidth);
365397
366- int64_t getNumFrames (const StreamMetadata& streamMetadata);
398+ void createSwsContext (
399+ StreamInfo& streamInfo,
400+ const DecodedFrameContext& frameContext,
401+ const enum AVColorSpace colorspace);
367402
368- int64_t getPts (
369- const StreamInfo& streamInfo,
370- const StreamMetadata& streamMetadata,
371- int64_t frameIndex);
403+ // --------------------------------------------------------------------------
404+ // PTS <-> INDEX CONVERSIONS
405+ // --------------------------------------------------------------------------
372406
373- double getMinSeconds (const StreamMetadata& streamMetadata);
374- double getMaxSeconds (const StreamMetadata& streamMetadata);
407+ int getKeyFrameIndexForPts (const StreamInfo& stream, int64_t pts) const ;
408+
409+ // Returns the key frame index of the presentation timestamp using our index.
410+ // We build this index by scanning the file in
411+ // scanFileAndUpdateMetadataAndIndex
412+ int getKeyFrameIndexForPtsUsingScannedIndex (
413+ const std::vector<VideoDecoder::FrameInfo>& keyFrames,
414+ int64_t pts) const ;
415+ // Return key frame index, from FFmpeg. Potentially less accurate
416+ int getKeyFrameIndexForPtsUsingEncoderIndex (AVStream* stream, int64_t pts)
417+ const ;
375418
376419 int64_t secondsToIndexLowerBound (
377420 double seconds,
@@ -383,39 +426,41 @@ class VideoDecoder {
383426 const StreamInfo& streamInfo,
384427 const StreamMetadata& streamMetadata);
385428
386- void createSwsContext (
387- StreamInfo& streamInfo,
388- const DecodedFrameContext& frameContext ,
389- const enum AVColorSpace colorspace );
429+ int64_t getPts (
430+ const StreamInfo& streamInfo,
431+ const StreamMetadata& streamMetadata ,
432+ int64_t frameIndex );
390433
391- void maybeSeekToBeforeDesiredPts ();
434+ // --------------------------------------------------------------------------
435+ // STREAM AND METADATA APIS
436+ // --------------------------------------------------------------------------
392437
393- AVFrameStream getAVFrameUsingFilterFunction (
394- std::function<bool (int , AVFrame*)>);
395- // Once we create a decoder can update the metadata with the codec context.
396- // For example, for video streams, we can add the height and width of the
397- // decoded stream.
398- void updateMetadataWithCodecContext (
399- int streamIndex,
400- AVCodecContext* codecContext);
401- void populateVideoMetadataFromStreamIndex (int streamIndex);
402- torch::Tensor convertAVFrameToTensorUsingFilterGraph (
403- int streamIndex,
404- const AVFrame* avFrame);
405- int convertAVFrameToTensorUsingSwsScale (
406- int streamIndex,
407- const AVFrame* avFrame,
408- torch::Tensor& outputTensor);
409- FrameOutput convertAVFrameToFrameOutput (
410- AVFrameStream& avFrameStream,
411- std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt );
412- void convertAVFrameToFrameOutputOnCPU (
413- AVFrameStream& avFrameStream,
414- FrameOutput& frameOutput,
415- std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt );
438+ // Returns the "best" stream index for a given media type. The "best" is
439+ // determined by various heuristics in FFMPEG.
440+ // See
441+ // https://ffmpeg.org/doxygen/trunk/group__lavf__decoding.html#ga757780d38f482deb4d809c6c521fbcc2
442+ // for more details about the heuristics.
443+ // Returns the key frame index of the presentation timestamp using FFMPEG's
444+ // index. Note that this index may be truncated for some files.
445+ int getBestStreamIndex (AVMediaType mediaType);
416446
417- FrameOutput getNextFrameNoDemuxInternal (
418- std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt );
447+ int64_t getNumFrames (const StreamMetadata& streamMetadata);
448+ double getMinSeconds (const StreamMetadata& streamMetadata);
449+ double getMaxSeconds (const StreamMetadata& streamMetadata);
450+
451+ // --------------------------------------------------------------------------
452+ // VALIDATION UTILS
453+ // --------------------------------------------------------------------------
454+
455+ void validateUserProvidedStreamIndex (int streamIndex);
456+ void validateScannedAllStreams (const std::string& msg);
457+ void validateFrameIndex (
458+ const StreamMetadata& streamMetadata,
459+ int64_t frameIndex);
460+
461+ // --------------------------------------------------------------------------
462+ // ATTRIBUTES
463+ // --------------------------------------------------------------------------
419464
420465 SeekMode seekMode_;
421466 ContainerMetadata containerMetadata_;
@@ -427,7 +472,6 @@ class VideoDecoder {
427472 // Set when the user wants to seek and stores the desired pts that the user
428473 // wants to seek to.
429474 std::optional<double > desiredPtsSeconds_;
430-
431475 // Stores various internal decoding stats.
432476 DecodeStats decodeStats_;
433477 // Stores the AVIOContext for the input buffer.
0 commit comments