Skip to content

Commit be1fb02

Browse files
committed
Reorganize private part of header
1 parent e233f06 commit be1fb02

File tree

1 file changed

+119
-75
lines changed

1 file changed

+119
-75
lines changed

src/torchcodec/decoders/_core/VideoDecoder.h

Lines changed: 119 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -275,10 +275,9 @@ class VideoDecoder {
275275
void resetDecodeStats();
276276

277277
private:
278-
explicit VideoDecoder(const std::string& videoFilePath, SeekMode seekMode);
279-
explicit VideoDecoder(const void* buffer, size_t length, SeekMode seekMode);
280-
torch::Tensor maybePermuteHWC2CHW(int streamIndex, torch::Tensor& hwcTensor);
281-
278+
// --------------------------------------------------------------------------
279+
// STREAMINFO AND ASSOCIATED STRUCTS
280+
// --------------------------------------------------------------------------
282281
struct FrameInfo {
283282
int64_t pts = 0;
284283
// The value of this default is important: the last frame's nextPts will be
@@ -305,73 +304,117 @@ class VideoDecoder {
305304
bool operator!=(const DecodedFrameContext&);
306305
};
307306

308-
// Stores information for each stream.
309307
struct StreamInfo {
310308
int streamIndex = -1;
311309
AVStream* stream = nullptr;
312310
AVRational timeBase = {};
313311
UniqueAVCodecContext codecContext;
314-
// The current position of the cursor in the stream.
312+
313+
// The FrameInfo indices we built when scanFileAndUpdateMetadataAndIndex was
314+
// called.
315+
std::vector<FrameInfo> keyFrames;
316+
std::vector<FrameInfo> allFrames;
317+
318+
// The current position of the cursor in the stream, and associated frame
319+
// duration.
315320
int64_t currentPts = 0;
316321
int64_t currentDuration = 0;
317322
// The desired position of the cursor in the stream. We send frames >=
318323
// this pts to the user when they request a frame.
319-
// We update this field if the user requested a seek.
324+
// We update this field if the user requested a seek. This typically
325+
// corresponds to the decoder's desiredPts_ attribute.
320326
int64_t discardFramesBeforePts = INT64_MIN;
321327
VideoStreamOptions videoStreamOptions;
322-
// The filter state associated with this stream (for video streams). The
323-
// actual graph will be nullptr for inactive streams.
324-
FilterGraphContext filterGraphContext;
328+
329+
// color-conversion fields. Only one of FilterGraphContextr and
330+
// UniqueSwsContext should be non-null.
325331
ColorConversionLibrary colorConversionLibrary = FILTERGRAPH;
326-
std::vector<FrameInfo> keyFrames;
327-
std::vector<FrameInfo> allFrames;
328-
DecodedFrameContext prevFrameContext;
332+
FilterGraphContext filterGraphContext;
329333
UniqueSwsContext swsContext;
334+
335+
// Used to know whether a new FilterGraphContext or UniqueSwsContext should
336+
// be created before decoding a new frame.
337+
DecodedFrameContext prevFrameContext;
330338
};
331339

332-
// Returns the key frame index of the presentation timestamp using FFMPEG's
333-
// index. Note that this index may be truncated for some files.
334-
int getKeyFrameIndexForPtsUsingEncoderIndex(AVStream* stream, int64_t pts)
335-
const;
336-
// Returns the key frame index of the presentation timestamp using our index.
337-
// We build this index by scanning the file in buildKeyFrameIndex().
338-
int getKeyFrameIndexForPtsUsingScannedIndex(
339-
const std::vector<VideoDecoder::FrameInfo>& keyFrames,
340-
int64_t pts) const;
341-
int getKeyFrameIndexForPts(const StreamInfo& stream, int64_t pts) const;
340+
// --------------------------------------------------------------------------
341+
// CONSTRUCTORS AND INITIALIZERS
342+
// --------------------------------------------------------------------------
343+
// Don't use those, use the static methods to create a decoder object.
344+
345+
explicit VideoDecoder(const std::string& videoFilePath, SeekMode seekMode);
346+
explicit VideoDecoder(const void* buffer, size_t length, SeekMode seekMode);
347+
void initializeDecoder();
348+
void updateMetadataWithCodecContext(
349+
int streamIndex,
350+
AVCodecContext* codecContext);
351+
352+
// --------------------------------------------------------------------------
353+
// DECODING APIS AND RELATED UTILS
354+
// --------------------------------------------------------------------------
355+
342356
bool canWeAvoidSeekingForStream(
343357
const StreamInfo& stream,
344358
int64_t currentPts,
345359
int64_t targetPts) const;
346-
// Returns the "best" stream index for a given media type. The "best" is
347-
// determined by various heuristics in FFMPEG.
348-
// See
349-
// https://ffmpeg.org/doxygen/trunk/group__lavf__decoding.html#ga757780d38f482deb4d809c6c521fbcc2
350-
// for more details about the heuristics.
351-
int getBestStreamIndex(AVMediaType mediaType);
352-
void initializeDecoder();
353-
void validateUserProvidedStreamIndex(int streamIndex);
354-
void validateScannedAllStreams(const std::string& msg);
355-
void validateFrameIndex(
356-
const StreamMetadata& streamMetadata,
357-
int64_t frameIndex);
358360

359-
// Creates and initializes a filter graph for a stream. The filter graph can
360-
// do rescaling and color conversion.
361+
void maybeSeekToBeforeDesiredPts();
362+
363+
AVFrameStream getAVFrameUsingFilterFunction(
364+
std::function<bool(int, AVFrame*)>);
365+
366+
FrameOutput getNextFrameNoDemuxInternal(
367+
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
368+
369+
torch::Tensor maybePermuteHWC2CHW(int streamIndex, torch::Tensor& hwcTensor);
370+
371+
FrameOutput convertAVFrameToFrameOutput(
372+
AVFrameStream& avFrameStream,
373+
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
374+
375+
void convertAVFrameToFrameOutputOnCPU(
376+
AVFrameStream& avFrameStream,
377+
FrameOutput& frameOutput,
378+
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
379+
380+
torch::Tensor convertAVFrameToTensorUsingFilterGraph(
381+
int streamIndex,
382+
const AVFrame* avFrame);
383+
384+
int convertAVFrameToTensorUsingSwsScale(
385+
int streamIndex,
386+
const AVFrame* avFrame,
387+
torch::Tensor& outputTensor);
388+
389+
// --------------------------------------------------------------------------
390+
// COLOR CONVERSION LIBRARIES HANDLERS CREATION
391+
// --------------------------------------------------------------------------
392+
361393
void createFilterGraph(
362394
StreamInfo& streamInfo,
363395
int expectedOutputHeight,
364396
int expectedOutputWidth);
365397

366-
int64_t getNumFrames(const StreamMetadata& streamMetadata);
398+
void createSwsContext(
399+
StreamInfo& streamInfo,
400+
const DecodedFrameContext& frameContext,
401+
const enum AVColorSpace colorspace);
367402

368-
int64_t getPts(
369-
const StreamInfo& streamInfo,
370-
const StreamMetadata& streamMetadata,
371-
int64_t frameIndex);
403+
// --------------------------------------------------------------------------
404+
// PTS <-> INDEX CONVERSIONS
405+
// --------------------------------------------------------------------------
372406

373-
double getMinSeconds(const StreamMetadata& streamMetadata);
374-
double getMaxSeconds(const StreamMetadata& streamMetadata);
407+
int getKeyFrameIndexForPts(const StreamInfo& stream, int64_t pts) const;
408+
409+
// Returns the key frame index of the presentation timestamp using our index.
410+
// We build this index by scanning the file in
411+
// scanFileAndUpdateMetadataAndIndex
412+
int getKeyFrameIndexForPtsUsingScannedIndex(
413+
const std::vector<VideoDecoder::FrameInfo>& keyFrames,
414+
int64_t pts) const;
415+
// Return key frame index, from FFmpeg. Potentially less accurate
416+
int getKeyFrameIndexForPtsUsingEncoderIndex(AVStream* stream, int64_t pts)
417+
const;
375418

376419
int64_t secondsToIndexLowerBound(
377420
double seconds,
@@ -383,39 +426,41 @@ class VideoDecoder {
383426
const StreamInfo& streamInfo,
384427
const StreamMetadata& streamMetadata);
385428

386-
void createSwsContext(
387-
StreamInfo& streamInfo,
388-
const DecodedFrameContext& frameContext,
389-
const enum AVColorSpace colorspace);
429+
int64_t getPts(
430+
const StreamInfo& streamInfo,
431+
const StreamMetadata& streamMetadata,
432+
int64_t frameIndex);
390433

391-
void maybeSeekToBeforeDesiredPts();
434+
// --------------------------------------------------------------------------
435+
// STREAM AND METADATA APIS
436+
// --------------------------------------------------------------------------
392437

393-
AVFrameStream getAVFrameUsingFilterFunction(
394-
std::function<bool(int, AVFrame*)>);
395-
// Once we create a decoder can update the metadata with the codec context.
396-
// For example, for video streams, we can add the height and width of the
397-
// decoded stream.
398-
void updateMetadataWithCodecContext(
399-
int streamIndex,
400-
AVCodecContext* codecContext);
401-
void populateVideoMetadataFromStreamIndex(int streamIndex);
402-
torch::Tensor convertAVFrameToTensorUsingFilterGraph(
403-
int streamIndex,
404-
const AVFrame* avFrame);
405-
int convertAVFrameToTensorUsingSwsScale(
406-
int streamIndex,
407-
const AVFrame* avFrame,
408-
torch::Tensor& outputTensor);
409-
FrameOutput convertAVFrameToFrameOutput(
410-
AVFrameStream& avFrameStream,
411-
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
412-
void convertAVFrameToFrameOutputOnCPU(
413-
AVFrameStream& avFrameStream,
414-
FrameOutput& frameOutput,
415-
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
438+
// Returns the "best" stream index for a given media type. The "best" is
439+
// determined by various heuristics in FFMPEG.
440+
// See
441+
// https://ffmpeg.org/doxygen/trunk/group__lavf__decoding.html#ga757780d38f482deb4d809c6c521fbcc2
442+
// for more details about the heuristics.
443+
// Returns the key frame index of the presentation timestamp using FFMPEG's
444+
// index. Note that this index may be truncated for some files.
445+
int getBestStreamIndex(AVMediaType mediaType);
416446

417-
FrameOutput getNextFrameNoDemuxInternal(
418-
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
447+
int64_t getNumFrames(const StreamMetadata& streamMetadata);
448+
double getMinSeconds(const StreamMetadata& streamMetadata);
449+
double getMaxSeconds(const StreamMetadata& streamMetadata);
450+
451+
// --------------------------------------------------------------------------
452+
// VALIDATION UTILS
453+
// --------------------------------------------------------------------------
454+
455+
void validateUserProvidedStreamIndex(int streamIndex);
456+
void validateScannedAllStreams(const std::string& msg);
457+
void validateFrameIndex(
458+
const StreamMetadata& streamMetadata,
459+
int64_t frameIndex);
460+
461+
// --------------------------------------------------------------------------
462+
// ATTRIBUTES
463+
// --------------------------------------------------------------------------
419464

420465
SeekMode seekMode_;
421466
ContainerMetadata containerMetadata_;
@@ -427,7 +472,6 @@ class VideoDecoder {
427472
// Set when the user wants to seek and stores the desired pts that the user
428473
// wants to seek to.
429474
std::optional<double> desiredPtsSeconds_;
430-
431475
// Stores various internal decoding stats.
432476
DecodeStats decodeStats_;
433477
// Stores the AVIOContext for the input buffer.

0 commit comments

Comments
 (0)