Skip to content

Commit af13ac5

Browse files
authored
Use AVFrame info for height and width in GPU APIs (#347)
1 parent c2bea4b commit af13ac5

File tree

5 files changed

+10
-17
lines changed

5 files changed

+10
-17
lines changed

src/torchcodec/decoders/_core/CPUOnlyDevice.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ namespace facebook::torchcodec {
1717
void convertAVFrameToDecodedOutputOnCuda(
1818
const torch::Device& device,
1919
const VideoDecoder::VideoStreamDecoderOptions& options,
20-
const VideoDecoder::StreamMetadata& metadata,
2120
VideoDecoder::RawDecodedOutput& rawOutput,
2221
VideoDecoder::DecodedOutput& output,
2322
std::optional<torch::Tensor> preAllocatedOutputTensor) {

src/torchcodec/decoders/_core/CudaDevice.cpp

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,6 @@ void initializeContextOnCuda(
187187
void convertAVFrameToDecodedOutputOnCuda(
188188
const torch::Device& device,
189189
const VideoDecoder::VideoStreamDecoderOptions& options,
190-
const VideoDecoder::StreamMetadata& metadata,
191190
VideoDecoder::RawDecodedOutput& rawOutput,
192191
VideoDecoder::DecodedOutput& output,
193192
std::optional<torch::Tensor> preAllocatedOutputTensor) {
@@ -197,11 +196,9 @@ void convertAVFrameToDecodedOutputOnCuda(
197196
src->format == AV_PIX_FMT_CUDA,
198197
"Expected format to be AV_PIX_FMT_CUDA, got " +
199198
std::string(av_get_pix_fmt_name((AVPixelFormat)src->format)));
200-
auto frameDims = getHeightAndWidthFromOptionsOrMetadata(options, metadata);
199+
auto frameDims = getHeightAndWidthFromOptionsOrAVFrame(options, *src);
201200
int height = frameDims.height;
202201
int width = frameDims.width;
203-
NppiSize oSizeROI = {width, height};
204-
Npp8u* input[2] = {src->data[0], src->data[1]};
205202
torch::Tensor& dst = output.frame;
206203
if (preAllocatedOutputTensor.has_value()) {
207204
dst = preAllocatedOutputTensor.value();
@@ -222,11 +219,10 @@ void convertAVFrameToDecodedOutputOnCuda(
222219
// Use the user-requested GPU for running the NPP kernel.
223220
c10::cuda::CUDAGuard deviceGuard(device);
224221

225-
auto start = std::chrono::high_resolution_clock::now();
222+
NppiSize oSizeROI = {width, height};
223+
Npp8u* input[2] = {src->data[0], src->data[1]};
226224

227-
// TODO height and width info of output tensor comes from the metadata, which
228-
// may not be accurate. How do we make sure we won't corrupt memory if the
229-
// allocated tensor is too short/large?
225+
auto start = std::chrono::high_resolution_clock::now();
230226
NppStatus status = nppiNV12ToRGB_8u_P2C3R(
231227
input,
232228
src->linesize[0],

src/torchcodec/decoders/_core/DeviceInterface.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ void initializeContextOnCuda(
3535
void convertAVFrameToDecodedOutputOnCuda(
3636
const torch::Device& device,
3737
const VideoDecoder::VideoStreamDecoderOptions& options,
38-
const VideoDecoder::StreamMetadata& metadata,
3938
VideoDecoder::RawDecodedOutput& rawOutput,
4039
VideoDecoder::DecodedOutput& output,
4140
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);

src/torchcodec/decoders/_core/VideoDecoder.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -863,7 +863,6 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput(
863863
convertAVFrameToDecodedOutputOnCuda(
864864
streamInfo.options.device,
865865
streamInfo.options,
866-
containerMetadata_.streams[streamIndex],
867866
rawOutput,
868867
output,
869868
preAllocatedOutputTensor);

src/torchcodec/decoders/_core/VideoDecoder.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -440,16 +440,16 @@ class VideoDecoder {
440440
// AVFrame *before* it is resized. In theory, i.e. if there are no bugs within
441441
// our code or within FFmpeg code, this should be exactly the same as
442442
// getHeightAndWidthFromResizedAVFrame(). This is used by single-frame
443-
// decoding APIs, on CPU, with swscale.
443+
// decoding APIs, on CPU with swscale, and on GPU.
444444
// - getHeightAndWidthFromOptionsOrMetadata(). This is the height and width from
445445
// the user-specified options if they exist, or the height and width form the
446446
// stream metadata, which itself got its value from the CodecContext, when the
447-
// stream was added. This is used by batch decoding APIs, or by GPU-APIs (both
448-
// batch and single-frames).
447+
// stream was added. This is used by batch decoding APIs, for both GPU and
448+
// CPU.
449449
//
450-
// The source of truth for height and width really is the (resized) AVFrame:
451-
// it's the decoded ouptut from FFmpeg. The info from the metadata (i.e. from
452-
// the CodecContext) may not be as accurate. However, the AVFrame is only
450+
// The source of truth for height and width really is the (resized) AVFrame: it
451+
// comes from the decoded ouptut of FFmpeg. The info from the metadata (i.e.
452+
// from the CodecContext) may not be as accurate. However, the AVFrame is only
453453
// available late in the call stack, when the frame is decoded, while the
454454
// CodecContext is available early when a stream is added. This is why we use
455455
// the CodecContext for pre-allocating batched output tensors (we could

0 commit comments

Comments
 (0)