Skip to content

Commit 25f3002

Browse files
committed
Merge branch 'main' of github.com:pytorch/torchcodec into proper_resize_test
2 parents 1541ab8 + d63504c commit 25f3002

23 files changed

+863
-200
lines changed

packaging/build_ffmpeg.bat

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
:: Copyright (c) Meta Platforms, Inc. and affiliates.
2+
:: All rights reserved.
3+
::
4+
:: This source code is licensed under the BSD-style license found in the
5+
:: LICENSE file in the root directory of this source tree.
6+
17
:: Taken from torchaudio
28
@echo off
39

packaging/build_ffmpeg.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
#!/usr/bin/env bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
27

38
# This is taken and adapated from torchaudio, only keeping the parts relevant to
49
# linux.

packaging/check_glibcxx.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
17
"""
28
The goal of this script is to ensure that the .so files we ship do not contain
39
symbol versions from libstdc++ that are too recent. This is a very manual way of

packaging/helpers.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
#!/usr/bin/env bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
27

38
_list_wheel_files() {
49
unzip -l "$1" | awk '{print $4}'

packaging/post_build_script.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
#!/bin/bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
27

38
set -ex
49

packaging/pre_build_script.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
#!/bin/bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
27

38
set -ex
49

packaging/vc_env_helper.bat

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
:: Copyright (c) Meta Platforms, Inc. and affiliates.
2+
:: All rights reserved.
3+
::
4+
:: This source code is licensed under the BSD-style license found in the
5+
:: LICENSE file in the root directory of this source tree.
6+
17
:: Taken from torchaudio
28
@echo on
39

src/torchcodec/_core/BetaCudaDeviceInterface.cpp

Lines changed: 163 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
#include "src/torchcodec/_core/FFMPEGCommon.h"
1616
#include "src/torchcodec/_core/NVDECCache.h"
1717

18-
// #include <cuda_runtime.h> // For cudaStreamSynchronize
18+
#include "src/torchcodec/_core/NVCUVIDRuntimeLoader.h"
1919
#include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
2020
#include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"
2121

@@ -155,6 +155,7 @@ std::optional<cudaVideoCodec> validateCodecSupport(AVCodecID codecId) {
155155
bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
156156
// Return true iff the input video stream is supported by our NVDEC
157157
// implementation.
158+
158159
auto codecType = validateCodecSupport(codecContext->codec_id);
159160
if (!codecType.has_value()) {
160161
return false;
@@ -212,6 +213,12 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
212213
return true;
213214
}
214215

216+
// Callback for freeing CUDA memory associated with AVFrame see where it's used
217+
// for more details.
218+
void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) {
219+
cudaFree(opaque);
220+
}
221+
215222
} // namespace
216223

217224
BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
@@ -222,6 +229,8 @@ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
222229

223230
initializeCudaContextWithPytorch(device_);
224231
nppCtx_ = getNppStreamContext(device_);
232+
233+
nvcuvidAvailable_ = loadNVCUVIDLibrary();
225234
}
226235

227236
BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
@@ -249,7 +258,7 @@ void BetaCudaDeviceInterface::initialize(
249258
const AVStream* avStream,
250259
const UniqueDecodingAVFormatContext& avFormatCtx,
251260
[[maybe_unused]] const SharedAVCodecContext& codecContext) {
252-
if (!nativeNVDECSupport(codecContext)) {
261+
if (!nvcuvidAvailable_ || !nativeNVDECSupport(codecContext)) {
253262
cpuFallback_ = createDeviceInterface(torch::kCPU);
254263
TORCH_CHECK(
255264
cpuFallback_ != nullptr, "Failed to create CPU device interface");
@@ -665,43 +674,176 @@ void BetaCudaDeviceInterface::flush() {
665674
std::swap(readyFrames_, emptyQueue);
666675
}
667676

677+
UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
678+
UniqueAVFrame& cpuFrame) {
679+
// This is called in the context of the CPU fallback: the frame was decoded on
680+
// the CPU, and in this function we convert that frame into NV12 format and
681+
// send it to the GPU.
682+
// We do that in 2 steps:
683+
// - First we convert the input CPU frame into an intermediate NV12 CPU frame
684+
// using sws_scale.
685+
// - Then we allocate GPU memory and copy the NV12 CPU frame to the GPU. This
686+
// is what we return
687+
688+
TORCH_CHECK(cpuFrame != nullptr, "CPU frame cannot be null");
689+
690+
int width = cpuFrame->width;
691+
int height = cpuFrame->height;
692+
693+
// intermediate NV12 CPU frame. It's not on the GPU yet.
694+
UniqueAVFrame nv12CpuFrame(av_frame_alloc());
695+
TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame");
696+
697+
nv12CpuFrame->format = AV_PIX_FMT_NV12;
698+
nv12CpuFrame->width = width;
699+
nv12CpuFrame->height = height;
700+
701+
int ret = av_frame_get_buffer(nv12CpuFrame.get(), 0);
702+
TORCH_CHECK(
703+
ret >= 0,
704+
"Failed to allocate NV12 CPU frame buffer: ",
705+
getFFMPEGErrorStringFromErrorCode(ret));
706+
707+
SwsFrameContext swsFrameContext(
708+
width,
709+
height,
710+
static_cast<AVPixelFormat>(cpuFrame->format),
711+
width,
712+
height);
713+
714+
if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
715+
swsContext_ = createSwsContext(
716+
swsFrameContext, cpuFrame->colorspace, AV_PIX_FMT_NV12, SWS_BILINEAR);
717+
prevSwsFrameContext_ = swsFrameContext;
718+
}
719+
720+
int convertedHeight = sws_scale(
721+
swsContext_.get(),
722+
cpuFrame->data,
723+
cpuFrame->linesize,
724+
0,
725+
height,
726+
nv12CpuFrame->data,
727+
nv12CpuFrame->linesize);
728+
TORCH_CHECK(
729+
convertedHeight == height, "sws_scale failed for CPU->NV12 conversion");
730+
731+
int ySize = width * height;
732+
TORCH_CHECK(
733+
ySize % 2 == 0,
734+
"Y plane size must be even. Please report on TorchCodec repo.");
735+
int uvSize = ySize / 2; // NV12: UV plane is half the size of Y plane
736+
size_t totalSize = static_cast<size_t>(ySize + uvSize);
737+
738+
uint8_t* cudaBuffer = nullptr;
739+
cudaError_t err =
740+
cudaMalloc(reinterpret_cast<void**>(&cudaBuffer), totalSize);
741+
TORCH_CHECK(
742+
err == cudaSuccess,
743+
"Failed to allocate CUDA memory: ",
744+
cudaGetErrorString(err));
745+
746+
UniqueAVFrame gpuFrame(av_frame_alloc());
747+
TORCH_CHECK(gpuFrame != nullptr, "Failed to allocate GPU AVFrame");
748+
749+
gpuFrame->format = AV_PIX_FMT_CUDA;
750+
gpuFrame->width = width;
751+
gpuFrame->height = height;
752+
gpuFrame->data[0] = cudaBuffer;
753+
gpuFrame->data[1] = cudaBuffer + ySize;
754+
gpuFrame->linesize[0] = width;
755+
gpuFrame->linesize[1] = width;
756+
757+
// Note that we use cudaMemcpy2D here instead of cudaMemcpy because the
758+
// linesizes (strides) may be different than the widths for the input CPU
759+
// frame. That's precisely what cudaMemcpy2D is for.
760+
err = cudaMemcpy2D(
761+
gpuFrame->data[0],
762+
gpuFrame->linesize[0],
763+
nv12CpuFrame->data[0],
764+
nv12CpuFrame->linesize[0],
765+
width,
766+
height,
767+
cudaMemcpyHostToDevice);
768+
TORCH_CHECK(
769+
err == cudaSuccess,
770+
"Failed to copy Y plane to GPU: ",
771+
cudaGetErrorString(err));
772+
773+
TORCH_CHECK(
774+
height % 2 == 0,
775+
"height must be even. Please report on TorchCodec repo.");
776+
err = cudaMemcpy2D(
777+
gpuFrame->data[1],
778+
gpuFrame->linesize[1],
779+
nv12CpuFrame->data[1],
780+
nv12CpuFrame->linesize[1],
781+
width,
782+
height / 2,
783+
cudaMemcpyHostToDevice);
784+
TORCH_CHECK(
785+
err == cudaSuccess,
786+
"Failed to copy UV plane to GPU: ",
787+
cudaGetErrorString(err));
788+
789+
ret = av_frame_copy_props(gpuFrame.get(), cpuFrame.get());
790+
TORCH_CHECK(
791+
ret >= 0,
792+
"Failed to copy frame properties: ",
793+
getFFMPEGErrorStringFromErrorCode(ret));
794+
795+
// We're almost done, but we need to make sure the CUDA memory is freed
796+
// properly. Usually, AVFrame data is freed when av_frame_free() is called
797+
// (upon UniqueAVFrame destruction), but since we allocated the CUDA memory
798+
// ourselves, FFmpeg doesn't know how to free it. The recommended way to deal
799+
// with this is to associate the opaque_ref field of the AVFrame with a `free`
800+
// callback that will then be called by av_frame_free().
801+
gpuFrame->opaque_ref = av_buffer_create(
802+
nullptr, // data - we don't need any
803+
0, // data size
804+
cudaBufferFreeCallback, // callback triggered by av_frame_free()
805+
cudaBuffer, // parameter to callback
806+
0); // flags
807+
TORCH_CHECK(
808+
gpuFrame->opaque_ref != nullptr,
809+
"Failed to create GPU memory cleanup reference");
810+
811+
return gpuFrame;
812+
}
813+
668814
void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
669815
UniqueAVFrame& avFrame,
670816
FrameOutput& frameOutput,
671817
std::optional<torch::Tensor> preAllocatedOutputTensor) {
672-
if (cpuFallback_) {
673-
// CPU decoded frame - need to do CPU color conversion then transfer to GPU
674-
FrameOutput cpuFrameOutput;
675-
cpuFallback_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
676-
677-
// Transfer CPU frame to GPU
678-
if (preAllocatedOutputTensor.has_value()) {
679-
preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data);
680-
frameOutput.data = preAllocatedOutputTensor.value();
681-
} else {
682-
frameOutput.data = cpuFrameOutput.data.to(device_);
683-
}
684-
return;
685-
}
818+
UniqueAVFrame gpuFrame =
819+
cpuFallback_ ? transferCpuFrameToGpuNV12(avFrame) : std::move(avFrame);
686820

687821
// TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA
688822
// ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24().
689823
TORCH_CHECK(
690-
avFrame->format == AV_PIX_FMT_CUDA,
824+
gpuFrame->format == AV_PIX_FMT_CUDA,
691825
"Expected CUDA format frame from BETA CUDA interface");
692826

693-
validatePreAllocatedTensorShape(preAllocatedOutputTensor, avFrame);
827+
validatePreAllocatedTensorShape(preAllocatedOutputTensor, gpuFrame);
694828

695829
at::cuda::CUDAStream nvdecStream =
696830
at::cuda::getCurrentCUDAStream(device_.index());
697831

698832
frameOutput.data = convertNV12FrameToRGB(
699-
avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
833+
gpuFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
700834
}
701835

702836
std::string BetaCudaDeviceInterface::getDetails() {
703-
return std::string("Beta CUDA Device Interface. Using ") +
704-
(cpuFallback_ ? "CPU fallback." : "NVDEC.");
837+
std::string details = "Beta CUDA Device Interface.";
838+
if (cpuFallback_) {
839+
details += " Using CPU fallback.";
840+
if (!nvcuvidAvailable_) {
841+
details += " NVCUVID not available!";
842+
}
843+
} else {
844+
details += " Using NVDEC.";
845+
}
846+
return details;
705847
}
706848

707849
} // namespace facebook::torchcodec

src/torchcodec/_core/BetaCudaDeviceInterface.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
8181
unsigned int pitch,
8282
const CUVIDPARSERDISPINFO& dispInfo);
8383

84+
UniqueAVFrame transferCpuFrameToGpuNV12(UniqueAVFrame& cpuFrame);
85+
8486
CUvideoparser videoParser_ = nullptr;
8587
UniqueCUvideodecoder decoder_;
8688
CUVIDEOFORMAT videoFormat_ = {};
@@ -98,6 +100,9 @@ class BetaCudaDeviceInterface : public DeviceInterface {
98100
UniqueNppContext nppCtx_;
99101

100102
std::unique_ptr<DeviceInterface> cpuFallback_;
103+
bool nvcuvidAvailable_ = false;
104+
UniqueSwsContext swsContext_;
105+
SwsFrameContext prevSwsFrameContext_;
101106
};
102107

103108
} // namespace facebook::torchcodec

src/torchcodec/_core/CMakeLists.txt

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ function(make_torchcodec_libraries
9999
)
100100

101101
if(ENABLE_CUDA)
102-
list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp)
102+
list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp)
103103
endif()
104104

105105
set(core_library_dependencies
@@ -108,27 +108,9 @@ function(make_torchcodec_libraries
108108
)
109109

110110
if(ENABLE_CUDA)
111-
# Try to find NVCUVID. Try the normal way first. This should work locally.
112-
find_library(NVCUVID_LIBRARY NAMES nvcuvid)
113-
# If not found, try with version suffix, or hardcoded path. Appears
114-
# to be necessary on the CI.
115-
if(NOT NVCUVID_LIBRARY)
116-
find_library(NVCUVID_LIBRARY NAMES nvcuvid.1 PATHS /usr/lib64 /usr/lib)
117-
endif()
118-
if(NOT NVCUVID_LIBRARY)
119-
set(NVCUVID_LIBRARY "/usr/lib64/libnvcuvid.so.1")
120-
endif()
121-
122-
if(NVCUVID_LIBRARY)
123-
message(STATUS "Found NVCUVID: ${NVCUVID_LIBRARY}")
124-
else()
125-
message(FATAL_ERROR "Could not find NVCUVID library")
126-
endif()
127-
128111
list(APPEND core_library_dependencies
129112
${CUDA_nppi_LIBRARY}
130113
${CUDA_nppicc_LIBRARY}
131-
${NVCUVID_LIBRARY}
132114
)
133115
endif()
134116

0 commit comments

Comments
 (0)