Skip to content

Commit 51b0a74

Browse files
authored
Merge branch 'meta-pytorch:main' into fallback-container-duration
2 parents 0539566 + b35005d commit 51b0a74

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+1852
-1201
lines changed

CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
cmake_minimum_required(VERSION 3.18)
22
project(TorchCodec)
33

4+
# Define LINUX platform variable globally
5+
if (UNIX AND NOT APPLE)
6+
set(LINUX TRUE)
7+
else()
8+
set(LINUX FALSE)
9+
endif()
10+
411
add_subdirectory(src/torchcodec/_core)
512

613

src/torchcodec/__init__.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,24 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7+
from pathlib import Path
8+
79
# Note: usort wants to put Frame and FrameBatch after decoders and samplers,
810
# but that results in circular import.
911
from ._frame import AudioSamples, Frame, FrameBatch # usort:skip # noqa
10-
from . import decoders, samplers # noqa
12+
from . import decoders, encoders, samplers # noqa
1113

1214
try:
1315
# Note that version.py is generated during install.
1416
from .version import __version__ # noqa: F401
1517
except Exception:
1618
pass
19+
20+
# cmake_prefix_path is needed for downstream cmake-based builds that use
21+
# torchcodec as a dependency to tell cmake where torchcodec is installed and where to find its
22+
# CMake configuration files.
23+
# Pytorch itself has a similar mechanism which we use in our setup.py!
24+
cmake_prefix_path = Path(__file__).parent / "share" / "cmake"
25+
# Similarly, these are exposed for downstream builds that use torchcodec as a
26+
# dependency.
27+
from ._core import core_library_path, ffmpeg_major_version # usort:skip

src/torchcodec/_core/AVIOContextHolder.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
// This source code is licensed under the BSD-style license found in the
55
// LICENSE file in the root directory of this source tree.
66

7-
#include "src/torchcodec/_core/AVIOContextHolder.h"
7+
#include "AVIOContextHolder.h"
88
#include <torch/types.h>
99

1010
namespace facebook::torchcodec {

src/torchcodec/_core/AVIOContextHolder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
#pragma once
88

9-
#include "src/torchcodec/_core/FFMPEGCommon.h"
9+
#include "FFMPEGCommon.h"
1010

1111
namespace facebook::torchcodec {
1212

src/torchcodec/_core/AVIOFileLikeContext.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
// This source code is licensed under the BSD-style license found in the
55
// LICENSE file in the root directory of this source tree.
66

7-
#include "src/torchcodec/_core/AVIOFileLikeContext.h"
7+
#include "AVIOFileLikeContext.h"
88
#include <torch/types.h>
99

1010
namespace facebook::torchcodec {

src/torchcodec/_core/AVIOFileLikeContext.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
#include <pybind11/pybind11.h>
1010
#include <pybind11/stl.h>
1111

12-
#include "src/torchcodec/_core/AVIOContextHolder.h"
12+
#include "AVIOContextHolder.h"
1313

1414
namespace py = pybind11;
1515

src/torchcodec/_core/AVIOTensorContext.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
// This source code is licensed under the BSD-style license found in the
55
// LICENSE file in the root directory of this source tree.
66

7-
#include "src/torchcodec/_core/AVIOTensorContext.h"
7+
#include "AVIOTensorContext.h"
88
#include <torch/types.h>
99

1010
namespace facebook::torchcodec {

src/torchcodec/_core/AVIOTensorContext.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#pragma once
88

99
#include <torch/types.h>
10-
#include "src/torchcodec/_core/AVIOContextHolder.h"
10+
#include "AVIOContextHolder.h"
1111

1212
namespace facebook::torchcodec {
1313

src/torchcodec/_core/BetaCudaDeviceInterface.cpp

Lines changed: 155 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,15 @@
99
#include <mutex>
1010
#include <vector>
1111

12-
#include "src/torchcodec/_core/BetaCudaDeviceInterface.h"
12+
#include "BetaCudaDeviceInterface.h"
1313

14-
#include "src/torchcodec/_core/DeviceInterface.h"
15-
#include "src/torchcodec/_core/FFMPEGCommon.h"
16-
#include "src/torchcodec/_core/NVDECCache.h"
14+
#include "DeviceInterface.h"
15+
#include "FFMPEGCommon.h"
16+
#include "NVDECCache.h"
1717

18-
#include "src/torchcodec/_core/NVCUVIDRuntimeLoader.h"
19-
#include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
20-
#include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"
18+
#include "NVCUVIDRuntimeLoader.h"
19+
#include "nvcuvid_include/cuviddec.h"
20+
#include "nvcuvid_include/nvcuvid.h"
2121

2222
extern "C" {
2323
#include <libavutil/hwcontext_cuda.h>
@@ -213,6 +213,12 @@ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
213213
return true;
214214
}
215215

216+
// Callback for freeing CUDA memory associated with AVFrame see where it's used
217+
// for more details.
218+
void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) {
219+
cudaFree(opaque);
220+
}
221+
216222
} // namespace
217223

218224
BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
@@ -668,38 +674,163 @@ void BetaCudaDeviceInterface::flush() {
668674
std::swap(readyFrames_, emptyQueue);
669675
}
670676

677+
UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
678+
UniqueAVFrame& cpuFrame) {
679+
// This is called in the context of the CPU fallback: the frame was decoded on
680+
// the CPU, and in this function we convert that frame into NV12 format and
681+
// send it to the GPU.
682+
// We do that in 2 steps:
683+
// - First we convert the input CPU frame into an intermediate NV12 CPU frame
684+
// using sws_scale.
685+
// - Then we allocate GPU memory and copy the NV12 CPU frame to the GPU. This
686+
// is what we return
687+
688+
TORCH_CHECK(cpuFrame != nullptr, "CPU frame cannot be null");
689+
690+
int width = cpuFrame->width;
691+
int height = cpuFrame->height;
692+
693+
// intermediate NV12 CPU frame. It's not on the GPU yet.
694+
UniqueAVFrame nv12CpuFrame(av_frame_alloc());
695+
TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame");
696+
697+
nv12CpuFrame->format = AV_PIX_FMT_NV12;
698+
nv12CpuFrame->width = width;
699+
nv12CpuFrame->height = height;
700+
701+
int ret = av_frame_get_buffer(nv12CpuFrame.get(), 0);
702+
TORCH_CHECK(
703+
ret >= 0,
704+
"Failed to allocate NV12 CPU frame buffer: ",
705+
getFFMPEGErrorStringFromErrorCode(ret));
706+
707+
SwsFrameContext swsFrameContext(
708+
width,
709+
height,
710+
static_cast<AVPixelFormat>(cpuFrame->format),
711+
width,
712+
height);
713+
714+
if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
715+
swsContext_ = createSwsContext(
716+
swsFrameContext, cpuFrame->colorspace, AV_PIX_FMT_NV12, SWS_BILINEAR);
717+
prevSwsFrameContext_ = swsFrameContext;
718+
}
719+
720+
int convertedHeight = sws_scale(
721+
swsContext_.get(),
722+
cpuFrame->data,
723+
cpuFrame->linesize,
724+
0,
725+
height,
726+
nv12CpuFrame->data,
727+
nv12CpuFrame->linesize);
728+
TORCH_CHECK(
729+
convertedHeight == height, "sws_scale failed for CPU->NV12 conversion");
730+
731+
int ySize = width * height;
732+
TORCH_CHECK(
733+
ySize % 2 == 0,
734+
"Y plane size must be even. Please report on TorchCodec repo.");
735+
int uvSize = ySize / 2; // NV12: UV plane is half the size of Y plane
736+
size_t totalSize = static_cast<size_t>(ySize + uvSize);
737+
738+
uint8_t* cudaBuffer = nullptr;
739+
cudaError_t err =
740+
cudaMalloc(reinterpret_cast<void**>(&cudaBuffer), totalSize);
741+
TORCH_CHECK(
742+
err == cudaSuccess,
743+
"Failed to allocate CUDA memory: ",
744+
cudaGetErrorString(err));
745+
746+
UniqueAVFrame gpuFrame(av_frame_alloc());
747+
TORCH_CHECK(gpuFrame != nullptr, "Failed to allocate GPU AVFrame");
748+
749+
gpuFrame->format = AV_PIX_FMT_CUDA;
750+
gpuFrame->width = width;
751+
gpuFrame->height = height;
752+
gpuFrame->data[0] = cudaBuffer;
753+
gpuFrame->data[1] = cudaBuffer + ySize;
754+
gpuFrame->linesize[0] = width;
755+
gpuFrame->linesize[1] = width;
756+
757+
// Note that we use cudaMemcpy2D here instead of cudaMemcpy because the
758+
// linesizes (strides) may be different than the widths for the input CPU
759+
// frame. That's precisely what cudaMemcpy2D is for.
760+
err = cudaMemcpy2D(
761+
gpuFrame->data[0],
762+
gpuFrame->linesize[0],
763+
nv12CpuFrame->data[0],
764+
nv12CpuFrame->linesize[0],
765+
width,
766+
height,
767+
cudaMemcpyHostToDevice);
768+
TORCH_CHECK(
769+
err == cudaSuccess,
770+
"Failed to copy Y plane to GPU: ",
771+
cudaGetErrorString(err));
772+
773+
TORCH_CHECK(
774+
height % 2 == 0,
775+
"height must be even. Please report on TorchCodec repo.");
776+
err = cudaMemcpy2D(
777+
gpuFrame->data[1],
778+
gpuFrame->linesize[1],
779+
nv12CpuFrame->data[1],
780+
nv12CpuFrame->linesize[1],
781+
width,
782+
height / 2,
783+
cudaMemcpyHostToDevice);
784+
TORCH_CHECK(
785+
err == cudaSuccess,
786+
"Failed to copy UV plane to GPU: ",
787+
cudaGetErrorString(err));
788+
789+
ret = av_frame_copy_props(gpuFrame.get(), cpuFrame.get());
790+
TORCH_CHECK(
791+
ret >= 0,
792+
"Failed to copy frame properties: ",
793+
getFFMPEGErrorStringFromErrorCode(ret));
794+
795+
// We're almost done, but we need to make sure the CUDA memory is freed
796+
// properly. Usually, AVFrame data is freed when av_frame_free() is called
797+
// (upon UniqueAVFrame destruction), but since we allocated the CUDA memory
798+
// ourselves, FFmpeg doesn't know how to free it. The recommended way to deal
799+
// with this is to associate the opaque_ref field of the AVFrame with a `free`
800+
// callback that will then be called by av_frame_free().
801+
gpuFrame->opaque_ref = av_buffer_create(
802+
nullptr, // data - we don't need any
803+
0, // data size
804+
cudaBufferFreeCallback, // callback triggered by av_frame_free()
805+
cudaBuffer, // parameter to callback
806+
0); // flags
807+
TORCH_CHECK(
808+
gpuFrame->opaque_ref != nullptr,
809+
"Failed to create GPU memory cleanup reference");
810+
811+
return gpuFrame;
812+
}
813+
671814
void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
672815
UniqueAVFrame& avFrame,
673816
FrameOutput& frameOutput,
674817
std::optional<torch::Tensor> preAllocatedOutputTensor) {
675-
if (cpuFallback_) {
676-
// CPU decoded frame - need to do CPU color conversion then transfer to GPU
677-
FrameOutput cpuFrameOutput;
678-
cpuFallback_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
679-
680-
// Transfer CPU frame to GPU
681-
if (preAllocatedOutputTensor.has_value()) {
682-
preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data);
683-
frameOutput.data = preAllocatedOutputTensor.value();
684-
} else {
685-
frameOutput.data = cpuFrameOutput.data.to(device_);
686-
}
687-
return;
688-
}
818+
UniqueAVFrame gpuFrame =
819+
cpuFallback_ ? transferCpuFrameToGpuNV12(avFrame) : std::move(avFrame);
689820

690821
// TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA
691822
// ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24().
692823
TORCH_CHECK(
693-
avFrame->format == AV_PIX_FMT_CUDA,
824+
gpuFrame->format == AV_PIX_FMT_CUDA,
694825
"Expected CUDA format frame from BETA CUDA interface");
695826

696-
validatePreAllocatedTensorShape(preAllocatedOutputTensor, avFrame);
827+
validatePreAllocatedTensorShape(preAllocatedOutputTensor, gpuFrame);
697828

698829
at::cuda::CUDAStream nvdecStream =
699830
at::cuda::getCurrentCUDAStream(device_.index());
700831

701832
frameOutput.data = convertNV12FrameToRGB(
702-
avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
833+
gpuFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
703834
}
704835

705836
std::string BetaCudaDeviceInterface::getDetails() {

src/torchcodec/_core/BetaCudaDeviceInterface.h

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@
1515

1616
#pragma once
1717

18-
#include "src/torchcodec/_core/CUDACommon.h"
19-
#include "src/torchcodec/_core/Cache.h"
20-
#include "src/torchcodec/_core/DeviceInterface.h"
21-
#include "src/torchcodec/_core/FFMPEGCommon.h"
22-
#include "src/torchcodec/_core/NVDECCache.h"
18+
#include "CUDACommon.h"
19+
#include "Cache.h"
20+
#include "DeviceInterface.h"
21+
#include "FFMPEGCommon.h"
22+
#include "NVDECCache.h"
2323

2424
#include <map>
2525
#include <memory>
@@ -28,8 +28,8 @@
2828
#include <unordered_map>
2929
#include <vector>
3030

31-
#include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
32-
#include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"
31+
#include "nvcuvid_include/cuviddec.h"
32+
#include "nvcuvid_include/nvcuvid.h"
3333

3434
namespace facebook::torchcodec {
3535

@@ -46,8 +46,7 @@ class BetaCudaDeviceInterface : public DeviceInterface {
4646
void convertAVFrameToFrameOutput(
4747
UniqueAVFrame& avFrame,
4848
FrameOutput& frameOutput,
49-
std::optional<torch::Tensor> preAllocatedOutputTensor =
50-
std::nullopt) override;
49+
std::optional<torch::Tensor> preAllocatedOutputTensor) override;
5150

5251
int sendPacket(ReferenceAVPacket& packet) override;
5352
int sendEOFPacket() override;
@@ -81,6 +80,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
8180
unsigned int pitch,
8281
const CUVIDPARSERDISPINFO& dispInfo);
8382

83+
UniqueAVFrame transferCpuFrameToGpuNV12(UniqueAVFrame& cpuFrame);
84+
8485
CUvideoparser videoParser_ = nullptr;
8586
UniqueCUvideodecoder decoder_;
8687
CUVIDEOFORMAT videoFormat_ = {};
@@ -99,6 +100,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
99100

100101
std::unique_ptr<DeviceInterface> cpuFallback_;
101102
bool nvcuvidAvailable_ = false;
103+
UniqueSwsContext swsContext_;
104+
SwsFrameContext prevSwsFrameContext_;
102105
};
103106

104107
} // namespace facebook::torchcodec

0 commit comments

Comments
 (0)