Skip to content

Commit 22e88d6

Browse files
committed
WIP
1 parent d63504c commit 22e88d6

File tree

10 files changed

+186
-245
lines changed

10 files changed

+186
-245
lines changed

src/torchcodec/_core/BetaCudaDeviceInterface.cpp

Lines changed: 2 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -674,149 +674,13 @@ void BetaCudaDeviceInterface::flush() {
674674
std::swap(readyFrames_, emptyQueue);
675675
}
676676

677-
UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
678-
UniqueAVFrame& cpuFrame) {
679-
// This is called in the context of the CPU fallback: the frame was decoded on
680-
// the CPU, and in this function we convert that frame into NV12 format and
681-
// send it to the GPU.
682-
// We do that in 2 steps:
683-
// - First we convert the input CPU frame into an intermediate NV12 CPU frame
684-
// using sws_scale.
685-
// - Then we allocate GPU memory and copy the NV12 CPU frame to the GPU. This
686-
// is what we return
687-
688-
TORCH_CHECK(cpuFrame != nullptr, "CPU frame cannot be null");
689-
690-
int width = cpuFrame->width;
691-
int height = cpuFrame->height;
692-
693-
// intermediate NV12 CPU frame. It's not on the GPU yet.
694-
UniqueAVFrame nv12CpuFrame(av_frame_alloc());
695-
TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame");
696-
697-
nv12CpuFrame->format = AV_PIX_FMT_NV12;
698-
nv12CpuFrame->width = width;
699-
nv12CpuFrame->height = height;
700-
701-
int ret = av_frame_get_buffer(nv12CpuFrame.get(), 0);
702-
TORCH_CHECK(
703-
ret >= 0,
704-
"Failed to allocate NV12 CPU frame buffer: ",
705-
getFFMPEGErrorStringFromErrorCode(ret));
706-
707-
SwsFrameContext swsFrameContext(
708-
width,
709-
height,
710-
static_cast<AVPixelFormat>(cpuFrame->format),
711-
width,
712-
height);
713-
714-
if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
715-
swsContext_ = createSwsContext(
716-
swsFrameContext, cpuFrame->colorspace, AV_PIX_FMT_NV12, SWS_BILINEAR);
717-
prevSwsFrameContext_ = swsFrameContext;
718-
}
719-
720-
int convertedHeight = sws_scale(
721-
swsContext_.get(),
722-
cpuFrame->data,
723-
cpuFrame->linesize,
724-
0,
725-
height,
726-
nv12CpuFrame->data,
727-
nv12CpuFrame->linesize);
728-
TORCH_CHECK(
729-
convertedHeight == height, "sws_scale failed for CPU->NV12 conversion");
730-
731-
int ySize = width * height;
732-
TORCH_CHECK(
733-
ySize % 2 == 0,
734-
"Y plane size must be even. Please report on TorchCodec repo.");
735-
int uvSize = ySize / 2; // NV12: UV plane is half the size of Y plane
736-
size_t totalSize = static_cast<size_t>(ySize + uvSize);
737-
738-
uint8_t* cudaBuffer = nullptr;
739-
cudaError_t err =
740-
cudaMalloc(reinterpret_cast<void**>(&cudaBuffer), totalSize);
741-
TORCH_CHECK(
742-
err == cudaSuccess,
743-
"Failed to allocate CUDA memory: ",
744-
cudaGetErrorString(err));
745-
746-
UniqueAVFrame gpuFrame(av_frame_alloc());
747-
TORCH_CHECK(gpuFrame != nullptr, "Failed to allocate GPU AVFrame");
748-
749-
gpuFrame->format = AV_PIX_FMT_CUDA;
750-
gpuFrame->width = width;
751-
gpuFrame->height = height;
752-
gpuFrame->data[0] = cudaBuffer;
753-
gpuFrame->data[1] = cudaBuffer + ySize;
754-
gpuFrame->linesize[0] = width;
755-
gpuFrame->linesize[1] = width;
756-
757-
// Note that we use cudaMemcpy2D here instead of cudaMemcpy because the
758-
// linesizes (strides) may be different than the widths for the input CPU
759-
// frame. That's precisely what cudaMemcpy2D is for.
760-
err = cudaMemcpy2D(
761-
gpuFrame->data[0],
762-
gpuFrame->linesize[0],
763-
nv12CpuFrame->data[0],
764-
nv12CpuFrame->linesize[0],
765-
width,
766-
height,
767-
cudaMemcpyHostToDevice);
768-
TORCH_CHECK(
769-
err == cudaSuccess,
770-
"Failed to copy Y plane to GPU: ",
771-
cudaGetErrorString(err));
772-
773-
TORCH_CHECK(
774-
height % 2 == 0,
775-
"height must be even. Please report on TorchCodec repo.");
776-
err = cudaMemcpy2D(
777-
gpuFrame->data[1],
778-
gpuFrame->linesize[1],
779-
nv12CpuFrame->data[1],
780-
nv12CpuFrame->linesize[1],
781-
width,
782-
height / 2,
783-
cudaMemcpyHostToDevice);
784-
TORCH_CHECK(
785-
err == cudaSuccess,
786-
"Failed to copy UV plane to GPU: ",
787-
cudaGetErrorString(err));
788-
789-
ret = av_frame_copy_props(gpuFrame.get(), cpuFrame.get());
790-
TORCH_CHECK(
791-
ret >= 0,
792-
"Failed to copy frame properties: ",
793-
getFFMPEGErrorStringFromErrorCode(ret));
794-
795-
// We're almost done, but we need to make sure the CUDA memory is freed
796-
// properly. Usually, AVFrame data is freed when av_frame_free() is called
797-
// (upon UniqueAVFrame destruction), but since we allocated the CUDA memory
798-
// ourselves, FFmpeg doesn't know how to free it. The recommended way to deal
799-
// with this is to associate the opaque_ref field of the AVFrame with a `free`
800-
// callback that will then be called by av_frame_free().
801-
gpuFrame->opaque_ref = av_buffer_create(
802-
nullptr, // data - we don't need any
803-
0, // data size
804-
cudaBufferFreeCallback, // callback triggered by av_frame_free()
805-
cudaBuffer, // parameter to callback
806-
0); // flags
807-
TORCH_CHECK(
808-
gpuFrame->opaque_ref != nullptr,
809-
"Failed to create GPU memory cleanup reference");
810-
811-
return gpuFrame;
812-
}
813-
814677
void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
815678
UniqueAVFrame& avFrame,
816679
FrameOutput& frameOutput,
817680
std::optional<torch::Tensor> preAllocatedOutputTensor) {
818681
UniqueAVFrame gpuFrame =
819-
cpuFallback_ ? transferCpuFrameToGpuNV12(avFrame) : std::move(avFrame);
682+
cpuFallback_ ? transferCpuFrameToGpuNV12(avFrame, swsCtx_, device_)
683+
: std::move(avFrame);
820684

821685
// TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA
822686
// ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24().

src/torchcodec/_core/BetaCudaDeviceInterface.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "src/torchcodec/_core/DeviceInterface.h"
2121
#include "src/torchcodec/_core/FFMPEGCommon.h"
2222
#include "src/torchcodec/_core/NVDECCache.h"
23+
#include "src/torchcodec/_core/SwsContext.h"
2324

2425
#include <map>
2526
#include <memory>
@@ -81,8 +82,6 @@ class BetaCudaDeviceInterface : public DeviceInterface {
8182
unsigned int pitch,
8283
const CUVIDPARSERDISPINFO& dispInfo);
8384

84-
UniqueAVFrame transferCpuFrameToGpuNV12(UniqueAVFrame& cpuFrame);
85-
8685
CUvideoparser videoParser_ = nullptr;
8786
UniqueCUvideodecoder decoder_;
8887
CUVIDEOFORMAT videoFormat_ = {};
@@ -101,8 +100,10 @@ class BetaCudaDeviceInterface : public DeviceInterface {
101100

102101
std::unique_ptr<DeviceInterface> cpuFallback_;
103102
bool nvcuvidAvailable_ = false;
104-
UniqueSwsContext swsContext_;
105-
SwsFrameContext prevSwsFrameContext_;
103+
104+
// Swscale context cache for GPU transfer during CPU fallback.
105+
// Used to convert CPU frames to NV12 before transferring to GPU.
106+
SwsScaler swsCtx_;
106107
};
107108

108109
} // namespace facebook::torchcodec

src/torchcodec/_core/CUDACommon.cpp

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,4 +327,137 @@ int getDeviceIndex(const torch::Device& device) {
327327
return deviceIndex;
328328
}
329329

330+
// Callback for freeing CUDA memory associated with AVFrame
331+
void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) {
332+
cudaFree(opaque);
333+
}
334+
335+
UniqueAVFrame transferCpuFrameToGpuNV12(
336+
UniqueAVFrame& cpuFrame,
337+
SwsScaler& swsCtx,
338+
[[maybe_unused]] const torch::Device& device) {
339+
// This function converts a CPU frame to NV12 format and transfers it to GPU.
340+
// We do that in 2 steps:
341+
// - First we convert the input CPU frame into an intermediate NV12 CPU frame
342+
// using sws_scale.
343+
// - Then we allocate GPU memory and copy the NV12 CPU frame to the GPU. This
344+
// is what we return.
345+
346+
TORCH_CHECK(cpuFrame != nullptr, "CPU frame cannot be null");
347+
348+
int width = cpuFrame->width;
349+
int height = cpuFrame->height;
350+
351+
// Intermediate NV12 CPU frame. It's not on the GPU yet.
352+
UniqueAVFrame nv12CpuFrame(av_frame_alloc());
353+
TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame");
354+
355+
nv12CpuFrame->format = AV_PIX_FMT_NV12;
356+
nv12CpuFrame->width = width;
357+
nv12CpuFrame->height = height;
358+
359+
int ret = av_frame_get_buffer(nv12CpuFrame.get(), 0);
360+
TORCH_CHECK(
361+
ret >= 0,
362+
"Failed to allocate NV12 CPU frame buffer: ",
363+
getFFMPEGErrorStringFromErrorCode(ret));
364+
365+
FrameDims outputDims(height, width);
366+
auto swsContext = swsCtx.getOrCreateContext(
367+
cpuFrame, outputDims, cpuFrame->colorspace, AV_PIX_FMT_NV12, SWS_BILINEAR);
368+
369+
int convertedHeight = sws_scale(
370+
swsContext.get(),
371+
cpuFrame->data,
372+
cpuFrame->linesize,
373+
0,
374+
height,
375+
nv12CpuFrame->data,
376+
nv12CpuFrame->linesize);
377+
TORCH_CHECK(
378+
convertedHeight == height, "sws_scale failed for CPU->NV12 conversion");
379+
380+
int ySize = width * height;
381+
TORCH_CHECK(
382+
ySize % 2 == 0,
383+
"Y plane size must be even. Please report on TorchCodec repo.");
384+
int uvSize = ySize / 2; // NV12: UV plane is half the size of Y plane
385+
size_t totalSize = static_cast<size_t>(ySize + uvSize);
386+
387+
uint8_t* cudaBuffer = nullptr;
388+
cudaError_t err =
389+
cudaMalloc(reinterpret_cast<void**>(&cudaBuffer), totalSize);
390+
TORCH_CHECK(
391+
err == cudaSuccess,
392+
"Failed to allocate CUDA memory: ",
393+
cudaGetErrorString(err));
394+
395+
UniqueAVFrame gpuFrame(av_frame_alloc());
396+
TORCH_CHECK(gpuFrame != nullptr, "Failed to allocate GPU AVFrame");
397+
398+
gpuFrame->format = AV_PIX_FMT_CUDA;
399+
gpuFrame->width = width;
400+
gpuFrame->height = height;
401+
gpuFrame->data[0] = cudaBuffer;
402+
gpuFrame->data[1] = cudaBuffer + ySize;
403+
gpuFrame->linesize[0] = width;
404+
gpuFrame->linesize[1] = width;
405+
406+
// Note that we use cudaMemcpy2D here instead of cudaMemcpy because the
407+
// linesizes (strides) may be different than the widths for the input CPU
408+
// frame. That's precisely what cudaMemcpy2D is for.
409+
err = cudaMemcpy2D(
410+
gpuFrame->data[0],
411+
gpuFrame->linesize[0],
412+
nv12CpuFrame->data[0],
413+
nv12CpuFrame->linesize[0],
414+
width,
415+
height,
416+
cudaMemcpyHostToDevice);
417+
TORCH_CHECK(
418+
err == cudaSuccess,
419+
"Failed to copy Y plane to GPU: ",
420+
cudaGetErrorString(err));
421+
422+
TORCH_CHECK(
423+
height % 2 == 0,
424+
"height must be even. Please report on TorchCodec repo.");
425+
err = cudaMemcpy2D(
426+
gpuFrame->data[1],
427+
gpuFrame->linesize[1],
428+
nv12CpuFrame->data[1],
429+
nv12CpuFrame->linesize[1],
430+
width,
431+
height / 2,
432+
cudaMemcpyHostToDevice);
433+
TORCH_CHECK(
434+
err == cudaSuccess,
435+
"Failed to copy UV plane to GPU: ",
436+
cudaGetErrorString(err));
437+
438+
ret = av_frame_copy_props(gpuFrame.get(), cpuFrame.get());
439+
TORCH_CHECK(
440+
ret >= 0,
441+
"Failed to copy frame properties: ",
442+
getFFMPEGErrorStringFromErrorCode(ret));
443+
444+
// We're almost done, but we need to make sure the CUDA memory is freed
445+
// properly. Usually, AVFrame data is freed when av_frame_free() is called
446+
// (upon UniqueAVFrame destruction), but since we allocated the CUDA memory
447+
// ourselves, FFmpeg doesn't know how to free it. The recommended way to deal
448+
// with this is to associate the opaque_ref field of the AVFrame with a `free`
449+
// callback that will then be called by av_frame_free().
450+
gpuFrame->opaque_ref = av_buffer_create(
451+
nullptr, // data - we don't need any
452+
0, // data size
453+
cudaBufferFreeCallback, // callback triggered by av_frame_free()
454+
cudaBuffer, // parameter to callback
455+
0); // flags
456+
TORCH_CHECK(
457+
gpuFrame->opaque_ref != nullptr,
458+
"Failed to create GPU memory cleanup reference");
459+
460+
return gpuFrame;
461+
}
462+
330463
} // namespace facebook::torchcodec

src/torchcodec/_core/CUDACommon.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
#include "src/torchcodec/_core/FFMPEGCommon.h"
1515
#include "src/torchcodec/_core/Frame.h"
16+
#include "src/torchcodec/_core/SwsContext.h"
1617

1718
extern "C" {
1819
#include <libavutil/hwcontext_cuda.h>
@@ -48,4 +49,11 @@ void validatePreAllocatedTensorShape(
4849

4950
int getDeviceIndex(const torch::Device& device);
5051

52+
// Convert CPU frame to NV12 and transfer to GPU for GPU-accelerated color
53+
// conversion. Used during CPU fallback to move color conversion to GPU.
54+
UniqueAVFrame transferCpuFrameToGpuNV12(
55+
UniqueAVFrame& cpuFrame,
56+
SwsScaler& swsCtx,
57+
const torch::Device& device);
58+
5159
} // namespace facebook::torchcodec

src/torchcodec/_core/CpuDeviceInterface.cpp

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -215,35 +215,17 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale(
215215
const UniqueAVFrame& avFrame,
216216
torch::Tensor& outputTensor,
217217
const FrameDims& outputDims) {
218-
enum AVPixelFormat frameFormat =
219-
static_cast<enum AVPixelFormat>(avFrame->format);
220-
221-
// We need to compare the current frame context with our previous frame
222-
// context. If they are different, then we need to re-create our colorspace
223-
// conversion objects. We create our colorspace conversion objects late so
224-
// that we don't have to depend on the unreliable metadata in the header.
225-
// And we sometimes re-create them because it's possible for frame
226-
// resolution to change mid-stream. Finally, we want to reuse the colorspace
227-
// conversion objects as much as possible for performance reasons.
228-
SwsFrameContext swsFrameContext(
229-
avFrame->width,
230-
avFrame->height,
231-
frameFormat,
232-
outputDims.width,
233-
outputDims.height);
234-
235-
if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
236-
swsContext_ = createSwsContext(
237-
swsFrameContext, avFrame->colorspace, AV_PIX_FMT_RGB24, swsFlags_);
238-
prevSwsFrameContext_ = swsFrameContext;
239-
}
218+
// Get or create swscale context. The SwsContext class manages caching
219+
// and recreation logic internally based on frame properties.
220+
auto swsContext = swsCtx_.getOrCreateContext(
221+
avFrame, outputDims, avFrame->colorspace, AV_PIX_FMT_RGB24, swsFlags_);
240222

241223
uint8_t* pointers[4] = {
242224
outputTensor.data_ptr<uint8_t>(), nullptr, nullptr, nullptr};
243225
int expectedOutputWidth = outputTensor.sizes()[1];
244226
int linesizes[4] = {expectedOutputWidth * 3, 0, 0, 0};
245227
int resultHeight = sws_scale(
246-
swsContext_.get(),
228+
swsContext.get(),
247229
avFrame->data,
248230
avFrame->linesize,
249231
0,

0 commit comments

Comments
 (0)