Skip to content

Commit 7e5e6d4

Browse files
committed
move more encoding to gpuEncoder.cpp, reduce diff
1 parent bf78468 commit 7e5e6d4

File tree

13 files changed

+90
-132
lines changed

13 files changed

+90
-132
lines changed

src/torchcodec/_core/CUDACommon.cpp

Lines changed: 0 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -156,21 +156,6 @@ const Npp32f bt709FullRangeColorTwist[3][4] = {
156156
{1.0f, -0.187324273f, -0.468124273f, -128.0f},
157157
{1.0f, 1.8556f, 0.0f, -128.0f}};
158158

159-
// RGB to NV12 color conversion matrices (inverse of YUV to RGB)
160-
// Note: NPP's ColorTwist function apparently expects "limited range"
161-
// coefficient format even when producing full range output. All matrices below
162-
// use the limited range coefficient format (Y with +16 offset) for NPP
163-
// compatibility.
164-
165-
// BT.601 limited range (matches FFmpeg default behavior)
166-
const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
167-
// Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
168-
{0.257f, 0.504f, 0.098f, 16.0f},
169-
// U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
170-
{-0.148f, -0.291f, 0.439f, 128.0f},
171-
// V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
172-
{0.439f, -0.368f, -0.071f, 128.0f}};
173-
174159
torch::Tensor convertNV12FrameToRGB(
175160
UniqueAVFrame& avFrame,
176161
const torch::Device& device,
@@ -261,68 +246,6 @@ torch::Tensor convertNV12FrameToRGB(
261246
return dst;
262247
}
263248

264-
void convertRGBTensorToNV12Frame(
265-
const torch::Tensor& rgbTensor,
266-
UniqueAVFrame& nv12Frame,
267-
const torch::Device& device,
268-
const UniqueNppContext& nppCtx,
269-
at::cuda::CUDAStream inputStream) {
270-
TORCH_CHECK(rgbTensor.is_cuda(), "RGB tensor must be on CUDA device");
271-
TORCH_CHECK(
272-
rgbTensor.dim() == 3 && rgbTensor.size(0) == 3,
273-
"Expected 3D RGB tensor in CHW format, got shape: ",
274-
rgbTensor.sizes());
275-
TORCH_CHECK(
276-
nv12Frame != nullptr && nv12Frame->data[0] != nullptr,
277-
"nv12Frame must be pre-allocated with CUDA memory");
278-
279-
// Convert CHW to HWC for NPP processing
280-
int height = static_cast<int>(rgbTensor.size(1));
281-
int width = static_cast<int>(rgbTensor.size(2));
282-
torch::Tensor hwcFrame = rgbTensor.permute({1, 2, 0}).contiguous();
283-
284-
// Set up stream synchronization - make NPP stream wait for input tensor
285-
// operations
286-
at::cuda::CUDAStream nppStream =
287-
at::cuda::getCurrentCUDAStream(device.index());
288-
at::cuda::CUDAEvent inputDoneEvent;
289-
inputDoneEvent.record(inputStream);
290-
inputDoneEvent.block(nppStream);
291-
292-
// Setup NPP context
293-
nppCtx->hStream = nppStream.stream();
294-
cudaError_t cudaErr =
295-
cudaStreamGetFlags(nppCtx->hStream, &nppCtx->nStreamFlags);
296-
TORCH_CHECK(
297-
cudaErr == cudaSuccess,
298-
"cudaStreamGetFlags failed: ",
299-
cudaGetErrorString(cudaErr));
300-
301-
// Always use FFmpeg's default behavior: BT.601 limited range
302-
NppiSize oSizeROI = {width, height};
303-
304-
NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
305-
static_cast<const Npp8u*>(hwcFrame.data_ptr()),
306-
hwcFrame.stride(0) * hwcFrame.element_size(),
307-
nv12Frame->data,
308-
nv12Frame->linesize,
309-
oSizeROI,
310-
defaultLimitedRangeRgbToNv12,
311-
*nppCtx);
312-
313-
TORCH_CHECK(
314-
status == NPP_SUCCESS,
315-
"Failed to convert RGB to NV12: NPP error code ",
316-
status);
317-
318-
// Validate CUDA operations completed successfully
319-
cudaError_t memCheck = cudaGetLastError();
320-
TORCH_CHECK(
321-
memCheck == cudaSuccess,
322-
"CUDA error detected: ",
323-
cudaGetErrorString(memCheck));
324-
}
325-
326249
UniqueNppContext getNppStreamContext(const torch::Device& device) {
327250
int deviceIndex = getDeviceIndex(device);
328251

src/torchcodec/_core/CUDACommon.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,6 @@ torch::Tensor convertNV12FrameToRGB(
3737
at::cuda::CUDAStream nvdecStream,
3838
std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
3939

40-
void convertRGBTensorToNV12Frame(
41-
const torch::Tensor& rgbTensor,
42-
UniqueAVFrame& nv12Frame,
43-
const torch::Device& device,
44-
const UniqueNppContext& nppCtx,
45-
at::cuda::CUDAStream inputStream);
46-
4740
UniqueNppContext getNppStreamContext(const torch::Device& device);
4841
void returnNppStreamContextToCache(
4942
const torch::Device& device,

src/torchcodec/_core/CpuDeviceInterface.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ class CpuDeviceInterface : public DeviceInterface {
1818

1919
virtual ~CpuDeviceInterface() {}
2020

21+
std::optional<const AVCodec*> findCodec(
22+
[[maybe_unused]] const AVCodecID& codecId) override {
23+
return std::nullopt;
24+
}
25+
2126
virtual void initialize(
2227
const AVStream* avStream,
2328
const UniqueDecodingAVFormatContext& avFormatCtx,

src/torchcodec/_core/CudaDeviceInterface.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
335335
// we have to do this because of an FFmpeg bug where hardware decoding is not
336336
// appropriately set, so we just go off and find the matching codec for the CUDA
337337
// device
338-
std::optional<const AVCodec*> CudaDeviceInterface::findDecoder(
338+
std::optional<const AVCodec*> CudaDeviceInterface::findCodec(
339339
const AVCodecID& codecId) {
340340
void* i = nullptr;
341341
const AVCodec* codec = nullptr;

src/torchcodec/_core/CudaDeviceInterface.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ class CudaDeviceInterface : public DeviceInterface {
1818

1919
virtual ~CudaDeviceInterface();
2020

21-
std::optional<const AVCodec*> findDecoder(const AVCodecID& codecId) override;
21+
std::optional<const AVCodec*> findCodec(const AVCodecID& codecId) override;
2222

2323
void initialize(
2424
const AVStream* avStream,

src/torchcodec/_core/DeviceInterface.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class DeviceInterface {
4646
return device_;
4747
};
4848

49-
virtual std::optional<const AVCodec*> findDecoder(
49+
virtual std::optional<const AVCodec*> findCodec(
5050
[[maybe_unused]] const AVCodecID& codecId) {
5151
return std::nullopt;
5252
};

src/torchcodec/_core/Encoder.cpp

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -894,8 +894,7 @@ void VideoEncoder::encode() {
894894
avFrame = gpuEncoder_->convertTensorToAVFrame(
895895
currFrame, outPixelFormat_, i, avCodecContext_.get());
896896
} else {
897-
// Use direct CPU conversion for CPU devices
898-
avFrame = convertCpuTensorToAVFrame(currFrame, i);
897+
avFrame = convertTensorToAVFrame(currFrame, i);
899898
}
900899
encodeFrame(autoAVPacket, avFrame);
901900
}
@@ -909,24 +908,25 @@ void VideoEncoder::encode() {
909908
getFFMPEGErrorStringFromErrorCode(status));
910909
}
911910

912-
UniqueAVFrame VideoEncoder::convertCpuTensorToAVFrame(
913-
const torch::Tensor& tensor,
911+
UniqueAVFrame VideoEncoder::convertTensorToAVFrame(
912+
const torch::Tensor& frame,
914913
int frameIndex) {
915-
TORCH_CHECK(tensor.is_cpu(), "CPU encoder requires CPU tensors");
914+
TORCH_CHECK(frame.is_cpu(), "CPU encoder requires CPU tensors");
916915
TORCH_CHECK(
917-
tensor.dim() == 3 && tensor.size(0) == 3,
916+
frame.dim() == 3 && frame.size(0) == 3,
918917
"Expected 3D RGB tensor (CHW format), got shape: ",
919-
tensor.sizes());
918+
frame.sizes());
920919

921-
inHeight_ = static_cast<int>(tensor.sizes()[1]);
922-
inWidth_ = static_cast<int>(tensor.sizes()[2]);
920+
// These are all already set in initializeEncoder?
921+
// inHeight_ = static_cast<int>(tensor.sizes()[1]);
922+
// inWidth_ = static_cast<int>(tensor.sizes()[2]);
923923

924-
// For now, reuse input dimensions as output dimensions
925-
outWidth_ = inWidth_;
926-
outHeight_ = inHeight_;
924+
// // For now, reuse input dimensions as output dimensions
925+
// outWidth_ = inWidth_;
926+
// outHeight_ = inHeight_;
927927

928-
// Input format is RGB planar (AV_PIX_FMT_GBRP after channel reordering)
929-
inPixelFormat_ = AV_PIX_FMT_GBRP;
928+
// // Input format is RGB planar (AV_PIX_FMT_GBRP after channel reordering)
929+
// inPixelFormat_ = AV_PIX_FMT_GBRP;
930930

931931
// Initialize and cache scaling context if it does not exist
932932
if (!swsContext_) {
@@ -965,15 +965,15 @@ UniqueAVFrame VideoEncoder::convertCpuTensorToAVFrame(
965965
inputFrame->width = inWidth_;
966966
inputFrame->height = inHeight_;
967967

968-
uint8_t* tensorData = static_cast<uint8_t*>(tensor.data_ptr());
968+
uint8_t* tensorData = static_cast<uint8_t*>(frame.data_ptr());
969969

970970
// TODO-VideoEncoder: Reorder tensor if in NHWC format
971971
int channelSize = inHeight_ * inWidth_;
972972
// Reorder RGB -> GBR for AV_PIX_FMT_GBRP format
973973
// TODO-VideoEncoder: Determine if FFmpeg supports planar RGB input format
974-
inputFrame->data[0] = tensorData + channelSize; // G channel
975-
inputFrame->data[1] = tensorData + (2 * channelSize); // B channel
976-
inputFrame->data[2] = tensorData; // R channel
974+
inputFrame->data[0] = tensorData + channelSize;
975+
inputFrame->data[1] = tensorData + (2 * channelSize);
976+
inputFrame->data[2] = tensorData;
977977

978978
inputFrame->linesize[0] = inWidth_;
979979
inputFrame->linesize[1] = inWidth_;
@@ -988,7 +988,6 @@ UniqueAVFrame VideoEncoder::convertCpuTensorToAVFrame(
988988
avFrame->data,
989989
avFrame->linesize);
990990
TORCH_CHECK(status == outHeight_, "sws_scale failed");
991-
992991
return avFrame;
993992
}
994993

src/torchcodec/_core/Encoder.h

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -162,14 +162,12 @@ class VideoEncoder {
162162

163163
private:
164164
void initializeEncoder(const VideoStreamOptions& videoStreamOptions);
165+
UniqueAVFrame convertTensorToAVFrame(
166+
const torch::Tensor& frame,
167+
int frameIndex);
165168
void encodeFrame(AutoAVPacket& autoAVPacket, const UniqueAVFrame& avFrame);
166169
void flushBuffers();
167170

168-
// CPU tensor-to-frame conversion for CPU encoding
169-
UniqueAVFrame convertCpuTensorToAVFrame(
170-
const torch::Tensor& tensor,
171-
int frameIndex);
172-
173171
UniqueEncodingAVFormatContext avFormatContext_;
174172
UniqueAVCodecContext avCodecContext_;
175173
AVStream* avStream_ = nullptr;
@@ -187,7 +185,6 @@ class VideoEncoder {
187185
AVPixelFormat outPixelFormat_ = AV_PIX_FMT_NONE;
188186

189187
std::unique_ptr<AVIOContextHolder> avioContextHolder_;
190-
std::unique_ptr<DeviceInterface> deviceInterface_;
191188
std::unique_ptr<GpuEncoder> gpuEncoder_;
192189

193190
bool encodeWasCalled_ = false;

src/torchcodec/_core/FFMPEGCommon.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ AVPacket* ReferenceAVPacket::operator->() {
4040

4141
AVCodecOnlyUseForCallingAVFindBestStream
4242
makeAVCodecOnlyUseForCallingAVFindBestStream(const AVCodec* codec) {
43-
#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100) // FFmpeg < 5.0.3
43+
#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100)
4444
return const_cast<AVCodec*>(codec);
4545
#else
4646
return codec;

src/torchcodec/_core/GpuEncoder.cpp

Lines changed: 59 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,20 @@ UniqueAVBufferRef createHardwareDeviceContext(const torch::Device& device) {
6969
return UniqueAVBufferRef(hardwareDeviceCtxRaw);
7070
}
7171

72+
// RGB to NV12 color conversion matrices (inverse of YUV to RGB)
73+
// Note: NPP's ColorTwist function apparently expects "limited range"
74+
// coefficient format even when producing full range output. All matrices below
75+
// use the limited range coefficient format (Y with +16 offset) for NPP
76+
// compatibility.
77+
78+
// BT.601 limited range (matches FFmpeg default behavior)
79+
const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
80+
// Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
81+
{0.257f, 0.504f, 0.098f, 16.0f},
82+
// U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
83+
{-0.148f, -0.291f, 0.439f, 128.0f},
84+
// V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
85+
{0.439f, -0.368f, -0.071f, 128.0f}};
7286
} // anonymous namespace
7387

7488
GpuEncoder::GpuEncoder(const torch::Device& device) : device_(device) {
@@ -155,14 +169,6 @@ UniqueAVFrame GpuEncoder::convertTensorToAVFrame(
155169
tensor.dim() == 3 && tensor.size(0) == 3,
156170
"Expected 3D RGB tensor (CHW format), got shape: ",
157171
tensor.sizes());
158-
159-
return convertRGBTensorToNV12Frame(tensor, frameIndex, codecContext);
160-
}
161-
162-
UniqueAVFrame GpuEncoder::convertRGBTensorToNV12Frame(
163-
const torch::Tensor& tensor,
164-
int frameIndex,
165-
AVCodecContext* codecContext) {
166172
UniqueAVFrame avFrame(av_frame_alloc());
167173
TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
168174

@@ -178,13 +184,55 @@ UniqueAVFrame GpuEncoder::convertRGBTensorToNV12Frame(
178184
"Failed to allocate hardware frame: ",
179185
getFFMPEGErrorStringFromErrorCode(ret));
180186

187+
// Validate that avFrame was properly allocated with CUDA memory
188+
TORCH_CHECK(
189+
avFrame != nullptr && avFrame->data[0] != nullptr,
190+
"avFrame must be pre-allocated with CUDA memory");
191+
192+
// Convert CHW to HWC for NPP processing
193+
int height = static_cast<int>(tensor.size(1));
194+
int width = static_cast<int>(tensor.size(2));
195+
torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous();
196+
197+
// Get current CUDA stream for NPP operations
181198
at::cuda::CUDAStream currentStream =
182199
at::cuda::getCurrentCUDAStream(device_.index());
183200

184-
facebook::torchcodec::convertRGBTensorToNV12Frame(
185-
tensor, avFrame, device_, nppCtx_, currentStream);
201+
// Setup NPP context with current stream
202+
nppCtx_->hStream = currentStream.stream();
203+
cudaError_t cudaErr =
204+
cudaStreamGetFlags(nppCtx_->hStream, &nppCtx_->nStreamFlags);
205+
TORCH_CHECK(
206+
cudaErr == cudaSuccess,
207+
"cudaStreamGetFlags failed: ",
208+
cudaGetErrorString(cudaErr));
209+
210+
// Always use FFmpeg's default behavior: BT.601 limited range
211+
NppiSize oSizeROI = {width, height};
212+
213+
NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
214+
static_cast<const Npp8u*>(hwcFrame.data_ptr()),
215+
hwcFrame.stride(0) * hwcFrame.element_size(),
216+
avFrame->data,
217+
avFrame->linesize,
218+
oSizeROI,
219+
defaultLimitedRangeRgbToNv12,
220+
*nppCtx_);
221+
222+
TORCH_CHECK(
223+
status == NPP_SUCCESS,
224+
"Failed to convert RGB to NV12: NPP error code ",
225+
status);
226+
227+
// Validate CUDA operations completed successfully
228+
cudaError_t memCheck = cudaGetLastError();
229+
TORCH_CHECK(
230+
memCheck == cudaSuccess,
231+
"CUDA error detected: ",
232+
cudaGetErrorString(memCheck));
186233

187-
// Set color properties to FFmpeg defaults
234+
// TODO-VideoEncoder: Enable configuration of color properties, similar to
235+
// FFmpeg Set color properties to FFmpeg defaults
188236
avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
189237
avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range
190238

0 commit comments

Comments
 (0)