@@ -69,6 +69,20 @@ UniqueAVBufferRef createHardwareDeviceContext(const torch::Device& device) {
6969 return UniqueAVBufferRef (hardwareDeviceCtxRaw);
7070}
7171
72+ // RGB to NV12 color conversion matrices (inverse of YUV to RGB)
73+ // Note: NPP's ColorTwist function apparently expects "limited range"
74+ // coefficient format even when producing full range output. All matrices below
75+ // use the limited range coefficient format (Y with +16 offset) for NPP
76+ // compatibility.
77+
78+ // BT.601 limited range (matches FFmpeg default behavior)
79+ const Npp32f defaultLimitedRangeRgbToNv12[3 ][4 ] = {
80+ // Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
81+ {0 .257f , 0 .504f , 0 .098f , 16 .0f },
82+ // U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
83+ {-0 .148f , -0 .291f , 0 .439f , 128 .0f },
84+ // V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
85+ {0 .439f , -0 .368f , -0 .071f , 128 .0f }};
7286} // anonymous namespace
7387
7488GpuEncoder::GpuEncoder (const torch::Device& device) : device_(device) {
@@ -155,14 +169,6 @@ UniqueAVFrame GpuEncoder::convertTensorToAVFrame(
155169 tensor.dim () == 3 && tensor.size (0 ) == 3 ,
156170 " Expected 3D RGB tensor (CHW format), got shape: " ,
157171 tensor.sizes ());
158-
159- return convertRGBTensorToNV12Frame (tensor, frameIndex, codecContext);
160- }
161-
162- UniqueAVFrame GpuEncoder::convertRGBTensorToNV12Frame (
163- const torch::Tensor& tensor,
164- int frameIndex,
165- AVCodecContext* codecContext) {
166172 UniqueAVFrame avFrame (av_frame_alloc ());
167173 TORCH_CHECK (avFrame != nullptr , " Failed to allocate AVFrame" );
168174
@@ -178,13 +184,55 @@ UniqueAVFrame GpuEncoder::convertRGBTensorToNV12Frame(
178184 " Failed to allocate hardware frame: " ,
179185 getFFMPEGErrorStringFromErrorCode (ret));
180186
187+ // Validate that avFrame was properly allocated with CUDA memory
188+ TORCH_CHECK (
189+ avFrame != nullptr && avFrame->data [0 ] != nullptr ,
190+ " avFrame must be pre-allocated with CUDA memory" );
191+
192+ // Convert CHW to HWC for NPP processing
193+ int height = static_cast <int >(tensor.size (1 ));
194+ int width = static_cast <int >(tensor.size (2 ));
195+ torch::Tensor hwcFrame = tensor.permute ({1 , 2 , 0 }).contiguous ();
196+
197+ // Get current CUDA stream for NPP operations
181198 at::cuda::CUDAStream currentStream =
182199 at::cuda::getCurrentCUDAStream (device_.index ());
183200
184- facebook::torchcodec::convertRGBTensorToNV12Frame (
185- tensor, avFrame, device_, nppCtx_, currentStream);
201+ // Setup NPP context with current stream
202+ nppCtx_->hStream = currentStream.stream ();
203+ cudaError_t cudaErr =
204+ cudaStreamGetFlags (nppCtx_->hStream , &nppCtx_->nStreamFlags );
205+ TORCH_CHECK (
206+ cudaErr == cudaSuccess,
207+ " cudaStreamGetFlags failed: " ,
208+ cudaGetErrorString (cudaErr));
209+
210+ // Always use FFmpeg's default behavior: BT.601 limited range
211+ NppiSize oSizeROI = {width, height};
212+
213+ NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx (
214+ static_cast <const Npp8u*>(hwcFrame.data_ptr ()),
215+ hwcFrame.stride (0 ) * hwcFrame.element_size (),
216+ avFrame->data ,
217+ avFrame->linesize ,
218+ oSizeROI,
219+ defaultLimitedRangeRgbToNv12,
220+ *nppCtx_);
221+
222+ TORCH_CHECK (
223+ status == NPP_SUCCESS,
224+ " Failed to convert RGB to NV12: NPP error code " ,
225+ status);
226+
227+ // Validate CUDA operations completed successfully
228+ cudaError_t memCheck = cudaGetLastError ();
229+ TORCH_CHECK (
230+ memCheck == cudaSuccess,
231+ " CUDA error detected: " ,
232+ cudaGetErrorString (memCheck));
186233
187- // Set color properties to FFmpeg defaults
234+ // TODO-VideoEncoder: Enable configuration of color properties, similar to
235+ // FFmpeg Set color properties to FFmpeg defaults
188236 avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
189237 avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range
190238
0 commit comments