diff --git a/modules/cudacodec/CMakeLists.txt b/modules/cudacodec/CMakeLists.txt
index 8df41f00a96..a2dd450423f 100644
--- a/modules/cudacodec/CMakeLists.txt
+++ b/modules/cudacodec/CMakeLists.txt
@@ -38,9 +38,6 @@ if(HAVE_NVCUVID OR HAVE_NVCUVENC)
   endif()
   if(HAVE_NVCUVID)
     list(APPEND extra_libs ${CUDA_nvcuvid_LIBRARY})
-    if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
-      list(APPEND extra_libs CUDA::nppicc${CUDA_LIB_EXT})
-    endif()
   endif()
   if(HAVE_NVCUVENC)
       if(WIN32)
diff --git a/modules/cudacodec/include/opencv2/cudacodec.hpp b/modules/cudacodec/include/opencv2/cudacodec.hpp
index ca0ce204447..a0c039189e9 100644
--- a/modules/cudacodec/include/opencv2/cudacodec.hpp
+++ b/modules/cudacodec/include/opencv2/cudacodec.hpp
@@ -93,19 +93,19 @@ enum Codec
 
 /** @brief ColorFormat for the frame returned by VideoReader::nextFrame() and VideoReader::retrieve() or used to initialize a VideoWriter.
 */
-enum class ColorFormat {
+enum ColorFormat {
     UNDEFINED = 0,
-    BGRA = 1, //!< OpenCV color format, can be used with both VideoReader and VideoWriter.
-    BGR = 2, //!< OpenCV color format, can be used with both VideoReader and VideoWriter.
-    GRAY = 3, //!< OpenCV color format, can be used with both VideoReader and VideoWriter.
-    NV_NV12 = 4, //!< Nvidia color format - equivalent to YUV - Semi-Planar YUV [Y plane followed by interleaved UV plane], can be used with both VideoReader and VideoWriter.
-
-    RGB = 5, //!< OpenCV color format, can only be used with VideoWriter.
-    RGBA = 6, //!< OpenCV color format, can only be used with VideoWriter.
-    NV_YV12 = 8, //!< Nvidia Buffer Format - Planar YUV [Y plane followed by V and U planes], use with VideoReader, can only be used with VideoWriter.
-    NV_IYUV = 9, //!< Nvidia Buffer Format - Planar YUV [Y plane followed by U and V planes], use with VideoReader, can only be used with VideoWriter.
-    NV_YUV444 = 10, //!< Nvidia Buffer Format - Planar YUV [Y plane followed by U and V planes], use with VideoReader, can only be used with VideoWriter.
-    NV_AYUV = 11, //!< Nvidia Buffer Format - 8 bit Packed A8Y8U8V8. This is a word-ordered format where a pixel is represented by a 32-bit word with V in the lowest 8 bits, U in the next 8 bits, Y in the 8 bits after that and A in the highest 8 bits, can only be used with VideoWriter.
+    BGRA = 1, //!< OpenCV color format. VideoReader and VideoWriter.
+    BGR = 2, //!< OpenCV color format. VideoReader and VideoWriter.
+    GRAY = 3, //!< OpenCV color format. VideoReader and VideoWriter.
+    RGB = 5, //!< OpenCV color format. VideoReader and VideoWriter.
+    RGBA = 6, //!< OpenCV color format. VideoReader and VideoWriter.
+    NV_YUV_SURFACE_FORMAT = 7, //!< Nvidia YUV Surface Format output by the Nvidia decoder, see @ref SurfaceFormat. VideoReader only.
+    NV_NV12 = 4, //!< Nvidia Buffer Format - Semi-Planar YUV [Y plane followed by interleaved UV plane]. VideoWriter only. @deprecated Deprecated for use with VideoReader, use @ref NV_YUV_SURFACE_FORMAT instead.
+    NV_YV12 = 8, //!< Nvidia Buffer Format - Planar YUV [Y plane followed by V and U planes]. VideoWriter only.
+    NV_IYUV = 9, //!< Nvidia Buffer Format - Planar YUV [Y plane followed by U and V planes]. VideoWriter only.
+    NV_YUV444 = 10, //!< Nvidia Buffer Format - Planar YUV [Y plane followed by U and V planes]. VideoWriter only.
+    NV_AYUV = 11, //!< Nvidia Buffer Format - 8 bit Packed A8Y8U8V8. This is a word-ordered format where a pixel is represented by a 32-bit word with V in the lowest 8 bits, U in the next 8 bits, Y in the 8 bits after that and A in the highest 8 bits. VideoWriter only.
 #ifndef CV_DOXYGEN
     PROP_NOT_SUPPORTED
 #endif
@@ -298,16 +298,41 @@ enum ChromaFormat
     NumFormats
 };
 
-/** @brief Deinterlacing mode used by decoder.
-* @param Weave Weave both fields (no deinterlacing). For progressive content and for content that doesn't need deinterlacing.
-* @param Bob Drop one field.
-* @param Adaptive Adaptive deinterlacing needs more video memory than other deinterlacing modes.
-* */
+/** @brief Deinterlacing mode used by decoder. */
 enum DeinterlaceMode
 {
-    Weave = 0,
-    Bob = 1,
-    Adaptive = 2
+    Weave = 0, //!< Weave both fields(no deinterlacing).For progressive content and for content that doesn't need deinterlacing.
+    Bob = 1, //!< Drop one field.
+    Adaptive = 2 //!< Adaptive deinterlacing needs more video memory than other deinterlacing modes.
+};
+
+/** @brief Video Signal Description Color Primaries of the VideoReader source (section E.2.1 VUI parameters semantics of H265 spec file) */
+enum class ColorSpaceStandard {
+    BT709 = 1, //!< ITU-R BT.709 standard for high-definition television.
+    Unspecified = 2, //!< Unspecified color space standard.
+    Reserved = 3, //!< Reserved for future use.
+    FCC = 4, //!< FCC color space standard.
+    BT470 = 5, //!< ITU - R BT.470, used for older analog television systems.
+    BT601 = 6, //!< ITU - R BT.601, used for standard definition television.
+    SMPTE240M = 7, //!< SMPTE 240M, used for early HDTV systems.
+    YCgCo = 8, //!< YCgCo color space, used in some video compression algorithms.
+    BT2020 = 9, //!< ITU - R BT.2020, used for ultra-high-definition television.
+    BT2020C = 10 //!< ITU - R BT.2020 Constant Luminance, used for ultra-high-definition television.
+};
+
+/** @brief Video surface formats output by the decoder */
+enum SurfaceFormat {
+    SF_NV12 = 0, //!< Semi-Planar YUV [Y plane followed by interleaved UV plane]
+    SF_P016 = 1, //!< 16 bit Semi-Planar YUV [Y plane followed by interleaved UV plane]. Can be used for 10 bit(6LSB bits 0), 12 bit (4LSB bits 0)
+    SF_YUV444 = 2, //!< Planar YUV [Y plane followed by U and V planes]
+    SF_YUV444_16Bit = 3 //!< 16 bit Planar YUV [Y plane followed by U and V planes]. Can be used for 10 bit(6LSB bits 0), 12 bit (4LSB bits 0)
+};
+
+/** @brief Bit depth of the frame returned by VideoReader::nextFrame() and VideoReader::retrieve()  */
+enum BitDepth {
+    EIGHT = 0, //!< 8 bit depth.
+    SIXTEEN = 1, //!< 16 bit depth.
+    UNCHANGED = 2 //!< Use source bit depth.
 };
 
 /** @brief Utility function demonstrating how to map the luma histogram when FormatInfo::videoFullRangeFlag == false
@@ -316,7 +341,7 @@ enum DeinterlaceMode
 
     @note
     -   This function demonstrates how to map the luma histogram back so that it is equivalent to the result obtained from cuda::calcHist()
-    if the returned frame was colorFormat::GRAY.
+    if the returned frame was ColorFormat::GRAY.
  */
 CV_EXPORTS_W void MapHist(const cuda::GpuMat& hist, CV_OUT Mat& histFull);
 
@@ -325,10 +350,11 @@ CV_EXPORTS_W void MapHist(const cuda::GpuMat& hist, CV_OUT Mat& histFull);
 struct CV_EXPORTS_W_SIMPLE FormatInfo
 {
     CV_WRAP FormatInfo() : nBitDepthMinus8(-1), ulWidth(0), ulHeight(0), width(0), height(0), ulMaxWidth(0), ulMaxHeight(0), valid(false),
-        fps(0), ulNumDecodeSurfaces(0), videoFullRangeFlag(false), enableHistogram(false), nCounterBitDepth(0), nMaxHistogramBins(0){};
+        fps(0), ulNumDecodeSurfaces(0), videoFullRangeFlag(false), colorSpaceStandard(ColorSpaceStandard::BT601), enableHistogram(false), nCounterBitDepth(0), nMaxHistogramBins(0){};
 
     CV_PROP_RW Codec codec;
     CV_PROP_RW ChromaFormat chromaFormat;
+    CV_PROP_RW SurfaceFormat surfaceFormat; //!< Surface format of the decoded frame.
     CV_PROP_RW int nBitDepthMinus8;
     CV_PROP_RW int nBitDepthChromaMinus8;
     CV_PROP_RW int ulWidth;//!< Coded sequence width in pixels.
@@ -345,12 +371,36 @@ struct CV_EXPORTS_W_SIMPLE FormatInfo
     CV_PROP_RW cv::Size targetSz;//!< Post-processed size of the output frame.
     CV_PROP_RW cv::Rect srcRoi;//!< Region of interest decoded from video source.
     CV_PROP_RW cv::Rect targetRoi;//!< Region of interest in the output frame containing the decoded frame.
-    CV_PROP_RW bool videoFullRangeFlag;//!< Output value indicating if the black level, luma and chroma of the source are represented using the full or limited range (AKA TV or "analogue" range) of values as defined in Annex E of the ITU-T Specification.  Internally the conversion from NV12 to BGR obeys ITU 709.
+    CV_PROP_RW bool videoFullRangeFlag;//!< Output value indicating if the black level, luma and chroma of the source are represented using the full or limited range (AKA TV or "analogue" range) of values as defined in Annex E of the ITU-T Specification.
+    CV_PROP_RW ColorSpaceStandard colorSpaceStandard; //!< Video Signal Description Color Primaries of the VideoReader source (section E.2.1 VUI parameters semantics of H265 spec file)
     CV_PROP_RW bool enableHistogram;//!< Flag requesting histogram output if supported. Exception will be thrown when requested but not supported.
     CV_PROP_RW int nCounterBitDepth;//!< Bit depth of histogram bins if histogram output is requested and supported.
     CV_PROP_RW int nMaxHistogramBins;//!< Max number of histogram bins if histogram output is requested and supported.
 };
 
+/** @brief Class for converting the raw YUV Surface output from VideoReader if output color format is set to ColorFormat::NV_YUV_SURFACE_FORMAT (VideoReader::set(ColorFormat::NV_YUV_SURFACE_FORMAT)) to the requested @ref ColorFormat.
+ */
+class CV_EXPORTS_W NVSurfaceToColorConverter {
+public:
+    /** @brief Performs the conversion from the raw YUV Surface output from VideoReader to the requested color format. Use this function when you want to convert the raw YUV Surface output from VideoReader to more than one color format or you want both the raw Surface output in addition to a color frame.
+     * @param yuv The raw YUV Surface output from VideoReader see @ref SurfaceFormat.
+     * @param color The converted frame.
+     * @param surfaceFormat The surface format of the input YUV data.
+     * @param outputFormat The requested output color format.
+     * @param bitDepth The requested bit depth of the output frame.
+     * @param planar Request seperate planes for each color plane.
+     * @param videoFullRangeFlag Indicates if the black level, luma and chroma of the source are represented using the full or limited range (AKA TV or "analogue" range) of values as defined in Annex E of the ITU-T Specification.
+     * @param stream Stream for the asynchronous version.
+     */
+    virtual bool convert(InputArray yuv, OutputArray color, const SurfaceFormat surfaceFormat, const ColorFormat outputFormat, const BitDepth bitDepth = BitDepth::UNCHANGED, const bool planar = false, const bool videoFullRangeFlag = false, cuda::Stream& stream = cuda::Stream::Null()) = 0;
+};
+
+/** @brief Creates a NVSurfaceToColorConverter.
+* @param colorSpace The requested @ref ColorSpaceStandard for the converter.
+* @param videoFullRangeFlag Indicates if the black level, luma and chroma of the source are represented using the full or limited range (AKA TV or "analogue" range) of values as defined in Annex E of the ITU-T Specification.
+ */
+CV_EXPORTS_W Ptr<NVSurfaceToColorConverter> createNVSurfaceToColorConverter(const ColorSpaceStandard colorSpace, const bool videoFullRangeFlag = false);
+
 /** @brief cv::cudacodec::VideoReader generic properties identifier.
 */
 enum class VideoReaderProps {
@@ -360,9 +410,11 @@ enum class VideoReaderProps {
     PROP_NUMBER_OF_RAW_PACKAGES_SINCE_LAST_GRAB = 3, //!< Number of raw packages recieved since the last call to grab().
     PROP_RAW_MODE = 4, //!< Status of raw mode.
     PROP_LRF_HAS_KEY_FRAME = 5, //!< FFmpeg source only - Indicates whether the Last Raw Frame (LRF), output from VideoReader::retrieve() when VideoReader is initialized in raw mode, contains encoded data for a key frame.
-    PROP_COLOR_FORMAT = 6, //!< Set the ColorFormat of the decoded frame.  This can be changed before every call to nextFrame() and retrieve().
+    PROP_COLOR_FORMAT = 6, //!< ColorFormat of the decoded frame.  This can be changed before every call to nextFrame() and retrieve().
     PROP_UDP_SOURCE = 7, //!< Status of VideoReaderInitParams::udpSource initialization.
     PROP_ALLOW_FRAME_DROP = 8, //!< Status of VideoReaderInitParams::allowFrameDrop initialization.
+    PROP_BIT_DEPTH = 9, //!< Bit depth of the decoded frame. This can be changed before every call to nextFrame() and retrieve().
+    PROP_PLANAR = 10, //!< Planar when true, packed when false. This can be changed before every call to nextFrame() and retrieve().
 #ifndef CV_DOXYGEN
     PROP_NOT_SUPPORTED
 #endif
@@ -481,9 +533,11 @@ class CV_EXPORTS_W VideoReader
     /** @brief Set the desired ColorFormat for the frame returned by nextFrame()/retrieve().
 
     @param colorFormat Value of the ColorFormat.
+    @param bitDepth Requested bit depth of the frame.
+    @param planar Set to true for planar and false for packed color format.
     @return `true` unless the colorFormat is not supported.
      */
-    CV_WRAP virtual bool set(const ColorFormat colorFormat) = 0;
+    CV_WRAP virtual bool set(const ColorFormat colorFormat, const BitDepth bitDepth = BitDepth::UNCHANGED, const bool planar = false) = 0;
 
     /** @brief Returns the specified VideoReader property
 
diff --git a/modules/cudacodec/src/cuda/ColorSpace.cu b/modules/cudacodec/src/cuda/ColorSpace.cu
new file mode 100644
index 00000000000..137805af392
--- /dev/null
+++ b/modules/cudacodec/src/cuda/ColorSpace.cu
@@ -0,0 +1,762 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "ColorSpace.h"
+#include <opencv2/cudacodec.hpp>
+#include <opencv2/core/cuda/common.hpp>
+
+namespace cv { namespace cuda { namespace device {
+
+__constant__ float matYuv2Color[3][3];
+
+void inline GetConstants(int iMatrix, float& wr, float& wb, int& black, int& white, int& uvWhite, int& max, bool fullRange = false) {
+    if (fullRange) {
+        black = 0; white = 255; uvWhite = 255;
+    }
+    else {
+        black = 16; white = 235; uvWhite = 240;
+    }
+    max = 255;
+
+    switch (static_cast<cv::cudacodec::ColorSpaceStandard>(iMatrix))
+    {
+    case cv::cudacodec::ColorSpaceStandard::BT709:
+    default:
+        wr = 0.2126f; wb = 0.0722f;
+        break;
+
+    case cv::cudacodec::ColorSpaceStandard::FCC:
+        wr = 0.30f; wb = 0.11f;
+        break;
+
+    case cv::cudacodec::ColorSpaceStandard::BT470:
+    case cv::cudacodec::ColorSpaceStandard::BT601:
+        wr = 0.2990f; wb = 0.1140f;
+        break;
+
+    case cv::cudacodec::ColorSpaceStandard::SMPTE240M:
+        wr = 0.212f; wb = 0.087f;
+        break;
+
+    case cv::cudacodec::ColorSpaceStandard::BT2020:
+    case cv::cudacodec::ColorSpaceStandard::BT2020C:
+        wr = 0.2627f; wb = 0.0593f;
+        // 10-bit only
+        black = 64 << 6; white = 940 << 6;
+        max = (1 << 16) - 1;
+        break;
+    }
+}
+
+void SetMatYuv2Rgb(int iMatrix, bool fullRange = false) {
+    float wr, wb;
+    int black, white, max, uvWhite;
+    GetConstants(iMatrix, wr, wb, black, white, uvWhite, max, fullRange);
+    float mat[3][3] = {
+        1.0f, 0.0f, (1.0f - wr) / 0.5f,
+        1.0f, -wb * (1.0f - wb) / 0.5f / (1 - wb - wr), -wr * (1 - wr) / 0.5f / (1 - wb - wr),
+        1.0f, (1.0f - wb) / 0.5f, 0.0f,
+    };
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            if (j == 0)
+                mat[i][j] = (float)(1.0 * max / (white - black) * mat[i][j]);
+            else
+                mat[i][j] = (float)(1.0 * max / (uvWhite - black) * mat[i][j]);
+        }
+    }
+    cudaMemcpyToSymbol(matYuv2Color, mat, sizeof(mat));
+}
+
+template<class T>
+__device__ static T Clamp(T x, T lower, T upper) {
+    return x < lower ? lower : (x > upper ? upper : x);
+}
+
+template<class Gray, class YuvUnit>
+__device__ inline Gray YToGrayForPixel(YuvUnit y, bool videoFullRangeFlag) {
+    const int low = videoFullRangeFlag ? 0 : 1 << (sizeof(YuvUnit) * 8 - 4);
+    float fy = (int)y - low;
+    const float maxf = (1 << sizeof(YuvUnit) * 8) - 1.0f;
+
+    YuvUnit g = (YuvUnit)Clamp(matYuv2Color[0][0] * fy, 0.0f, maxf);
+    const int nShift = abs((int)sizeof(YuvUnit) - (int)sizeof(Gray)) * 8;
+    Gray gray{};
+    if (sizeof(YuvUnit) >= sizeof(Gray))
+        gray = g >> nShift;
+    else
+        gray = g << nShift;
+    return gray;
+}
+
+template<class Color, class YuvUnit>
+__device__ inline Color YuvToColorForPixel(YuvUnit y, YuvUnit u, YuvUnit v, bool videoFullRangeFlag) {
+    const int
+        low = videoFullRangeFlag ? 0 : 1 << (sizeof(YuvUnit) * 8 - 4),
+        mid = 1 << (sizeof(YuvUnit) * 8 - 1);
+    float fy = (int)y - low, fu = (int)u - mid, fv = (int)v - mid;
+    const float maxf = (1 << sizeof(YuvUnit) * 8) - 1.0f;
+    YuvUnit
+        r = (YuvUnit)Clamp(matYuv2Color[0][0] * fy + matYuv2Color[0][1] * fu + matYuv2Color[0][2] * fv, 0.0f, maxf),
+        g = (YuvUnit)Clamp(matYuv2Color[1][0] * fy + matYuv2Color[1][1] * fu + matYuv2Color[1][2] * fv, 0.0f, maxf),
+        b = (YuvUnit)Clamp(matYuv2Color[2][0] * fy + matYuv2Color[2][1] * fu + matYuv2Color[2][2] * fv, 0.0f, maxf);
+
+    Color color{};
+    const int nShift = abs((int)sizeof(YuvUnit) - (int)sizeof(color.c.r)) * 8;
+    if (sizeof(YuvUnit) >= sizeof(color.c.r)) {
+        color.c.r = r >> nShift;
+        color.c.g = g >> nShift;
+        color.c.b = b >> nShift;
+    }
+    else {
+        color.c.r = r << nShift;
+        color.c.g = g << nShift;
+        color.c.b = b << nShift;
+    }
+    return color;
+}
+
+template<class Color, class YuvUnit>
+__device__ inline Color YuvToColoraForPixel(YuvUnit y, YuvUnit u, YuvUnit v, bool videoFullRangeFlag) {
+    Color color = YuvToColorForPixel<Color>(y, u, v, videoFullRangeFlag);
+    const float maxf = (1 << sizeof(color.c.r) * 8) - 1.0f;
+    color.c.a = maxf;
+    return color;
+}
+
+template<class Yuvx2, class Gray, class Grayx2>
+__global__ static void YToGrayKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y);
+    if (x + 1 >= nWidth || y >= nHeight) {
+        return;
+    }
+
+    uint8_t* pSrc = pYuv + x * sizeof(Yuvx2) / 2 + y * nYuvPitch;
+    uint8_t* pDst = pGray + x * sizeof(Gray) + y * nGrayPitch;
+
+    Yuvx2 l0 = *(Yuvx2*)pSrc;
+    *(Grayx2*)pDst = Grayx2{
+        YToGrayForPixel<Gray>(l0.x, videoFullRangeFlag),
+        YToGrayForPixel<Gray>(l0.y, videoFullRangeFlag),
+    };
+}
+
+template<class Yuvx2, class Color, class Colorx2>
+__global__ static void YuvToColorKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
+    if (x + 1 >= nWidth || y + 1 >= nHeight) {
+        return;
+    }
+
+    uint8_t* pSrc = pYuv + x * sizeof(Yuvx2) / 2 + y * nYuvPitch;
+    uint8_t* pDst = pColor + x * sizeof(Color) + y * nColorPitch;
+
+    Yuvx2 l0 = *(Yuvx2*)pSrc;
+    Yuvx2 l1 = *(Yuvx2*)(pSrc + nYuvPitch);
+    Yuvx2 ch = *(Yuvx2*)(pSrc + (nHeight - y / 2) * nYuvPitch);
+
+    union ColorOutx2 {
+        Colorx2 d;
+        Color Color[2];
+    };
+    ColorOutx2 l1Out;
+    l1Out.Color[0] = YuvToColorForPixel<Color>(l0.x, ch.x, ch.y, videoFullRangeFlag);
+    l1Out.Color[1] = YuvToColorForPixel<Color>(l0.y, ch.x, ch.y, videoFullRangeFlag);
+    *(Colorx2*)pDst = l1Out.d;
+    ColorOutx2 l2Out;
+    l2Out.Color[0] = YuvToColorForPixel<Color>(l1.x, ch.x, ch.y, videoFullRangeFlag);
+    l2Out.Color[1] = YuvToColorForPixel<Color>(l1.y, ch.x, ch.y, videoFullRangeFlag);
+    *(Colorx2*)(pDst + nColorPitch) = l2Out.d;
+}
+
+template<class YuvUnitx2, class Color, class ColorIntx2>
+__global__ static void YuvToColoraKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
+    if (x + 1 >= nWidth || y + 1 >= nHeight) {
+        return;
+    }
+
+    uint8_t* pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+    uint8_t* pDst = pColor + x * sizeof(Color) + y * nColorPitch;
+
+    YuvUnitx2 l0 = *(YuvUnitx2*)pSrc;
+    YuvUnitx2 l1 = *(YuvUnitx2*)(pSrc + nYuvPitch);
+    YuvUnitx2 ch = *(YuvUnitx2*)(pSrc + (nHeight - y / 2) * nYuvPitch);
+
+    *(ColorIntx2*)pDst = ColorIntx2{
+        YuvToColoraForPixel<Color>(l0.x, ch.x, ch.y, videoFullRangeFlag).d,
+        YuvToColoraForPixel<Color>(l0.y, ch.x, ch.y, videoFullRangeFlag).d,
+    };
+    *(ColorIntx2*)(pDst + nColorPitch) = ColorIntx2{
+        YuvToColoraForPixel<Color>(l1.x, ch.x, ch.y, videoFullRangeFlag).d,
+        YuvToColoraForPixel<Color>(l1.y, ch.x, ch.y, videoFullRangeFlag).d,
+    };
+}
+
+template<class YuvUnitx2, class Color, class Colorx2>
+__global__ static void Yuv444ToColorKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y);
+    if (x + 1 >= nWidth || y >= nHeight) {
+        return;
+    }
+
+    uint8_t* pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+    uint8_t* pDst = pColor + x * sizeof(Color) + y * nColorPitch;
+
+    YuvUnitx2 l0 = *(YuvUnitx2*)pSrc;
+    YuvUnitx2 ch1 = *(YuvUnitx2*)(pSrc + (nHeight * nYuvPitch));
+    YuvUnitx2 ch2 = *(YuvUnitx2*)(pSrc + (2 * nHeight * nYuvPitch));
+
+    union ColorOutx2 {
+        Colorx2 d;
+        Color Color[2];
+    };
+    ColorOutx2 out;
+    out.Color[0] = YuvToColorForPixel<Color>(l0.x, ch1.x, ch2.x, videoFullRangeFlag);
+    out.Color[1] = YuvToColorForPixel<Color>(l0.y, ch1.y, ch2.y, videoFullRangeFlag);
+    *(Colorx2*)pDst = out.d;
+}
+
+template<class YuvUnitx2, class Color, class ColorIntx2>
+__global__ static void Yuv444ToColoraKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y);
+    if (x + 1 >= nWidth || y >= nHeight) {
+        return;
+    }
+
+    uint8_t* pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+    uint8_t* pDst = pColor + x * sizeof(Color) + y * nColorPitch;
+
+    YuvUnitx2 l0 = *(YuvUnitx2*)pSrc;
+    YuvUnitx2 ch1 = *(YuvUnitx2*)(pSrc + (nHeight * nYuvPitch));
+    YuvUnitx2 ch2 = *(YuvUnitx2*)(pSrc + (2 * nHeight * nYuvPitch));
+
+    *(ColorIntx2*)pDst = ColorIntx2{
+        YuvToColoraForPixel<Color>(l0.x, ch1.x, ch2.x, videoFullRangeFlag).d,
+        YuvToColoraForPixel<Color>(l0.y, ch1.y, ch2.y, videoFullRangeFlag).d,
+    };
+}
+
+template<class YuvUnitx2, class Color, class ColorUnitx2>
+__global__ static void YuvToColorPlanarKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColorp, int nColorpPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
+    if (x + 1 >= nWidth || y + 1 >= nHeight) {
+        return;
+    }
+
+    uint8_t* pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+
+    YuvUnitx2 l0 = *(YuvUnitx2*)pSrc;
+    YuvUnitx2 l1 = *(YuvUnitx2*)(pSrc + nYuvPitch);
+    YuvUnitx2 ch = *(YuvUnitx2*)(pSrc + (nHeight - y / 2) * nYuvPitch);
+
+    Color color0 = YuvToColorForPixel<Color>(l0.x, ch.x, ch.y, videoFullRangeFlag),
+        color1 = YuvToColorForPixel<Color>(l0.y, ch.x, ch.y, videoFullRangeFlag),
+        color2 = YuvToColorForPixel<Color>(l1.x, ch.x, ch.y, videoFullRangeFlag),
+        color3 = YuvToColorForPixel<Color>(l1.y, ch.x, ch.y, videoFullRangeFlag);
+
+    uint8_t* pDst = pColorp + x * sizeof(ColorUnitx2) / 2 + y * nColorpPitch;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.x, color1.v.x };
+    *(ColorUnitx2*)(pDst + nColorpPitch) = ColorUnitx2{ color2.v.x, color3.v.x };
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.y, color1.v.y };
+    *(ColorUnitx2*)(pDst + nColorpPitch) = ColorUnitx2{ color2.v.y, color3.v.y };
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.z, color1.v.z };
+    *(ColorUnitx2*)(pDst + nColorpPitch) = ColorUnitx2{ color2.v.z, color3.v.z };
+}
+
+template<class YuvUnitx2, class Color, class ColorUnitx2>
+__global__ static void YuvToColoraPlanarKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColorp, int nColorpPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y) * 2;
+    if (x + 1 >= nWidth || y + 1 >= nHeight) {
+        return;
+    }
+
+    uint8_t* pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+
+    YuvUnitx2 l0 = *(YuvUnitx2*)pSrc;
+    YuvUnitx2 l1 = *(YuvUnitx2*)(pSrc + nYuvPitch);
+    YuvUnitx2 ch = *(YuvUnitx2*)(pSrc + (nHeight - y / 2) * nYuvPitch);
+
+    Color color0 = YuvToColoraForPixel<Color>(l0.x, ch.x, ch.y, videoFullRangeFlag),
+        color1 = YuvToColoraForPixel<Color>(l0.y, ch.x, ch.y, videoFullRangeFlag),
+        color2 = YuvToColoraForPixel<Color>(l1.x, ch.x, ch.y, videoFullRangeFlag),
+        color3 = YuvToColoraForPixel<Color>(l1.y, ch.x, ch.y, videoFullRangeFlag);
+
+    uint8_t* pDst = pColorp + x * sizeof(ColorUnitx2) / 2 + y * nColorpPitch;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.x, color1.v.x };
+    *(ColorUnitx2*)(pDst + nColorpPitch) = ColorUnitx2{ color2.v.x, color3.v.x };
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.y, color1.v.y };
+    *(ColorUnitx2*)(pDst + nColorpPitch) = ColorUnitx2{ color2.v.y, color3.v.y };
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.z, color1.v.z };
+    *(ColorUnitx2*)(pDst + nColorpPitch) = ColorUnitx2{ color2.v.z, color3.v.z };
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.w, color1.v.w };
+    *(ColorUnitx2*)(pDst + nColorpPitch) = ColorUnitx2{ color2.v.w, color3.v.w };
+}
+
+template<class YuvUnitx2, class Color, class ColorUnitx2>
+__global__ static void Yuv444ToColorPlanarKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColorp, int nColorpPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y);
+    if (x + 1 >= nWidth || y >= nHeight) {
+        return;
+    }
+
+    uint8_t* pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+
+    YuvUnitx2 l0 = *(YuvUnitx2*)pSrc;
+    YuvUnitx2 ch1 = *(YuvUnitx2*)(pSrc + (nHeight * nYuvPitch));
+    YuvUnitx2 ch2 = *(YuvUnitx2*)(pSrc + (2 * nHeight * nYuvPitch));
+
+    Color color0 = YuvToColorForPixel<Color>(l0.x, ch1.x, ch2.x, videoFullRangeFlag),
+        color1 = YuvToColorForPixel<Color>(l0.y, ch1.y, ch2.y, videoFullRangeFlag);
+
+
+    uint8_t* pDst = pColorp + x * sizeof(ColorUnitx2) / 2 + y * nColorpPitch;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.x, color1.v.x };
+
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.y, color1.v.y };
+
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.z, color1.v.z };
+}
+
+template<class YuvUnitx2, class Color, class ColorUnitx2>
+__global__ static void Yuv444ToColoraPlanarKernel(uint8_t* pYuv, int nYuvPitch, uint8_t* pColorp, int nColorpPitch, int nWidth, int nHeight, bool videoFullRangeFlag) {
+    int x = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    int y = (threadIdx.y + blockIdx.y * blockDim.y);
+    if (x + 1 >= nWidth || y >= nHeight) {
+        return;
+    }
+
+    uint8_t* pSrc = pYuv + x * sizeof(YuvUnitx2) / 2 + y * nYuvPitch;
+
+    YuvUnitx2 l0 = *(YuvUnitx2*)pSrc;
+    YuvUnitx2 ch1 = *(YuvUnitx2*)(pSrc + (nHeight * nYuvPitch));
+    YuvUnitx2 ch2 = *(YuvUnitx2*)(pSrc + (2 * nHeight * nYuvPitch));
+
+    Color color0 = YuvToColoraForPixel<Color>(l0.x, ch1.x, ch2.x, videoFullRangeFlag),
+        color1 = YuvToColoraForPixel<Color>(l0.y, ch1.y, ch2.y, videoFullRangeFlag);
+
+
+    uint8_t* pDst = pColorp + x * sizeof(ColorUnitx2) / 2 + y * nColorpPitch;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.x, color1.v.x };
+
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.y, color1.v.y };
+
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.z, color1.v.z };
+
+    pDst += nColorpPitch * nHeight;
+    *(ColorUnitx2*)pDst = ColorUnitx2{ color0.v.w, color1.v.w };
+}
+
+#define BLOCKSIZE_X 32
+#define BLOCKSIZE_Y 8
+
+void Y8ToGray8(uint8_t* dpY8, int nY8Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YToGrayKernel<uchar2, unsigned char, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpY8, nY8Pitch, dpGray, nGrayPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+void Y8ToGray16(uint8_t* dpY8, int nY8Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YToGrayKernel<uchar2, unsigned short, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpY8, nY8Pitch, dpGray, nGrayPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+void Y16ToGray8(uint8_t* dpY16, int nY16Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YToGrayKernel<ushort2, unsigned char, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpY16, nY16Pitch, dpGray, nGrayPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+void Y16ToGray16(uint8_t* dpY16, int nY16Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YToGrayKernel<ushort2, unsigned short, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpY16, nY16Pitch, dpGray, nGrayPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR24>
+void Nv12ToColor24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColorKernel<uchar2, COLOR24, ushort3>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2* BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR32>
+void Nv12ToColor32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColoraKernel<uchar2, COLOR32, uint2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2* BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR48>
+void Nv12ToColor48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColorKernel<uchar2, COLOR48, uint3>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2* BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR64>
+void Nv12ToColor64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColoraKernel<uchar2, COLOR64, ulonglong2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2* BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR24>
+void YUV444ToColor24(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColorKernel<uchar2, COLOR24, ushort3>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR32>
+void YUV444ToColor32(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColoraKernel<uchar2, COLOR32, uint2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR48>
+void YUV444ToColor48(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColorKernel<uchar2, COLOR48, uint3>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR64>
+void YUV444ToColor64(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColoraKernel<uchar2, COLOR64, ulonglong2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR24>
+void P016ToColor24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColorKernel<ushort2, COLOR24, ushort3>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR32>
+void P016ToColor32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColoraKernel<ushort2, COLOR32, uint2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR48>
+void P016ToColor48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColorKernel<ushort2, COLOR48, uint3>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR64>
+void P016ToColor64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColoraKernel<ushort2, COLOR64, ulonglong2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR24>
+void YUV444P16ToColor24(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColorKernel<ushort2, COLOR24, ushort3>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR32>
+void YUV444P16ToColor32(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColoraKernel<ushort2, COLOR32, uint2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR48>
+void YUV444P16ToColor48(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColorKernel<ushort2, COLOR48, uint3>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR64>
+void YUV444P16ToColor64(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColoraKernel<ushort2, COLOR64, ulonglong2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR24>
+void Nv12ToColorPlanar24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColorPlanarKernel<uchar2, COLOR24, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR32>
+void Nv12ToColorPlanar32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColoraPlanarKernel<uchar2, COLOR32, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR48>
+void Nv12ToColorPlanar48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColorPlanarKernel<uchar2, COLOR48, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR64>
+void Nv12ToColorPlanar64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColoraPlanarKernel<uchar2, COLOR64, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpNv12, nNv12Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR24>
+void P016ToColorPlanar24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColorPlanarKernel<ushort2, COLOR24, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR32>
+void P016ToColorPlanar32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColoraPlanarKernel<ushort2, COLOR32, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR48>
+void P016ToColorPlanar48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColorPlanarKernel<ushort2, COLOR48, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR64>
+void P016ToColorPlanar64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    YuvToColoraPlanarKernel<ushort2, COLOR64, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, 2 * BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpP016, nP016Pitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR24>
+void YUV444ToColorPlanar24(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColorPlanarKernel<uchar2, COLOR24, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR32>
+void YUV444ToColorPlanar32(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColoraPlanarKernel<uchar2, COLOR32, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR48>
+void YUV444ToColorPlanar48(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColorPlanarKernel<uchar2, COLOR48, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR64>
+void YUV444ToColorPlanar64(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColoraPlanarKernel<uchar2, COLOR64, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR24>
+void YUV444P16ToColorPlanar24(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColorPlanarKernel<ushort2, COLOR24, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR32>
+void YUV444P16ToColorPlanar32(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColoraPlanarKernel<ushort2, COLOR32, uchar2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR48>
+void YUV444P16ToColorPlanar48(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColorPlanarKernel<ushort2, COLOR48, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template <class COLOR64>
+void YUV444P16ToColorPlanar64(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream) {
+    Yuv444ToColoraPlanarKernel<ushort2, COLOR64, ushort2>
+        <<<dim3(divUp(nWidth, 2 * BLOCKSIZE_X), divUp(nHeight, BLOCKSIZE_Y)), dim3(BLOCKSIZE_X, BLOCKSIZE_Y), 0, stream>>>
+        (dpYUV444, nPitch, dpColor, nColorPitch, nWidth, nHeight, videoFullRangeFlag);
+    if (stream == 0)
+        cudaSafeCall(cudaStreamSynchronize(stream));
+}
+
+template void Nv12ToColor24<BGR24>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor24<RGB24>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor32<BGRA32>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor32<RGBA32>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor48<BGR48>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor48<RGB48>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor64<BGRA64>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColor64<RGBA64>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void Nv12ToColorPlanar24<BGR24>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar24<RGB24>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar32<BGRA32>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar32<RGBA32>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar48<BGR48>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar48<RGB48>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar64<BGRA64>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void Nv12ToColorPlanar64<RGBA64>(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void P016ToColor24<BGR24>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor24<RGB24>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor32<BGRA32>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor32<RGBA32>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor48<BGR48>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor48<RGB48>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor64<BGRA64>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColor64<RGBA64>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void P016ToColorPlanar24<BGR24>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar24<RGB24>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar32<BGRA32>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar32<RGBA32>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar48<BGR48>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar48<RGB48>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar64<BGRA64>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void P016ToColorPlanar64<RGBA64>(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void YUV444ToColor24<BGR24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor24<RGB24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor32<BGRA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor32<RGBA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor48<BGR48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor48<RGB48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor64<BGRA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColor64<RGBA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void YUV444ToColorPlanar24<BGR24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar24<RGB24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar32<BGRA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar32<RGBA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar48<BGR48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar48<RGB48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar64<BGRA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444ToColorPlanar64<RGBA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void YUV444P16ToColor24<BGR24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor24<RGB24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor32<BGRA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor32<RGBA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor48<BGR48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor48<RGB48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor64<BGRA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColor64<RGBA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template void YUV444P16ToColorPlanar24<BGR24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar24<RGB24>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar32<BGRA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar32<RGBA32>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar48<BGR48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar48<RGB48>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar64<BGRA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template void YUV444P16ToColorPlanar64<RGBA64>(uint8_t* dpYUV444, int nPitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+}}}
diff --git a/modules/cudacodec/src/cuda/ColorSpace.h b/modules/cudacodec/src/cuda/ColorSpace.h
new file mode 100644
index 00000000000..d730aa37fd1
--- /dev/null
+++ b/modules/cudacodec/src/cuda/ColorSpace.h
@@ -0,0 +1,69 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#pragma once
+#include <stdint.h>
+#include <cuda_runtime.h>
+
+namespace cv { namespace cuda { namespace device {
+union BGR24 {
+    uchar3 v;
+    struct {
+        uint8_t b, g, r;
+    } c;
+};
+
+union RGB24 {
+    uchar3 v;
+    struct {
+        uint8_t r, g, b;
+    } c;
+};
+
+union BGRA32 {
+    uint32_t d;
+    uchar4 v;
+    struct {
+        uint8_t b, g, r, a;
+    } c;
+};
+
+union RGBA32 {
+    uint32_t d;
+    uchar4 v;
+    struct {
+        uint8_t r, g, b, a;
+    } c;
+};
+
+union BGR48 {
+    ushort3 v;
+    struct {
+        uint16_t b, g, r;
+    } c;
+};
+
+union RGB48 {
+    ushort3 v;
+    struct {
+        uint16_t r, g, b;
+    } c;
+};
+
+union BGRA64 {
+    uint64_t d;
+    ushort4 v;
+    struct {
+        uint16_t b, g, r, a;
+    } c;
+};
+
+union RGBA64 {
+    uint64_t d;
+    ushort4 v;
+    struct {
+        uint16_t r, g, b, a;
+    } c;
+};
+}}}
diff --git a/modules/cudacodec/src/cuda/nv12_to_rgb.cu b/modules/cudacodec/src/cuda/nv12_to_rgb.cu
deleted file mode 100644
index a9031e0ec9e..00000000000
--- a/modules/cudacodec/src/cuda/nv12_to_rgb.cu
+++ /dev/null
@@ -1,190 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-/*
- * NV12ToARGB color space conversion CUDA kernel
- *
- * This sample uses CUDA to perform a simple NV12 (YUV 4:2:0 planar)
- * source and converts to output in ARGB format
- */
-
-#include "opencv2/opencv_modules.hpp"
-
-#ifndef HAVE_OPENCV_CUDEV
-
-#error "opencv_cudev is required"
-
-#else
-
-#include "opencv2/cudev/common.hpp"
-
-using namespace cv;
-using namespace cv::cudev;
-
-void nv12ToBgra(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, cudaStream_t stream);
-
-namespace
-{
-    __constant__ float constHueColorSpaceMat[9] = {1.1644f, 0.0f, 1.596f, 1.1644f, -0.3918f, -0.813f, 1.1644f, 2.0172f, 0.0f};
-
-    template<bool fullRange>
-    __device__ static void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue)
-    {
-        float luma, chromaCb, chromaCr;
-        if (fullRange) {
-            luma = (float)(((int)yuvi[0] * 219.0f / 255.0f));
-            chromaCb = (float)(((int)yuvi[1] - 512.0f) * 224.0f / 255.0f);
-            chromaCr = (float)(((int)yuvi[2] - 512.0f) * 224.0f / 255.0f);
-        }
-        else {
-            luma = (float)((int)yuvi[0] - 64.0f);
-            chromaCb = (float)((int)yuvi[1] - 512.0f);
-            chromaCr = (float)((int)yuvi[2] - 512.0f);
-        }
-
-       // Convert YUV To RGB with hue adjustment
-       *red   = (luma     * constHueColorSpaceMat[0]) +
-                (chromaCb * constHueColorSpaceMat[1]) +
-                (chromaCr * constHueColorSpaceMat[2]);
-
-       *green = (luma     * constHueColorSpaceMat[3]) +
-                (chromaCb * constHueColorSpaceMat[4]) +
-                (chromaCr * constHueColorSpaceMat[5]);
-
-       *blue  = (luma     * constHueColorSpaceMat[6]) +
-                (chromaCb * constHueColorSpaceMat[7]) +
-                (chromaCr * constHueColorSpaceMat[8]);
-    }
-
-    __device__ static uint RGBA_pack_10bit(float red, float green, float blue, uint alpha)
-    {
-        uint ARGBpixel = 0;
-
-        // Clamp final 10 bit results
-        red   = ::fmin(::fmax(red,   0.0f), 1023.f);
-        green = ::fmin(::fmax(green, 0.0f), 1023.f);
-        blue  = ::fmin(::fmax(blue,  0.0f), 1023.f);
-
-        // Convert to 8 bit unsigned integers per color component
-        ARGBpixel = (((uint)blue  >> 2) |
-                    (((uint)green >> 2) << 8)  |
-                    (((uint)red   >> 2) << 16) |
-                    (uint)alpha);
-
-        return ARGBpixel;
-    }
-
-    // CUDA kernel for outputting the final ARGB output from NV12
-
-    #define COLOR_COMPONENT_BIT_SIZE 10
-    #define COLOR_COMPONENT_MASK     0x3FF
-
-    template<bool fullRange>
-    __global__ void NV12_to_BGRA(const uchar* srcImage, size_t nSourcePitch,
-                                  uint* dstImage, size_t nDestPitch,
-                                  uint width, uint height)
-    {
-        // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
-        const int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
-        const int y = blockIdx.y *  blockDim.y       +  threadIdx.y;
-
-        if (x >= width || y >= height)
-            return;
-
-        // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
-        // if we move to texture we could read 4 luminance values
-
-        uint yuv101010Pel[2];
-
-        yuv101010Pel[0] = (srcImage[y * nSourcePitch + x    ]) << 2;
-        yuv101010Pel[1] = (srcImage[y * nSourcePitch + x + 1]) << 2;
-
-        const size_t chromaOffset = nSourcePitch * height;
-
-        const int y_chroma = y >> 1;
-
-        yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-        yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-
-        yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-        yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-
-        // this steps performs the color conversion
-        uint yuvi[6];
-        float red[2], green[2], blue[2];
-
-        yuvi[0] =  (yuv101010Pel[0] &   COLOR_COMPONENT_MASK    );
-        yuvi[1] = ((yuv101010Pel[0] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
-        yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
-
-        yuvi[3] =  (yuv101010Pel[1] &   COLOR_COMPONENT_MASK    );
-        yuvi[4] = ((yuv101010Pel[1] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
-        yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
-
-        // YUV to RGB Transformation conversion
-        YUV2RGB<fullRange>(&yuvi[0], &red[0], &green[0], &blue[0]);
-        YUV2RGB<fullRange>(&yuvi[3], &red[1], &green[1], &blue[1]);
-
-        // Clamp the results to RGBA
-
-        const size_t dstImagePitch = nDestPitch >> 2;
-
-        dstImage[y * dstImagePitch + x     ] = RGBA_pack_10bit(red[0], green[0], blue[0], ((uint)0xff << 24));
-        dstImage[y * dstImagePitch + x + 1 ] = RGBA_pack_10bit(red[1], green[1], blue[1], ((uint)0xff << 24));
-    }
-}
-
-void nv12ToBgra(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, const bool videoFullRangeFlag, cudaStream_t stream)
-{
-    outFrame.create(height, width, CV_8UC4);
-    dim3 block(32, 8);
-    dim3 grid(divUp(width, 2 * block.x), divUp(height, block.y));
-    if (videoFullRangeFlag)
-        NV12_to_BGRA<true> <<<grid, block, 0, stream >>> (decodedFrame.ptr<uchar>(), decodedFrame.step, outFrame.ptr<uint>(), outFrame.step, width, height);
-    else
-        NV12_to_BGRA<false> <<<grid, block, 0, stream >>> (decodedFrame.ptr<uchar>(), decodedFrame.step, outFrame.ptr<uint>(), outFrame.step, width, height);
-    CV_CUDEV_SAFE_CALL(cudaGetLastError());
-    if (stream == 0)
-        CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());
-}
-
-#endif
diff --git a/modules/cudacodec/src/cuda/rgb_to_yv12.cu b/modules/cudacodec/src/cuda/rgb_to_yv12.cu
deleted file mode 100644
index ed0e0df9ba8..00000000000
--- a/modules/cudacodec/src/cuda/rgb_to_yv12.cu
+++ /dev/null
@@ -1,167 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "opencv2/opencv_modules.hpp"
-
-#ifndef HAVE_OPENCV_CUDEV
-
-#error "opencv_cudev is required"
-
-#else
-
-#include "opencv2/cudev/ptr2d/glob.hpp"
-
-using namespace cv::cudev;
-
-void RGB_to_YV12(const GpuMat& src, GpuMat& dst);
-
-namespace
-{
-    __device__ __forceinline__ void rgb_to_y(const uchar b, const uchar g, const uchar r, uchar& y)
-    {
-        y = static_cast<uchar>(((int)(30 * r) + (int)(59 * g) + (int)(11 * b)) / 100);
-    }
-
-    __device__ __forceinline__ void rgb_to_yuv(const uchar b, const uchar g, const uchar r, uchar& y, uchar& u, uchar& v)
-    {
-        rgb_to_y(b, g, r, y);
-        u = static_cast<uchar>(((int)(-17 * r) - (int)(33 * g) + (int)(50 * b) + 12800) / 100);
-        v = static_cast<uchar>(((int)(50 * r) - (int)(42 * g) - (int)(8 * b) + 12800) / 100);
-    }
-
-    __global__ void Gray_to_YV12(const GlobPtrSz<uchar> src, GlobPtr<uchar> dst)
-    {
-        const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
-        const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
-
-        if (x + 1 >= src.cols || y + 1 >= src.rows)
-            return;
-
-        // get pointers to the data
-        const size_t planeSize = src.rows * dst.step;
-        GlobPtr<uchar> y_plane = globPtr(dst.data, dst.step);
-        GlobPtr<uchar> u_plane = globPtr(y_plane.data + planeSize, dst.step / 2);
-        GlobPtr<uchar> v_plane = globPtr(u_plane.data + (planeSize / 4), dst.step / 2);
-
-        uchar pix;
-        uchar y_val, u_val, v_val;
-
-        pix = src(y, x);
-        rgb_to_y(pix, pix, pix, y_val);
-        y_plane(y, x) = y_val;
-
-        pix = src(y, x + 1);
-        rgb_to_y(pix, pix, pix, y_val);
-        y_plane(y, x + 1) = y_val;
-
-        pix = src(y + 1, x);
-        rgb_to_y(pix, pix, pix, y_val);
-        y_plane(y + 1, x) = y_val;
-
-        pix = src(y + 1, x + 1);
-        rgb_to_yuv(pix, pix, pix, y_val, u_val, v_val);
-        y_plane(y + 1, x + 1) = y_val;
-        u_plane(y / 2, x / 2) = u_val;
-        v_plane(y / 2, x / 2) = v_val;
-    }
-
-    template <typename T>
-    __global__ void RGB_to_YV12(const GlobPtrSz<T> src, GlobPtr<uchar> dst)
-    {
-        const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
-        const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
-
-        if (x + 1 >= src.cols || y + 1 >= src.rows)
-            return;
-
-        // get pointers to the data
-        const size_t planeSize = src.rows * dst.step;
-        GlobPtr<uchar> y_plane = globPtr(dst.data, dst.step);
-        GlobPtr<uchar> u_plane = globPtr(y_plane.data + planeSize, dst.step / 2);
-        GlobPtr<uchar> v_plane = globPtr(u_plane.data + (planeSize / 4), dst.step / 2);
-
-        T pix;
-        uchar y_val, u_val, v_val;
-
-        pix = src(y, x);
-        rgb_to_y(pix.z, pix.y, pix.x, y_val);
-        y_plane(y, x) = y_val;
-
-        pix = src(y, x + 1);
-        rgb_to_y(pix.z, pix.y, pix.x, y_val);
-        y_plane(y, x + 1) = y_val;
-
-        pix = src(y + 1, x);
-        rgb_to_y(pix.z, pix.y, pix.x, y_val);
-        y_plane(y + 1, x) = y_val;
-
-        pix = src(y + 1, x + 1);
-        rgb_to_yuv(pix.z, pix.y, pix.x, y_val, u_val, v_val);
-        y_plane(y + 1, x + 1) = y_val;
-        u_plane(y / 2, x / 2) = u_val;
-        v_plane(y / 2, x / 2) = v_val;
-    }
-}
-
-void RGB_to_YV12(const GpuMat& src, GpuMat& dst)
-{
-    const dim3 block(32, 8);
-    const dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
-
-    switch (src.channels())
-    {
-    case 1:
-        Gray_to_YV12<<<grid, block>>>(globPtr<uchar>(src), globPtr<uchar>(dst));
-        break;
-    case 3:
-        RGB_to_YV12<<<grid, block>>>(globPtr<uchar3>(src), globPtr<uchar>(dst));
-        break;
-    case 4:
-        RGB_to_YV12<<<grid, block>>>(globPtr<uchar4>(src), globPtr<uchar>(dst));
-        break;
-    }
-
-    CV_CUDEV_SAFE_CALL( cudaGetLastError() );
-    CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
-}
-
-#endif
diff --git a/modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp b/modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp
new file mode 100644
index 00000000000..e22549e2296
--- /dev/null
+++ b/modules/cudacodec/src/nvidia_surface_format_to_color_converter.cpp
@@ -0,0 +1,205 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudacodec;
+
+#if !defined (HAVE_CUDA)
+Ptr<NVSurfaceToColorConverter> cv::cudacodec::createNVSurfaceToColorConverter(const ColorSpaceStandard, const bool){ throw_no_cuda(); }
+#else
+#include "cuda/ColorSpace.h"
+namespace cv { namespace cuda { namespace device {
+template<class BGR24> void Nv12ToColor24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void Nv12ToColor24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void Nv12ToColor32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void Nv12ToColor32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void Nv12ToColor48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void Nv12ToColor48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void Nv12ToColor64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void Nv12ToColor64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void Nv12ToColorPlanar24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void Nv12ToColorPlanar24(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void Nv12ToColorPlanar32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void Nv12ToColorPlanar32(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void Nv12ToColorPlanar48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void Nv12ToColorPlanar48(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void Nv12ToColorPlanar64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void Nv12ToColorPlanar64(uint8_t* dpNv12, int nNv12Pitch, uint8_t* dpBgrp, int nBgrpPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void P016ToColor24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void P016ToColor24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void P016ToColor32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void P016ToColor32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void P016ToColor48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void P016ToColor48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void P016ToColor64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void P016ToColor64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void P016ToColorPlanar24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void P016ToColorPlanar24(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void P016ToColorPlanar32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void P016ToColorPlanar32(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag);
+template<class BGR48> void P016ToColorPlanar48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void P016ToColorPlanar48(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void P016ToColorPlanar64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void P016ToColorPlanar64(uint8_t* dpP016, int nP016Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void YUV444ToColor24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void YUV444ToColor24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void YUV444ToColor32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void YUV444ToColor32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void YUV444ToColor48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void YUV444ToColor48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void YUV444ToColor64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void YUV444ToColor64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void YUV444ToColorPlanar24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void YUV444ToColorPlanar24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void YUV444ToColorPlanar32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void YUV444ToColorPlanar32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void YUV444ToColorPlanar48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void YUV444ToColorPlanar48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void YUV444ToColorPlanar64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void YUV444ToColorPlanar64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void YUV444P16ToColor24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void YUV444P16ToColor24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void YUV444P16ToColor32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void YUV444P16ToColor32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void YUV444P16ToColor48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void YUV444P16ToColor48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void YUV444P16ToColor64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void YUV444P16ToColor64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+template<class BGR24> void YUV444P16ToColorPlanar24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB24> void YUV444P16ToColorPlanar24(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA32> void YUV444P16ToColorPlanar32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA32> void YUV444P16ToColorPlanar32(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGR48> void YUV444P16ToColorPlanar48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGB48> void YUV444P16ToColorPlanar48(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class BGRA64> void YUV444P16ToColorPlanar64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+template<class RGBA64> void YUV444P16ToColorPlanar64(uint8_t* dpYuv444, int nYuv444Pitch, uint8_t* dpColor, int nColorPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+void Y8ToGray8(uint8_t* dpY8, int nY8Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+void Y8ToGray16(uint8_t* dpY8, int nY8Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+void Y16ToGray8(uint8_t* dpY16, int nY16Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+void Y16ToGray16(uint8_t* dpY16, int nY16Pitch, uint8_t* dpGray, int nGrayPitch, int nWidth, int nHeight, bool videoFullRangeFlag, const cudaStream_t stream);
+
+void SetMatYuv2Rgb(int iMatrix, bool);
+}}}
+
+using namespace cuda::device;
+class NVSurfaceToColorConverterImpl : public NVSurfaceToColorConverter {
+public:
+    NVSurfaceToColorConverterImpl(ColorSpaceStandard colorSpace, bool fullColorRange = false) {
+        SetMatYuv2Rgb(static_cast<int>(colorSpace), fullColorRange);
+    }
+
+    int OutputColorFormatIdx(const cudacodec::ColorFormat format) {
+        switch (format) {
+        case cudacodec::ColorFormat::BGR: return 0;
+        case cudacodec::ColorFormat::RGB: return 1;
+        case cudacodec::ColorFormat::BGRA: return 2;
+        case cudacodec::ColorFormat::RGBA: return 3;
+        case cudacodec::ColorFormat::GRAY: return 4;
+        default: return -1;
+        }
+    }
+
+    int NumChannels(const cudacodec::ColorFormat format) {
+        switch (format) {
+        case cudacodec::ColorFormat::BGR:
+        case cudacodec::ColorFormat::RGB: return 3;
+        case cudacodec::ColorFormat::BGRA:
+        case cudacodec::ColorFormat::RGBA: return 4;
+        case cudacodec::ColorFormat::GRAY: return 1;
+        default: return -1;
+        }
+    }
+
+    BitDepth GetBitDepthOut(const BitDepth bitDepth, const int nBitsIn) {
+        switch (bitDepth) {
+        case BitDepth::EIGHT:
+        case BitDepth::SIXTEEN:
+            return bitDepth;
+        case BitDepth::UNCHANGED:
+        default:
+            if (nBitsIn == CV_8U)
+                return BitDepth::EIGHT;
+            else
+                return BitDepth::SIXTEEN;
+        }
+    }
+
+    bool convert(const InputArray yuv, const OutputArray out, const SurfaceFormat surfaceFormat, const ColorFormat outputFormat, const BitDepth bitDepth, const bool planar, const bool videoFullRangeFlag, cuda::Stream& stream) {
+        CV_Assert(outputFormat == ColorFormat::BGR || outputFormat == ColorFormat::BGRA || outputFormat == ColorFormat::RGB || outputFormat == ColorFormat::RGBA || outputFormat == ColorFormat::GRAY);
+        CV_Assert(yuv.depth() == CV_8U || yuv.depth() == CV_16U);
+        const bool yuv420 = surfaceFormat == SurfaceFormat::SF_NV12 || surfaceFormat == SurfaceFormat::SF_P016;
+        CV_Assert(yuv.cols() % 2 == 0);
+
+        typedef void (*func_t)(uint8_t* yuv, int yuvPitch, uint8_t* color, int colorPitch, int width, int height, bool videoFullRangeFlag, cudaStream_t stream);
+        static const func_t funcs[4][5][2][2] =
+        {
+            {
+                {{{Nv12ToColor24<BGR24>},{Nv12ToColorPlanar24<BGR24>}},{{Nv12ToColor48<BGR48>},{Nv12ToColorPlanar48<BGR48>}}},
+                {{{Nv12ToColor24<RGB24>},{Nv12ToColorPlanar24<RGB24>}},{{Nv12ToColor48<RGB48>},{Nv12ToColorPlanar48<RGB48>}}},
+                {{{Nv12ToColor32<BGRA32>},{Nv12ToColorPlanar32<BGRA32>}},{{Nv12ToColor64<BGRA64>},{Nv12ToColorPlanar64<BGRA64>}}},
+                {{{Nv12ToColor32<RGBA32>},{Nv12ToColorPlanar32<RGBA32>}},{{Nv12ToColor64<RGBA64>},{Nv12ToColorPlanar64<RGBA64>}}},
+                {{{Y8ToGray8},{Y8ToGray8}},{{Y8ToGray16},{Y8ToGray16}}}
+            },
+            {
+                {{{P016ToColor24<BGR24>},{P016ToColorPlanar24<BGR24>}},{{P016ToColor48<BGR48>},{P016ToColorPlanar48<BGR48>}}},
+                {{{P016ToColor24<RGB24>},{P016ToColorPlanar24<RGB24>}},{{P016ToColor48<RGB48>},{P016ToColorPlanar48<RGB48>}}},
+                {{{P016ToColor32<BGRA32>},{P016ToColorPlanar32<BGRA32>}},{{P016ToColor64<BGRA64>},{P016ToColorPlanar64<BGRA64>}}},
+                {{{P016ToColor32<RGBA32>},{P016ToColorPlanar32<RGBA32>}},{{P016ToColor64<RGBA64>},{P016ToColorPlanar64<RGBA64>}}},
+                {{{Y16ToGray8},{Y16ToGray8}},{{Y16ToGray16},{Y16ToGray16}}}
+            },
+            {
+                {{{YUV444ToColor24<BGR24>},{YUV444ToColorPlanar24<BGR24>}},{{YUV444ToColor48<BGR48>},{YUV444ToColorPlanar48<BGR48>}}},
+                {{{YUV444ToColor24<RGB24>},{YUV444ToColorPlanar24<RGB24>}},{{YUV444ToColor48<RGB48>},{YUV444ToColorPlanar48<RGB48>}}},
+                {{{YUV444ToColor32<BGRA32>},{YUV444ToColorPlanar32<BGRA32>}},{{YUV444ToColor64<BGRA64>},{YUV444ToColorPlanar64<BGRA64>}}},
+                {{{YUV444ToColor32<RGBA32>},{YUV444ToColorPlanar32<RGBA32>}},{{YUV444ToColor64<RGBA64>},{YUV444ToColorPlanar64<RGBA64>}}},
+                {{{Y8ToGray8},{Y8ToGray8}},{{Y8ToGray16},{Y8ToGray16}}}
+            },
+            {
+                {{{YUV444P16ToColor24<BGR24>},{YUV444P16ToColorPlanar24<BGR24>}},{{YUV444P16ToColor48<BGR48>},{YUV444P16ToColorPlanar48<BGR48>}}},
+                {{{YUV444P16ToColor24<RGB24>},{YUV444P16ToColorPlanar24<RGB24>}},{{YUV444P16ToColor48<RGB48>},{YUV444P16ToColorPlanar48<RGB48>}}},
+                {{{YUV444P16ToColor32<BGRA32>},{YUV444P16ToColorPlanar32<BGRA32>}},{{YUV444P16ToColor64<BGRA64>},{YUV444P16ToColorPlanar64<BGRA64>}}},
+                {{{YUV444P16ToColor32<RGBA32>},{YUV444P16ToColorPlanar32<RGBA32>}},{{YUV444P16ToColor64<RGBA64>},{YUV444P16ToColorPlanar64<RGBA64>}}},
+                {{{Y16ToGray8},{Y16ToGray8}},{{Y16ToGray16},{Y16ToGray16}}}
+            }
+        };
+
+        GpuMat yuv_ = getInputMat(yuv, stream);
+        CV_Assert(yuv_.step <= static_cast<size_t>(std::numeric_limits<int>::max()));
+
+        const int nRows = static_cast<int>(yuv.rows() / (yuv420 ? 1.5f : 3.0f));
+        CV_Assert(!yuv420 || nRows % 2 == 0);
+        const int nChannels = NumChannels(outputFormat);
+        const int nRowsOut = nRows * (planar ? nChannels : 1);
+        const BitDepth bitDepth_ = GetBitDepthOut(bitDepth, yuv.depth());
+        const int typeOut = CV_MAKE_TYPE(bitDepth_ == BitDepth::EIGHT ? CV_8U : CV_16U, planar ? 1 : nChannels);
+        GpuMat out_ = getOutputMat(out, nRowsOut, yuv.cols(), typeOut, stream);
+
+        const int iOutputFormat = OutputColorFormatIdx(outputFormat);
+        const func_t func = funcs[static_cast<int>(surfaceFormat)][iOutputFormat][static_cast<int>(bitDepth_)][planar];
+        if (!func)
+            CV_Error(Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+        CV_Assert(out_.step <= static_cast<size_t>(std::numeric_limits<int>::max()));
+        func((uint8_t*)yuv_.ptr(0), static_cast<int>(yuv_.step), (uint8_t*)out_.ptr(0), static_cast<int>(out_.step), out_.cols, nRows, videoFullRangeFlag, StreamAccessor::getStream(stream));
+        return true;
+    }
+
+
+};
+
+Ptr<NVSurfaceToColorConverter> cv::cudacodec::createNVSurfaceToColorConverter(const ColorSpaceStandard colorSpace, const bool videoFullRangeFlag) {
+    return makePtr<NVSurfaceToColorConverterImpl>(colorSpace, videoFullRangeFlag);
+}
+#endif
diff --git a/modules/cudacodec/src/precomp.hpp b/modules/cudacodec/src/precomp.hpp
index 004cf85c88d..99a788a0128 100644
--- a/modules/cudacodec/src/precomp.hpp
+++ b/modules/cudacodec/src/precomp.hpp
@@ -82,7 +82,6 @@
         #include "frame_queue.hpp"
         #include "video_decoder.hpp"
         #include "video_parser.hpp"
-        #include <opencv2/cudaarithm.hpp>
     #endif
     #if defined(HAVE_NVCUVENC)
         #include <fstream>
diff --git a/modules/cudacodec/src/video_decoder.cpp b/modules/cudacodec/src/video_decoder.cpp
index 10008d9b033..e156e25a705 100644
--- a/modules/cudacodec/src/video_decoder.cpp
+++ b/modules/cudacodec/src/video_decoder.cpp
@@ -45,6 +45,7 @@
 
 #ifdef HAVE_NVCUVID
 
+#if (CUDART_VERSION < 9000)
 static const char* GetVideoChromaFormatString(cudaVideoChromaFormat eChromaFormat) {
     static struct {
         cudaVideoChromaFormat eChromaFormat;
@@ -61,6 +62,7 @@ static const char* GetVideoChromaFormatString(cudaVideoChromaFormat eChromaForma
     }
     return "Unknown";
 }
+#endif
 
 void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
 {
@@ -68,16 +70,30 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
         AutoLock autoLock(mtx_);
         videoFormat_ = videoFormat;
     }
-    const cudaVideoCodec _codec = static_cast<cudaVideoCodec>(videoFormat.codec);
-    const cudaVideoChromaFormat _chromaFormat = static_cast<cudaVideoChromaFormat>(videoFormat.chromaFormat);
+    const cudaVideoCodec _codec = static_cast<cudaVideoCodec>(videoFormat_.codec);
+    const cudaVideoChromaFormat _chromaFormat = static_cast<cudaVideoChromaFormat>(videoFormat_.chromaFormat);
+
+    cudaVideoSurfaceFormat surfaceFormat = cudaVideoSurfaceFormat_NV12;
+#if (CUDART_VERSION < 9000)
     if (videoFormat.nBitDepthMinus8 > 0) {
-        std::ostringstream warning;
-        warning << "NV12 (8 bit luma, 4 bit chroma) is currently the only supported decoder output format. Video input is " << videoFormat.nBitDepthMinus8 + 8 << " bit " \
+    std::ostringstream warning;
+    warning << "NV12 (8 bit luma, 4 bit chroma) is currently the only supported decoder output format. Video input is " << videoFormat.nBitDepthMinus8 + 8 << " bit " \
             << std::string(GetVideoChromaFormatString(_chromaFormat)) << ".  Truncating luma to 8 bits";
         if (videoFormat.chromaFormat != YUV420)
             warning << " and chroma to 4 bits";
         CV_LOG_WARNING(NULL, warning.str());
     }
+#else
+    if (_chromaFormat == cudaVideoChromaFormat_420 || cudaVideoChromaFormat_Monochrome)
+        surfaceFormat = videoFormat_.nBitDepthMinus8 ? cudaVideoSurfaceFormat_P016 : cudaVideoSurfaceFormat_NV12;
+    else if (_chromaFormat == cudaVideoChromaFormat_444)
+        surfaceFormat = videoFormat_.nBitDepthMinus8 ? cudaVideoSurfaceFormat_YUV444_16Bit : cudaVideoSurfaceFormat_YUV444;
+    else if (_chromaFormat == cudaVideoChromaFormat_422) {
+        surfaceFormat = videoFormat_.nBitDepthMinus8 ? cudaVideoSurfaceFormat_P016 : cudaVideoSurfaceFormat_NV12;
+        CV_LOG_WARNING(NULL, "YUV 4:2:2 is not currently supported, falling back to YUV 4:2:0.");
+    }
+#endif
+
     const cudaVideoCreateFlags videoCreateFlags = (_codec == cudaVideoCodec_JPEG || _codec == cudaVideoCodec_MPEG2) ?
                                             cudaVideoCreate_PreferCUDA :
                                             cudaVideoCreate_PreferCUVID;
@@ -123,9 +139,25 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
     cuSafeCall(cuCtxPushCurrent(ctx_));
     cuSafeCall(cuvidGetDecoderCaps(&decodeCaps));
     cuSafeCall(cuCtxPopCurrent(NULL));
-    if (!(decodeCaps.bIsSupported && (decodeCaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12)))) {
-        CV_Error(Error::StsUnsupportedFormat, "Video source is not supported by hardware video decoder refer to Nvidia's GPU Support Matrix to confirm your GPU supports hardware decoding of the video source's codec.");
+
+    if (!decodeCaps.bIsSupported) {
+        CV_Error(Error::StsUnsupportedFormat, "Video codec is not supported by this GPU hardware video decoder refer to Nvidia's GPU Support Matrix to confirm your GPU supports hardware decoding of the video source's codec.");
+    }
+
+    if (!(decodeCaps.nOutputFormatMask & (1 << surfaceFormat)))
+    {
+        if (decodeCaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12))
+            surfaceFormat = cudaVideoSurfaceFormat_NV12;
+        else if (decodeCaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_P016))
+            surfaceFormat = cudaVideoSurfaceFormat_P016;
+        else if (decodeCaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444))
+            surfaceFormat = cudaVideoSurfaceFormat_YUV444;
+        else if (decodeCaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444_16Bit))
+            surfaceFormat = cudaVideoSurfaceFormat_YUV444_16Bit;
+        else
+            CV_Error(Error::StsUnsupportedFormat, "No supported output format found");
     }
+    videoFormat_.surfaceFormat = static_cast<SurfaceFormat>(surfaceFormat);
 
     if (videoFormat.enableHistogram) {
         if (!decodeCaps.bIsHistogramSupported) {
@@ -168,7 +200,7 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
     createInfo_.ulHeight            = videoFormat.ulHeight;
     createInfo_.ulNumDecodeSurfaces = videoFormat.ulNumDecodeSurfaces;
     createInfo_.ChromaFormat    = _chromaFormat;
-    createInfo_.OutputFormat    = cudaVideoSurfaceFormat_NV12;
+    createInfo_.OutputFormat    = surfaceFormat;
     createInfo_.DeinterlaceMode = static_cast<cudaVideoDeinterlaceMode>(videoFormat.deinterlaceMode);
     createInfo_.ulTargetWidth       = videoFormat.width;
     createInfo_.ulTargetHeight      = videoFormat.height;
diff --git a/modules/cudacodec/src/video_decoder.hpp b/modules/cudacodec/src/video_decoder.hpp
index bea15369011..f77d288051b 100644
--- a/modules/cudacodec/src/video_decoder.hpp
+++ b/modules/cudacodec/src/video_decoder.hpp
@@ -103,7 +103,9 @@ class VideoDecoder
 
         cuSafeCall( cuvidMapVideoFrame(decoder_, picIdx, &ptr, &pitch, &videoProcParams) );
 
-        return cuda::GpuMat(targetHeight() * 3 / 2, targetWidth(), CV_8UC1, (void*) ptr, pitch);
+        const int height = (videoFormat_.surfaceFormat == cudaVideoSurfaceFormat_NV12 || videoFormat_.surfaceFormat == cudaVideoSurfaceFormat_P016) ? targetHeight() * 3 / 2 : targetHeight() * 3;
+        const int type = (videoFormat_.surfaceFormat == cudaVideoSurfaceFormat_NV12 || videoFormat_.surfaceFormat == cudaVideoSurfaceFormat_YUV444) ? CV_8U : CV_16U;
+        return cuda::GpuMat(height, targetWidth(), type, (void*) ptr, pitch);
     }
 
     void unmapFrame(cuda::GpuMat& frame)
diff --git a/modules/cudacodec/src/video_parser.cpp b/modules/cudacodec/src/video_parser.cpp
index 1aba16d585e..597845f01dd 100644
--- a/modules/cudacodec/src/video_parser.cpp
+++ b/modules/cudacodec/src/video_parser.cpp
@@ -119,6 +119,7 @@ int CUDAAPI cv::cudacodec::detail::VideoParser::HandleVideoSequence(void* userDa
 
     FormatInfo newFormat;
     newFormat.videoFullRangeFlag = format->video_signal_description.video_full_range_flag;
+    newFormat.colorSpaceStandard = static_cast<ColorSpaceStandard>(format->video_signal_description.matrix_coefficients);
     newFormat.codec = static_cast<Codec>(format->codec);
     newFormat.chromaFormat = static_cast<ChromaFormat>(format->chroma_format);
     newFormat.nBitDepthMinus8 = format->bit_depth_luma_minus8;
diff --git a/modules/cudacodec/src/video_reader.cpp b/modules/cudacodec/src/video_reader.cpp
index 5bf9aac91ed..28bbc113163 100644
--- a/modules/cudacodec/src/video_reader.cpp
+++ b/modules/cudacodec/src/video_reader.cpp
@@ -55,51 +55,6 @@ void cv::cudacodec::MapHist(const GpuMat&, Mat&) { throw_no_cuda(); }
 #else // HAVE_NVCUVID
 
 void nv12ToBgra(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, const bool videoFullRangeFlag, cudaStream_t stream);
-bool ValidColorFormat(const ColorFormat colorFormat);
-
-void cvtFromNv12(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int height, const ColorFormat colorFormat, const bool videoFullRangeFlag,
-    Stream stream)
-{
-    CV_Assert(decodedFrame.cols == width && decodedFrame.rows == height * 1.5f);
-    if (colorFormat == ColorFormat::BGRA) {
-        nv12ToBgra(decodedFrame, outFrame, width, height, videoFullRangeFlag, StreamAccessor::getStream(stream));
-    }
-    else if (colorFormat == ColorFormat::BGR) {
-        outFrame.create(height, width, CV_8UC3);
-        Npp8u* pSrc[2] = { decodedFrame.data, &decodedFrame.data[decodedFrame.step * height] };
-        NppiSize oSizeROI = { width,height };
-        cv::cuda::NppStreamHandler h(stream);
-#if USE_NPP_STREAM_CTX
-        if (videoFullRangeFlag)
-            nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, h));
-        else {
-#if (CUDART_VERSION < 11000)
-            nppSafeCall(nppiNV12ToBGR_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, h));
-#else
-            nppSafeCall(nppiNV12ToBGR_709CSC_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, h));
-#endif
-        }
-#else
-        if (videoFullRangeFlag)
-            nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI));
-        else {
-            nppSafeCall(nppiNV12ToBGR_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI));
-        }
-#endif
-    }
-    else if (colorFormat == ColorFormat::GRAY) {
-        outFrame.create(height, width, CV_8UC1);
-        if(videoFullRangeFlag)
-            cudaSafeCall(cudaMemcpy2DAsync(outFrame.ptr(), outFrame.step, decodedFrame.ptr(), decodedFrame.step, width, height, cudaMemcpyDeviceToDevice, StreamAccessor::getStream(stream)));
-        else {
-            cv::cuda::subtract(decodedFrame(Rect(0,0,width,height)), 16, outFrame, noArray(), CV_8U, stream);
-            cv::cuda::multiply(outFrame, 255.0f / 219.0f, outFrame, 1.0, CV_8U, stream);
-        }
-    }
-    else if (colorFormat == ColorFormat::NV_NV12) {
-        decodedFrame.copyTo(outFrame, stream);
-    }
-}
 
 using namespace cv::cudacodec::detail;
 
@@ -124,7 +79,7 @@ namespace
 
         bool set(const VideoReaderProps propertyId, const double propertyVal) CV_OVERRIDE;
 
-        bool set(const ColorFormat colorFormat_) CV_OVERRIDE;
+        bool set(const ColorFormat colorFormat, const BitDepth bitDepth = BitDepth::UNCHANGED, const bool planar = false) CV_OVERRIDE;
 
         bool get(const VideoReaderProps propertyId, double& propertyVal) const CV_OVERRIDE;
         bool getVideoReaderProps(const VideoReaderProps propertyId, double& propertyValOut, double propertyValIn) const CV_OVERRIDE;
@@ -137,6 +92,7 @@ namespace
         void releaseFrameInfo(const std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS>& frameInfo);
         bool internalGrab(GpuMat & frame, GpuMat & histogram, Stream & stream);
         void waitForDecoderInit();
+        void cvtFromYuv(const GpuMat& decodedFrame, GpuMat& outFrame, const SurfaceFormat surfaceFormat, const bool videoFullRangeFlag, Stream& stream);
 
         Ptr<VideoSource> videoSource_;
 
@@ -152,7 +108,10 @@ namespace
         static const int decodedFrameIdx = 0;
         static const int extraDataIdx = 1;
         static const int rawPacketsBaseIdx = 2;
+        Ptr<NVSurfaceToColorConverter> yuvConverter = 0;
         ColorFormat colorFormat = ColorFormat::BGRA;
+        BitDepth bitDepth = BitDepth::UNCHANGED;
+        bool planar = false;
         static const String errorMsg;
         int iFrame = 0;
     };
@@ -191,9 +150,17 @@ namespace
         videoSource_->setVideoParser(videoParser_);
         videoSource_->start();
         waitForDecoderInit();
+        FormatInfo format = videoDecoder_->format();
+        if (format.colorSpaceStandard == ColorSpaceStandard::Unspecified) {
+            if (format.width > 1280 || format.height > 720)
+                format.colorSpaceStandard = ColorSpaceStandard::BT709;
+            else
+                format.colorSpaceStandard = ColorSpaceStandard::BT601;
+        }
+        yuvConverter = createNVSurfaceToColorConverter(format.colorSpaceStandard, format.videoFullRangeFlag);
         for(iFrame = videoSource_->getFirstFrameIdx(); iFrame < firstFrameIdx; iFrame++)
             CV_Assert(skipFrame());
-        videoSource_->updateFormat(videoDecoder_->format());
+        videoSource_->updateFormat(format);
     }
 
     VideoReaderImpl::~VideoReaderImpl()
@@ -287,14 +254,13 @@ namespace
             // map decoded video frame to CUDA surface
             GpuMat decodedFrame = videoDecoder_->mapFrame(frameInfo.first.picture_index, frameInfo.second);
 
-            cvtFromNv12(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight(), colorFormat, videoDecoder_->format().videoFullRangeFlag, stream);
-
             if (fmt.enableHistogram) {
                 const size_t histogramSz = 4 * fmt.nMaxHistogramBins;
                 histogram.create(1, fmt.nMaxHistogramBins, CV_32S);
                 cuSafeCall(cuMemcpyDtoDAsync((CUdeviceptr)(histogram.data), cuHistogramPtr, histogramSz, StreamAccessor::getStream(stream)));
             }
 
+            cvtFromYuv(decodedFrame, frame, videoDecoder_->format().surfaceFormat, videoDecoder_->format().videoFullRangeFlag, stream);
             // unmap video frame
             // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
             videoDecoder_->unmapFrame(decodedFrame);
@@ -350,23 +316,21 @@ namespace
     }
 
     bool ValidColorFormat(const ColorFormat colorFormat) {
-        if (colorFormat == ColorFormat::BGRA || colorFormat == ColorFormat::BGR || colorFormat == ColorFormat::GRAY || colorFormat == ColorFormat::NV_NV12)
+        if (colorFormat == ColorFormat::BGRA || colorFormat == ColorFormat::BGR || colorFormat == ColorFormat::RGB || colorFormat == ColorFormat::RGBA || colorFormat == ColorFormat::GRAY || colorFormat == ColorFormat::NV_YUV_SURFACE_FORMAT || colorFormat == ColorFormat::NV_YUV444)
             return true;
         return false;
     }
 
-    bool VideoReaderImpl::set(const ColorFormat colorFormat_) {
-        if (!ValidColorFormat(colorFormat_)) return false;
-        if (colorFormat_ == ColorFormat::BGR) {
-#if (CUDART_VERSION < 9020)
-            CV_LOG_DEBUG(NULL, "ColorFormat::BGR is not supported until CUDA 9.2, use default ColorFormat::BGRA.");
-            return false;
-#elif (CUDART_VERSION < 11000)
-            if (!videoDecoder_->format().videoFullRangeFlag)
-                CV_LOG_INFO(NULL, "Color reproduction may be inaccurate due CUDA version <= 11.0, for better results upgrade CUDA runtime or try ColorFormat::BGRA.");
-#endif
+    bool VideoReaderImpl::set(const ColorFormat colorFormat_, const BitDepth bitDepth_, const bool planar_) {
+        ColorFormat tmpFormat = colorFormat_;
+        if (tmpFormat == ColorFormat::NV_NV12) {
+            CV_LOG_WARNING(NULL, "ColorFormat::NV_NV12 is depreciated forcing ColorFormat::NV_YUV_SURFACE_FORMAT instead.");
+            tmpFormat = ColorFormat::NV_YUV_SURFACE_FORMAT;
         }
-        colorFormat = colorFormat_;
+        if (!ValidColorFormat(tmpFormat)) return false;
+        colorFormat = tmpFormat;
+        bitDepth = bitDepth_;
+        planar = planar_;
         return true;
     }
 
@@ -410,6 +374,12 @@ namespace
         case VideoReaderProps::PROP_COLOR_FORMAT:
             propertyVal = static_cast<double>(colorFormat);
             return true;
+        case VideoReaderProps::PROP_BIT_DEPTH:
+            propertyVal = static_cast<double>(bitDepth);
+            return true;
+        case VideoReaderProps::PROP_PLANAR:
+            propertyVal = static_cast<double>(planar);
+            return true;
         default:
             break;
         }
@@ -443,6 +413,15 @@ namespace
             return false;
         return true;
     }
+
+    void VideoReaderImpl::cvtFromYuv(const GpuMat& decodedFrame, GpuMat& outFrame, const SurfaceFormat surfaceFormat, const bool videoFullRangeFlag, Stream& stream)
+    {
+        if (colorFormat == ColorFormat::NV_YUV_SURFACE_FORMAT) {
+            decodedFrame.copyTo(outFrame, stream);
+            return;
+        }
+        yuvConverter->convert(decodedFrame, outFrame, surfaceFormat, colorFormat, bitDepth, planar, videoFullRangeFlag, stream);
+    }
 }
 
 Ptr<VideoReader> cv::cudacodec::createVideoReader(const String& filename, const std::vector<int>& sourceParams, const VideoReaderInitParams params)
diff --git a/modules/cudacodec/test/test_video.cpp b/modules/cudacodec/test/test_video.cpp
index 003fbb7358e..1158a3f6201 100644
--- a/modules/cudacodec/test/test_video.cpp
+++ b/modules/cudacodec/test/test_video.cpp
@@ -45,6 +45,10 @@ namespace opencv_test {
     namespace {
 
 #if defined(HAVE_NVCUVID) || defined(HAVE_NVCUVENC)
+CV_ENUM(ColorFormats, cudacodec::ColorFormat::BGR, cudacodec::ColorFormat::BGRA, cudacodec::ColorFormat::RGB, cudacodec::ColorFormat::RGBA, cudacodec::ColorFormat::GRAY)
+CV_ENUM(SurfaceFormats, cudacodec::SurfaceFormat::SF_NV12, cudacodec::SurfaceFormat::SF_P016, cudacodec::SurfaceFormat::SF_YUV444, cudacodec::SurfaceFormat::SF_YUV444_16Bit)
+CV_ENUM(BitDepths, cudacodec::BitDepth::UNCHANGED, cudacodec::BitDepth::EIGHT, cudacodec::BitDepth::SIXTEEN)
+
 struct SetDevice : testing::TestWithParam<cv::cuda::DeviceInfo>
 {
     cv::cuda::DeviceInfo devInfo;
@@ -76,7 +80,19 @@ PARAM_TEST_CASE(Video, cv::cuda::DeviceInfo, std::string)
 };
 
 typedef tuple<std::string, bool> color_conversion_params_t;
-PARAM_TEST_CASE(ColorConversion, cv::cuda::DeviceInfo, cv::cudacodec::ColorFormat, color_conversion_params_t)
+PARAM_TEST_CASE(ColorConversionLumaChromaRange, cv::cuda::DeviceInfo, color_conversion_params_t)
+{
+};
+
+PARAM_TEST_CASE(ColorConversionFormat, cv::cuda::DeviceInfo, ColorFormats)
+{
+};
+
+struct ColorConversionPlanar : SetDevice
+{
+};
+
+PARAM_TEST_CASE(ColorConversionBitdepth, cv::cuda::DeviceInfo, BitDepths)
 {
 };
 
@@ -117,6 +133,10 @@ struct Seek : SetDevice
 {
 };
 
+PARAM_TEST_CASE(YuvConverter, cv::cuda::DeviceInfo, SurfaceFormats, ColorFormats, BitDepths, bool, bool)
+{
+};
+
 #if defined(HAVE_NVCUVID)
 //////////////////////////////////////////////////////
 // VideoReader
@@ -300,38 +320,35 @@ CUDA_TEST_P(Video, Reader)
         {cudacodec::ColorFormat::GRAY,1},
         {cudacodec::ColorFormat::BGR,3},
         {cudacodec::ColorFormat::BGRA,4},
-        {cudacodec::ColorFormat::NV_NV12,1}
+        {cudacodec::ColorFormat::NV_YUV_SURFACE_FORMAT,1}
     };
 
     std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + relativeFilePath;
     cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
-    ASSERT_FALSE(reader->set(cudacodec::ColorFormat::RGB));
     cv::cudacodec::FormatInfo fmt = reader->format();
     cv::cuda::GpuMat frame;
     for (int i = 0; i < 10; i++)
     {
-        // request a different colour format for each frame
         const std::pair< cudacodec::ColorFormat, int>& formatToChannels = formatsToChannels[i % formatsToChannels.size()];
         ASSERT_TRUE(reader->set(formatToChannels.first));
         double colorFormat;
         ASSERT_TRUE(reader->get(cudacodec::VideoReaderProps::PROP_COLOR_FORMAT, colorFormat) && static_cast<cudacodec::ColorFormat>(colorFormat) == formatToChannels.first);
         ASSERT_TRUE(reader->nextFrame(frame));
-        const int height = formatToChannels.first == cudacodec::ColorFormat::NV_NV12 ? static_cast<int>(1.5 * fmt.height) : fmt.height;
+        const int height = formatToChannels.first == cudacodec::ColorFormat::NV_YUV_SURFACE_FORMAT ? static_cast<int>(1.5 * fmt.height) : fmt.height;
         ASSERT_TRUE(frame.cols == fmt.width && frame.rows == height);
         ASSERT_FALSE(frame.empty());
         ASSERT_TRUE(frame.channels() == formatToChannels.second);
     }
 }
 
-CUDA_TEST_P(ColorConversion, Reader)
+CUDA_TEST_P(ColorConversionLumaChromaRange, Reader)
 {
     cv::cuda::setDevice(GET_PARAM(0).deviceID());
-    const cv::cudacodec::ColorFormat colorFormat = GET_PARAM(1);
-    const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + get<0>(GET_PARAM(2));
-    const bool videoFullRangeFlag = get<1>(GET_PARAM(2));
+    const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + get<0>(GET_PARAM(1));
+    const bool videoFullRangeFlag = get<1>(GET_PARAM(1));
     cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
     cv::cudacodec::FormatInfo fmt = reader->format();
-    reader->set(colorFormat);
+    reader->set(cudacodec::ColorFormat::BGR);
     cv::VideoCapture cap(inputFile);
 
     cv::cuda::GpuMat frame;
@@ -343,11 +360,120 @@ CUDA_TEST_P(ColorConversion, Reader)
         cap.read(frameHost);
         fmt = reader->format();
         ASSERT_TRUE(fmt.videoFullRangeFlag == videoFullRangeFlag);
-        if (colorFormat == cv::cudacodec::ColorFormat::BGRA)
-            cv::cvtColor(frameHost, frameHostGs, COLOR_BGR2BGRA);
-        else
+        frameHostGs = frameHost;
+        EXPECT_MAT_NEAR(frameHostGs, frameFromDevice, 2);
+    }
+}
+
+CUDA_TEST_P(ColorConversionFormat, Reader)
+{
+    const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../highgui/video/big_buck_bunny.h264";
+    cv::cuda::setDevice(GET_PARAM(0).deviceID());
+    const cudacodec::ColorFormat colorFormat = static_cast<cudacodec::ColorFormat>(static_cast<int>(GET_PARAM(1)));
+    cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
+    double colorFormatGetVal;
+    ASSERT_TRUE(reader->get(cudacodec::VideoReaderProps::PROP_COLOR_FORMAT, colorFormatGetVal));
+    ASSERT_EQ(cudacodec::ColorFormat::BGRA, static_cast<cudacodec::ColorFormat>(colorFormatGetVal));
+    reader->set(colorFormat);
+    ASSERT_TRUE(reader->get(cudacodec::VideoReaderProps::PROP_COLOR_FORMAT, colorFormatGetVal));
+    ASSERT_EQ(colorFormat, static_cast<cudacodec::ColorFormat>(colorFormatGetVal));
+    cv::VideoCapture cap(inputFile);
+
+    int maxDiff = 2;
+    cv::cuda::GpuMat frame;
+    Mat frameHost, frameHostGs, frameFromDevice, unused;
+    for (int i = 0; i < 10; i++)
+    {
+        reader->nextFrame(frame);
+        frame.download(frameFromDevice);
+        cap.read(frameHost);
+        switch (colorFormat)
+        {
+        case cudacodec::ColorFormat::BGRA:
+            cv::cvtColor(frameHost, frameHostGs, cv::COLOR_BGR2BGRA);
+            break;
+        case cudacodec::ColorFormat::RGB:
+            cv::cvtColor(frameHost, frameHostGs, cv::COLOR_BGR2RGB);
+            break;
+        case cudacodec::ColorFormat::RGBA:
+            cv::cvtColor(frameHost, frameHostGs, cv::COLOR_BGR2RGBA);
+            break;
+        case cudacodec::ColorFormat::GRAY:
+            cv::cvtColor(frameHost, frameHostGs, cv::COLOR_BGR2GRAY);
+            // Increased error because of different conversion pipelines. i.e. frameFromDevice (NV12 -> GRAY) and frameHostGs (NV12 -> BGR -> GRAY).  Due to 420 subsampling NV12 -> BGR can increase the luminance of neighbouring pixels if they are significantly different to each other meaning the subsequent conversion BGR -> GRAY will be different to the direct NV12 -> GRAY conversion.
+            maxDiff = 15;
+            break;
+        default:
             frameHostGs = frameHost;
-        EXPECT_MAT_NEAR(frameHostGs, frameFromDevice, 2.0);
+        }
+        EXPECT_MAT_NEAR(frameHostGs, frameFromDevice, maxDiff);
+    }
+}
+
+CUDA_TEST_P(ColorConversionPlanar, Reader)
+{
+    const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../highgui/video/big_buck_bunny.h264";
+    cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
+    double planarGetVal;
+    ASSERT_TRUE(reader->get(cudacodec::VideoReaderProps::PROP_PLANAR, planarGetVal));
+    ASSERT_FALSE(static_cast<bool>(planarGetVal));
+    reader->set(cudacodec::ColorFormat::BGR, cudacodec::BitDepth::UNCHANGED, true);
+    ASSERT_TRUE(reader->get(cudacodec::VideoReaderProps::PROP_PLANAR, planarGetVal));
+    ASSERT_TRUE(static_cast<bool>(planarGetVal));
+    cv::VideoCapture cap(inputFile);
+
+    cv::cuda::GpuMat frame;
+    Mat frameHost, frameHostGs, frameFromDevice;
+    for (int i = 0; i < 10; i++)
+    {
+        reader->nextFrame(frame);
+        frame.download(frameFromDevice);
+        cap.read(frameHost);
+        Mat bgrSplit[3];
+        cv::split(frameHost, bgrSplit);
+        if(i == 0)
+            frameHostGs = Mat(frameHost.rows * 3, frameHost.cols, CV_8U);
+        bgrSplit[0].copyTo(frameHostGs(Rect(0, 0, frameHost.cols, frameHost.rows)));
+        bgrSplit[1].copyTo(frameHostGs(Rect(0, frameHost.rows, frameHost.cols, frameHost.rows)));
+        bgrSplit[2].copyTo(frameHostGs(Rect(0, 2 * frameHost.rows, frameHost.cols, frameHost.rows)));
+        EXPECT_MAT_NEAR(frameHostGs, frameFromDevice, 2);
+    }
+}
+
+CUDA_TEST_P(ColorConversionBitdepth, Reader)
+{
+    const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../highgui/video/big_buck_bunny.h264";
+    cv::cuda::setDevice(GET_PARAM(0).deviceID());
+    const cudacodec::BitDepth bitDepth = static_cast<cudacodec::BitDepth>(static_cast<int>(GET_PARAM(1)));
+    cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
+    double bitDepthGetVal;
+    ASSERT_TRUE(reader->get(cudacodec::VideoReaderProps::PROP_BIT_DEPTH, bitDepthGetVal));
+    ASSERT_EQ(cudacodec::BitDepth::UNCHANGED, static_cast<cudacodec::BitDepth>(bitDepthGetVal));
+    reader->set(cudacodec::ColorFormat::BGR, bitDepth);
+    ASSERT_TRUE(reader->get(cudacodec::VideoReaderProps::PROP_BIT_DEPTH, bitDepthGetVal));
+    ASSERT_EQ(bitDepth, static_cast<cudacodec::BitDepth>(bitDepthGetVal));
+    cv::VideoCapture cap(inputFile);
+
+    int maxDiff = 2;
+    cv::cuda::GpuMat frame;
+    Mat frameHost, frameHostGs, frameFromDevice;
+    for (int i = 0; i < 10; i++)
+    {
+        reader->nextFrame(frame);
+        frame.download(frameFromDevice);
+        cap.read(frameHost);
+        switch (bitDepth)
+        {
+        case cudacodec::BitDepth::EIGHT:
+        default:
+            frameHostGs = frameHost;
+            break;
+        case cudacodec::BitDepth::SIXTEEN:
+            frameHost.convertTo(frameHostGs, CV_16U);
+            frameHostGs *= pow(2, 8);
+            maxDiff = 512;
+        }
+        EXPECT_MAT_NEAR(frameHostGs, frameFromDevice, maxDiff);
     }
 }
 
@@ -637,24 +763,259 @@ CUDA_TEST_P(Seek, Reader)
     ASSERT_EQ(iFrame, static_cast<double>(firstFrameIdx+1));
 }
 
+
+void inline GetConstants(float& wr, float& wb, int& black, int& white, int& uvWhite, int& max, bool fullRange = false) {
+    if (fullRange) {
+        black = 0; white = 255; uvWhite = 255;
+    }
+    else {
+        black = 16; white = 235; uvWhite = 240;
+    }
+    max = 255;
+    wr = 0.2990f; wb = 0.1140f;
+}
+
+std::array<std::array<float, 3>, 3> getYuv2RgbMatrix(const bool fullRange = false) {
+    float wr, wb;
+    int black, white, uvWhite, max;
+    GetConstants(wr, wb, black, white, uvWhite, max, fullRange);
+    std::array<std::array<float, 3>, 3> mat = { {
+        {1.0f, 0.0f, (1.0f - wr) / 0.5f},
+        {1.0f, -wb * (1.0f - wb) / 0.5f / (1 - wb - wr), -wr * (1 - wr) / 0.5f / (1 - wb - wr)},
+        {1.0f, (1.0f - wb) / 0.5f, 0.0f},
+    } };
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            if (j == 0)
+                mat[i][j] = (float)(1.0 * max / (white - black) * mat[i][j]);
+            else
+                mat[i][j] = (float)(1.0 * max / (uvWhite - black) * mat[i][j]);
+        }
+    }
+    return mat;
+}
+
+std::array<std::array<float, 3>, 3> getRgb2YuvMatrix(const bool fullRange = false) {
+    float wr, wb;
+    int black, white, max, uvWhite;
+    GetConstants(wr, wb, black, white, uvWhite, max, fullRange);
+    std::array<std::array<float, 3>, 3> mat = { {
+        {wr, 1.0f - wb - wr, wb},
+        {-0.5f * wr / (1.0f - wb), -0.5f * (1 - wb - wr) / (1.0f - wb), 0.5f},
+        {0.5f, -0.5f * (1.0f - wb - wr) / (1.0f - wr), -0.5f * wb / (1.0f - wr)},
+    } };
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            if (j == 0)
+                mat[i][j] = (float)(1.0 * (white - black) / max * mat[i][j]);
+            else
+                mat[i][j] = (float)(1.0 * (uvWhite - black) / max * mat[i][j]);
+        }
+    }
+    return mat;
+}
+
+void generateGray(Mat bgr, Mat& y, Mat& grayFromY, const bool fullRange) {
+    Mat yuvI420;
+    cv::cvtColor(bgr, yuvI420, COLOR_BGR2YUV_I420);
+    yuvI420(Rect(0, 0, bgr.cols, bgr.rows)).copyTo(y);
+    if (fullRange) {
+        y -= 16;
+        y *= 255.0 / 219.0;
+    }
+    y.copyTo(grayFromY);
+    if (!fullRange) {
+        grayFromY -= 16;
+        grayFromY *= 255.0 / 219.0;
+    }
+}
+
+void generateNv12(Mat bgr, Mat& nv12Interleaved, Mat& bgrFromYuv, const bool fullRange) {
+    Mat yuvI420;
+    cv::cvtColor(bgr, yuvI420, COLOR_BGR2YUV_I420);
+    cv::cvtColor(yuvI420, bgrFromYuv, COLOR_YUV2BGR_I420);
+
+    Mat uv = yuvI420(Rect(0, bgr.rows, bgr.cols, bgr.rows / 2));
+    Mat u0 = uv(Rect(0, 0, uv.cols / 2, uv.rows / 2));
+    Mat u1 = uv(Rect(uv.cols / 2, 0, uv.cols / 2, uv.rows / 2));
+    Mat v0 = uv(Rect(0, uv.rows / 2, uv.cols / 2, uv.rows / 2));
+    Mat v1 = uv(Rect(uv.cols / 2, uv.rows / 2, uv.cols / 2, uv.rows / 2));
+
+    Mat u(uv.rows, uv.cols / 2, CV_8U);
+    Mat ur0(u0.rows, u0.cols, CV_8U, u.data, u0.cols * 2);
+    Mat ur1(u0.rows, u0.cols, CV_8U, u.data + u0.cols, u0.cols * 2);
+    u0.copyTo(ur0);
+    u1.copyTo(ur1);
+
+    Mat v(uv.rows, uv.cols / 2, CV_8U);
+    Mat vr0(v0.rows, v0.cols, CV_8U, v.data, v0.cols * 2);
+    Mat vr1(v0.rows, v0.cols, CV_8U, v.data + v0.cols, v0.cols * 2);
+    v0.copyTo(vr0);
+    v1.copyTo(vr1);
+
+    Mat uv2Channel;
+    Mat uvArray[2] = { u,v };
+    cv::merge(uvArray, 2, uv2Channel);
+
+    Mat y = yuvI420(Rect(0, 0, bgr.cols, bgr.rows));
+    Mat uvInterleaved(uv2Channel.rows, uv2Channel.cols * 2, CV_8U, uv2Channel.data, uv2Channel.step[0]);
+
+    if (fullRange) {
+        Mat y32F;
+        y = (y - 16) * 255.0 / 219.0;
+        uvInterleaved = (uvInterleaved - 128) * 255.0 / 224.0 + 128;
+    }
+
+    nv12Interleaved = Mat(yuvI420.size(), CV_8UC1);
+    y.copyTo(nv12Interleaved(Rect(0, 0, bgr.cols, bgr.rows)));
+    uvInterleaved.copyTo(nv12Interleaved(Rect(0, bgr.rows, uvInterleaved.cols, uvInterleaved.rows)));
+}
+
+void generateYuv444(Mat bgr, Mat& yuv444, Mat& bgrFromYuv, const bool fullRange) {
+    std::array<std::array<float, 3>, 3> matrix = getRgb2YuvMatrix(fullRange);
+    const int yAdj = fullRange ? 0 : 16, uvAdj = 128;
+    Mat bgr32F;
+    bgr.convertTo(bgr32F, CV_32F);
+    Mat bgrSplit32F[3];
+    cv::split(bgr32F, bgrSplit32F);
+    Mat yuv32 = Mat(bgr.rows * 3, bgr.cols, CV_32F);
+    Mat Y = matrix[0][0] * bgrSplit32F[2] + matrix[0][1] * bgrSplit32F[1] + matrix[0][2] * bgrSplit32F[0] + yAdj;
+    Y.copyTo(yuv32(Rect(0, 0, bgr.cols, bgr.rows)));
+    Mat U = matrix[1][0] * bgrSplit32F[2] + matrix[1][1] * bgrSplit32F[1] + matrix[1][2] * bgrSplit32F[0] + uvAdj;
+    U.copyTo(yuv32(Rect(0, bgr.rows, bgr.cols, bgr.rows)));
+    Mat V = matrix[2][0] * bgrSplit32F[2] + matrix[2][1] * bgrSplit32F[1] + matrix[2][2] * bgrSplit32F[0] + uvAdj;
+    V.copyTo(yuv32(Rect(0, 2 * bgr.rows, bgr.cols, bgr.rows)));
+    yuv32.convertTo(yuv444, CV_8UC1);
+
+    Mat y8 = yuv444(Rect(0, 0, bgr.cols, bgr.rows));
+    Mat u8 = yuv444(Rect(0, bgr.rows, bgr.cols, bgr.rows));
+    Mat v8 = yuv444(Rect(0, 2 * bgr.rows, bgr.cols, bgr.rows));
+    y8.convertTo(Y, CV_32F);
+    u8.convertTo(U, CV_32F);
+    v8.convertTo(V, CV_32F);
+
+    if (!fullRange) Y -= 16;
+    U -= 128;
+    V -= 128;
+    matrix = getYuv2RgbMatrix(fullRange);
+    Mat bgrFromYuvSplit32F[3];
+    bgrFromYuvSplit32F[0] = matrix[2][0] * Y + matrix[2][1] * U;
+    bgrFromYuvSplit32F[1] = matrix[1][0] * Y + matrix[1][1] * U + matrix[1][2] * V;
+    bgrFromYuvSplit32F[2] = matrix[0][0] * Y + matrix[0][2] * V;
+    Mat bgrFromYuv32F;
+    cv::merge(bgrFromYuvSplit32F, 3, bgrFromYuv32F);
+    bgrFromYuv32F.convertTo(bgrFromYuv, CV_8UC3);
+}
+
+void generateTestImages(Mat bgrIn, Mat& testImg, Mat& out, const cudacodec::SurfaceFormat inputFormat, const cudacodec::ColorFormat outputFormat, const cudacodec::BitDepth outputBitDepth = cudacodec::BitDepth::EIGHT, bool planar = false, const bool fullRange = false) {
+    Mat imgOutFromYuv, imgOut8;
+    Mat yuv8;
+
+    switch (inputFormat) {
+    case cudacodec::SurfaceFormat::SF_NV12:
+    case cudacodec::SurfaceFormat::SF_P016:
+        if (outputFormat == cudacodec::ColorFormat::GRAY) {
+            yuv8 = Mat(static_cast<int>(bgrIn.rows * 1.5), bgrIn.cols, CV_8U);
+            Mat y = yuv8(Rect(0, 0, bgrIn.cols, bgrIn.rows));
+            generateGray(bgrIn, y, imgOutFromYuv, fullRange);
+        }
+        else
+            generateNv12(bgrIn, yuv8, imgOutFromYuv, fullRange);
+        break;
+    case cudacodec::SurfaceFormat::SF_YUV444:
+    case cudacodec::SurfaceFormat::SF_YUV444_16Bit:
+        if (outputFormat == cudacodec::ColorFormat::GRAY) {
+            yuv8 = Mat(bgrIn.rows * 3, bgrIn.cols, CV_8U);
+            Mat y = yuv8(Rect(0, 0, bgrIn.cols, bgrIn.rows));
+            generateGray(bgrIn, y, imgOutFromYuv, fullRange);
+        }
+        else
+            generateYuv444(bgrIn, yuv8, imgOutFromYuv, fullRange);
+        break;
+    }
+
+    if (inputFormat == cudacodec::SurfaceFormat::SF_P016 || inputFormat == cudacodec::SurfaceFormat::SF_YUV444_16Bit) {
+        yuv8.convertTo(testImg, CV_16U);
+        testImg *= pow(2, 8);
+    }
+    else
+        yuv8.copyTo(testImg);
+
+    switch (outputFormat) {
+    case cudacodec::ColorFormat::BGR:
+        imgOut8 = imgOutFromYuv;
+        break;
+    case cudacodec::ColorFormat::BGRA: {
+        cv::cvtColor(imgOutFromYuv, imgOut8, COLOR_BGR2BGRA);
+        break;
+    }
+    case cudacodec::ColorFormat::RGB: {
+        cv::cvtColor(imgOutFromYuv, imgOut8, COLOR_BGR2RGB);
+        break;
+    }
+    case cudacodec::ColorFormat::RGBA: {
+        cv::cvtColor(imgOutFromYuv, imgOut8, COLOR_BGR2RGBA);
+        break;
+    }
+    case cudacodec::ColorFormat::GRAY: {
+        imgOut8 = imgOutFromYuv;
+        break;
+    }
+    }
+
+    Mat imgOutBitDepthOut;
+    if (outputBitDepth == cudacodec::BitDepth::SIXTEEN) {
+        imgOut8.convertTo(imgOutBitDepthOut, CV_16U);
+        imgOutBitDepthOut *= pow(2, 8);
+    }
+    else
+        imgOutBitDepthOut = imgOut8;
+
+    if (planar && outputFormat != cudacodec::ColorFormat::GRAY) {
+        Mat* bgrSplit = new Mat[imgOutBitDepthOut.channels()];
+        cv::split(imgOutBitDepthOut, bgrSplit);
+        const int type = CV_MAKE_TYPE(CV_MAT_DEPTH(imgOutBitDepthOut.flags), 1);
+        out = Mat(imgOutBitDepthOut.rows * imgOutBitDepthOut.channels(), imgOutBitDepthOut.cols, type);
+        for (int i = 0; i < imgOut8.channels(); i++)
+            bgrSplit[i].copyTo(out(Rect(0, i * imgOut8.rows, imgOut8.cols, imgOut8.rows)));
+        delete[] bgrSplit;
+    }
+    else
+        imgOutBitDepthOut.copyTo(out);
+}
+
+CUDA_TEST_P(YuvConverter, Reader)
+{
+    cv::cuda::setDevice(GET_PARAM(0).deviceID());
+    const cudacodec::SurfaceFormat surfaceFormat = static_cast<cudacodec::SurfaceFormat>(static_cast<int>(GET_PARAM(1)));
+    const cudacodec::ColorFormat outputFormat = static_cast<cudacodec::ColorFormat>(static_cast<int>(GET_PARAM(2)));
+    const cudacodec::BitDepth bitDepth = static_cast<cudacodec::BitDepth>(static_cast<int>(GET_PARAM(3)));
+    const bool planar = GET_PARAM(4);
+    const bool fullRange = GET_PARAM(5);
+    std::string imgPath = std::string(cvtest::TS::ptr()->get_data_path()) + "../python/images/baboon.jpg";
+    Ptr<cv::cudacodec::NVSurfaceToColorConverter> yuvConverter = cudacodec::createNVSurfaceToColorConverter(cv::cudacodec::ColorSpaceStandard::BT601, fullRange);
+    Mat bgr = imread(imgPath), bgrHost;
+    Mat nv12Interleaved, bgrFromYuv;
+    generateTestImages(bgr, nv12Interleaved, bgrFromYuv, surfaceFormat, outputFormat, bitDepth, planar, fullRange);
+    GpuMat nv12Device(nv12Interleaved), bgrDevice(bgrFromYuv.size(), bgrFromYuv.type());
+    yuvConverter->convert(nv12Device, bgrDevice, surfaceFormat, outputFormat, bitDepth, planar, fullRange);
+    bgrDevice.download(bgrHost);
+    EXPECT_MAT_NEAR(bgrFromYuv, bgrHost, bitDepth == cudacodec::BitDepth::EIGHT ? 2 :512);
+}
+
 #endif // HAVE_NVCUVID
 
 #if defined(HAVE_NVCUVID) && defined(HAVE_NVCUVENC)
-struct TransCode : testing::TestWithParam<cv::cuda::DeviceInfo>
+
+struct H264ToH265 : SetDevice
 {
-    cv::cuda::DeviceInfo devInfo;
-    virtual void SetUp()
-    {
-        devInfo = GetParam();
-        cv::cuda::setDevice(devInfo.deviceID());
-    }
 };
 
 #if defined(WIN32)  // remove when FFmpeg wrapper includes PR25874
 #define WIN32_WAIT_FOR_FFMPEG_WRAPPER_UPDATE
 #endif
 
-CUDA_TEST_P(TransCode, H264ToH265)
+CUDA_TEST_P(H264ToH265, Transcode)
 {
     const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../highgui/video/big_buck_bunny.h264";
     constexpr cv::cudacodec::ColorFormat colorFormat = cv::cudacodec::ColorFormat::NV_NV12;
@@ -667,14 +1028,13 @@ CUDA_TEST_P(TransCode, H264ToH265)
     {
         cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
         cv::cudacodec::FormatInfo fmt = reader->format();
-        reader->set(cudacodec::ColorFormat::NV_NV12);
+        reader->set(cudacodec::ColorFormat::NV_YUV_SURFACE_FORMAT);
         cv::Ptr<cv::cudacodec::VideoWriter> writer;
         cv::cuda::GpuMat frame;
         cv::cuda::Stream stream;
         for (int i = 0; i < nFrames; ++i) {
             ASSERT_TRUE(reader->nextFrame(frame, stream));
             ASSERT_FALSE(frame.empty());
-            Mat tst; frame.download(tst);
             if (writer.empty()) {
                 frameSz = Size(fmt.width, fmt.height);
                 writer = cv::cudacodec::createVideoWriter(outputFile, frameSz, codec, fps, colorFormat, 0, stream);
@@ -703,7 +1063,7 @@ CUDA_TEST_P(TransCode, H264ToH265)
     ASSERT_EQ(0, remove(outputFile.c_str()));
 }
 
-INSTANTIATE_TEST_CASE_P(CUDA_Codec, TransCode, ALL_DEVICES);
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, H264ToH265, ALL_DEVICES);
 
 #endif
 
@@ -920,12 +1280,16 @@ const color_conversion_params_t color_conversion_params[] =
     color_conversion_params_t("highgui/video/big_buck_bunny_full_color_range.h264", true),
 };
 
-#define VIDEO_COLOR_OUTPUTS cv::cudacodec::ColorFormat::BGRA, cv::cudacodec::ColorFormat::BGRA
-INSTANTIATE_TEST_CASE_P(CUDA_Codec, ColorConversion, testing::Combine(
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, ColorConversionLumaChromaRange, testing::Combine(
     ALL_DEVICES,
-    testing::Values(VIDEO_COLOR_OUTPUTS),
     testing::ValuesIn(color_conversion_params)));
 
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, ColorConversionFormat, testing::Combine(ALL_DEVICES, ColorFormats::all()));
+
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, ColorConversionPlanar, ALL_DEVICES);
+
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, ColorConversionBitdepth, testing::Combine(ALL_DEVICES, BitDepths::all()));
+
 INSTANTIATE_TEST_CASE_P(CUDA_Codec, ReconfigureDecoderWithScaling, ALL_DEVICES);
 
 #define N_DECODE_SURFACES testing::Values(0, 10)
@@ -939,7 +1303,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_Codec, VideoReadRaw, testing::Combine(
 const histogram_params_t histogram_params[] =
 {
     histogram_params_t("highgui/video/big_buck_bunny.mp4", false),
-    histogram_params_t("highgui/video/big_buck_bunny.h264", true),
+    histogram_params_t("highgui/video/big_buck_bunny.h264", false),
     histogram_params_t("highgui/video/big_buck_bunny_full_color_range.h264", true),
 };
 
@@ -975,5 +1339,9 @@ INSTANTIATE_TEST_CASE_P(CUDA_Codec, CheckInitParams, testing::Combine(
 
 INSTANTIATE_TEST_CASE_P(CUDA_Codec, Seek, ALL_DEVICES);
 
+#define BIT_DEPTHS testing::Values(BitDepths(cudacodec::BitDepth::EIGHT), BitDepths(cudacodec::BitDepth::SIXTEEN))
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, YuvConverter, testing::Combine(
+    ALL_DEVICES, SurfaceFormats::all(), ColorFormats::all(), BIT_DEPTHS, testing::Bool(), testing::Bool()));
+
 #endif // HAVE_NVCUVID || HAVE_NVCUVENC
 }} // namespace