NVIDIA · farbod-nv · Apr 1, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 31, 2026
diff --git a/examples/camera_streamer/camera_config.py b/examples/camera_streamer/camera_config.py
@@ -10,6 +10,13 @@
 import warnings
 
 VALID_CAMERA_TYPES = {"zed", "oakd", "v4l2", "video_file"}
+VALID_COLOR_RANGES = {"auto", "full", "limited"}
+
+# Default color range per camera type.  Used when color_range is "auto".
+# OAK-D VPU encoder outputs full-range BT.601 NV12; others use limited-range.
+_DEFAULT_COLOR_RANGE: dict[str, str] = {
+    "oakd": "full",
+}
 
 
 @dataclass
@@ -70,6 +77,9 @@ class CameraConfig:
     video_dir: str | None = None
     video_basename: str | None = None
 
+    color_range: str = "auto"
+    """NV12->RGB color range: 'auto' (per-camera-type default), 'full', or 'limited'."""
+
     _KNOWN_KEYS = {
         "type",
         "stereo",
@@ -84,13 +94,25 @@ class CameraConfig:
         "device",
         "video_dir",
         "video_basename",
+        "color_range",
     }
 
     def __post_init__(self):
         if self.camera_type not in VALID_CAMERA_TYPES:
             raise ValueError(
                 f"Camera '{self.name}': unknown camera_type '{self.camera_type}' (valid: {VALID_CAMERA_TYPES})"
             )
+        if self.color_range not in VALID_COLOR_RANGES:
+            raise ValueError(
+                f"Camera '{self.name}': unknown color_range '{self.color_range}' (valid: {VALID_COLOR_RANGES})"
+            )
+
+    @property
+    def is_full_range(self) -> bool:
+        """Resolved color range: True for full-range NV12, False for limited-range."""
+        if self.color_range == "auto":
+            return _DEFAULT_COLOR_RANGE.get(self.camera_type, "limited") == "full"
+        return self.color_range == "full"
 
     @classmethod
     def from_dict(cls, name: str, data: dict) -> "CameraConfig":
@@ -148,6 +170,7 @@ def from_dict(cls, name: str, data: dict) -> "CameraConfig":
             device=data.get("device"),
             video_dir=data.get("video_dir"),
             video_basename=data.get("video_basename"),
+            color_range=data.get("color_range", "auto"),
         )
 
 

diff --git a/examples/camera_streamer/operators/nv_stream_decoder/CMakeLists.txt b/examples/camera_streamer/operators/nv_stream_decoder/CMakeLists.txt
@@ -50,6 +50,7 @@ endif()
 # C++ library.
 add_library(nv_stream_decoder SHARED
   nv_stream_decoder_op.cpp
+  nv12_to_rgb.cu
   ${NVC_SDK_DIR}/NvDecoder/NvDecoder.cpp
   ${NVC_SDK_DIR}/Logger.cpp
 )

diff --git a/examples/camera_streamer/operators/nv_stream_decoder/nv12_to_rgb.cu b/examples/camera_streamer/operators/nv_stream_decoder/nv12_to_rgb.cu
@@ -0,0 +1,63 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "nv12_to_rgb.cuh"
+
+namespace isaac_teleop::cam_streamer
+{
+
+// Full-range BT.601 NV12 -> RGB (ITU-T T.871 / JFIF).
+// Coefficients derived from BT.601 luma weights (Kr=0.299, Kb=0.114).
+// See: https://www.itu.int/rec/T-REC-T.871
+__global__ void nv12_to_rgb_fullrange_kernel(const uint8_t* __restrict__ y_plane,
+                                             const uint8_t* __restrict__ uv_plane,
+                                             int y_pitch,
+                                             uint8_t* __restrict__ dst,
+                                             int dst_pitch,
+                                             int width,
+                                             int height)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= width || y >= height)
+        return;
+
+    const float Y  = static_cast<float>(y_plane[y * y_pitch + x]);
+    const int uv_x = (x & ~1);
+    const int uv_y = y >> 1;
+    const int uv_offset = uv_y * y_pitch + uv_x;
+
+    const float Cb = static_cast<float>(uv_plane[uv_offset])     - 128.0f;
+    const float Cr = static_cast<float>(uv_plane[uv_offset + 1]) - 128.0f;
+
+    const float R = Y + 1.402f   * Cr;
+    const float G = Y - 0.34414f * Cb - 0.71414f * Cr;
+    const float B = Y + 1.772f   * Cb;
+
+    const int dst_offset = y * dst_pitch + x * 3;
+    dst[dst_offset]     = static_cast<uint8_t>(fminf(fmaxf(R, 0.0f), 255.0f));
+    dst[dst_offset + 1] = static_cast<uint8_t>(fminf(fmaxf(G, 0.0f), 255.0f));
+    dst[dst_offset + 2] = static_cast<uint8_t>(fminf(fmaxf(B, 0.0f), 255.0f));
+}
+
+void nv12_to_rgb_fullrange_bt601(const uint8_t* y_plane,
+                                 const uint8_t* uv_plane,
+                                 int y_pitch,
+                                 uint8_t* dst,
+                                 int dst_pitch,
+                                 int width,
+                                 int height,
+                                 cudaStream_t stream)
+{
+    dim3 block(16, 16);
+    dim3 grid((width + block.x - 1) / block.x,
+              (height + block.y - 1) / block.y);
+
+    nv12_to_rgb_fullrange_kernel<<<grid, block, 0, stream>>>(
+        y_plane, uv_plane, y_pitch, dst, dst_pitch, width, height);
+}
+
+} // namespace isaac_teleop::cam_streamer
diff --git a/examples/camera_streamer/operators/nv_stream_decoder/nv12_to_rgb.cuh b/examples/camera_streamer/operators/nv_stream_decoder/nv12_to_rgb.cuh
@@ -0,0 +1,36 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef NV12_TO_RGB_CUH
+#define NV12_TO_RGB_CUH
+
+#include <cuda_runtime.h>
+#include <cstdint>
+
+namespace isaac_teleop::cam_streamer
+{
+
+/**
+ * Full-range BT.601 NV12 -> packed RGB conversion.
+ * Coefficients from ITU-T T.871 (https://www.itu.int/rec/T-REC-T.871).
+ *
+ * NPP's NV12-to-RGB functions don't cover this combination: 709CSC is
+ * BT.709 limited-range, 709HDTV is BT.709 full-range, and the plain
+ * nppiNV12ToRGB_8u_P2C3R uses BT.601 but its range is unspecified in
+ * the docs.  This single-pass kernel fills the gap for cameras like
+ * OAK-D whose VPU encoder outputs full-range BT.601 NV12.
+ */
+void nv12_to_rgb_fullrange_bt601(const uint8_t* y_plane,
+                                 const uint8_t* uv_plane,
+                                 int y_pitch,
+                                 uint8_t* dst,
+                                 int dst_pitch,
+                                 int width,
+                                 int height,
+                                 cudaStream_t stream = 0);
+
+} // namespace isaac_teleop::cam_streamer
+
+#endif /* NV12_TO_RGB_CUH */
diff --git a/examples/camera_streamer/operators/nv_stream_decoder/nv_stream_decoder_op.cpp b/examples/camera_streamer/operators/nv_stream_decoder/nv_stream_decoder_op.cpp
@@ -12,6 +12,7 @@
 #include "holoscan/core/execution_context.hpp"
 #include "holoscan/core/gxf/entity.hpp"
 #include "holoscan/core/io_context.hpp"
+#include "nv12_to_rgb.cuh"
 
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -39,6 +40,11 @@ void NvStreamDecoderOp::setup(holoscan::OperatorSpec& spec)
     spec.param(cuda_device_ordinal_, "cuda_device_ordinal", "CUDA Device", "CUDA device ordinal", 0);
     spec.param(allocator_, "allocator", "Allocator", "Output buffer allocator");
     spec.param(verbose_, "verbose", "Verbose", "Enable verbose logging", false);
+    spec.param(force_full_range_, "force_full_range", "Force Full Range",
+               "Force full-range NV12 to RGB conversion. Set true for encoders that "
+               "produce full-range YUV (e.g. OAK-D VPU). When false, auto-detects from "
+               "the H.264 bitstream VUI parameters.",
+               false);
 
     cuda_stream_handler_.define_params(spec);
 }
@@ -52,6 +58,9 @@ void NvStreamDecoderOp::initialize()
     cuda_check(cuDevicePrimaryCtxRetain(&cu_context_, cu_device_));
 
     // Initialize NPP stream context manually.
+    // Push the target device context so the CUDA runtime API calls below
+    // query the correct GPU (matters on multi-GPU systems).
+    cuda_check(cuCtxPushCurrent(cu_context_));
     {
         npp_ctx_.hStream = 0; // Default (NULL) stream.
 
@@ -72,6 +81,7 @@ void NvStreamDecoderOp::initialize()
         npp_ctx_.nMaxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
         npp_ctx_.nSharedMemPerBlock = deviceProperties.sharedMemPerBlock;
     }
+    cuda_check(cuCtxPopCurrent(nullptr));
 
     if (verbose_.get())
     {
@@ -172,6 +182,20 @@ void NvStreamDecoderOp::compute(holoscan::InputContext& op_input,
         }
     }
 
+    // Detect full-range vs limited-range once after first successful decode.
+    // Auto-detection reads video_full_range_flag from the H.264 VUI parameters
+    // (ITU-T H.264 Section E.2.1).  Many embedded encoders (e.g. OAK-D VPU)
+    // don't set this flag, so force_full_range overrides when needed.
+    if (!range_detected_)
+    {
+        range_detected_ = true;
+        auto fmt = decoder_->GetVideoFormatInfo();
+        int bitstream_flag = fmt.video_signal_description.video_full_range_flag;
+        use_full_range_ = force_full_range_.get() || (bitstream_flag != 0);
+        HOLOSCAN_LOG_INFO("NV12->RGB color range: {} (force_full_range={}, bitstream flag={})",
+                          use_full_range_ ? "full" : "limited", force_full_range_.get(), bitstream_flag);
+    }
+
     auto allocator = nvidia::gxf::Handle<nvidia::gxf::Allocator>::Create(context.context(), allocator_->gxf_cid());
     auto output = nvidia::gxf::Entity::New(context.context());
     if (!output)
@@ -180,7 +204,6 @@ void NvStreamDecoderOp::compute(holoscan::InputContext& op_input,
         throw std::runtime_error("Failed to create output entity");
     }
 
-    // Output RGB tensor [height, width, 3] in HWC format
     auto out_tensor = output.value().add<nvidia::gxf::Tensor>("");
     if (!out_tensor)
     {
@@ -196,16 +219,40 @@ void NvStreamDecoderOp::compute(holoscan::InputContext& op_input,
 
     auto dst = static_cast<uint8_t*>(out_tensor.value()->pointer());
 
-    const Npp8u* pSrc[2] = { pFrame, pFrame + lumaSize };
-    NppiSize roi = { width, height };
-
-    NppStatus status = nppiNV12ToRGB_8u_P2C3R_Ctx(pSrc, pitch, dst, width * 3, roi, npp_ctx_);
-    if (status != NPP_SUCCESS)
+    // Push the decoder's CUDA context so the conversion runs on the correct
+    // GPU.  pFrame and dst reside on cu_device_; without this, multi-GPU
+    // setups would target the wrong device after the decode context pop above.
+    cuda_check(cuCtxPushCurrent(cu_context_));
+    if (use_full_range_)
     {
-        HOLOSCAN_LOG_ERROR("NPP NV12->RGB failed: {}", static_cast<int>(status));
-        decoder_->UnlockFrame(&pFrame);
-        return;
+        // BT.601 full-range (ITU-T T.871).  NPP has no NV12 variant for this
+        // combination so we use a single-pass CUDA kernel.  See nv12_to_rgb.cu.
+        nv12_to_rgb_fullrange_bt601(pFrame, pFrame + lumaSize, pitch, dst, width * 3, width, height, npp_ctx_.hStream);
+        cudaError_t cuda_status = cudaGetLastError();
+        if (cuda_status != cudaSuccess)
+        {
+            HOLOSCAN_LOG_ERROR("CUDA NV12->RGB kernel failed: {}", cudaGetErrorString(cuda_status));
+            decoder_->UnlockFrame(&pFrame);
+            cuda_check(cuCtxPopCurrent(nullptr));
+            return;
+        }
+    }
+    else
+    {
+        // BT.709 limited-range (16-235).  NPP docs: "use CSC version for
+        // limited range color" (as opposed to the 709HDTV full-range variant).
+        const Npp8u* pSrc[2] = { pFrame, pFrame + lumaSize };
+        NppiSize roi = { width, height };
+        NppStatus status = nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx(pSrc, pitch, dst, width * 3, roi, npp_ctx_);
+        if (status != NPP_SUCCESS)
+        {
+            HOLOSCAN_LOG_ERROR("NPP NV12->RGB failed: {}", static_cast<int>(status));
+            decoder_->UnlockFrame(&pFrame);
+            cuda_check(cuCtxPopCurrent(nullptr));
+            return;
+        }
     }
+    cuda_check(cuCtxPopCurrent(nullptr));
 
     decoder_->UnlockFrame(&pFrame);
 

diff --git a/examples/camera_streamer/operators/nv_stream_decoder/nv_stream_decoder_op.hpp b/examples/camera_streamer/operators/nv_stream_decoder/nv_stream_decoder_op.hpp
@@ -52,6 +52,7 @@ class NvStreamDecoderOp : public holoscan::Operator
     holoscan::Parameter<int> cuda_device_ordinal_;
     holoscan::Parameter<std::shared_ptr<holoscan::Allocator>> allocator_;
     holoscan::Parameter<bool> verbose_;
+    holoscan::Parameter<bool> force_full_range_;
 
     holoscan::CudaStreamHandler cuda_stream_handler_;
 
@@ -64,6 +65,10 @@ class NvStreamDecoderOp : public holoscan::Operator
     std::unique_ptr<NvDecoder> decoder_;
     bool decoder_initialized_ = false;
 
+    // Color range detection (resolved after first decoded frame)
+    bool use_full_range_ = false;
+    bool range_detected_ = false;
+
     // Stats
     uint64_t frame_count_ = 0;
     int video_width_ = 0;

diff --git a/examples/camera_streamer/operators/nv_stream_decoder/nv_stream_decoder_op_py.cpp b/examples/camera_streamer/operators/nv_stream_decoder/nv_stream_decoder_op_py.cpp
@@ -49,10 +49,11 @@ class PyNvStreamDecoderOp : public NvStreamDecoderOp
                         int cuda_device_ordinal,
                         std::shared_ptr<holoscan::Allocator> allocator,
                         bool verbose,
+                        bool force_full_range,
                         const std::string& name = "nv_stream_decoder")
-        : NvStreamDecoderOp(holoscan::ArgList{ holoscan::Arg{ "cuda_device_ordinal", cuda_device_ordinal },
-                                               holoscan::Arg{ "allocator", allocator },
-                                               holoscan::Arg{ "verbose", verbose } })
+        : NvStreamDecoderOp(holoscan::ArgList{
+              holoscan::Arg{ "cuda_device_ordinal", cuda_device_ordinal }, holoscan::Arg{ "allocator", allocator },
+              holoscan::Arg{ "verbose", verbose }, holoscan::Arg{ "force_full_range", force_full_range } })
     {
         add_positional_condition_and_resource_args(this, args);
         name_ = name;
@@ -83,13 +84,17 @@ allocator : Allocator
     Output buffer allocator.
 verbose : bool
     Enable verbose logging (default: False).
+force_full_range : bool
+    Force full-range NV12 to RGB conversion. Set True for encoders that
+    produce full-range YUV (e.g. OAK-D VPU). When False, auto-detects
+    from the H.264 bitstream VUI parameters (default: False).
 name : str
     Operator name (default: "nv_stream_decoder").
 )doc")
-        .def(py::init<holoscan::Fragment*, const py::args&, int, std::shared_ptr<holoscan::Allocator>, bool,
+        .def(py::init<holoscan::Fragment*, const py::args&, int, std::shared_ptr<holoscan::Allocator>, bool, bool,
                       const std::string&>(),
              "fragment"_a, "cuda_device_ordinal"_a = 0, "allocator"_a, "verbose"_a = false,
-             "name"_a = "nv_stream_decoder"s)
+             "force_full_range"_a = false, "name"_a = "nv_stream_decoder"s)
         .def("initialize", &NvStreamDecoderOp::initialize)
         .def("setup", &NvStreamDecoderOp::setup, "spec"_a);
 }

diff --git a/examples/camera_streamer/teleop_camera_subgraph.py b/examples/camera_streamer/teleop_camera_subgraph.py
@@ -491,6 +491,7 @@ def _compose_rtp_sources(
                     cuda_device_ordinal=cuda_device,
                     allocator=allocator,
                     verbose=verbose,
+                    force_full_range=cam_cfg.is_full_range,
                 )
 
                 if self._config.display_mode == DisplayMode.MONITOR: