Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions examples/camera_streamer/camera_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@
import warnings

VALID_CAMERA_TYPES = {"zed", "oakd", "v4l2", "video_file"}
VALID_COLOR_RANGES = {"auto", "full", "limited"}

# Default color range per camera type. Used when color_range is "auto".
# OAK-D VPU encoder outputs full-range BT.601 NV12; others use limited-range.
_DEFAULT_COLOR_RANGE: dict[str, str] = {
"oakd": "full",
}


@dataclass
Expand Down Expand Up @@ -70,6 +77,9 @@ class CameraConfig:
video_dir: str | None = None
video_basename: str | None = None

color_range: str = "auto"
"""NV12->RGB color range: 'auto' (per-camera-type default), 'full', or 'limited'."""

_KNOWN_KEYS = {
"type",
"stereo",
Expand All @@ -84,13 +94,25 @@ class CameraConfig:
"device",
"video_dir",
"video_basename",
"color_range",
}

def __post_init__(self):
if self.camera_type not in VALID_CAMERA_TYPES:
raise ValueError(
f"Camera '{self.name}': unknown camera_type '{self.camera_type}' (valid: {VALID_CAMERA_TYPES})"
)
if self.color_range not in VALID_COLOR_RANGES:
raise ValueError(
f"Camera '{self.name}': unknown color_range '{self.color_range}' (valid: {VALID_COLOR_RANGES})"
)

@property
def is_full_range(self) -> bool:
"""Resolved color range: True for full-range NV12, False for limited-range."""
if self.color_range == "auto":
return _DEFAULT_COLOR_RANGE.get(self.camera_type, "limited") == "full"
return self.color_range == "full"

@classmethod
def from_dict(cls, name: str, data: dict) -> "CameraConfig":
Expand Down Expand Up @@ -148,6 +170,7 @@ def from_dict(cls, name: str, data: dict) -> "CameraConfig":
device=data.get("device"),
video_dir=data.get("video_dir"),
video_basename=data.get("video_basename"),
color_range=data.get("color_range", "auto"),
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ endif()
# C++ library.
add_library(nv_stream_decoder SHARED
nv_stream_decoder_op.cpp
nv12_to_rgb.cu
${NVC_SDK_DIR}/NvDecoder/NvDecoder.cpp
${NVC_SDK_DIR}/Logger.cpp
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/

#include "nv12_to_rgb.cuh"

namespace isaac_teleop::cam_streamer
{

// Full-range BT.601 NV12 -> RGB (ITU-T T.871 / JFIF).
// Coefficients derived from BT.601 luma weights (Kr=0.299, Kb=0.114).
// See: https://www.itu.int/rec/T-REC-T.871
__global__ void nv12_to_rgb_fullrange_kernel(const uint8_t* __restrict__ y_plane,
const uint8_t* __restrict__ uv_plane,
int y_pitch,
uint8_t* __restrict__ dst,
int dst_pitch,
int width,
int height)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;

if (x >= width || y >= height)
return;

const float Y = static_cast<float>(y_plane[y * y_pitch + x]);
const int uv_x = (x & ~1);
const int uv_y = y >> 1;
const int uv_offset = uv_y * y_pitch + uv_x;

const float Cb = static_cast<float>(uv_plane[uv_offset]) - 128.0f;
const float Cr = static_cast<float>(uv_plane[uv_offset + 1]) - 128.0f;

const float R = Y + 1.402f * Cr;
const float G = Y - 0.34414f * Cb - 0.71414f * Cr;
const float B = Y + 1.772f * Cb;

const int dst_offset = y * dst_pitch + x * 3;
dst[dst_offset] = static_cast<uint8_t>(fminf(fmaxf(R, 0.0f), 255.0f));
dst[dst_offset + 1] = static_cast<uint8_t>(fminf(fmaxf(G, 0.0f), 255.0f));
dst[dst_offset + 2] = static_cast<uint8_t>(fminf(fmaxf(B, 0.0f), 255.0f));
}

void nv12_to_rgb_fullrange_bt601(const uint8_t* y_plane,
const uint8_t* uv_plane,
int y_pitch,
uint8_t* dst,
int dst_pitch,
int width,
int height,
cudaStream_t stream)
{
dim3 block(16, 16);
dim3 grid((width + block.x - 1) / block.x,
(height + block.y - 1) / block.y);

nv12_to_rgb_fullrange_kernel<<<grid, block, 0, stream>>>(
y_plane, uv_plane, y_pitch, dst, dst_pitch, width, height);
}

} // namespace isaac_teleop::cam_streamer
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/

#ifndef NV12_TO_RGB_CUH
#define NV12_TO_RGB_CUH

#include <cuda_runtime.h>
#include <cstdint>

namespace isaac_teleop::cam_streamer
{

/**
* Full-range BT.601 NV12 -> packed RGB conversion.
* Coefficients from ITU-T T.871 (https://www.itu.int/rec/T-REC-T.871).
*
* NPP's NV12-to-RGB functions don't cover this combination: 709CSC is
* BT.709 limited-range, 709HDTV is BT.709 full-range, and the plain
* nppiNV12ToRGB_8u_P2C3R uses BT.601 but its range is unspecified in
* the docs. This single-pass kernel fills the gap for cameras like
* OAK-D whose VPU encoder outputs full-range BT.601 NV12.
*/
void nv12_to_rgb_fullrange_bt601(const uint8_t* y_plane,
const uint8_t* uv_plane,
int y_pitch,
uint8_t* dst,
int dst_pitch,
int width,
int height,
cudaStream_t stream = 0);

} // namespace isaac_teleop::cam_streamer

#endif /* NV12_TO_RGB_CUH */
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "holoscan/core/execution_context.hpp"
#include "holoscan/core/gxf/entity.hpp"
#include "holoscan/core/io_context.hpp"
#include "nv12_to_rgb.cuh"

#include <cuda.h>
#include <cuda_runtime.h>
Expand Down Expand Up @@ -39,6 +40,11 @@ void NvStreamDecoderOp::setup(holoscan::OperatorSpec& spec)
spec.param(cuda_device_ordinal_, "cuda_device_ordinal", "CUDA Device", "CUDA device ordinal", 0);
spec.param(allocator_, "allocator", "Allocator", "Output buffer allocator");
spec.param(verbose_, "verbose", "Verbose", "Enable verbose logging", false);
spec.param(force_full_range_, "force_full_range", "Force Full Range",
"Force full-range NV12 to RGB conversion. Set true for encoders that "
"produce full-range YUV (e.g. OAK-D VPU). When false, auto-detects from "
"the H.264 bitstream VUI parameters.",
false);

cuda_stream_handler_.define_params(spec);
}
Expand All @@ -52,6 +58,9 @@ void NvStreamDecoderOp::initialize()
cuda_check(cuDevicePrimaryCtxRetain(&cu_context_, cu_device_));

// Initialize NPP stream context manually.
// Push the target device context so the CUDA runtime API calls below
// query the correct GPU (matters on multi-GPU systems).
cuda_check(cuCtxPushCurrent(cu_context_));
{
npp_ctx_.hStream = 0; // Default (NULL) stream.

Expand All @@ -72,6 +81,7 @@ void NvStreamDecoderOp::initialize()
npp_ctx_.nMaxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
npp_ctx_.nSharedMemPerBlock = deviceProperties.sharedMemPerBlock;
}
cuda_check(cuCtxPopCurrent(nullptr));

if (verbose_.get())
{
Expand Down Expand Up @@ -172,6 +182,20 @@ void NvStreamDecoderOp::compute(holoscan::InputContext& op_input,
}
}

// Detect full-range vs limited-range once after first successful decode.
// Auto-detection reads video_full_range_flag from the H.264 VUI parameters
// (ITU-T H.264 Section E.2.1). Many embedded encoders (e.g. OAK-D VPU)
// don't set this flag, so force_full_range overrides when needed.
if (!range_detected_)
{
range_detected_ = true;
auto fmt = decoder_->GetVideoFormatInfo();
int bitstream_flag = fmt.video_signal_description.video_full_range_flag;
use_full_range_ = force_full_range_.get() || (bitstream_flag != 0);
HOLOSCAN_LOG_INFO("NV12->RGB color range: {} (force_full_range={}, bitstream flag={})",
use_full_range_ ? "full" : "limited", force_full_range_.get(), bitstream_flag);
}

auto allocator = nvidia::gxf::Handle<nvidia::gxf::Allocator>::Create(context.context(), allocator_->gxf_cid());
auto output = nvidia::gxf::Entity::New(context.context());
if (!output)
Expand All @@ -180,7 +204,6 @@ void NvStreamDecoderOp::compute(holoscan::InputContext& op_input,
throw std::runtime_error("Failed to create output entity");
}

// Output RGB tensor [height, width, 3] in HWC format
auto out_tensor = output.value().add<nvidia::gxf::Tensor>("");
if (!out_tensor)
{
Expand All @@ -196,16 +219,40 @@ void NvStreamDecoderOp::compute(holoscan::InputContext& op_input,

auto dst = static_cast<uint8_t*>(out_tensor.value()->pointer());

const Npp8u* pSrc[2] = { pFrame, pFrame + lumaSize };
NppiSize roi = { width, height };

NppStatus status = nppiNV12ToRGB_8u_P2C3R_Ctx(pSrc, pitch, dst, width * 3, roi, npp_ctx_);
if (status != NPP_SUCCESS)
// Push the decoder's CUDA context so the conversion runs on the correct
// GPU. pFrame and dst reside on cu_device_; without this, multi-GPU
// setups would target the wrong device after the decode context pop above.
cuda_check(cuCtxPushCurrent(cu_context_));
if (use_full_range_)
{
HOLOSCAN_LOG_ERROR("NPP NV12->RGB failed: {}", static_cast<int>(status));
decoder_->UnlockFrame(&pFrame);
return;
// BT.601 full-range (ITU-T T.871). NPP has no NV12 variant for this
// combination so we use a single-pass CUDA kernel. See nv12_to_rgb.cu.
nv12_to_rgb_fullrange_bt601(pFrame, pFrame + lumaSize, pitch, dst, width * 3, width, height, npp_ctx_.hStream);
cudaError_t cuda_status = cudaGetLastError();
if (cuda_status != cudaSuccess)
{
HOLOSCAN_LOG_ERROR("CUDA NV12->RGB kernel failed: {}", cudaGetErrorString(cuda_status));
decoder_->UnlockFrame(&pFrame);
cuda_check(cuCtxPopCurrent(nullptr));
return;
}
}
else
{
// BT.709 limited-range (16-235). NPP docs: "use CSC version for
// limited range color" (as opposed to the 709HDTV full-range variant).
const Npp8u* pSrc[2] = { pFrame, pFrame + lumaSize };
NppiSize roi = { width, height };
NppStatus status = nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx(pSrc, pitch, dst, width * 3, roi, npp_ctx_);
if (status != NPP_SUCCESS)
{
HOLOSCAN_LOG_ERROR("NPP NV12->RGB failed: {}", static_cast<int>(status));
decoder_->UnlockFrame(&pFrame);
cuda_check(cuCtxPopCurrent(nullptr));
return;
}
}
cuda_check(cuCtxPopCurrent(nullptr));

decoder_->UnlockFrame(&pFrame);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class NvStreamDecoderOp : public holoscan::Operator
holoscan::Parameter<int> cuda_device_ordinal_;
holoscan::Parameter<std::shared_ptr<holoscan::Allocator>> allocator_;
holoscan::Parameter<bool> verbose_;
holoscan::Parameter<bool> force_full_range_;

holoscan::CudaStreamHandler cuda_stream_handler_;

Expand All @@ -64,6 +65,10 @@ class NvStreamDecoderOp : public holoscan::Operator
std::unique_ptr<NvDecoder> decoder_;
bool decoder_initialized_ = false;

// Color range detection (resolved after first decoded frame)
bool use_full_range_ = false;
bool range_detected_ = false;

// Stats
uint64_t frame_count_ = 0;
int video_width_ = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,11 @@ class PyNvStreamDecoderOp : public NvStreamDecoderOp
int cuda_device_ordinal,
std::shared_ptr<holoscan::Allocator> allocator,
bool verbose,
bool force_full_range,
const std::string& name = "nv_stream_decoder")
: NvStreamDecoderOp(holoscan::ArgList{ holoscan::Arg{ "cuda_device_ordinal", cuda_device_ordinal },
holoscan::Arg{ "allocator", allocator },
holoscan::Arg{ "verbose", verbose } })
: NvStreamDecoderOp(holoscan::ArgList{
holoscan::Arg{ "cuda_device_ordinal", cuda_device_ordinal }, holoscan::Arg{ "allocator", allocator },
holoscan::Arg{ "verbose", verbose }, holoscan::Arg{ "force_full_range", force_full_range } })
{
add_positional_condition_and_resource_args(this, args);
name_ = name;
Expand Down Expand Up @@ -83,13 +84,17 @@ allocator : Allocator
Output buffer allocator.
verbose : bool
Enable verbose logging (default: False).
force_full_range : bool
Force full-range NV12 to RGB conversion. Set True for encoders that
produce full-range YUV (e.g. OAK-D VPU). When False, auto-detects
from the H.264 bitstream VUI parameters (default: False).
name : str
Operator name (default: "nv_stream_decoder").
)doc")
.def(py::init<holoscan::Fragment*, const py::args&, int, std::shared_ptr<holoscan::Allocator>, bool,
.def(py::init<holoscan::Fragment*, const py::args&, int, std::shared_ptr<holoscan::Allocator>, bool, bool,
const std::string&>(),
"fragment"_a, "cuda_device_ordinal"_a = 0, "allocator"_a, "verbose"_a = false,
"name"_a = "nv_stream_decoder"s)
"force_full_range"_a = false, "name"_a = "nv_stream_decoder"s)
.def("initialize", &NvStreamDecoderOp::initialize)
.def("setup", &NvStreamDecoderOp::setup, "spec"_a);
}
Expand Down
1 change: 1 addition & 0 deletions examples/camera_streamer/teleop_camera_subgraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,7 @@ def _compose_rtp_sources(
cuda_device_ordinal=cuda_device,
allocator=allocator,
verbose=verbose,
force_full_range=cam_cfg.is_full_range,
)

if self._config.display_mode == DisplayMode.MONITOR:
Expand Down
Loading