Skip to content
Draft
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ git clone [email protected]:pytorch/torchcodec.git
# Or, using https instead of ssh: git clone https://github.com/pytorch/torchcodec.git
cd torchcodec

# Optional, but recommended: define a persistent build directory which speeds-up
# subsequent builds.
export TORCHCODEC_CMAKE_BUILD_DIR="${PWD}/build"

pip install -e ".[dev]" --no-build-isolation -vv
# Or, for cuda support: ENABLE_CUDA=1 pip install -e ".[dev]" --no-build-isolation -vv
```
Expand Down
5 changes: 5 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,17 @@ def _build_all_extensions_with_cmake(self):
f"-DTORCHCODEC_DISABLE_COMPILE_WARNING_AS_ERROR={torchcodec_disable_compile_warning_as_error}",
]

self.build_temp = os.getenv("TORCHCODEC_CMAKE_BUILD_DIR", self.build_temp)
print(f"Using {self.build_temp = }", flush=True)
Path(self.build_temp).mkdir(parents=True, exist_ok=True)

print("Calling cmake (configure)", flush=True)
subprocess.check_call(
["cmake", str(_ROOT_DIR)] + cmake_args, cwd=self.build_temp
)
print("Calling cmake --build", flush=True)
subprocess.check_call(["cmake", "--build", "."], cwd=self.build_temp)
print("Calling cmake --install", flush=True)
subprocess.check_call(["cmake", "--install", "."], cwd=self.build_temp)

def copy_extensions_to_source(self):
Expand Down
179 changes: 148 additions & 31 deletions src/torchcodec/_core/CudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,54 +196,171 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
UniqueAVFrame& avFrame,
FrameOutput& frameOutput,
std::optional<torch::Tensor> preAllocatedOutputTensor) {
// We check that avFrame->format == AV_PIX_FMT_CUDA. This only ensures the
// AVFrame is on GPU memory. It can be on CPU memory if the video isn't
// supported by NVDEC for whatever reason: NVDEC falls back to CPU decoding in
// this case, and our check fails.
// TODO: we could send the frame back into the CPU path, and rely on
// swscale/filtergraph to run the color conversion to properly output the
// frame.
TORCH_CHECK(
avFrame->format == AV_PIX_FMT_CUDA,
"Expected format to be AV_PIX_FMT_CUDA, got " +
std::string(av_get_pix_fmt_name((AVPixelFormat)avFrame->format)));
"Expected format to be AV_PIX_FMT_CUDA, got ",
(av_get_pix_fmt_name((AVPixelFormat)avFrame->format)
? av_get_pix_fmt_name((AVPixelFormat)avFrame->format)
: "unknown"),
". When that happens, it is probably because the video is not supported by NVDEC. "
"Try using the CPU device instead. "
"If the video is 10bit, we are tracking 10bit support in "
"https://github.com/pytorch/torchcodec/issues/776");

// Above we checked that the AVFrame was on GPU, but that's not enough, we
// also need to check that the AVFrame is in AV_PIX_FMT_NV12 format (8 bits),
// because this is what the NPP color conversion routines expect.
// TODO: we should investigate how to can perform color conversion for
// non-8bit videos. This is supported on CPU.
TORCH_CHECK(
avFrame->hw_frames_ctx != nullptr,
"The AVFrame does not have a hw_frames_ctx. "
"That's unexpected, please report this to the TorchCodec repo.");

AVPixelFormat actualFormat =
reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data)
->sw_format;
TORCH_CHECK(
actualFormat == AV_PIX_FMT_NV12 || actualFormat == AV_PIX_FMT_P010LE,
"The AVFrame is ",
(av_get_pix_fmt_name(actualFormat) ? av_get_pix_fmt_name(actualFormat)
: "unknown"),
", but we expected AV_PIX_FMT_NV12 or AV_PIX_FMT_P010LE. "
"Try using the CPU device instead.");

auto frameDims =
getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, avFrame);
int height = frameDims.height;
int width = frameDims.width;
torch::Tensor& dst = frameOutput.data;
if (preAllocatedOutputTensor.has_value()) {
dst = preAllocatedOutputTensor.value();
auto shape = dst.sizes();
TORCH_CHECK(
(shape.size() == 3) && (shape[0] == height) && (shape[1] == width) &&
(shape[2] == 3),
"Expected tensor of shape ",
height,
"x",
width,
"x3, got ",
shape);
torch::Tensor intermediateTensor;

if (actualFormat == AV_PIX_FMT_P010LE) {
// For 10-bit, we need a 16-bit intermediate tensor, then convert to 8-bit
intermediateTensor = torch::empty(
{height, width, 3},
torch::TensorOptions().dtype(torch::kUInt16).device(device_));

if (preAllocatedOutputTensor.has_value()) {
dst = preAllocatedOutputTensor.value();
} else {
dst = allocateEmptyHWCTensor(height, width, device_);
}
} else {
dst = allocateEmptyHWCTensor(height, width, device_);
// For 8-bit formats, use the output tensor directly
if (preAllocatedOutputTensor.has_value()) {
dst = preAllocatedOutputTensor.value();
auto shape = dst.sizes();
TORCH_CHECK(
(shape.size() == 3) && (shape[0] == height) && (shape[1] == width) &&
(shape[2] == 3),
"Expected tensor of shape ",
height,
"x",
width,
"x3, got ",
shape);
} else {
dst = allocateEmptyHWCTensor(height, width, device_);
}
}

// Use the user-requested GPU for running the NPP kernel.
c10::cuda::CUDAGuard deviceGuard(device_);

NppiSize oSizeROI = {width, height};
Npp8u* input[2] = {avFrame->data[0], avFrame->data[1]};

NppStatus status;
if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
status = nppiNV12ToRGB_709CSC_8u_P2C3R(
input,
avFrame->linesize[0],
static_cast<Npp8u*>(dst.data_ptr()),
dst.stride(0),
oSizeROI);
} else {
status = nppiNV12ToRGB_8u_P2C3R(

if (actualFormat == AV_PIX_FMT_NV12) {
// 8-bit NV12 format
Npp8u* input[2] = {avFrame->data[0], avFrame->data[1]};

if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
status = nppiNV12ToRGB_709CSC_8u_P2C3R(
input,
avFrame->linesize[0],
static_cast<Npp8u*>(dst.data_ptr()),
dst.stride(0),
oSizeROI);
} else {
status = nppiNV12ToRGB_8u_P2C3R(
input,
avFrame->linesize[0],
static_cast<Npp8u*>(dst.data_ptr()),
dst.stride(0),
oSizeROI);
}
} else if (actualFormat == AV_PIX_FMT_P010LE) {
// 10-bit semi-planar format (like NV12 but 16-bit)
// P010LE has Y plane + interleaved UV plane, 10-bit data in high bits
const Npp16u* input[2] = {
reinterpret_cast<const Npp16u*>(avFrame->data[0]), // Y plane (16-bit)
reinterpret_cast<const Npp16u*>(
avFrame->data[1]) // UV plane (16-bit interleaved)
};

// Choose color matrix based on colorspace
const Npp32f (*aTwist)[4];

// TODO use even more accurage values from
// https://ffmpeg.org/doxygen/trunk/yuv2rgb_8c_source.html#l00047
// Need to devide by 65536 to get the floats
// BT.709 matrix (HDTV)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

static const Npp32f bt709Matrix[3][4] = {
{1.0f, 0.0f, 1.402f, 0.0f},
{1.0f, -0.344136f, -0.714136f, -32768.0f},
{1.0f, 1.772f, 0.0f, -32768.0f}
};

// BT.601 matrix (SDTV)
static const Npp32f bt601Matrix[3][4] = {
{1.0f, 0.0f, 1.596f, 0.0f},
{1.0f, -0.392f, -0.813f, -32768.0f},
{1.0f, 2.017f, 0.0f, -32768.0f}
};

if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
printf("It's BT.709 colorspace\n");
aTwist = bt709Matrix;
} else {
// Default to BT.601 for other colorspaces (including AVCOL_SPC_BT470BG, AVCOL_SPC_SMPTE170M)
printf("It's BT.601 colorspace\n");
aTwist = bt601Matrix;
}

// Create NPP stream context
NppStreamContext nppStreamCtx;
nppStreamCtx.hStream = nppGetStream();

int rSrcStep[2] = {
avFrame->linesize[0], avFrame->linesize[1]}; // Y and UV strides

status = nppiNV12ToRGB_16u_ColorTwist32f_P2C3R_Ctx(
input,
avFrame->linesize[0],
static_cast<Npp8u*>(dst.data_ptr()),
dst.stride(0),
oSizeROI);
rSrcStep,
reinterpret_cast<Npp16u*>(intermediateTensor.data_ptr()),
intermediateTensor.stride(0) * sizeof(uint16_t),
oSizeROI,
aTwist,
nppStreamCtx);

// Convert 16-bit to 8-bit: P010LE has 10-bit data, so divide by 4 to
// convert to 8-bit
if (status == NPP_SUCCESS) {
dst =
(intermediateTensor.div(256))
.to(torch::kUInt8); // Divide by 4 for 10-bit -> 8-bit conversion
}
}
TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame.");

TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert frame.");

// Make the pytorch stream wait for the npp kernel to finish before using the
// output.
Expand Down
Binary file added test/resources/h264_10bits.mp4
Binary file not shown.
Binary file added test/resources/h265_10bits.mp4
Binary file not shown.
35 changes: 35 additions & 0 deletions test/test_decoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,15 @@
AV1_VIDEO,
cpu_and_cuda,
get_ffmpeg_major_version,
H264_10BITS,
H265_10BITS,
H265_VIDEO,
in_fbcode,
NASA_AUDIO,
NASA_AUDIO_MP3,
NASA_AUDIO_MP3_44100,
NASA_VIDEO,
needs_cuda,
SINE_MONO_S16,
SINE_MONO_S32,
SINE_MONO_S32_44100,
Expand Down Expand Up @@ -1138,6 +1141,38 @@ def test_pts_to_dts_fallback(self, seek_mode):
with pytest.raises(AssertionError, match="not equal"):
torch.testing.assert_close(decoder[0], decoder[10])

@needs_cuda
# @pytest.mark.parametrize("asset", (H264_10BITS, H265_10BITS))
@pytest.mark.parametrize("asset", (H265_10BITS,)) # H265_10BITS))
def test_10bit_videos_cuda(self, asset):
# Assert that we raise proper error on different kinds of 10bit videos.

# TODO we should investigate how to support 10bit videos on GPU.
# See https://github.com/pytorch/torchcodec/issues/776

from torchvision.io import write_png

decoder = VideoDecoder(asset.path, device="cuda")
gpu_frame = decoder.get_frame_at(0)
write_png(gpu_frame.data.cpu(), "gpu.png")

decoder = VideoDecoder(asset.path, device="cpu")
cpu_frame = decoder.get_frame_at(0)
write_png(cpu_frame.data, "cpu.png")

if asset is H265_10BITS:
match = "The AVFrame is p010le, but we expected AV_PIX_FMT_NV12."
else:
match = "Expected format to be AV_PIX_FMT_CUDA, got yuv420p10le."

@pytest.mark.parametrize("asset", (H264_10BITS, H265_10BITS))
def test_10bit_videos_cpu(self, asset):
# This just validates that we can decode 10-bit videos on CPU.
# TODO validate against the ref that the decoded frames are correct

decoder = VideoDecoder(asset.path)
decoder.get_frame_at(10)


class TestAudioDecoder:
@pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3, SINE_MONO_S32))
Expand Down
22 changes: 22 additions & 0 deletions test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,28 @@ def get_empty_chw_tensor(self, *, stream_index: int) -> torch.Tensor:
frames={}, # Automatically loaded from json file
)

# Video generated with:
# ffmpeg -f lavfi -i testsrc2=duration=1:size=200x200:rate=30 -c:v libx265 -pix_fmt yuv420p10le -preset fast -crf 23 h265_10bits.mp4
H265_10BITS = TestVideo(
filename="h265_10bits.mp4",
default_stream_index=0,
stream_infos={
0: TestVideoStreamInfo(width=200, height=200, num_color_channels=3),
},
frames={0: {}}, # Not needed yet
)

# Video generated with:
# peg -f lavfi -i testsrc2=duration=1:size=200x200:rate=30 -c:v libx264 -pix_fmt yuv420p10le -preset fast -crf 23 h264_10bits.mp4
H264_10BITS = TestVideo(
filename="h264_10bits.mp4",
default_stream_index=0,
stream_infos={
0: TestVideoStreamInfo(width=200, height=200, num_color_channels=3),
},
frames={0: {}}, # Not needed yet
)


@dataclass
class TestAudio(TestContainerFile):
Expand Down
Loading