Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/linux_wheel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ jobs:
run: python -m pip install --upgrade pip
- name: Install PyTorch
run: |
python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
- name: Install torchcodec from the wheel
run: |
wheel_path=`find pytorch/torchcodec/dist -type f -name "*.whl"`
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/macos_wheel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ jobs:

- name: Install PyTorch
run: |
python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
- name: Install torchcodec from the wheel
run: |
Expand Down
46 changes: 44 additions & 2 deletions .github/workflows/reference_resources.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,51 @@ defaults:
shell: bash -l -eo pipefail {0}

jobs:
generate-matrix:
uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
with:
package-type: wheel
os: linux
test-infra-repository: pytorch/test-infra
test-infra-ref: main
with-xpu: disable
with-rocm: disable
with-cuda: disable
build-python-only: "disable"

build:
needs: generate-matrix
strategy:
fail-fast: false
name: Build and Upload Linux wheel
uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
with:
repository: meta-pytorch/torchcodec
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
pre-script: packaging/pre_build_script.sh
post-script: packaging/post_build_script.sh
smoke-test-script: packaging/fake_smoke_test.py
package-name: torchcodec
trigger-event: ${{ github.event_name }}
build-platform: "python-build-package"
build-command: "BUILD_AGAINST_ALL_FFMPEG_FROM_S3=1 python -m build --wheel -vvv --no-isolation"

test-reference-resource-generation:
needs: build
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ['3.10']
ffmpeg-version-for-tests: ['4.4.2', '5.1.2', '6.1.1', '7.0.1']
steps:
- uses: actions/download-artifact@v4
with:
name: meta-pytorch_torchcodec__${{ matrix.python-version }}_cpu_x86_64
path: pytorch/torchcodec/dist/
- name: Setup conda env
uses: conda-incubator/setup-miniconda@v2
with:
Expand All @@ -43,11 +80,16 @@ jobs:
# Note that we're installing stable - this is for running a script where we're a normal PyTorch
# user, not for building TorhCodec.
python -m pip install torch --index-url https://download.pytorch.org/whl/cpu
python -m pip install numpy pillow
python -m pip install numpy pillow pytest

- name: Install torchcodec from the wheel
run: |
wheel_path=`find pytorch/torchcodec/dist -type f -name "*.whl"`
echo Installing $wheel_path
python -m pip install $wheel_path -vvv
- name: Check out repo
uses: actions/checkout@v3

- name: Run generation reference resources
run: |
python test/generate_reference_resources.py
python -m test.generate_reference_resources
2 changes: 1 addition & 1 deletion .github/workflows/windows_wheel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ jobs:
run: python -m pip install --upgrade pip
- name: Install PyTorch
run: |
python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
- name: Install torchcodec from the wheel
run: |
wheel_path=`find pytorch/torchcodec/dist -type f -name "*.whl"`
Expand Down
3 changes: 2 additions & 1 deletion src/torchcodec/_core/FilterGraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ FilterGraph::FilterGraph(
TORCH_CHECK(
status >= 0,
"Failed to configure filter graph: ",
getFFMPEGErrorStringFromErrorCode(status));
getFFMPEGErrorStringFromErrorCode(status),
", provided filters: " + filtersContext.filtergraphStr);
}

UniqueAVFrame FilterGraph::convert(const UniqueAVFrame& avFrame) {
Expand Down
5 changes: 5 additions & 0 deletions src/torchcodec/_core/Frame.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@

namespace facebook::torchcodec {

FrameDims::FrameDims(int height, int width) : height(height), width(width) {
TORCH_CHECK(height > 0, "FrameDims.height must be > 0, got: ", height);
TORCH_CHECK(width > 0, "FrameDims.width must be > 0, got: ", width);
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Drive-by: when passing in a height and width, we should only be able to instantiate a FrameDims object with positive values. If we want a FrameDims object that has 0 for both values, that's just the default constructor. We should never have a FrameDims object with negative values.


FrameBatchOutput::FrameBatchOutput(
int64_t numFrames,
const FrameDims& outputDims,
Expand Down
2 changes: 1 addition & 1 deletion src/torchcodec/_core/Frame.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ struct FrameDims {

FrameDims() = default;

FrameDims(int h, int w) : height(h), width(w) {}
FrameDims(int h, int w);
};

// All public video decoding entry points return either a FrameOutput or a
Expand Down
2 changes: 2 additions & 0 deletions src/torchcodec/_core/SingleStreamDecoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <sstream>
#include <stdexcept>
#include <string_view>
#include "Metadata.h"
#include "torch/types.h"

namespace facebook::torchcodec {
Expand Down Expand Up @@ -527,6 +528,7 @@ void SingleStreamDecoder::addVideoStream(
if (transform->getOutputFrameDims().has_value()) {
resizedOutputDims_ = transform->getOutputFrameDims().value();
}
transform->validate(streamMetadata);

// Note that we are claiming ownership of the transform objects passed in to
// us.
Expand Down
27 changes: 27 additions & 0 deletions src/torchcodec/_core/Transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,31 @@ int ResizeTransform::getSwsFlags() const {
return toSwsInterpolation(interpolationMode_);
}

CropTransform::CropTransform(const FrameDims& dims, int x, int y)
: outputDims_(dims), x_(x), y_(y) {
TORCH_CHECK(x_ >= 0, "Crop x position must be >= 0, got: ", x_);
TORCH_CHECK(y_ >= 0, "Crop y position must be >= 0, got: ", y_);
}

std::string CropTransform::getFilterGraphCpu() const {
return "crop=" + std::to_string(outputDims_.width) + ":" +
std::to_string(outputDims_.height) + ":" + std::to_string(x_) + ":" +
std::to_string(y_) + ":exact=1";
}

std::optional<FrameDims> CropTransform::getOutputFrameDims() const {
return outputDims_;
}

void CropTransform::validate(const StreamMetadata& streamMetadata) const {
TORCH_CHECK(x_ <= streamMetadata.width, "Crop x position out of bounds");
TORCH_CHECK(
x_ + outputDims_.width <= streamMetadata.width,
"Crop x position out of bounds")
TORCH_CHECK(y_ <= streamMetadata.height, "Crop y position out of bounds");
TORCH_CHECK(
y_ + outputDims_.height <= streamMetadata.height,
"Crop y position out of bounds");
}

} // namespace facebook::torchcodec
25 changes: 25 additions & 0 deletions src/torchcodec/_core/Transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <optional>
#include <string>
#include "src/torchcodec/_core/Frame.h"
#include "src/torchcodec/_core/Metadata.h"

namespace facebook::torchcodec {

Expand All @@ -33,6 +34,16 @@ class Transform {
virtual bool isResize() const {
return false;
}

// The validity of some transforms depends on the characteristics of the
// AVStream they're being applied to. For example, some transforms will
// specify coordinates inside a frame, we need to validate that those are
// within the frame's bounds.
//
// Note that the validation function does not return anything. We expect
// invalid configurations to throw an exception.
virtual void validate(
[[maybe_unused]] const StreamMetadata& streamMetadata) const {}
};

class ResizeTransform : public Transform {
Expand All @@ -56,4 +67,18 @@ class ResizeTransform : public Transform {
InterpolationMode interpolationMode_;
};

class CropTransform : public Transform {
public:
CropTransform(const FrameDims& dims, int x, int y);

std::string getFilterGraphCpu() const override;
std::optional<FrameDims> getOutputFrameDims() const override;
void validate(const StreamMetadata& streamMetadata) const override;

private:
FrameDims outputDims_;
int x_;
int y_;
};

} // namespace facebook::torchcodec
22 changes: 22 additions & 0 deletions src/torchcodec/_core/custom_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,26 @@ Transform* makeResizeTransform(
return new ResizeTransform(FrameDims(height, width));
}

// Crop transform specs take the form:
//
// "crop, <height>, <width>, <x>, <y>"
//
// Where "crop" is the string literal and <height>, <width>, <x> and <y> are
// positive integers. <x> and <y> are the x and y coordinates of the top left
// corner of the crop. Note that we follow the PyTorch convention of (height,
// width) for specifying image dimensions; FFmpeg uses (width, height).
Transform* makeCropTransform(
const std::vector<std::string>& cropTransformSpec) {
TORCH_CHECK(
cropTransformSpec.size() == 5,
"cropTransformSpec must have 5 elements including its name");
int height = checkedToPositiveInt(cropTransformSpec[1]);
int width = checkedToPositiveInt(cropTransformSpec[2]);
int x = checkedToPositiveInt(cropTransformSpec[3]);
int y = checkedToPositiveInt(cropTransformSpec[4]);
return new CropTransform(FrameDims(height, width), x, y);
}

std::vector<std::string> split(const std::string& str, char delimiter) {
std::vector<std::string> tokens;
std::string token;
Expand Down Expand Up @@ -239,6 +259,8 @@ std::vector<Transform*> makeTransforms(const std::string& transformSpecsRaw) {
auto name = transformSpec[0];
if (name == "resize") {
transforms.push_back(makeResizeTransform(transformSpec));
} else if (name == "crop") {
transforms.push_back(makeCropTransform(transformSpec));
} else {
TORCH_CHECK(false, "Invalid transform name: " + name);
}
Expand Down
54 changes: 42 additions & 12 deletions test/generate_reference_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,15 @@
import torch
from PIL import Image

from .utils import sanitize_filtergraph_expression
Copy link
Contributor Author

@scotts scotts Oct 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that pulling in code from utils.py means we have to run this as a module, and since utils.py imports general TorchCodec stuff, we do a full build and install for the reference_resources.yaml workflow. I think that's a fair trade-off to ensure that the unit tests and the resource generation are using the same logic for writing and loading files.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we expect to pull in more things from utils.py? If it's just for sanitize_filtergraph_expression, I would have a slight preference for just duplicating it in both files, because the python -m stuff isn't immediately obvious. But that's fine too.


# Run this script to update the resources used in unit tests. The resources are all derived
# from source media already checked into the repo.

SCRIPT_DIR = Path(__file__).resolve().parent
TORCHCODEC_PATH = SCRIPT_DIR.parent
RESOURCES_DIR = TORCHCODEC_PATH / "test" / "resources"


def convert_image_to_tensor(image_path):
image_path = Path(image_path)
Expand All @@ -31,7 +37,18 @@ def convert_image_to_tensor(image_path):
image_path.unlink()


def get_frame_by_index(video_path, frame, output_path, stream):
def get_frame_by_index(video_path, frame, output_path, stream, filters=None):
# Note that we have an exlicit format conversion to rgb24 in our filtergraph specification,
# which always happens BEFORE any of the filters that we receive as input. We do this to
# ensure that the color conversion happens BEFORE the filters, matching the behavior of the
# torchcodec filtergraph implementation.
#
# Not doing this would result in the color conversion happening AFTER the filters, which
# would result in different color values for the same frame.
filtergraph = f"select='eq(n\\,{frame})',format=rgb24"
if filters is not None:
filtergraph = filtergraph + f",{filters}"

cmd = [
"ffmpeg",
"-y",
Expand All @@ -40,11 +57,11 @@ def get_frame_by_index(video_path, frame, output_path, stream):
"-map",
f"0:{stream}",
"-vf",
f"select=eq(n\\,{frame})",
"-vsync",
"vfr",
"-q:v",
"2",
filtergraph,
"-fps_mode",
"passthrough",
"-update",
"1",
output_path,
]
subprocess.run(cmd, check=True)
Expand All @@ -65,14 +82,9 @@ def get_frame_by_timestamp(video_path, timestamp, output_path):
subprocess.run(cmd, check=True)


def main():
SCRIPT_DIR = Path(__file__).resolve().parent
TORCHCODEC_PATH = SCRIPT_DIR.parent
RESOURCES_DIR = TORCHCODEC_PATH / "test" / "resources"
def generate_nasa_13013_references():
VIDEO_PATH = RESOURCES_DIR / "nasa_13013.mp4"

# Last generated with ffmpeg version 4.3
#
# Note: The naming scheme used here must match the naming scheme used to load
# tensors in ./utils.py.
STREAMS = [0, 3]
Expand All @@ -95,6 +107,16 @@ def main():
get_frame_by_timestamp(VIDEO_PATH, timestamp, output_bmp)
convert_image_to_tensor(output_bmp)

# Extract frames with specific filters. We have tests that assume these exact filters.
FRAMES = [0, 15, 200, 389]
crop_filter = "crop=300:200:50:35:exact=1"
for frame in FRAMES:
output_bmp = f"{VIDEO_PATH}.{sanitize_filtergraph_expression(crop_filter)}.stream3.frame{frame:06d}.bmp"
get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=3, filters=crop_filter)
convert_image_to_tensor(output_bmp)


def generate_h265_video_references():
# This video was generated by running the following:
# conda install -c conda-forge x265
# ./configure --enable-nonfree --enable-gpl --prefix=$(readlink -f ../bin) --enable-libx265 --enable-rpath --extra-ldflags=-Wl,-rpath=$CONDA_PREFIX/lib --enable-filter=drawtext --enable-libfontconfig --enable-libfreetype --enable-libharfbuzz
Expand All @@ -107,6 +129,8 @@ def main():
get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=0)
convert_image_to_tensor(output_bmp)


def generate_av1_video_references():
# This video was generated by running the following:
# ffmpeg -f lavfi -i testsrc=duration=5:size=640x360:rate=25,format=yuv420p -c:v libaom-av1 -crf 30 -colorspace bt709 -color_primaries bt709 -color_trc bt709 av1_video.mkv
# Note that this video only has 1 stream, at index 0.
Expand All @@ -119,5 +143,11 @@ def main():
convert_image_to_tensor(output_bmp)


def main():
generate_nasa_13013_references()
generate_h265_video_references()
generate_av1_video_references()


if __name__ == "__main__":
main()
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Loading