Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/linux_wheel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ jobs:
run: python -m pip install --upgrade pip
- name: Install PyTorch
run: |
python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
- name: Install torchcodec from the wheel
run: |
wheel_path=`find pytorch/torchcodec/dist -type f -name "*.whl"`
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/macos_wheel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ jobs:

- name: Install PyTorch
run: |
python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
- name: Install torchcodec from the wheel
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/windows_wheel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ jobs:
run: python -m pip install --upgrade pip
- name: Install PyTorch
run: |
python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
- name: Install torchcodec from the wheel
run: |
wheel_path=`find pytorch/torchcodec/dist -type f -name "*.whl"`
Expand Down
3 changes: 2 additions & 1 deletion src/torchcodec/_core/FilterGraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ FilterGraph::FilterGraph(
TORCH_CHECK(
status >= 0,
"Failed to configure filter graph: ",
getFFMPEGErrorStringFromErrorCode(status));
getFFMPEGErrorStringFromErrorCode(status),
", provided filters: " + filtersContext.filtergraphStr);
}

UniqueAVFrame FilterGraph::convert(const UniqueAVFrame& avFrame) {
Expand Down
5 changes: 5 additions & 0 deletions src/torchcodec/_core/Frame.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@

namespace facebook::torchcodec {

FrameDims::FrameDims(int height, int width) : height(height), width(width) {
TORCH_CHECK(height > 0, "FrameDims.height must be > 0, got: ", height);
TORCH_CHECK(width > 0, "FrameDims.width must be > 0, got: ", width);
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Drive-by: when passing in a height and width, we should only be able to instantiate a FrameDims object with positive values. If we want a FrameDims object that has 0 for both values, that's just the default constructor. We should never have a FrameDims object with negative values.


FrameBatchOutput::FrameBatchOutput(
int64_t numFrames,
const FrameDims& outputDims,
Expand Down
2 changes: 1 addition & 1 deletion src/torchcodec/_core/Frame.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ struct FrameDims {

FrameDims() = default;

FrameDims(int h, int w) : height(h), width(w) {}
FrameDims(int h, int w);
};

// All public video decoding entry points return either a FrameOutput or a
Expand Down
2 changes: 2 additions & 0 deletions src/torchcodec/_core/SingleStreamDecoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <sstream>
#include <stdexcept>
#include <string_view>
#include "Metadata.h"
#include "torch/types.h"

namespace facebook::torchcodec {
Expand Down Expand Up @@ -523,6 +524,7 @@ void SingleStreamDecoder::addVideoStream(
if (transform->getOutputFrameDims().has_value()) {
resizedOutputDims_ = transform->getOutputFrameDims().value();
}
transform->validate(streamMetadata);

// Note that we are claiming ownership of the transform objects passed in to
// us.
Expand Down
21 changes: 21 additions & 0 deletions src/torchcodec/_core/Transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,25 @@ int ResizeTransform::getSwsFlags() const {
return toSwsInterpolation(interpolationMode_);
}

CropTransform::CropTransform(const FrameDims& dims, int x, int y)
: outputDims_(dims), x_(x), y_(y) {
TORCH_CHECK(x_ >= 0, "Crop x position must be positive, got: ", x_);
TORCH_CHECK(y_ >= 0, "Crop y position must be positive, got: ", y_);
}

std::string CropTransform::getFilterGraphCpu() const {
return "crop=" + std::to_string(outputDims_.width) + ":" +
std::to_string(outputDims_.height) + ":" + std::to_string(x_) + ":" +
std::to_string(y_) + ":exact=1";
}

std::optional<FrameDims> CropTransform::getOutputFrameDims() const {
return outputDims_;
}

void CropTransform::validate(const StreamMetadata& streamMetadata) const {
TORCH_CHECK(x_ <= streamMetadata.width, "Crop x position out of bounds");
TORCH_CHECK(y_ <= streamMetadata.height, "Crop y position out of bounds");
Comment on lines 77 to 81
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should/can we account for H and W of the crop as well? E.g.

TORCH_CHECK(x_ + W < streamMetadata.width, "Crop x position out of bounds");

}

} // namespace facebook::torchcodec
26 changes: 26 additions & 0 deletions src/torchcodec/_core/Transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@

#include <optional>
#include <string>
#include "Metadata.h"
#include "src/torchcodec/_core/Frame.h"
#include "src/torchcodec/_core/Metadata.h"

namespace facebook::torchcodec {

Expand All @@ -33,6 +35,16 @@ class Transform {
virtual bool isResize() const {
return false;
}

// The validity of some transforms depends on the characteristics of the
// AVStream they're being applied to. For example, some transforms will
// specify coordinates inside a frame, we need to validate that those are
// within the frame's bounds.
//
// Note that the validation function does not return anything. We expect
// invalid configurations to throw an exception.
virtual void validate(
[[maybe_unused]] const StreamMetadata& streamMetadata) const {}
};

class ResizeTransform : public Transform {
Expand All @@ -56,4 +68,18 @@ class ResizeTransform : public Transform {
InterpolationMode interpolationMode_;
};

class CropTransform : public Transform {
public:
CropTransform(const FrameDims& dims, int x, int y);

std::string getFilterGraphCpu() const override;
std::optional<FrameDims> getOutputFrameDims() const override;
void validate(const StreamMetadata& streamMetadata) const override;

private:
FrameDims outputDims_;
int x_;
int y_;
};

} // namespace facebook::torchcodec
22 changes: 22 additions & 0 deletions src/torchcodec/_core/custom_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,26 @@ Transform* makeResizeTransform(
return new ResizeTransform(FrameDims(height, width));
}

// Crop transform specs take the form:
//
// "crop, <width>, <height>, <x>, <y>"
//
// Where "crop" is the string literal and <width>, <height>, <x> and <y> are
// positive integers. Note that that in this spec, we are following the
// filtergraph convention of (width, height). This makes it easier to compare it
// against actual filtergraph strings.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC x and y are essentially the top-left coordinates of the crop, let's make that explicit (it could also be e.g. the center coordinates of the crop).

Separately, I was hoping we could use H and W everywhere for consistency. Can you share the comparison against filtergraph strings that are made easier by using the W and H order?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@NicolasHug, I'm fine with (h, w). It just means that in the tests and in the resource generation, we'll have strings that look like:

        crop_spec = "crop, 200, 300, 50, 35"   # note: h=200, w=300
        crop_filtergraph = "crop=300:200:50:35:exact=1". # note: w=300, h=200

Comments can cover it. We'll have to deal with this awkwardness somewhere. I don't have a strong preference where, and I think it's probably good to keep all of TorchCodec in (h, w).

Transform* makeCropTransform(
const std::vector<std::string>& cropTransformSpec) {
TORCH_CHECK(
cropTransformSpec.size() == 5,
"cropTransformSpec must have 5 elements including its name");
int width = checkedToPositiveInt(cropTransformSpec[1]);
int height = checkedToPositiveInt(cropTransformSpec[2]);
int x = checkedToPositiveInt(cropTransformSpec[3]);
int y = checkedToPositiveInt(cropTransformSpec[4]);
return new CropTransform(FrameDims(height, width), x, y);
}

std::vector<std::string> split(const std::string& str, char delimiter) {
std::vector<std::string> tokens;
std::string token;
Expand Down Expand Up @@ -239,6 +259,8 @@ std::vector<Transform*> makeTransforms(const std::string& transformSpecsRaw) {
auto name = transformSpec[0];
if (name == "resize") {
transforms.push_back(makeResizeTransform(transformSpec));
} else if (name == "crop") {
transforms.push_back(makeCropTransform(transformSpec));
} else {
TORCH_CHECK(false, "Invalid transform name: " + name);
}
Expand Down
55 changes: 43 additions & 12 deletions test/generate_reference_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
# Run this script to update the resources used in unit tests. The resources are all derived
# from source media already checked into the repo.

SCRIPT_DIR = Path(__file__).resolve().parent
TORCHCODEC_PATH = SCRIPT_DIR.parent
RESOURCES_DIR = TORCHCODEC_PATH / "test" / "resources"


def convert_image_to_tensor(image_path):
image_path = Path(image_path)
Expand All @@ -31,7 +35,18 @@ def convert_image_to_tensor(image_path):
image_path.unlink()


def get_frame_by_index(video_path, frame, output_path, stream):
def get_frame_by_index(video_path, frame, output_path, stream, filters=None):
# Note that we have an exlicit format conversion to rgb24 in our filtergraph specification,
# which always happens BEFORE any of the filters that we receive as input. We do this to
# ensure that the color conversion happens BEFORE the filters, matching the behavior of the
# torchcodec filtergraph implementation.
#
# Not doing this would result in the color conversion happening AFTER the filters, which
# would result in different color values for the same frame.
filtergraph = f"select='eq(n\\,{frame})',format=rgb24"
if filters is not None:
filtergraph = filtergraph + f",{filters}"

cmd = [
"ffmpeg",
"-y",
Expand All @@ -40,13 +55,14 @@ def get_frame_by_index(video_path, frame, output_path, stream):
"-map",
f"0:{stream}",
"-vf",
f"select=eq(n\\,{frame})",
"-vsync",
"vfr",
"-q:v",
"2",
filtergraph,
"-fps_mode",
"passthrough",
"-update",
"1",
output_path,
]
print("===" + " ".join([str(x) for x in cmd]))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this debug leftover? Just checking, maybe it's intended and OK since this is just the generation util.

subprocess.run(cmd, check=True)


Expand All @@ -65,14 +81,9 @@ def get_frame_by_timestamp(video_path, timestamp, output_path):
subprocess.run(cmd, check=True)


def main():
SCRIPT_DIR = Path(__file__).resolve().parent
TORCHCODEC_PATH = SCRIPT_DIR.parent
RESOURCES_DIR = TORCHCODEC_PATH / "test" / "resources"
def generate_nasa_13013_references():
VIDEO_PATH = RESOURCES_DIR / "nasa_13013.mp4"

# Last generated with ffmpeg version 4.3
#
# Note: The naming scheme used here must match the naming scheme used to load
# tensors in ./utils.py.
STREAMS = [0, 3]
Expand All @@ -95,6 +106,18 @@ def main():
get_frame_by_timestamp(VIDEO_PATH, timestamp, output_bmp)
convert_image_to_tensor(output_bmp)

# Extract frames with specific filters. We have tests that assume these exact filters.
# We prepend format=rgb24 to ensure the color conversion happens before the crop,
# matching the behavior of the torchcodec filtergraph implementation.
FRAMES = [0, 15, 200, 389]
crop_filter = "crop=300:200:50:35:exact=1"
for frame in FRAMES:
output_bmp = f"{VIDEO_PATH}.{crop_filter}.stream3.frame{frame:06d}.bmp"
get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=3, filters=crop_filter)
convert_image_to_tensor(output_bmp)


def generate_h265_video_references():
# This video was generated by running the following:
# conda install -c conda-forge x265
# ./configure --enable-nonfree --enable-gpl --prefix=$(readlink -f ../bin) --enable-libx265 --enable-rpath --extra-ldflags=-Wl,-rpath=$CONDA_PREFIX/lib --enable-filter=drawtext --enable-libfontconfig --enable-libfreetype --enable-libharfbuzz
Expand All @@ -107,6 +130,8 @@ def main():
get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=0)
convert_image_to_tensor(output_bmp)


def generate_av1_video_references():
# This video was generated by running the following:
# ffmpeg -f lavfi -i testsrc=duration=5:size=640x360:rate=25,format=yuv420p -c:v libaom-av1 -crf 30 -colorspace bt709 -color_primaries bt709 -color_trc bt709 av1_video.mkv
# Note that this video only has 1 stream, at index 0.
Expand All @@ -119,5 +144,11 @@ def main():
convert_image_to_tensor(output_bmp)


def main():
generate_nasa_13013_references()
generate_h265_video_references()
generate_av1_video_references()


if __name__ == "__main__":
main()
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Loading