-
Notifications
You must be signed in to change notification settings - Fork 74
C++ implementation of crop transform #967
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
b171309
3248f2f
74fe47f
c77fbd7
0511233
bba2696
4f121cb
07e7f60
67478dd
88bc94a
a15d458
8f4507b
fca8b83
8dfbee9
c1836b8
abb80eb
f819ac1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -57,4 +57,25 @@ int ResizeTransform::getSwsFlags() const { | |
| return toSwsInterpolation(interpolationMode_); | ||
| } | ||
|
|
||
| CropTransform::CropTransform(const FrameDims& dims, int x, int y) | ||
| : outputDims_(dims), x_(x), y_(y) { | ||
| TORCH_CHECK(x_ >= 0, "Crop x position must be positive, got: ", x_); | ||
| TORCH_CHECK(y_ >= 0, "Crop y position must be positive, got: ", y_); | ||
| } | ||
|
|
||
| std::string CropTransform::getFilterGraphCpu() const { | ||
| return "crop=" + std::to_string(outputDims_.width) + ":" + | ||
| std::to_string(outputDims_.height) + ":" + std::to_string(x_) + ":" + | ||
| std::to_string(y_) + ":exact=1"; | ||
NicolasHug marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| std::optional<FrameDims> CropTransform::getOutputFrameDims() const { | ||
| return outputDims_; | ||
| } | ||
|
|
||
| void CropTransform::validate(const StreamMetadata& streamMetadata) const { | ||
| TORCH_CHECK(x_ <= streamMetadata.width, "Crop x position out of bounds"); | ||
| TORCH_CHECK(y_ <= streamMetadata.height, "Crop y position out of bounds"); | ||
|
Comment on lines
77
to
81
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should/can we account for H and W of the crop as well? E.g. TORCH_CHECK(x_ + W < streamMetadata.width, "Crop x position out of bounds"); |
||
| } | ||
|
|
||
| } // namespace facebook::torchcodec | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -212,6 +212,26 @@ Transform* makeResizeTransform( | |
| return new ResizeTransform(FrameDims(height, width)); | ||
| } | ||
|
|
||
| // Crop transform specs take the form: | ||
| // | ||
| // "crop, <width>, <height>, <x>, <y>" | ||
| // | ||
| // Where "crop" is the string literal and <width>, <height>, <x> and <y> are | ||
| // positive integers. Note that that in this spec, we are following the | ||
| // filtergraph convention of (width, height). This makes it easier to compare it | ||
| // against actual filtergraph strings. | ||
|
||
| Transform* makeCropTransform( | ||
| const std::vector<std::string>& cropTransformSpec) { | ||
| TORCH_CHECK( | ||
| cropTransformSpec.size() == 5, | ||
| "cropTransformSpec must have 5 elements including its name"); | ||
| int width = checkedToPositiveInt(cropTransformSpec[1]); | ||
| int height = checkedToPositiveInt(cropTransformSpec[2]); | ||
| int x = checkedToPositiveInt(cropTransformSpec[3]); | ||
| int y = checkedToPositiveInt(cropTransformSpec[4]); | ||
| return new CropTransform(FrameDims(height, width), x, y); | ||
| } | ||
|
|
||
| std::vector<std::string> split(const std::string& str, char delimiter) { | ||
| std::vector<std::string> tokens; | ||
| std::string token; | ||
|
|
@@ -239,6 +259,8 @@ std::vector<Transform*> makeTransforms(const std::string& transformSpecsRaw) { | |
| auto name = transformSpec[0]; | ||
| if (name == "resize") { | ||
| transforms.push_back(makeResizeTransform(transformSpec)); | ||
| } else if (name == "crop") { | ||
| transforms.push_back(makeCropTransform(transformSpec)); | ||
| } else { | ||
| TORCH_CHECK(false, "Invalid transform name: " + name); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,6 +15,10 @@ | |
| # Run this script to update the resources used in unit tests. The resources are all derived | ||
| # from source media already checked into the repo. | ||
|
|
||
| SCRIPT_DIR = Path(__file__).resolve().parent | ||
| TORCHCODEC_PATH = SCRIPT_DIR.parent | ||
| RESOURCES_DIR = TORCHCODEC_PATH / "test" / "resources" | ||
|
|
||
|
|
||
| def convert_image_to_tensor(image_path): | ||
| image_path = Path(image_path) | ||
|
|
@@ -31,7 +35,18 @@ def convert_image_to_tensor(image_path): | |
| image_path.unlink() | ||
|
|
||
|
|
||
| def get_frame_by_index(video_path, frame, output_path, stream): | ||
| def get_frame_by_index(video_path, frame, output_path, stream, filters=None): | ||
| # Note that we have an exlicit format conversion to rgb24 in our filtergraph specification, | ||
| # which always happens BEFORE any of the filters that we receive as input. We do this to | ||
| # ensure that the color conversion happens BEFORE the filters, matching the behavior of the | ||
| # torchcodec filtergraph implementation. | ||
| # | ||
| # Not doing this would result in the color conversion happening AFTER the filters, which | ||
| # would result in different color values for the same frame. | ||
| filtergraph = f"select='eq(n\\,{frame})',format=rgb24" | ||
| if filters is not None: | ||
| filtergraph = filtergraph + f",{filters}" | ||
|
|
||
| cmd = [ | ||
| "ffmpeg", | ||
| "-y", | ||
|
|
@@ -40,13 +55,14 @@ def get_frame_by_index(video_path, frame, output_path, stream): | |
| "-map", | ||
| f"0:{stream}", | ||
| "-vf", | ||
| f"select=eq(n\\,{frame})", | ||
| "-vsync", | ||
| "vfr", | ||
| "-q:v", | ||
| "2", | ||
| filtergraph, | ||
| "-fps_mode", | ||
| "passthrough", | ||
| "-update", | ||
| "1", | ||
| output_path, | ||
| ] | ||
| print("===" + " ".join([str(x) for x in cmd])) | ||
|
||
| subprocess.run(cmd, check=True) | ||
|
|
||
|
|
||
|
|
@@ -65,14 +81,9 @@ def get_frame_by_timestamp(video_path, timestamp, output_path): | |
| subprocess.run(cmd, check=True) | ||
|
|
||
|
|
||
| def main(): | ||
| SCRIPT_DIR = Path(__file__).resolve().parent | ||
| TORCHCODEC_PATH = SCRIPT_DIR.parent | ||
| RESOURCES_DIR = TORCHCODEC_PATH / "test" / "resources" | ||
| def generate_nasa_13013_references(): | ||
| VIDEO_PATH = RESOURCES_DIR / "nasa_13013.mp4" | ||
|
|
||
| # Last generated with ffmpeg version 4.3 | ||
| # | ||
| # Note: The naming scheme used here must match the naming scheme used to load | ||
| # tensors in ./utils.py. | ||
| STREAMS = [0, 3] | ||
|
|
@@ -95,6 +106,18 @@ def main(): | |
| get_frame_by_timestamp(VIDEO_PATH, timestamp, output_bmp) | ||
| convert_image_to_tensor(output_bmp) | ||
|
|
||
| # Extract frames with specific filters. We have tests that assume these exact filters. | ||
| # We prepend format=rgb24 to ensure the color conversion happens before the crop, | ||
| # matching the behavior of the torchcodec filtergraph implementation. | ||
| FRAMES = [0, 15, 200, 389] | ||
| crop_filter = "crop=300:200:50:35:exact=1" | ||
| for frame in FRAMES: | ||
| output_bmp = f"{VIDEO_PATH}.{crop_filter}.stream3.frame{frame:06d}.bmp" | ||
| get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=3, filters=crop_filter) | ||
| convert_image_to_tensor(output_bmp) | ||
|
|
||
|
|
||
| def generate_h265_video_references(): | ||
| # This video was generated by running the following: | ||
| # conda install -c conda-forge x265 | ||
| # ./configure --enable-nonfree --enable-gpl --prefix=$(readlink -f ../bin) --enable-libx265 --enable-rpath --extra-ldflags=-Wl,-rpath=$CONDA_PREFIX/lib --enable-filter=drawtext --enable-libfontconfig --enable-libfreetype --enable-libharfbuzz | ||
|
|
@@ -107,6 +130,8 @@ def main(): | |
| get_frame_by_index(VIDEO_PATH, frame, output_bmp, stream=0) | ||
| convert_image_to_tensor(output_bmp) | ||
|
|
||
|
|
||
| def generate_av1_video_references(): | ||
| # This video was generated by running the following: | ||
| # ffmpeg -f lavfi -i testsrc=duration=5:size=640x360:rate=25,format=yuv420p -c:v libaom-av1 -crf 30 -colorspace bt709 -color_primaries bt709 -color_trc bt709 av1_video.mkv | ||
| # Note that this video only has 1 stream, at index 0. | ||
|
|
@@ -119,5 +144,11 @@ def main(): | |
| convert_image_to_tensor(output_bmp) | ||
|
|
||
|
|
||
| def main(): | ||
| generate_nasa_13013_references() | ||
| generate_h265_video_references() | ||
| generate_av1_video_references() | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Drive-by: when passing in a height and width, we should only be able to instantiate a
FrameDimsobject with positive values. If we want aFrameDimsobject that has 0 for both values, that's just the default constructor. We should never have aFrameDimsobject with negative values.