Skip to content

Commit d9441d0

Browse files
author
pytorchbot
committed
2025-12-05 nightly release (4e412b7)
1 parent 71fac63 commit d9441d0

File tree

10 files changed

+302
-149
lines changed

10 files changed

+302
-149
lines changed

README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
# TorchCodec
44

55
TorchCodec is a Python library for decoding video and audio data into PyTorch
6-
tensors, on CPU and CUDA GPU. It also supports audio encoding, and video
7-
encoding will come soon! It aims to be fast, easy to use, and well integrated
6+
tensors, on CPU and CUDA GPU. It also supports video and audio encoding on CPU!
7+
It aims to be fast, easy to use, and well integrated
88
into the PyTorch ecosystem. If you want to use PyTorch to train ML models on
99
videos and audio, TorchCodec is how you turn these into data.
1010

@@ -130,7 +130,8 @@ The following table indicates the compatibility between versions of
130130

131131
| `torchcodec` | `torch` | Python |
132132
| ------------------ | ------------------ | ------------------- |
133-
| `main` / `nightly` | `main` / `nightly` | `>=3.10`, `<=3.13` |
133+
| `main` / `nightly` | `main` / `nightly` | `>=3.10`, `<=3.14` |
134+
| `0.9` | `2.9` | `>=3.10`, `<=3.14` |
134135
| `0.8` | `2.9` | `>=3.10`, `<=3.13` |
135136
| `0.7` | `2.8` | `>=3.9`, `<=3.13` |
136137
| `0.6` | `2.8` | `>=3.9`, `<=3.13` |

docs/source/api_ref_transforms.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,6 @@ For a tutorial, see: TODO_DECODER_TRANSFORMS_TUTORIAL.
1414
:template: dataclass.rst
1515

1616
DecoderTransform
17+
CenterCrop
18+
RandomCrop
1719
Resize

src/torchcodec/_core/Transform.cpp

Lines changed: 36 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -37,16 +37,22 @@ std::optional<FrameDims> ResizeTransform::getOutputFrameDims() const {
3737
return outputDims_;
3838
}
3939

40+
CropTransform::CropTransform(const FrameDims& dims) : outputDims_(dims) {}
41+
4042
CropTransform::CropTransform(const FrameDims& dims, int x, int y)
4143
: outputDims_(dims), x_(x), y_(y) {
4244
TORCH_CHECK(x_ >= 0, "Crop x position must be >= 0, got: ", x_);
4345
TORCH_CHECK(y_ >= 0, "Crop y position must be >= 0, got: ", y_);
4446
}
4547

4648
std::string CropTransform::getFilterGraphCpu() const {
49+
// For the FFmpeg filter crop, if the x and y coordinates are left
50+
// unspecified, it defaults to a center crop.
51+
std::string coordinates = x_.has_value()
52+
? (":" + std::to_string(x_.value()) + ":" + std::to_string(y_.value()))
53+
: "";
4754
return "crop=" + std::to_string(outputDims_.width) + ":" +
48-
std::to_string(outputDims_.height) + ":" + std::to_string(x_) + ":" +
49-
std::to_string(y_) + ":exact=1";
55+
std::to_string(outputDims_.height) + coordinates + ":exact=1";
5056
}
5157

5258
std::optional<FrameDims> CropTransform::getOutputFrameDims() const {
@@ -69,29 +75,34 @@ void CropTransform::validate(const FrameDims& inputDims) const {
6975
inputDims.width,
7076
")");
7177
TORCH_CHECK(
72-
x_ <= inputDims.width,
73-
"Crop x start position, ",
74-
x_,
75-
", out of bounds of input width, ",
76-
inputDims.width);
77-
TORCH_CHECK(
78-
x_ + outputDims_.width <= inputDims.width,
79-
"Crop x end position, ",
80-
x_ + outputDims_.width,
81-
", out of bounds of input width ",
82-
inputDims.width);
83-
TORCH_CHECK(
84-
y_ <= inputDims.height,
85-
"Crop y start position, ",
86-
y_,
87-
", out of bounds of input height, ",
88-
inputDims.height);
89-
TORCH_CHECK(
90-
y_ + outputDims_.height <= inputDims.height,
91-
"Crop y end position, ",
92-
y_ + outputDims_.height,
93-
", out of bounds of input height ",
94-
inputDims.height);
78+
x_.has_value() == y_.has_value(),
79+
"Crop x and y values must be both set or both unset");
80+
if (x_.has_value()) {
81+
TORCH_CHECK(
82+
x_.value() <= inputDims.width,
83+
"Crop x start position, ",
84+
x_.value(),
85+
", out of bounds of input width, ",
86+
inputDims.width);
87+
TORCH_CHECK(
88+
x_.value() + outputDims_.width <= inputDims.width,
89+
"Crop x end position, ",
90+
x_.value() + outputDims_.width,
91+
", out of bounds of input width ",
92+
inputDims.width);
93+
TORCH_CHECK(
94+
y_.value() <= inputDims.height,
95+
"Crop y start position, ",
96+
y_.value(),
97+
", out of bounds of input height, ",
98+
inputDims.height);
99+
TORCH_CHECK(
100+
y_.value() + outputDims_.height <= inputDims.height,
101+
"Crop y end position, ",
102+
y_.value() + outputDims_.height,
103+
", out of bounds of input height ",
104+
inputDims.height);
105+
}
95106
}
96107

97108
} // namespace facebook::torchcodec

src/torchcodec/_core/Transform.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,14 +61,17 @@ class CropTransform : public Transform {
6161
public:
6262
CropTransform(const FrameDims& dims, int x, int y);
6363

64+
// Becomes a center crop if x and y are not specified.
65+
CropTransform(const FrameDims& dims);
66+
6467
std::string getFilterGraphCpu() const override;
6568
std::optional<FrameDims> getOutputFrameDims() const override;
6669
void validate(const FrameDims& inputDims) const override;
6770

6871
private:
6972
FrameDims outputDims_;
70-
int x_;
71-
int y_;
73+
std::optional<int> x_;
74+
std::optional<int> y_;
7275
};
7376

7477
} // namespace facebook::torchcodec

src/torchcodec/_core/custom_ops.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,23 @@ Transform* makeCropTransform(
288288
return new CropTransform(FrameDims(height, width), x, y);
289289
}
290290

291+
// CenterCrop transform specs take the form:
292+
//
293+
// "center_crop, <height>, <width>"
294+
//
295+
// Where "center_crop" is the string literal and <height>, <width> are
296+
// positive integers. Note that we follow the PyTorch convention of (height,
297+
// width) for specifying image dimensions; FFmpeg uses (width, height).
298+
Transform* makeCenterCropTransform(
299+
const std::vector<std::string>& cropTransformSpec) {
300+
TORCH_CHECK(
301+
cropTransformSpec.size() == 3,
302+
"cropTransformSpec must have 3 elements including its name");
303+
int height = checkedToPositiveInt(cropTransformSpec[1]);
304+
int width = checkedToPositiveInt(cropTransformSpec[2]);
305+
return new CropTransform(FrameDims(height, width));
306+
}
307+
291308
std::vector<std::string> split(const std::string& str, char delimiter) {
292309
std::vector<std::string> tokens;
293310
std::string token;
@@ -317,6 +334,8 @@ std::vector<Transform*> makeTransforms(const std::string& transformSpecsRaw) {
317334
transforms.push_back(makeResizeTransform(transformSpec));
318335
} else if (name == "crop") {
319336
transforms.push_back(makeCropTransform(transformSpec));
337+
} else if (name == "center_crop") {
338+
transforms.push_back(makeCenterCropTransform(transformSpec));
320339
} else {
321340
TORCH_CHECK(false, "Invalid transform name: " + name);
322341
}

src/torchcodec/decoders/_video_decoder.py

Lines changed: 2 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
create_decoder,
2020
ERROR_REPORTING_INSTRUCTIONS,
2121
)
22-
from torchcodec.transforms import DecoderTransform, RandomCrop, Resize
22+
from torchcodec.transforms import DecoderTransform
23+
from torchcodec.transforms._decoder_transforms import _make_transform_specs
2324

2425

2526
class VideoDecoder:
@@ -451,102 +452,6 @@ def _get_and_validate_stream_metadata(
451452
)
452453

453454

454-
def _make_transform_specs(
455-
transforms: Optional[Sequence[Union[DecoderTransform, nn.Module]]],
456-
input_dims: Tuple[Optional[int], Optional[int]],
457-
) -> str:
458-
"""Given a sequence of transforms, turn those into the specification string
459-
the core API expects.
460-
461-
Args:
462-
transforms: Optional sequence of transform objects. The objects can be
463-
one of two types:
464-
1. torchcodec.transforms.DecoderTransform
465-
2. torchvision.transforms.v2.Transform, but our type annotation
466-
only mentions its base, nn.Module. We don't want to take a
467-
hard dependency on TorchVision.
468-
input_dims: Optional (height, width) pair. Note that only some
469-
transforms need to know the dimensions. If the user provides
470-
transforms that don't need to know the dimensions, and that metadata
471-
is missing, everything should still work. That means we assert their
472-
existence as late as possible.
473-
474-
Returns:
475-
String of transforms in the format the core API expects: transform
476-
specifications separate by semicolons.
477-
"""
478-
if transforms is None:
479-
return ""
480-
481-
try:
482-
from torchvision.transforms import v2
483-
484-
tv_available = True
485-
except ImportError:
486-
tv_available = False
487-
488-
# The following loop accomplishes two tasks:
489-
#
490-
# 1. Converts the transform to a DecoderTransform, if necessary. We
491-
# accept TorchVision transform objects and they must be converted
492-
# to their matching DecoderTransform.
493-
# 2. Calculates what the input dimensions are to each transform.
494-
#
495-
# The order in our transforms list is semantically meaningful, as we
496-
# actually have a pipeline where the output of one transform is the input to
497-
# the next. For example, if we have the transforms list [A, B, C, D], then
498-
# we should understand that as:
499-
#
500-
# A -> B -> C -> D
501-
#
502-
# Where the frame produced by A is the input to B, the frame produced by B
503-
# is the input to C, etc. This particularly matters for frame dimensions.
504-
# Transforms can both:
505-
#
506-
# 1. Produce frames with arbitrary dimensions.
507-
# 2. Rely on their input frame's dimensions to calculate ahead-of-time
508-
# what their runtime behavior will be.
509-
#
510-
# The consequence of the above facts is that we need to statically track
511-
# frame dimensions in the pipeline while we pre-process it. The input
512-
# frame's dimensions to A, our first transform, is always what we know from
513-
# our metadata. For each transform, we always calculate its output
514-
# dimensions from its input dimensions. We store these with the converted
515-
# transform, to be all used together when we generate the specs.
516-
converted_transforms: list[
517-
Tuple[
518-
DecoderTransform,
519-
# A (height, width) pair where the values may be missing.
520-
Tuple[Optional[int], Optional[int]],
521-
]
522-
] = []
523-
curr_input_dims = input_dims
524-
for transform in transforms:
525-
if not isinstance(transform, DecoderTransform):
526-
if not tv_available:
527-
raise ValueError(
528-
f"The supplied transform, {transform}, is not a TorchCodec "
529-
" DecoderTransform. TorchCodec also accepts TorchVision "
530-
"v2 transforms, but TorchVision is not installed."
531-
)
532-
elif isinstance(transform, v2.Resize):
533-
transform = Resize._from_torchvision(transform)
534-
elif isinstance(transform, v2.RandomCrop):
535-
transform = RandomCrop._from_torchvision(transform)
536-
else:
537-
raise ValueError(
538-
f"Unsupported transform: {transform}. Transforms must be "
539-
"either a TorchCodec DecoderTransform or a TorchVision "
540-
"v2 transform."
541-
)
542-
543-
converted_transforms.append((transform, curr_input_dims))
544-
output_dims = transform._get_output_dims()
545-
curr_input_dims = output_dims if output_dims is not None else curr_input_dims
546-
547-
return ";".join([t._make_transform_spec(dims) for t, dims in converted_transforms])
548-
549-
550455
def _read_custom_frame_mappings(
551456
custom_frame_mappings: Union[str, bytes, io.RawIOBase, io.BufferedReader]
552457
) -> tuple[Tensor, Tensor, Tensor]:

src/torchcodec/transforms/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,9 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7-
from ._decoder_transforms import DecoderTransform, RandomCrop, Resize # noqa
7+
from ._decoder_transforms import ( # noqa
8+
CenterCrop,
9+
DecoderTransform,
10+
RandomCrop,
11+
Resize,
12+
)

0 commit comments

Comments
 (0)