@@ -41,12 +41,43 @@ class DecoderTransform(ABC):
4141 def _make_transform_spec (
4242 self , input_dims : Tuple [Optional [int ], Optional [int ]]
4343 ) -> str :
44+ """Makes the transform spec that is used by the `VideoDecoder`.
45+
46+ Args:
47+ input_dims (Tuple[Optional[int], Optional[int]]): The dimensions of
48+ the input frame in the form (height, width). We cannot know the
49+ dimensions at object construction time because it's dependent on
50+ the video being decoded and upstream transforms in the same
51+ transform pipeline. Not all transforms need to know this; those
52+ that don't will ignore it. The individual values in the tuple are
53+ optional because the original values come from file metadata which
54+ may be missing. We maintain the optionality throughout the APIs so
55+ that we can decide as late as possible that it's necessary for the
56+ values to exist. That is, if the values are missing from the
57+ metadata and we have transforms which ignore the input dimensions,
58+ we want that to still work.
59+
60+ Note: This method is the moral equivalent of TorchVision's
61+ `Transformer.make_params()`.
62+
63+ Returns:
64+ str: A string which contains the spec for the transform that the
65+ `VideoDecoder` knows what to do with.
66+ """
4467 pass
4568
46- # Transforms that change the dimensions of their input frame return a value.
47- # Transforms that don't return None; they can rely on this default
48- # implementation.
4969 def _get_output_dims (self ) -> Optional [Tuple [Optional [int ], Optional [int ]]]:
70+ """Get the dimensions of the output frame.
71+
72+ Transforms that change the frame dimensions need to override this
73+ method. Transforms that don't change the frame dimensions can rely on
74+ this default implementation.
75+
76+ Returns:
77+ Optional[Tuple[Optional[int], Optional[int]]]: The output dimensions.
78+ - None: The output dimensions are the same as the input dimensions.
79+ - (int, int): The (height, width) of the output frame.
80+ """
5081 return None
5182
5283
@@ -68,7 +99,7 @@ class Resize(DecoderTransform):
6899 Interpolation is always bilinear. Anti-aliasing is always on.
69100
70101 Args:
71- size: (sequence of int): Desired output size. Must be a sequence of
102+ size (Sequence[ int] ): Desired output size. Must be a sequence of
72103 the form (height, width).
73104 """
74105
@@ -117,13 +148,13 @@ class RandomCrop(DecoderTransform):
117148 Complementary TorchVision transform: :class:`~torchvision.transforms.v2.RandomCrop`.
118149 Padding of all kinds is disabled. The random location within the frame is
119150 determined during the initialization of the
120- :class:~` torchcodec.decoders.VideoDecoder` object that owns this transform.
151+ :class:`~ torchcodec.decoders.VideoDecoder` object that owns this transform.
121152 As a consequence, each decoded frame in the video will be cropped at the
122153 same location. Videos with variable resolution may result in undefined
123154 behavior.
124155
125156 Args:
126- size: (sequence of int): Desired output size. Must be a sequence of
157+ size (Sequence[ int] ): Desired output size. Must be a sequence of
127158 the form (height, width).
128159 """
129160
0 commit comments