@@ -23,8 +23,8 @@ class DecoderTransform(ABC):
2323 decoded frames and applying the same kind of transform.
2424
2525 Most ``DecoderTransform`` objects have a complementary transform in TorchVision,
26- specificially in `torchvision.transforms.v2 <https://docs.pytorch.org/vision/stable/transforms.html>`_. For such transforms, we
27- ensure that:
26+ specificially in `torchvision.transforms.v2 <https://docs.pytorch.org/vision/stable/transforms.html>`_.
27+ For such transforms, we ensure that:
2828
2929 1. The names are the same.
3030 2. Default behaviors are the same.
@@ -74,7 +74,7 @@ def _make_transform_spec(self) -> str:
7474 return f"resize, { self .size [0 ]} , { self .size [1 ]} "
7575
7676 def _get_output_dims (self , input_dims : Tuple [int , int ]) -> Tuple [int , int ]:
77- return self .size
77+ return ( * self .size ,)
7878
7979 @classmethod
8080 def _from_torchvision (cls , resize_tv : nn .Module ):
@@ -102,20 +102,51 @@ def _from_torchvision(cls, resize_tv: nn.Module):
102102
103103@dataclass
104104class RandomCrop (DecoderTransform ):
105+ """Crop the decoded frame to a given size at a random location in the frame.
106+
107+ Complementary TorchVision transform: :class:`~torchvision.transforms.v2.RandomCrop`.
108+ Padding of all kinds is disabled. The random location within the frame is
109+ determined during the initialization of the
110+ :class:~`torchcodec.decoders.VideoDecoder` object that owns this transform.
111+ As a consequence, each decoded frame in the video will be cropped at the
112+ same location. Videos with variable resolution may result in undefined
113+ behavior.
114+
115+ Args:
116+ size: (sequence of int): Desired output size. Must be a sequence of
117+ the form (height, width).
118+ """
105119
106120 size : Sequence [int ]
107121 _top : Optional [int ] = None
108122 _left : Optional [int ] = None
109123 _input_dims : Optional [Tuple [int , int ]] = None
110124
111125 def _make_transform_spec (self ) -> str :
112- assert len (self .size ) == 2
126+ if len (self .size ) != 2 :
127+ raise ValueError (
128+ f"RandomCrop's size must be a sequence of length 2, got { self .size } . "
129+ "This should never happen, please report a bug."
130+ )
131+
113132 if self ._top is None or self ._left is None :
114- assert self ._input_dims is not None
133+ # TODO: It would be very strange if only ONE of those is None. But should we
134+ # make it an error? We can continue, but it would probably mean
135+ # something bad happened. Dear reviewer, please register an opinion here:
136+ if self ._input_dims is None :
137+ raise ValueError (
138+ "RandomCrop's input_dims must be set before calling _make_transform_spec(). "
139+ "This should never happen, please report a bug."
140+ )
115141 if self ._input_dims [0 ] < self .size [0 ] or self ._input_dims [1 ] < self .size [1 ]:
116142 raise ValueError (
117143 f"Input dimensions { input_dims } are smaller than the crop size { self .size } ."
118144 )
145+
146+ # Note: This logic must match the logic in
147+ # torchvision.transforms.v2.RandomCrop.make_params(). Given
148+ # the same seed, they should get the same result. This is an
149+ # API guarantee with our users.
119150 self ._top = torch .randint (
120151 0 , self ._input_dims [0 ] - self .size [0 ] + 1 , size = ()
121152 )
@@ -144,17 +175,16 @@ def _from_torchvision(cls, random_crop_tv: nn.Module, input_dims: Tuple[int, int
144175 "TorchVision RandomCrop transform must not specify pad_if_needed."
145176 )
146177 if random_crop_tv .fill != 0 :
147- raise ValueError ("TorchVision RandomCrop must specify fill of 0." )
178+ raise ValueError ("TorchVision RandomCrop fill must be 0." )
148179 if random_crop_tv .padding_mode != "constant" :
149- raise ValueError (
150- "TorchVision RandomCrop must specify padding_mode of constant."
151- )
180+ raise ValueError ("TorchVision RandomCrop padding_mode must be constant." )
152181 if len (random_crop_tv .size ) != 2 :
153182 raise ValueError (
154183 "TorchVision RandcomCrop transform must have a (height, width) "
155184 f"pair for the size, got { random_crop_tv .size } ."
156185 )
157186 params = random_crop_tv .make_params (
187+ # TODO: deal with NCHW versus NHWC; video decoder knows
158188 torch .empty (size = (3 , * input_dims ), dtype = torch .uint8 )
159189 )
160190 assert random_crop_tv .size == (params ["height" ], params ["width" ])
0 commit comments