|
19 | 19 | create_decoder, |
20 | 20 | ERROR_REPORTING_INSTRUCTIONS, |
21 | 21 | ) |
22 | | -from torchcodec.transforms import DecoderTransform, RandomCrop, Resize |
| 22 | +from torchcodec.transforms import DecoderTransform |
| 23 | +from torchcodec.transforms._decoder_transforms import _make_transform_specs |
23 | 24 |
|
24 | 25 |
|
25 | 26 | class VideoDecoder: |
@@ -451,102 +452,6 @@ def _get_and_validate_stream_metadata( |
451 | 452 | ) |
452 | 453 |
|
453 | 454 |
|
454 | | -def _make_transform_specs( |
455 | | - transforms: Optional[Sequence[Union[DecoderTransform, nn.Module]]], |
456 | | - input_dims: Tuple[Optional[int], Optional[int]], |
457 | | -) -> str: |
458 | | - """Given a sequence of transforms, turn those into the specification string |
459 | | - the core API expects. |
460 | | -
|
461 | | - Args: |
462 | | - transforms: Optional sequence of transform objects. The objects can be |
463 | | - one of two types: |
464 | | - 1. torchcodec.transforms.DecoderTransform |
465 | | - 2. torchvision.transforms.v2.Transform, but our type annotation |
466 | | - only mentions its base, nn.Module. We don't want to take a |
467 | | - hard dependency on TorchVision. |
468 | | - input_dims: Optional (height, width) pair. Note that only some |
469 | | - transforms need to know the dimensions. If the user provides |
470 | | - transforms that don't need to know the dimensions, and that metadata |
471 | | - is missing, everything should still work. That means we assert their |
472 | | - existence as late as possible. |
473 | | -
|
474 | | - Returns: |
475 | | - String of transforms in the format the core API expects: transform |
476 | | - specifications separate by semicolons. |
477 | | - """ |
478 | | - if transforms is None: |
479 | | - return "" |
480 | | - |
481 | | - try: |
482 | | - from torchvision.transforms import v2 |
483 | | - |
484 | | - tv_available = True |
485 | | - except ImportError: |
486 | | - tv_available = False |
487 | | - |
488 | | - # The following loop accomplishes two tasks: |
489 | | - # |
490 | | - # 1. Converts the transform to a DecoderTransform, if necessary. We |
491 | | - # accept TorchVision transform objects and they must be converted |
492 | | - # to their matching DecoderTransform. |
493 | | - # 2. Calculates what the input dimensions are to each transform. |
494 | | - # |
495 | | - # The order in our transforms list is semantically meaningful, as we |
496 | | - # actually have a pipeline where the output of one transform is the input to |
497 | | - # the next. For example, if we have the transforms list [A, B, C, D], then |
498 | | - # we should understand that as: |
499 | | - # |
500 | | - # A -> B -> C -> D |
501 | | - # |
502 | | - # Where the frame produced by A is the input to B, the frame produced by B |
503 | | - # is the input to C, etc. This particularly matters for frame dimensions. |
504 | | - # Transforms can both: |
505 | | - # |
506 | | - # 1. Produce frames with arbitrary dimensions. |
507 | | - # 2. Rely on their input frame's dimensions to calculate ahead-of-time |
508 | | - # what their runtime behavior will be. |
509 | | - # |
510 | | - # The consequence of the above facts is that we need to statically track |
511 | | - # frame dimensions in the pipeline while we pre-process it. The input |
512 | | - # frame's dimensions to A, our first transform, is always what we know from |
513 | | - # our metadata. For each transform, we always calculate its output |
514 | | - # dimensions from its input dimensions. We store these with the converted |
515 | | - # transform, to be all used together when we generate the specs. |
516 | | - converted_transforms: list[ |
517 | | - Tuple[ |
518 | | - DecoderTransform, |
519 | | - # A (height, width) pair where the values may be missing. |
520 | | - Tuple[Optional[int], Optional[int]], |
521 | | - ] |
522 | | - ] = [] |
523 | | - curr_input_dims = input_dims |
524 | | - for transform in transforms: |
525 | | - if not isinstance(transform, DecoderTransform): |
526 | | - if not tv_available: |
527 | | - raise ValueError( |
528 | | - f"The supplied transform, {transform}, is not a TorchCodec " |
529 | | - " DecoderTransform. TorchCodec also accepts TorchVision " |
530 | | - "v2 transforms, but TorchVision is not installed." |
531 | | - ) |
532 | | - elif isinstance(transform, v2.Resize): |
533 | | - transform = Resize._from_torchvision(transform) |
534 | | - elif isinstance(transform, v2.RandomCrop): |
535 | | - transform = RandomCrop._from_torchvision(transform) |
536 | | - else: |
537 | | - raise ValueError( |
538 | | - f"Unsupported transform: {transform}. Transforms must be " |
539 | | - "either a TorchCodec DecoderTransform or a TorchVision " |
540 | | - "v2 transform." |
541 | | - ) |
542 | | - |
543 | | - converted_transforms.append((transform, curr_input_dims)) |
544 | | - output_dims = transform._get_output_dims() |
545 | | - curr_input_dims = output_dims if output_dims is not None else curr_input_dims |
546 | | - |
547 | | - return ";".join([t._make_transform_spec(dims) for t, dims in converted_transforms]) |
548 | | - |
549 | | - |
550 | 455 | def _read_custom_frame_mappings( |
551 | 456 | custom_frame_mappings: Union[str, bytes, io.RawIOBase, io.BufferedReader] |
552 | 457 | ) -> tuple[Tensor, Tensor, Tensor]: |
|
0 commit comments