Add some doc

NicolasHug · NicolasHug · commit 956f31916563 · 2024-12-06T16:22:44.000Z
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
@@ -508,6 +508,12 @@ are combining pairs of images together. These can be used after the dataloader
 Developer tools
 ^^^^^^^^^^^^^^^
 
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.Transform
+
 .. autosummary::
     :toctree: generated/
     :template: function.rst
diff --git a/gallery/transforms/plot_custom_transforms.py b/gallery/transforms/plot_custom_transforms.py
@@ -12,6 +12,8 @@
 """
 
 # %%
+from typing import Any, Dict
+
 import torch
 from torchvision import tv_tensors
 from torchvision.transforms import v2
@@ -89,33 +91,55 @@ def forward(self, img, bboxes, label):  # we assume inputs are always structured
 # A key feature of the builtin Torchvision V2 transforms is that they can accept
 # arbitrary input structure and return the same structure as output (with
 # transformed entries). For example, transforms can accept a single image, or a
-# tuple of ``(img, label)``, or an arbitrary nested dictionary as input:
+# tuple of ``(img, label)``, or an arbitrary nested dictionary as input. Here's
+# an example on the built-in transform :class:`~torchvision.transforms.v2.RandomHorizontalFlip`:
 
 structured_input = {
     "img": img,
     "annotations": (bboxes, label),
-    "something_that_will_be_ignored": (1, "hello")
+    "something that will be ignored": (1, "hello"),
+    "another tensor that is ignored": torch.arange(10),
 }
 structured_output = v2.RandomHorizontalFlip(p=1)(structured_input)
 
 assert isinstance(structured_output, dict)
-assert structured_output["something_that_will_be_ignored"] == (1, "hello")
+assert structured_output["something that will be ignored"] == (1, "hello")
+assert (structured_output["another tensor that is ignored"] == torch.arange(10)).all()
+print(f"The input bboxes are:\n{structured_input['annotations'][0]}")
+print(f"The transformed bboxes are:\n{structured_output['annotations'][0]}")
+
+# %%
+# In order to support arbitrary inputs in your custom transform, you will need
+# to inherit from :class:`~torchvision.transforms.v2.Transform` and override the
+# `.transform()` method (not the `forward()` method!).
+
+
+class MyCustomTransform(v2.Transform):
+    def transform(self, inpt: Any, params: Dict[str, Any]):
+        if type(inpt) == torch.Tensor:
+            print(f"I'm transforming an image of shape {inpt.shape}")
+            return inpt + 1  # dummy transformation
+        elif isinstance(inpt, tv_tensors.BoundingBoxes):
+            print(f"I'm transforming bounding boxes! {inpt.canvas_size = }")
+            return tv_tensors.wrap(inpt + 100, like=inpt)  # dummy transformation
+
+
+my_custom_transform = MyCustomTransform()
+structured_output = my_custom_transform(structured_input)
+
+assert isinstance(structured_output, dict)
+assert structured_output["something that will be ignored"] == (1, "hello")
+assert (structured_output["another tensor that is ignored"] == torch.arange(10)).all()
+print(f"The input bboxes are:\n{structured_input['annotations'][0]}")
 print(f"The transformed bboxes are:\n{structured_output['annotations'][0]}")
 
 # %%
-# If you want to reproduce this behavior in your own transform, we invite you to
-# look at our `code
-# <https://github.com/pytorch/vision/blob/main/torchvision/transforms/v2/_transform.py>`_
-# and adapt it to your needs.
+# An important thing to note is that when we call `my_custom_transform` on
+# `structured_input`, the input is flattened and then each individual part is
+# passed to `transform()`. That is, `transform()` received the input image, then
+# the bounding boxes, etc. It is then within `transform()` that you can decide
+# how to transform each input, based on their type.
 #
-# In brief, the core logic is to unpack the input into a flat list using `pytree
-# <https://github.com/pytorch/pytorch/blob/main/torch/utils/_pytree.py>`_, and
-# then transform only the entries that can be transformed (the decision is made
-# based on the **class** of the entries, as all TVTensors are
-# tensor-subclasses) plus some custom logic that is out of score here - check the
-# code for details. The (potentially transformed) entries are then repacked and
-# returned, in the same structure as the input.
+# If you're curious why the other tensor (`torch.arange()`) didn't get passed to `transform()`, see :ref:`_passthrough_heuristic`.
 #
-# We do not provide public dev-facing tools to achieve that at this time, but if
-# this is something that would be valuable to you, please let us know by opening
-# an issue on our `GitHub repo <https://github.com/pytorch/vision/issues>`_.
+# TODO explain make_params()
diff --git a/torchvision/transforms/v2/_transform.py b/torchvision/transforms/v2/_transform.py
@@ -15,6 +15,11 @@
 
 
 class Transform(nn.Module):
+    """Base class to implement your own v2 transforms.
+
+    See  :ref:`sphx_glr_auto_examples_transforms_plot_custom_transforms.py` for
+    more details.
+    """
 
     # Class attribute defining transformed types. Other types are passed-through without any transformation
     # We support both Types and callables that are able to do further checks on the type of the input.
diff --git a/torchvision/transforms/v2/_utils.py b/torchvision/transforms/v2/_utils.py
@@ -159,6 +159,9 @@ def get_bounding_boxes(flat_inputs: List[Any]) -> tv_tensors.BoundingBoxes:
 
 
 def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]:
+    print("AEFAEFAE")
+    print(len(flat_inputs))
+    print([type(inpt) for inpt in flat_inputs])
     chws = {
         tuple(get_dimensions(inpt))
         for inpt in flat_inputs