Add SanitizeKeyPoints transform

Callidior · Callidior · commit 51a657411774 · 2025-10-06T18:23:35.000+02:00
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
@@ -413,6 +413,7 @@ Miscellaneous
     v2.RandomErasing
     v2.Lambda
     v2.SanitizeBoundingBoxes
+    v2.SanitizeKeyPoints
     v2.ClampBoundingBoxes
     v2.ClampKeyPoints
     v2.UniformTemporalSubsample
@@ -427,6 +428,7 @@ Functionals
     v2.functional.normalize
     v2.functional.erase
     v2.functional.sanitize_bounding_boxes
+    v2.functional.sanitize_keypoints
     v2.functional.clamp_bounding_boxes
     v2.functional.clamp_keypoints
     v2.functional.uniform_temporal_subsample
@@ -530,6 +532,7 @@ Developer tools
     v2.query_size
     v2.query_chw
     v2.get_bounding_boxes
+    v2.get_keypoints
 
 
 V1 API Reference
diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
@@ -51,10 +51,11 @@
     LinearTransformation,
     Normalize,
     SanitizeBoundingBoxes,
+    SanitizeKeyPoints,
     ToDtype,
 )
 from ._temporal import UniformTemporalSubsample
 from ._type_conversion import PILToTensor, ToImage, ToPILImage, ToPureTensor
-from ._utils import check_type, get_bounding_boxes, has_all, has_any, query_chw, query_size
+from ._utils import check_type, get_bounding_boxes, get_keypoints, has_all, has_any, query_chw, query_size
 
 from ._deprecated import ToTensor  # usort: skip
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
@@ -10,7 +10,15 @@
 from torchvision import transforms as _transforms, tv_tensors
 from torchvision.transforms.v2 import functional as F, Transform
 
-from ._utils import _parse_labels_getter, _setup_number_or_seq, _setup_size, get_bounding_boxes, has_any, is_pure_tensor
+from ._utils import (
+    _parse_labels_getter,
+    _setup_number_or_seq,
+    _setup_size,
+    get_bounding_boxes,
+    get_keypoints,
+    has_any,
+    is_pure_tensor,
+)
 
 
 # TODO: do we want/need to expose this?
@@ -459,3 +467,117 @@ def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
             return output
         else:
             return tv_tensors.wrap(output, like=inpt)
+
+
+class SanitizeKeyPoints(Transform):
+    """Remove keypoints outside of the image area and their corresponding labels (if any).
+
+    This transform removes keypoints or groups of keypoints and their associated labels that
+    have coordinates outside of their corresponding image or within ``min_valid_edge_distance`` pixels
+    from the image edges.
+    If you would instead like to clamp such keypoints to the image edges, use
+    :class:`~torchvision.transforms.v2.ClampKeyPoints`.
+
+    It is recommended to call it at the end of a pipeline, before passing the
+    input to the models.
+
+    Keypoints can be passed as a set of individual keypoints of shape ``[N_points, 2]`` or as a
+    set of objects (e.g., polygons or polygonal chains) consisting of a fixed number of keypoints
+    of shape ``[N_objects, ..., 2]``.
+    When groups of keypoints are passed (i.e., an at least 3-dimensional tensor), this transform
+    will only remove entire groups, not individual keypoints within a group.
+
+    Args:
+        min_valid_edge_distance (int, optional): The minimum distance that keypoints need to be away from the closest image
+            edge along any axis in order to be considered valid. For example, setting this to 0 will only
+            invalidate/remove keypoints outside of the image area, while a value of 1 will also remove keypoints
+            lying exactly on the edge.
+            Default is 0.
+        min_invalid_points (int or float, optional): Minimum number or fraction of invalid keypoints required
+            for a group of keypoints to be removed. For example, setting this to 1 will remove a group of keypoints
+            if any of its keypoints is invalid, while setting it to 2 will only remove groups with at least 2 invalid keypoints.
+            If a float in (0.0, 1.0) is passed, it represents a fraction of the total number of keypoints in
+            the group. For example, setting this to 0.3 will remove groups of keypoints with at least 30% invalid keypoints.
+            Note that a value of `1` (integer) is very different from `1.0` (float). The former will remove groups
+            with any invalid keypoint, while the latter will only remove groups where all keypoints are invalid.
+            Default is 1.
+        labels_getter (callable or str or None, optional): indicates how to identify the labels in the input
+            (or anything else that needs to be sanitized along with the keypoints).
+            By default, this will try to find a "labels" key in the input (case-insensitive), if
+            the input is a dict or it is a tuple whose second element is a dict.
+
+            It can also be a callable that takes the same input as the transform, and returns either:
+
+            - A single tensor (the labels)
+            - A tuple/list of tensors, each of which will be subject to the same sanitization as the keypoints.
+
+            If ``labels_getter`` is None then only keypoints are sanitized.
+    """
+
+    def __init__(
+        self,
+        min_valid_edge_distance: int = 0,
+        min_invalid_points: int | float = 1,
+        labels_getter: Union[Callable[[Any], Any], str, None] = "default",
+    ) -> None:
+        super().__init__()
+        self.min_valid_edge_distance = min_valid_edge_distance
+        self.min_invalid_points = min_invalid_points
+        self.labels_getter = labels_getter
+        self._labels_getter = _parse_labels_getter(labels_getter)
+
+        if min_invalid_points <= 0:
+            raise ValueError(f"min_invalid_points must be > 0. Got {min_invalid_points}.")
+
+    def forward(self, *inputs: Any) -> Any:
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+
+        labels = self._labels_getter(inputs)
+        if labels is not None:
+            msg = "The labels in the input to forward() must be a tensor or None, got {type} instead."
+            if isinstance(labels, torch.Tensor):
+                labels = (labels,)
+            elif isinstance(labels, (tuple, list)):
+                for entry in labels:
+                    if not isinstance(entry, torch.Tensor):
+                        # TODO: we don't need to enforce tensors, just that entries are indexable as t[bool_mask]
+                        raise ValueError(msg.format(type=type(entry)))
+            else:
+                raise ValueError(msg.format(type=type(labels)))
+
+        flat_inputs, spec = tree_flatten(inputs)
+        points = get_keypoints(flat_inputs)
+
+        if labels is not None:
+            for label in labels:
+                if points.shape[0] != label.shape[0]:
+                    raise ValueError(
+                        f"Number of kepyoints (shape={points.shape}) must match the number of labels."
+                        f"Found labels with shape={label.shape})."
+                    )
+
+        valid = F._misc._get_sanitize_keypoints_mask(
+            points,
+            canvas_size=points.canvas_size,
+            min_valid_edge_distance=self.min_valid_edge_distance,
+            min_invalid_points=self.min_invalid_points,
+        )
+
+        params = dict(valid=valid, labels=labels)
+        flat_outputs = [self.transform(inpt, params) for inpt in flat_inputs]
+
+        return tree_unflatten(flat_outputs, spec)
+
+    def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
+        is_label = params["labels"] is not None and any(inpt is label for label in params["labels"])
+        is_keypoints = isinstance(inpt, tv_tensors.KeyPoints)
+
+        if not (is_label or is_keypoints):
+            return inpt
+
+        output = inpt[params["valid"]]
+
+        if is_label:
+            return output
+        else:
+            return tv_tensors.wrap(output, like=inpt)
diff --git a/torchvision/transforms/v2/_utils.py b/torchvision/transforms/v2/_utils.py
@@ -165,6 +165,18 @@ def get_bounding_boxes(flat_inputs: list[Any]) -> tv_tensors.BoundingBoxes:
         raise ValueError("No bounding boxes were found in the sample")
 
 
+def get_keypoints(flat_inputs: list[Any]) -> tv_tensors.KeyPoints:
+    """Return the keypoints in the input.
+
+    Assumes only one ``KeyPoints`` object is present.
+    """
+    # This assumes there is only one keypoint per sample as per the general convention
+    try:
+        return next(inpt for inpt in flat_inputs if isinstance(inpt, tv_tensors.KeyPoints))
+    except StopIteration:
+        raise ValueError("No keypoints were found in the sample")
+
+
 def query_chw(flat_inputs: list[Any]) -> tuple[int, int, int]:
     """Return Channel, Height, and Width."""
     chws = {
diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py
@@ -156,6 +156,7 @@
     normalize_image,
     normalize_video,
     sanitize_bounding_boxes,
+    sanitize_keypoints,
     to_dtype,
     to_dtype_image,
     to_dtype_video,
diff --git a/torchvision/transforms/v2/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py
@@ -442,3 +442,128 @@ def _get_sanitize_bounding_boxes_mask(
         valid &= (bounding_boxes[..., 4] <= image_w) & (bounding_boxes[..., 5] <= image_h)
         valid &= (bounding_boxes[..., 6] <= image_w) & (bounding_boxes[..., 7] <= image_h)
     return valid
+
+
+def sanitize_keypoints(
+    key_points: torch.Tensor,
+    canvas_size: Optional[tuple[int, int]] = None,
+    min_valid_edge_distance: int = 0,
+    min_invalid_points: int | float = 1,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Remove keypoints outside of the image area and their corresponding labels (if any).
+
+    This transform removes keypoints or groups of keypoints and their associated labels that
+    have coordinates outside of their corresponding image or within ``min_valid_edge_distance`` pixels
+    from the image edges.
+    If you would instead like to clamp such keypoints to the image edges, use
+    :class:`~torchvision.transforms.v2.ClampKeyPoints`.
+
+    It is recommended to call it at the end of a pipeline, before passing the
+    input to the models.
+
+    Keypoints can be passed as a set of individual keypoints of shape ``[N_points, 2]`` or as a
+    set of objects (e.g., polygons or polygonal chains) consisting of a fixed number of keypoints
+    of shape ``[N_objects, ..., 2]``.
+    When groups of keypoints are passed (i.e., an at least 3-dimensional tensor), this transform
+    will only remove entire groups, not individual keypoints within a group.
+
+    Args:
+        key_points (Tensor or :class:`~torchvision.tv_tensors.KeyPoints`): The keypoints to be sanitized.
+        canvas_size (tuple of int, optional): The canvas_size of the keypoints
+            (size of the corresponding image/video).
+            Must be left to none if ``key_points`` is a :class:`~torchvision.tv_tensors.KeyPoints` object.
+        min_valid_edge_distance (int, optional): The minimum distance that keypoints need to be away from the closest image
+            edge along any axis in order to be considered valid. For example, setting this to 0 will only
+            invalidate/remove keypoints outside of the image area, while a value of 1 will also remove keypoints
+            lying exactly on the edge.
+            Default is 0.
+        min_invalid_points (int or float, optional): Minimum number or fraction of invalid keypoints required
+            for a group of keypoints to be removed. For example, setting this to 1 will remove a group of keypoints
+            if any of its keypoints is invalid, while setting it to 2 will only remove groups with at least 2 invalid keypoints.
+            If a float in (0.0, 1.0) is passed, it represents a fraction of the total number of keypoints in
+            the group. For example, setting this to 0.3 will remove groups of keypoints with at least 30% invalid keypoints.
+            Note that a value of `1` (integer) is very different from `1.0` (float). The former will remove groups
+            with any invalid keypoint, while the latter will only remove groups where all keypoints are invalid.
+            Default is 1.
+
+    Returns:
+        out (tuple of Tensors): The subset of valid keypoints, and the corresponding indexing mask.
+        The mask can then be used to subset other tensors (e.g. labels) that are associated with the keypoints.
+    """
+    if torch.jit.is_scripting() or is_pure_tensor(key_points):
+        if canvas_size is None:
+            raise ValueError(
+                "canvas_size cannot be None if key_points is a pure tensor. "
+                "Set it to an appropriate value or pass key_points as a tv_tensors.KeyPoints object."
+            )
+        valid = _get_sanitize_keypoints_mask(
+            key_points,
+            canvas_size=canvas_size,
+            min_valid_edge_distance=min_valid_edge_distance,
+            min_invalid_points=min_invalid_points,
+        )
+        key_points = key_points[valid]
+    else:
+        if not isinstance(key_points, tv_tensors.KeyPoints):
+            raise ValueError("key_points must be a tv_tensors.KeyPoints instance or a pure tensor.")
+        if canvas_size is not None:
+            raise ValueError(
+                "canvas_size must be None when key_points is a tv_tensors.KeyPoints instance. "
+                f"Got canvas_size={canvas_size}. "
+                "Leave it to None or pass key_points as a pure tensor."
+            )
+        valid = _get_sanitize_keypoints_mask(
+            key_points,
+            canvas_size=key_points.canvas_size,
+            min_valid_edge_distance=min_valid_edge_distance,
+            min_invalid_points=min_invalid_points,
+        )
+        key_points = tv_tensors.wrap(key_points[valid], like=key_points)
+
+    return key_points, valid
+
+
+def _get_sanitize_keypoints_mask(
+    key_points: torch.Tensor,
+    canvas_size: tuple[int, int],
+    min_valid_edge_distance: int = 0,
+    min_invalid_points: int | float = 1,
+) -> torch.Tensor:
+
+    image_h, image_w = canvas_size
+
+    # Bring keypoints tensor into canonical shape [N_instances, N_points, 2]
+    if key_points.ndim == 2:
+        key_points = key_points.unsqueeze(dim=1)
+    elif key_points.ndim > 3:
+        key_points = key_points.flatten(start_dim=1, end_dim=-2)
+
+    # Convert min_invalid_points from relative to absolute number of points
+    if min_invalid_points <= 0:
+        raise ValueError(f"min_invalid_points must be > 0. Got {min_invalid_points}.")
+    if isinstance(min_invalid_points, float):
+        min_invalid_points = math.ceil(min_invalid_points * key_points.shape[1])
+    if min_invalid_points > 1 and key_points.shape[1] == 1:
+        raise ValueError(
+            f"min_invalid_points was set to {min_invalid_points}, but key_points only contains a single point per "
+            "instance, so min_invalid_points must be 1."
+        )
+
+    # Compute distance of each point to the closest image edge
+    dists = torch.stack(
+        [
+            key_points[..., 0],  # x
+            image_w - 1 - key_points[..., 0],  # image_w - x
+            key_points[..., 1],  # y
+            image_h - 1 - key_points[..., 1],  # image_h - y
+        ],
+        dim=-1,
+    )
+    dists = dists.min(dim=-1).values  # [N_instances, N_points]
+
+    # Determine invalid points
+    invalid_points = dists < min_valid_edge_distance  # [N_instances, N_points]
+
+    # Determine valid instances
+    valid = invalid_points.sum(dim=-1) < min_invalid_points  # [N_instances]
+    return valid