improved docs for PyTorch code

n-poulsen · n-poulsen · commit 40005ae13bc7 · 2025-02-27T15:30:43.000+01:00
diff --git a/dlclive/dlclive.py b/dlclive/dlclive.py
@@ -46,14 +46,55 @@ class DLCLive:
         TensorFlow only. Optional ConfigProto for the TensorFlow session.
 
     single_animal: bool, default=True
-        PyTorch only.
+        PyTorch only. If True, the predicted pose array returned by the runner will be
+        (num_bodyparts, 3). As multi-animal pose estimation can be run with the PyTorch
+        engine, setting this to False means the returned pose array will be of shape
+        (num_detections, num_bodyparts, 3).
 
     device: str, optional, default=None
-        PyTorch only.
+        PyTorch only. The device on which to run inference, e.g. "cpu", "cuda" or
+        "cuda:0". If set to None or "auto", the device will be automatically selected
+        based on CUDA availability.
 
     top_down_config: dict, optional, default=None
+        PyTorch only. Configuration settings for top-down pose estimation models. Must
+        be provided when running top-down models and `top_down_dynamic` is None. The
+        parameters in the dict will be given to the `TopDownConfig` class (in
+        `dlclive/pose_estimation_pytorch/runner.py`). The `crop_size` does not need to
+        be set, as it will be read from the model configuration file.
+        Example parameters:
+            >>> # Running a top-down model with basic parameters
+            >>> top_down_config = {
+            >>>     "bbox_cutoff": 0.5,  # min confidence score for a bbox to be used
+            >>>     "max_detections": 3,  # max number of detections to return in a frame
+            >>> }
+            >>> # Running a top-down model with skip-frames
+            >>> top_down_config = {
+            >>>     "bbox_cutoff": 0.5,  # min confidence score for a bbox to be used
+            >>>     "max_detections": 3,  # max number of detections to return in a frame
+            >>>     "skip_frames": {  # only run the detector every 5 frames
+            >>>         "skip": 5,  # number of frames to skip between detections
+            >>>         "margin": 5,  # margin (in pixels) to use when generating bboxes
+            >>>     },
+            >>> }
 
     top_down_dynamic: dict, optional, default=None
+        PyTorch only. Single animal only. Top-down models do not need a detector to be
+        used for single animal pose estimation. This is equivalent to dynamic cropping
+        in TensorFlow or for bottom-up models, but crops are resized to the input size
+        required by the model. Pose estimation is never run on the full image. If no
+        animal is detected, the image is split into N by M "patches", and we run pose
+        estimation on the batch of patches. Pose is kept from the patch with the
+        highest likelyhood. No need to provide the `top_down_crop_size` parameter, as it
+        set using the model configuration file.
+        The parameters (except "type") will be passed to the `TopDownDynamicCropper`
+        class (in `dlclive/pose_estimation_pytorch/dynamic_cropping.py`
+
+        Example parameters:
+            >>> top_down_dynamic = {
+            >>>     "type": "TopDownDynamicCropper",
+            >>>     "min_bbox_size": (50, 50),
+            >>> }
 
     cropping: list of int
         Cropping parameters in pixel number: [x1, x2, y1, y2]
diff --git a/dlclive/pose_estimation_pytorch/dynamic_cropping.py b/dlclive/pose_estimation_pytorch/dynamic_cropping.py
@@ -260,18 +260,19 @@ class TopDownDynamicCropper(DynamicCropper):
 
     def __init__(
         self,
-        top_down_crop_size: tuple[int, int],
-        patch_counts: tuple[int, int],
-        patch_overlap: int,
-        min_bbox_size: tuple[int, int],
-        threshold: float,
-        margin: int,
+        top_down_crop_size: tuple[int, int] = (256, 256),
+        patch_counts: tuple[int, int] = (4, 3),
+        patch_overlap: int = 50,
+        min_bbox_size: tuple[int, int] = (100, 100),
+        threshold: float = 0.6,
+        margin: int = 10,
         min_hq_keypoints: int = 2,
         bbox_from_hq: bool = False,
         store_crops: bool = False,
         **kwargs,
     ) -> None:
         super().__init__(threshold=threshold, margin=margin, **kwargs)
+        self.top_down_crop_size = top_down_crop_size
         self.min_bbox_size = min_bbox_size
         self.min_hq_keypoints = min_hq_keypoints
         self.bbox_from_hq = bbox_from_hq
@@ -280,8 +281,7 @@ def __init__(
         self._patch_overlap = patch_overlap
         self._patches = []
         self._patch_offsets = []
-        self._td_crop_size = top_down_crop_size
-        self._td_ratio = self._td_crop_size[0] / self._td_crop_size[1]
+        self._td_ratio = self.top_down_crop_size[0] / self.top_down_crop_size[1]
 
         self.crop_history = []
         self.store_crops = store_crops
@@ -363,7 +363,7 @@ def update(self, pose: torch.Tensor) -> torch.Tensor:
             )
 
         # offset and rescale the pose to the original image space
-        out_w, out_h = self._td_crop_size
+        out_w, out_h = self.top_down_crop_size
         offset_x, offset_y, w, h = self._crop
         scale_x, scale_y = w / out_w, h / out_h
         pose[..., 0] = (pose[..., 0] * scale_x) + offset_x
@@ -448,7 +448,7 @@ def _crop_bounding_box(
             The cropped and resized image.
         """
         x1, y1, w, h = bbox
-        out_w, out_h = self._td_crop_size
+        out_w, out_h = self.top_down_crop_size
         return F.resized_crop(image, y1, x1, h, w, [out_h, out_w])
 
     def _crop_patches(self, image: torch.Tensor) -> torch.Tensor:
diff --git a/dlclive/pose_estimation_pytorch/runner.py b/dlclive/pose_estimation_pytorch/runner.py
@@ -32,6 +32,15 @@ class SkipFrames:
     then the detector will only be run every `skip` frames. Between frames where the
     detector is run, bounding boxes will be computed from the pose estimated in the
     previous frame.
+
+    Every `N` frames, the detector will be run to detect bounding boxes for individuals.
+    In the "skipped" frames between the frames where the object detector is run, the
+    bounding boxes will be computed from the poses estimated in the previous frame (with
+    some margin added around the poses).
+
+    Attributes:
+        skip: The number of frames to skip between each run of the detector.
+        margin: The margin (in pixels) to use when generating bboxes
     """
 
     skip: int
@@ -78,20 +87,28 @@ class TopDownConfig:
     """Configuration for top-down models.
 
     Attributes:
+        bbox_cutoff: The minimum score required for a bounding box to be considered.
+        max_detections: The maximum number of detections to keep in a frame. If None,
+            the `max_detections` will be set to the number of individuals in the model
+            configuration file when `read_config` is called.
         skip_frames: If defined, the detector will only be run every
             `skip_frames.skip` frames.
     """
 
-    bbox_cutoff: float
-    max_detections: int
+    bbox_cutoff: float = 0.6
+    max_detections: int | None = 30
     crop_size: tuple[int, int] = (256, 256)
     skip_frames: SkipFrames | None = None
 
-    def read_config(self, detector_cfg: dict) -> None:
-        crop = detector_cfg.get("data", {}).get("inference", {}).get("top_down_crop")
+    def read_config(self, model_cfg: dict) -> None:
+        crop = model_cfg.get("data", {}).get("inference", {}).get("top_down_crop")
         if crop is not None:
             self.crop_size = (crop["width"], crop["height"])
 
+        if self.max_detections is None:
+            individuals = model_cfg.get("metadata", {}).get("individuals", [])
+            self.max_detections = len(individuals)
+
 
 class PyTorchRunner(BaseRunner):
     """PyTorch runner for live pose estimation using DeepLabCut-Live.
@@ -242,7 +259,7 @@ def load_model(self) -> None:
             self.model = self.model.half()
 
         self.detector = None
-        if raw_data.get("detector") is not None:
+        if self.dynamic is None and raw_data.get("detector") is not None:
             self.detector = models.DETECTORS.build(self.cfg["detector"]["model"])
             self.detector.to(self.device)
             self.detector.load_state_dict(raw_data["detector"])
@@ -251,18 +268,23 @@ def load_model(self) -> None:
             if self.precision == "FP16":
                 self.detector = self.detector.half()
 
-        if self.cfg["method"] == "td" and self.detector is None:
-            crop_cfg = self.cfg["data"]["inference"]["top_down_crop"]
-            top_down_crop_size = crop_cfg["width"], crop_cfg["height"]
-            self.dynamic = dynamic_cropping.TopDownDynamicCropper(
-                top_down_crop_size,
-                patch_counts=(4, 3),
-                patch_overlap=50,
-                min_bbox_size=(250, 250),
-                threshold=0.6,
-                margin=25,
-                min_hq_keypoints=2,
-                bbox_from_hq=True,
+            if self.top_down_config is None:
+                self.top_down_config = TopDownConfig()
+
+            self.top_down_config.read_config(self.cfg)
+
+        if isinstance(self.dynamic, dynamic_cropping.TopDownDynamicCropper):
+            crop = self.cfg["data"]["inference"].get("top_down_crop", {})
+            w, h = crop.get("width", 256), crop.get("height", 256)
+            self.dynamic.top_down_crop_size = w, h
+
+        if (
+            self.cfg["method"] == "td"
+            and self.detector is None
+            and self.dynamic is None
+        ):
+            raise ValueError(
+                "Top-down models must either use a detector or a TopDownDynamicCropper."
             )
 
         self.transform = v2.Compose(
@@ -283,9 +305,10 @@ def read_config(self) -> dict:
     def _prepare_top_down(
         self, frame: torch.Tensor, detections: dict[str, torch.Tensor]
     ):
+        """Prepares a frame for top-down pose estimation."""
         bboxes, scores = detections["boxes"], detections["scores"]
         bboxes = bboxes[scores >= self.top_down_config.bbox_cutoff]
-        if len(bboxes) > 0:
+        if len(bboxes) > 0 and self.top_down_config.max_detections is not None:
             bboxes = bboxes[: self.top_down_config.max_detections]
 
         crops = []