feat: Support CC-3DT++ for CR-3DT

RoyYang0714 · RoyYang0714 · commit c832021818e4 · 2025-08-05T00:44:49.000+02:00
diff --git a/scripts/eval_nusc/README.md b/scripts/eval_nusc/README.md
@@ -14,5 +14,17 @@ pip install -r nusc.txt
 - $VERSION is `mini` or `trainval` to select mini or validation split.
 
 ```bash
-bash eval_nusc.sh $WORK_DIR $VERSION
+# Detection
+python eval.py \
+--input $FOLDER_OF_PREDICTION \
+--version $VERSION \
+--dataroot $NUSC_DATA_ROOT \
+--mode detection
+
+# Tracking
+python eval.py \
+--input $FOLDER_OF_PREDICTION \
+--version $VERSION \
+--dataroot $NUSC_DATA_ROOT \
+--mode tracking
 ```
diff --git a/scripts/eval_nusc/eval.py b/scripts/eval_nusc/eval.py
@@ -1,4 +1,5 @@
 """nuScenes evaluation pipeline for Vis4D."""
+
 import argparse
 import os
 import json
@@ -13,35 +14,6 @@
 from nuscenes.eval.common.config import config_factory as track_configs
 
 
-def parse_arguments() -> argparse.Namespace:
-    """Parse arguments."""
-    parser = argparse.ArgumentParser(description="Vis4D for nuScenes eval.")
-    parser.add_argument(
-        "--input",
-        "-i",
-        help=("Path to save nuScenes format dection / tracking results."),
-    )
-    parser.add_argument(
-        "--version",
-        "-v",
-        choices=["v1.0-trainval", "v1.0-test", "v1.0-mini"],
-        help="NuScenes dataset version to convert.",
-    )
-    parser.add_argument(
-        "--dataroot",
-        "-d",
-        help="NuScenes dataset root.",
-    )
-    parser.add_argument(
-        "-m",
-        "--mode",
-        default="tracking",
-        choices=["tracking", "detection"],
-        help="Conversion mode: detection or tracking.",
-    )
-    return parser.parse_args()
-
-
 def eval_detection(
     version: str,
     dataroot: str,
@@ -114,7 +86,33 @@ def print_metric_summary(metric_summary_path: str) -> None:
 
 if __name__ == "__main__":
     """Main."""
-    args = parse_arguments()
+    parser = argparse.ArgumentParser(description="NuScenes eval for Vis4D.")
+    parser.add_argument(
+        "--input",
+        "-i",
+        help=(
+            "Folder path to the nuScenes format detection / tracking results."
+        ),
+    )
+    parser.add_argument(
+        "--version",
+        "-v",
+        choices=["v1.0-trainval", "v1.0-test", "v1.0-mini"],
+        help="NuScenes dataset version to convert.",
+    )
+    parser.add_argument(
+        "--dataroot",
+        "-d",
+        help="NuScenes dataset root.",
+    )
+    parser.add_argument(
+        "-m",
+        "--mode",
+        default="tracking",
+        choices=["tracking", "detection"],
+        help="Conversion mode: detection or tracking.",
+    )
+    args = parser.parse_args()
 
     if args.mode == "detection":
         metric = "detect_3d"
diff --git a/scripts/eval_nusc/eval_nusc.sh b/scripts/eval_nusc/eval_nusc.sh
diff --git a/vis4d/op/track3d/cc_3dt.py b/vis4d/op/track3d/cc_3dt.py
@@ -68,6 +68,7 @@ def __init__(
         nms_class_iou_thr: float = 0.7,
         nms_conf_thr: float = 0.5,
         with_cats: bool = True,
+        with_velocities: bool = False,
         bbox_affinity_weight: float = 0.5,
     ) -> None:
         """Creates an instance of the class.
@@ -83,10 +84,12 @@ def __init__(
                 another detection.
             nms_class_iou_thr (float): Maximum IoU of a high score detection
                 with another of a different class.
+            nms_conf_thr (float): Confidence threshold for NMS.
             with_cats (bool): If to consider category information for
                 tracking (i.e. all detections within a track must have
                 consistent category labels).
-            nms_conf_thr (float): Confidence threshold for NMS.
+            with_velocities (bool): If to use predicted velocities for
+                matching.
             bbox_affinity_weight (float): Weight of bbox affinity in the
                 overall affinity score.
         """
@@ -98,6 +101,7 @@ def __init__(
         self.nms_class_iou_thr = nms_class_iou_thr
         self.nms_conf_thr = nms_conf_thr
         self.with_cats = with_cats
+        self.with_velocities = with_velocities
         self.bbox_affinity_weight = bbox_affinity_weight
         self.feat_affinity_weight = 1 - bbox_affinity_weight
 
@@ -110,7 +114,8 @@ def _filter_detections(
         scores_3d: Tensor,
         class_ids: Tensor,
         embeddings: Tensor,
-    ) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
+        velocities: Tensor | None = None,
+    ) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
         """Remove overlapping objects across classes via nms.
 
         Args:
@@ -121,6 +126,7 @@ def _filter_detections(
             scores_3d (Tensor): [N,] Tensor of 3D confidence scores.
             class_ids (Tensor): [N,] Tensor of class ids.
             embeddings (Tensor): [N, C] tensor of appearance embeddings.
+            velocities (Tensor | None): [N, 3] Tensor of velocities.
 
         Returns:
             tuple[Tensor]: filtered detections, scores, class_ids,
@@ -142,6 +148,10 @@ def _filter_detections(
             detections_3d[inds],
             scores_3d[inds],
         )
+
+        if velocities is not None:
+            velocities = velocities[inds]
+
         valids = embeddings.new_ones((len(detections),), dtype=torch.bool)
 
         ious = bbox_iou(detections, detections)
@@ -158,25 +168,32 @@ def _filter_detections(
 
             if (ious[i, :i] > thr).any():
                 valids[i] = False
+
         detections = detections[valids]
         scores = scores[valids]
         detections_3d = detections_3d[valids]
         scores_3d = scores_3d[valids]
         class_ids = class_ids[valids]
         embeddings = embeddings[valids]
+
+        if velocities is not None:
+            velocities = velocities[valids]
+
         return (
             detections,
             scores,
             detections_3d,
             scores_3d,
             class_ids,
             embeddings,
+            velocities,
             inds[valids],
         )
 
-    @staticmethod
     def depth_ordering(
+        self,
         obsv_boxes_3d: Tensor,
+        obsv_velocities: Tensor | None,
         memory_boxes_3d_predict: Tensor,
         memory_boxes_3d: Tensor,
         memory_velocities: Tensor,
@@ -197,11 +214,11 @@ def depth_ordering(
 
         # Moving distance should be aligned
         motion_weight_list = []
-        obsv_velocities = (
+        moving_dist = (
             obsv_boxes_3d[:, :3, None]
             - memory_boxes_3d[:, :3, None].transpose(2, 0)
         ).transpose(1, 2)
-        for v in obsv_velocities:
+        for v in moving_dist:
             motion_weight_list.append(
                 F.pairwise_distance(  # pylint: disable=not-callable
                     v, memory_velocities[:, :3]
@@ -210,22 +227,41 @@ def depth_ordering(
         motion_weight = torch.cat(motion_weight_list, dim=0)
         motion_weight = torch.exp(-torch.div(motion_weight, 5.0))
 
-        # Moving direction should be aligned
-        # Set to 0.5 when two vector not within +-90 degree
-        cos_sim_list = []
-        obsv_direct = (
-            obsv_boxes_3d[:, :2, None]
-            - memory_boxes_3d[:, :2, None].transpose(2, 0)
-        ).transpose(1, 2)
-        for d in obsv_direct:
-            cos_sim_list.append(
-                F.cosine_similarity(  # pylint: disable=not-callable
-                    d, memory_velocities[:, :2]
-                ).unsqueeze(0)
+        # Velocity scores
+        if self.with_velocities:
+            assert (
+                obsv_velocities is not None
+            ), "Please provide velocities if with_velocities=True!"
+
+            velsim_weight_list = []
+            obsvvv_velocities = obsv_velocities.unsqueeze(1).expand_as(
+                moving_dist
             )
-        cos_sim = torch.cat(cos_sim_list, dim=0)
-        cos_sim = torch.add(cos_sim, 1.0)
-        cos_sim = torch.div(cos_sim, 2.0)
+            for v in obsvvv_velocities:
+                velsim_weight_list.append(
+                    F.pairwise_distance(
+                        v, memory_velocities[:, -3:]
+                    ).unsqueeze(0)
+                )
+            velsim_weight = torch.cat(velsim_weight_list, dim=0)
+            cos_sim = torch.exp(-velsim_weight / 5.0)
+        else:
+            # Moving direction should be aligned
+            # Set to 0.5 when two vector not within +-90 degree
+            cos_sim_list = []
+            obsv_direct = (
+                obsv_boxes_3d[:, :2, None]
+                - memory_boxes_3d[:, :2, None].transpose(2, 0)
+            ).transpose(1, 2)
+            for d in obsv_direct:
+                cos_sim_list.append(
+                    F.cosine_similarity(  # pylint: disable=not-callable
+                        d, memory_velocities[:, :2]
+                    ).unsqueeze(0)
+                )
+            cos_sim = torch.cat(cos_sim_list, dim=0)
+            cos_sim = torch.add(cos_sim, 1.0)
+            cos_sim = torch.div(cos_sim, 2.0)
 
         scores_depth = (
             cos_sim * centroid_weight + (1.0 - cos_sim) * motion_weight
@@ -242,6 +278,7 @@ def __call__(
         detection_scores_3d: Tensor,
         detection_class_ids: Tensor,
         detection_embeddings: Tensor,
+        obs_velocities: Tensor | None = None,
         memory_boxes_3d: Tensor | None = None,
         memory_track_ids: Tensor | None = None,
         memory_class_ids: Tensor | None = None,
@@ -260,6 +297,7 @@ def __call__(
             detection_scores_3d (Tensor): [N,] confidence scores in 3D.
             detection_class_ids (Tensor): [N,] class indices.
             detection_embeddings (Tensor): [N, C] appearance embeddings.
+            obs_velocities (Tensor | None): [N, 3] velocities of detections.
             memory_boxes_3d (Tensor): [M, 7] boxes in memory.
             memory_track_ids (Tensor): [M,] track ids in memory.
             memory_class_ids (Tensor): [M,] class indices in memory.
@@ -280,6 +318,7 @@ def __call__(
             detection_scores_3d,
             detection_class_ids,
             detection_embeddings,
+            obs_velocities,
             permute_inds,
         ) = self._filter_detections(
             detections,
@@ -289,6 +328,7 @@ def __call__(
             detection_scores_3d,
             detection_class_ids,
             detection_embeddings,
+            obs_velocities,
         )
 
         if with_depth_confidence:
@@ -324,6 +364,7 @@ def __call__(
             # Depth Ordering
             scores_depth = self.depth_ordering(
                 detections_3d,
+                obs_velocities,
                 memory_boxes_3d_predict,
                 memory_boxes_3d,
                 memory_velocities,
diff --git a/vis4d/state/track3d/cc_3dt.py b/vis4d/state/track3d/cc_3dt.py
@@ -60,6 +60,7 @@ def __init__(
         num_frames: int = 5,
         fps: int = 2,
         update_3d_score: bool = True,
+        use_velocities: bool = False,
         add_backdrops: bool = True,
     ) -> None:
         """Creates an instance of the class."""
@@ -88,6 +89,7 @@ def __init__(
         self.fps = fps
         self.update_3d_score = update_3d_score
         self.add_backdrops = add_backdrops
+        self.use_velocities = use_velocities
 
     def reset(self) -> None:
         """Empty the memory."""
@@ -289,6 +291,7 @@ def __call__(
             memory_boxes_3d_predict = None
             memory_velocities = None
 
+        obs_velocities = boxes_3d[:, 9:]
         obs_boxes_3d = torch.cat(
             [boxes_3d[:, :6], boxes_3d[:, 8].unsqueeze(1)], dim=1
         )
@@ -301,6 +304,7 @@ def __call__(
             scores_3d,
             class_ids,
             embeddings,
+            obs_velocities,
             memory_boxes_3d,
             memory_track_ids,
             memory_class_ids,
@@ -512,6 +516,12 @@ def update_track(
             + velocity
         ) / (self.tracklets[track_id]["acc_frame"] + 1)
 
+        # Use predicted velocity if available
+        if self.use_velocities:
+            self.tracklets[track_id]["velocity"][4:] = self.tracklets[
+                track_id
+            ]["box_3d"][9:12]
+
         self.tracklets[track_id]["last_frame"] = frame_id
         self.tracklets[track_id]["acc_frame"] += 1
 
diff --git a/vis4d/vis/image/util.py b/vis4d/vis/image/util.py
@@ -186,7 +186,7 @@ def preprocess_boxes3d(
     list[list[tuple[float, float, float]]],
     list[str],
     list[tuple[int, int, int]],
-    list[int],
+    list[int | None],
 ]:
     """Preprocesses bounding boxes.
 
@@ -233,7 +233,7 @@ def preprocess_boxes3d(
     corners_proc: list[list[tuple[float, float, float]]] = []
     colors_proc: list[tuple[int, int, int]] = []
     labels_proc: list[str] = []
-    track_ids_proc: list[int] = []
+    track_ids_proc: list[int | None] = []
 
     if len(mask) == 1:
         if not mask[0]:
@@ -281,8 +281,7 @@ def preprocess_boxes3d(
             category = None
 
         labels_proc.append(_get_box_label(category, score, track_id))
-        if track_id is not None:
-            track_ids_proc.append(track_id)
+        track_ids_proc.append(track_id)
     return centers_proc, corners_proc, labels_proc, colors_proc, track_ids_proc
 
 
diff --git a/vis4d/zoo/cc_3dt/cc_3dt_pp_kf3d.py b/vis4d/zoo/cc_3dt/cc_3dt_pp_kf3d.py