1038lab
diff --git a/‎sam3/perflib/__init__.py‎
Lines changed: 8 additions & 0 deletions b/‎sam3/perflib/__init__.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎sam3/perflib/associate_det_trk.py‎
Lines changed: 137 additions & 0 deletions b/‎sam3/perflib/associate_det_trk.py‎
Lines changed: 137 additions & 0 deletions
diff --git a/‎sam3/perflib/compile.py‎
Lines changed: 99 additions & 0 deletions b/‎sam3/perflib/compile.py‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎sam3/perflib/connected_components.py‎
Lines changed: 84 additions & 0 deletions b/‎sam3/perflib/connected_components.py‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎sam3/perflib/fa3.py‎
Lines changed: 27 additions & 0 deletions b/‎sam3/perflib/fa3.py‎
Lines changed: 27 additions & 0 deletions
@@ -0,0 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+
+import os
+
+is_enabled = False
+if os.getenv("USE_PERFLIB", "1") == "1":
+    # print("Enabled the use of perflib.\n", end="")
+    is_enabled = True
@@ -0,0 +1,137 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+
+from collections import defaultdict
+
+import torch
+import torch.nn.functional as F
+from sam3.perflib.masks_ops import mask_iou
+from scipy.optimize import linear_sum_assignment
+
+
+def associate_det_trk(
+    det_masks,
+    track_masks,
+    iou_threshold=0.5,
+    iou_threshold_trk=0.5,
+    det_scores=None,
+    new_det_thresh=0.0,
+):
+    """
+    Optimized implementation of detection <-> track association that minimizes DtoH syncs.
+
+    Args:
+        det_masks: (N, H, W) tensor of predicted masks
+        track_masks: (M, H, W) tensor of track masks
+
+    Returns:
+        new_det_indices: list of indices in det_masks considered 'new'
+        unmatched_trk_indices: list of indices in track_masks considered 'unmatched'
+    """
+    with torch.autograd.profiler.record_function("perflib: associate_det_trk"):
+        assert isinstance(det_masks, torch.Tensor), "det_masks should be a tensor"
+        assert isinstance(track_masks, torch.Tensor), "track_masks should be a tensor"
+        if det_masks.size(0) == 0 or track_masks.size(0) == 0:
+            return list(range(det_masks.size(0))), [], {}, {}  # all detections are new
+
+        if list(det_masks.shape[-2:]) != list(track_masks.shape[-2:]):
+            # resize to the smaller size to save GPU memory
+            if torch.numel(det_masks[-2:]) < torch.numel(track_masks[-2:]):
+                track_masks = (
+                    F.interpolate(
+                        track_masks.unsqueeze(1).float(),
+                        size=det_masks.shape[-2:],
+                        mode="bilinear",
+                        align_corners=False,
+                    ).squeeze(1)
+                    > 0
+                )
+            else:
+                # resize detections to track size
+                det_masks = (
+                    F.interpolate(
+                        det_masks.unsqueeze(1).float(),
+                        size=track_masks.shape[-2:],
+                        mode="bilinear",
+                        align_corners=False,
+                    ).squeeze(1)
+                    > 0
+                )
+
+        det_masks = det_masks > 0
+        track_masks = track_masks > 0
+
+        iou = mask_iou(det_masks, track_masks)  # (N, M)
+        igeit = iou >= iou_threshold
+        igeit_any_dim_1 = igeit.any(dim=1)
+        igeit_trk = iou >= iou_threshold_trk
+
+        iou_list = iou.cpu().numpy().tolist()
+        igeit_list = igeit.cpu().numpy().tolist()
+        igeit_any_dim_1_list = igeit_any_dim_1.cpu().numpy().tolist()
+        igeit_trk_list = igeit_trk.cpu().numpy().tolist()
+
+        det_scores_list = (
+            det_scores
+            if det_scores is None
+            else det_scores.cpu().float().numpy().tolist()
+        )
+
+        # Hungarian matching for tracks (one-to-one: each track matches at most one detection)
+        # For detections: allow many tracks to match to the same detection (many-to-one)
+
+        # If either is empty, return all detections as new
+        if det_masks.size(0) == 0 or track_masks.size(0) == 0:
+            return list(range(det_masks.size(0))), [], {}
+
+        # Hungarian matching: maximize IoU for tracks
+        cost_matrix = 1 - iou.cpu().numpy()  # Hungarian solves for minimum cost
+        row_ind, col_ind = linear_sum_assignment(cost_matrix)
+
+        def branchy_hungarian_better_uses_the_cpu(
+            cost_matrix, row_ind, col_ind, iou_list, det_masks, track_masks
+        ):
+            matched_trk = set()
+            matched_det = set()
+            matched_det_scores = {}  # track index -> [det_score, det_score * iou] det score of matched detection mask
+            for d, t in zip(row_ind, col_ind):
+                matched_det_scores[t] = [
+                    det_scores_list[d],
+                    det_scores_list[d] * iou_list[d][t],
+                ]
+                if igeit_trk_list[d][t]:
+                    matched_trk.add(t)
+                    matched_det.add(d)
+
+            # Tracks not matched by Hungarian assignment above threshold are unmatched
+            unmatched_trk_indices = [
+                t for t in range(track_masks.size(0)) if t not in matched_trk
+            ]
+
+            # For detections: allow many tracks to match to the same detection (many-to-one)
+            # So, a detection is 'new' if it does not match any track above threshold
+            assert track_masks.size(0) == igeit.size(
+                1
+            )  # Needed for loop optimizaiton below
+            new_det_indices = []
+            for d in range(det_masks.size(0)):
+                if not igeit_any_dim_1_list[d]:
+                    if det_scores is not None and det_scores[d] >= new_det_thresh:
+                        new_det_indices.append(d)
+
+            # for each detection, which tracks it matched to (above threshold)
+            det_to_matched_trk = defaultdict(list)
+            for d in range(det_masks.size(0)):
+                for t in range(track_masks.size(0)):
+                    if igeit_list[d][t]:
+                        det_to_matched_trk[d].append(t)
+
+            return (
+                new_det_indices,
+                unmatched_trk_indices,
+                det_to_matched_trk,
+                matched_det_scores,
+            )
+
+        return (branchy_hungarian_better_uses_the_cpu)(
+            cost_matrix, row_ind, col_ind, iou_list, det_masks, track_masks
+        )
@@ -0,0 +1,99 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+
+import torch
+
+
+def recursive_fn_factory(fn):
+    def recursive_fn(b):
+        if isinstance(b, dict):
+            return {k: recursive_fn(b[k]) for k in b}
+        if isinstance(b, list):
+            return [recursive_fn(t) for t in b]
+        if isinstance(b, tuple):
+            return tuple(recursive_fn(t) for t in b)
+        if isinstance(b, torch.Tensor):
+            return fn(b)
+        # Yes, writing out an explicit white list of
+        # trivial types is tedious, but so are bugs that
+        # come from not applying fn, when expected to have
+        # applied it.
+        if b is None:
+            return b
+        trivial_types = [bool, int]
+        for t in trivial_types:
+            if isinstance(b, t):
+                return b
+        raise TypeError(f"Unexpected type {type(b)}")
+
+    return recursive_fn
+
+
+recursive_contiguous = recursive_fn_factory(lambda x: x.contiguous())
+recursive_clone = recursive_fn_factory(torch.clone)
+
+
+def compile_wrapper(
+    fn, *, mode="max-autotune", fullgraph=True, dynamic=False, name=None
+):
+    compiled_fn = torch.compile(fn, mode=mode, fullgraph=fullgraph, dynamic=dynamic)
+
+    def compiled_fn_wrapper(*args, **kwargs):
+        with torch.autograd.profiler.record_function(
+            f"compiled {fn}" if name is None else name
+        ):
+            cont_args = recursive_contiguous(args)
+            cont_kwargs = recursive_contiguous(kwargs)
+            result = compiled_fn(*cont_args, **cont_kwargs)
+            cloned_result = recursive_clone(result)
+            return cloned_result
+
+    return compiled_fn_wrapper
+
+
+def shape_logging_wrapper(fn, keep_kwargs, enable_logging=False):
+    """
+    Wraps a function and prints the shapes of all tensor inputs.
+    Only prints when a new combination of shapes is seen.
+    Thread-safe.
+
+    Args:
+        fn: Function to wrap
+        enable_logging: Boolean flag to enable/disable logging
+    """
+    seen_shapes = set()
+
+    def get_shape(obj):
+        if isinstance(obj, torch.Tensor):
+            return obj.shape
+        elif isinstance(obj, (list, tuple)):
+            if len(obj) > 1:
+                return tuple(get_shape(x) for x in obj)
+            return get_shape(obj[0])
+        elif isinstance(obj, dict):
+            return tuple(sorted((k, get_shape(v)) for k, v in obj.items()))
+        else:
+            return type(obj).__name__
+
+    def wrapper(*args, **kwargs):
+        shapes = tuple(get_shape(arg) for arg in args) + tuple(
+            (k, get_shape(v))
+            for k, v in kwargs.items()
+            if isinstance(v, (torch.Tensor, list))
+            and (len(keep_kwargs) > 0 and k in keep_kwargs)
+        )
+        if shapes not in seen_shapes:
+            seen_shapes.add(shapes)
+            if enable_logging:
+                print(f"[ShapeLogger] New input shapes for {fn.__qualname__}: {shapes}")
+        return fn(*args, **kwargs)
+
+    # Allow toggling the flag at runtime
+    wrapper.enable_logging = enable_logging
+
+    def set_logging(enabled=False):
+        nonlocal enable_logging
+        enable_logging = enabled
+        wrapper.enable_logging = enable_logging
+
+    wrapper.set_logging = set_logging
+    return wrapper
@@ -0,0 +1,84 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import logging
+
+import torch
+
+try:
+    from cc_torch import get_connected_components
+
+    HAS_CC_TORCH = True
+except ImportError:
+    logging.debug(
+        "cc_torch not found. Consider installing for better performance. Command line:"
+        " pip install git+https://github.com/ronghanghu/cc_torch.git"
+    )
+    HAS_CC_TORCH = False
+
+
+def connected_components_cpu_single(values: torch.Tensor):
+    assert values.dim() == 2
+    from skimage.measure import label
+
+    labels, num = label(values.cpu().numpy(), return_num=True)
+    labels = torch.from_numpy(labels)
+    counts = torch.zeros_like(labels)
+    for i in range(1, num + 1):
+        cur_mask = labels == i
+        cur_count = cur_mask.sum()
+        counts[cur_mask] = cur_count
+    return labels, counts
+
+
+def connected_components_cpu(input_tensor: torch.Tensor):
+    out_shape = input_tensor.shape
+    if input_tensor.dim() == 4 and input_tensor.shape[1] == 1:
+        input_tensor = input_tensor.squeeze(1)
+    else:
+        assert (
+            input_tensor.dim() == 3
+        ), "Input tensor must be (B, H, W) or (B, 1, H, W)."
+
+    batch_size = input_tensor.shape[0]
+    labels_list = []
+    counts_list = []
+    for b in range(batch_size):
+        labels, counts = connected_components_cpu_single(input_tensor[b])
+        labels_list.append(labels)
+        counts_list.append(counts)
+    labels_tensor = torch.stack(labels_list, dim=0).to(input_tensor.device)
+    counts_tensor = torch.stack(counts_list, dim=0).to(input_tensor.device)
+    return labels_tensor.view(out_shape), counts_tensor.view(out_shape)
+
+
+def connected_components(input_tensor: torch.Tensor):
+    """
+    Computes connected components labeling on a batch of 2D tensors, using the best available backend.
+
+    Args:
+        input_tensor (torch.Tensor): A BxHxW integer tensor or Bx1xHxW. Non-zero values are considered foreground. Bool tensor also accepted
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Both tensors have the same shape as input_tensor.
+            - A tensor with dense labels. Background is 0.
+            - A tensor with the size of the connected component for each pixel.
+    """
+    if input_tensor.dim() == 3:
+        input_tensor = input_tensor.unsqueeze(1)
+
+    assert (
+        input_tensor.dim() == 4 and input_tensor.shape[1] == 1
+    ), "Input tensor must be (B, H, W) or (B, 1, H, W)."
+
+    if input_tensor.is_cuda:
+        if HAS_CC_TORCH:
+            return get_connected_components(input_tensor.to(torch.uint8))
+        else:
+            # triton fallback
+            from sam3.perflib.triton.connected_components import (
+                connected_components_triton,
+            )
+
+            return connected_components_triton(input_tensor)
+
+    # CPU fallback
+    return connected_components_cpu(input_tensor)
@@ -0,0 +1,27 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+
+import torch
+
+
+@torch.library.custom_op("flash::flash_attn_func", mutates_args=())
+def flash_attn_func_op(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor
+) -> torch.Tensor:
+    from flash_attn_interface import flash_attn_func as fa3
+
+    return fa3(q, k, v)
+
+
+def flash_attn_func(q, k, v):
+    dtype = torch.float8_e4m3fn
+    return flash_attn_func_op(q.to(dtype), k.to(dtype), v.to(dtype)).to(q.dtype)
+
+
+@flash_attn_func_op.register_fake
+def _(q, k, v, **kwargs):
+    # two outputs:
+    # 1. output: (batch, seq_len, num_heads, head_dim)
+    # 2. softmax_lse: (batch, num_heads, seq_len) with dtype=torch.float32
+    # output needs to be bfloat16, not float8!
+    meta_q = torch.empty_like(q, dtype=torch.bfloat16).contiguous()
+    return meta_q