InterDigitalInc
diff --git a/‎cfgs/vision_model/default.yaml‎
Lines changed: 18 additions & 0 deletions b/‎cfgs/vision_model/default.yaml‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎compressai_vision/datasets/image.py‎
Lines changed: 5 additions & 3 deletions b/‎compressai_vision/datasets/image.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎compressai_vision/datasets/utils.py‎
Lines changed: 10 additions & 8 deletions b/‎compressai_vision/datasets/utils.py‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎compressai_vision/model_wrappers/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎compressai_vision/model_wrappers/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎compressai_vision/model_wrappers/sam.py‎
Lines changed: 37 additions & 23 deletions b/‎compressai_vision/model_wrappers/sam.py‎
Lines changed: 37 additions & 23 deletions
@@ -47,6 +47,24 @@ sam_vit_h_4b8939:
   weights: "weights/segment_anything/sam_vit_h_4b8939.pth"
   splits: "imgenc"
 
+sam2_hiera_image_model:
+  model_path_prefix: ${..model_root_path}
+  cfg: "models/sam2/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml"
+  weights: "weights/sam2/sam2.1_hiera_base_plus.pt"
+  # weights: "weights/sam2/sam2.1_hiera_large.pt"
+  # weights: "weights/sam2/sam2.1_hiera_small.pt"
+  # weights: "weights/sam2/sam2.1_hiera_tiny.pt"
+  splits: "backbone"
+
+sam2_hiera_video_model:
+  model_path_prefix: ${..model_root_path}
+  cfg: "models/sam2/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml"
+  weights: "weights/sam2/sam2.1_hiera_base_plus.pt"
+  # weights: "weights/sam2/sam2.1_hiera_large.pt"
+  # weights: "weights/sam2/sam2.1_hiera_small.pt"
+  # weights: "weights/sam2/sam2.1_hiera_tiny.pt"
+  splits: "backbone"
+
 jde_1088x608:
   model_path_prefix: ${..model_root_path}
   cfg: "models/Towards-Realtime-MOT/cfg/yolov3_1088x608.cfg"
 
@@ -45,7 +45,6 @@
 from detectron2.data.samplers import InferenceSampler
 from detectron2.data.transforms import AugmentationList
 from detectron2.utils.serialize import PicklableWrapper
-from jde.utils.io import read_results
 from PIL import Image
 from torch.utils.data import Dataset
 
@@ -308,10 +307,12 @@ def __init__(self, root, dataset_name, imgs_folder, **kwargs):
         self.collate_fn = bypass_collator
 
         _dataset = DatasetFromList(self.dataset, copy=False)
-        mapper = SAMCustomMapper()
+        mapper = SAMCustomMapper(
+            augmentation_bypass=kwargs["input_augmentation_bypass"]
+        )
 
         self.mapDataset = MapDataset(_dataset, mapper)
-        self._org_mapper_func = PicklableWrapper(SAMCustomMapper())
+        self._org_mapper_func = PicklableWrapper(mapper)
 
         metaData = MetadataCatalog.get(dataset_name)
         try:
@@ -551,6 +552,7 @@ def __init__(
             dataset_name=dataset_name,
             ext=ext,
         )
+        from jde.utils.io import read_results
 
         self.data_type = "mot"
         gt_frame_dict = read_results(
 
@@ -307,13 +307,14 @@ def __call__(self, dataset_dict):
 
 
 class SAMCustomMapper:
-    def __init__(self, img_size=1024):
+    def __init__(self, augmentation_bypass=False, img_size=1024):
         """
         Args:
             img_size: single value - target size to SAM as input
         """
         from segment_anything.utils.transforms import ResizeLongestSide
 
+        self.augmentation_bypass = augmentation_bypass
         self.target_size = img_size
         self.transform = ResizeLongestSide(img_size)
 
@@ -335,16 +336,17 @@ def __call__(self, dataset_dict):
         org_img = cv2.imread(dataset_dict["file_name"])  # return img in BGR by default
         dataset_dict["height"], dataset_dict["width"], _ = org_img.shape
 
-        # h = dataset_dict["height"]
-        # w = dataset_dict["width"]
-
         # BGR --> RGB (SAM requires RGB input)
         org_img = org_img[..., ::-1]
-        input_image = self.transform.apply_image(org_img)
-        input_image = torch.tensor(input_image)
-        input_image = input_image.permute(2, 0, 1).contiguous()[None, :, :, :]
 
-        dataset_dict["image"] = input_image
+        if self.augmentation_bypass:
+            dataset_dict["image"] = org_img.copy()
+        else:
+            input_image = self.transform.apply_image(org_img)
+            input_image = torch.tensor(input_image)
+            input_image = input_image.permute(2, 0, 1).contiguous()[None, :, :, :]
+
+            dataset_dict["image"] = input_image
 
         return dataset_dict
 
 
@@ -27,7 +27,7 @@
 # OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from . import detectron2, jde, rtmo, sam, yolox
+from . import detectron2, jde, rtmo, sam, sam2, yolox
 from .base_wrapper import BaseWrapper
 
 __all__ = ["BaseWrapper"]
@@ -1,3 +1,32 @@
+# Copyright (c) 2025, InterDigital Communications, Inc
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the disclaimer
+# below) provided that the following conditions are met:
+
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# * Neither the name of InterDigital Communications, Inc nor the names of its
+#   contributors may be used to endorse or promote products derived from this
+#   software without specific prior written permission.
+
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import base64
 import csv
 import os
@@ -44,8 +73,10 @@ def __repr__(self):
 
 
 def mask_to_bbx(mask):
-    mask = mask.cpu()
-    mask = np.array(mask)
+    if not isinstance(mask, np.ndarray):
+        mask = mask.cpu()
+        mask = np.array(mask)
+
     mask = np.squeeze(mask)
     h, w = mask.shape[-2:]
     rows, cols = np.where(mask)
@@ -228,43 +259,31 @@ def _image_encoder_to_output(
             dense_prompt_embeddings=prompt_feature[1],
             multimask_output=False,
         )
-        # print("len low_res_masks", len(low_res_masks))
+
         # post process mask
         masks = F.interpolate(
             low_res_masks,
             (self.image_encoder.img_size, self.image_encoder.img_size),
             mode="bilinear",
             align_corners=False,
         )
-        masks = masks[
-            ..., : input_img_size[0], : input_img_size[1]
-        ]  # [..., : 793, : 1024]
+        masks = masks[..., : input_img_size[0], : input_img_size[1]]
         masks = F.interpolate(
             masks,
             (org_img_size["height"], org_img_size["width"]),
             mode="bilinear",
             align_corners=False,
         )
 
-        # masks1 = self.postprocess_masks(
-        #        masks= low_res_masks,
-        #        input_size=input_img_size,
-        #        original_size=org_img_size,
-        #    )
         mask_threshold = 0.0
         masks = masks > mask_threshold
-        # print("len masks", len(masks), masks[0].shape)
-        # name = '/t/vic/hevc_simulations/rosen/build/main-20250423-sam1/masks' + str(input_img_size[0]) + '.pt'
-        # torch.save(masks[0], name) #"/t/vic/hevc_simulations/rosen/build/main-20250423-sam1/masks.pt")
 
         # post process result
         processed_results = []
         boxes = mask_to_bbx(masks[0])
-        # print("boxes", boxes)
         boxes = Boxes(torch.tensor(np.array([boxes])))
         scores = torch.tensor([iou_pred])
-        classes = torch.tensor(object_classes)  # 48 for sandwich,
-        # masks = torch.rand(1, 683, 1024)  # Example binary mask
+        classes = torch.tensor(object_classes)
 
         from detectron2.structures import Instances
 
@@ -273,14 +292,9 @@ def _image_encoder_to_output(
         instances.set("pred_boxes", boxes)
         instances.set("scores", scores)
         instances.set("pred_classes", classes)
-        instances.set("pred_masks", masks[0])  # ✅ Now a real tensor
+        instances.set("pred_masks", masks[0])
 
-        # Wrap in result
-        # result = [f"{{'instances': {instances}}}"]
-        # print("result", result)
-        # print("instances", instances.get_fields().keys(), len(instances))
         processed_results.append({"instances": instances})
-        # print("processed_results", len(processed_results))
         return processed_results
 
     @torch.no_grad()