[fix] initial support - SAMv1

chyomin06 · chyomin06 · commit b8b3b97a20a9 · 2025-08-26T21:10:21.000-04:00
diff --git a/cfgs/pipeline/remote_inference.yaml b/cfgs/pipeline/remote_inference.yaml
@@ -25,6 +25,8 @@ codec:
     skip_n_frames: 0 # This is encoder only option
     n_frames_to_be_encoded: -1  #(-1 = encode all input), This is encoder only option
     measure_complexity: "${codec.mac_computation}"
+    vcm_mode: "{codec.vcm_mode}"
+    output10b: "{codec.output10b}"
 nn_task:
     dump_results: False
     output_results_dir: "${codec.output_dir}/output_results"
diff --git a/cfgs/vision_model/default.yaml b/cfgs/vision_model/default.yaml
@@ -25,7 +25,8 @@ mask_rcnn_R_50_FPN_3x:
 
 sam_vit_h_4b8939:
   model_path_prefix: ${..model_root_path}
-  weights: "weights/sam/sam_vit_h_4b8939.pth"
+  cfg: "Built-in configurations"
+  weights: "weights/segment_anything/sam_vit_h_4b8939.pth"
   splits: "imgenc" 
 
 mask_rcnn_X_101_32x8d_FPN_3x:
diff --git a/compressai_vision/codecs/base.py b/compressai_vision/codecs/base.py
@@ -143,6 +143,7 @@ def decode(
         org_img_size: Dict = None,
         remote_inference=False,
         vcm_mode=False,
+        output10b=False,
     ):
         del org_img_size
         del file_prefix  # used in other codecs that write log files
diff --git a/compressai_vision/codecs/ffmpeg.py b/compressai_vision/codecs/ffmpeg.py
@@ -287,6 +287,7 @@ def decode(
         org_img_size: Dict = None,
         remote_inference=False,
         vcm_mode=False,
+        output10b=False,
     ) -> bool:
         """
         Decodes a bitstream into video frames and extract features from the decoded frames.
diff --git a/compressai_vision/datasets/image.py b/compressai_vision/datasets/image.py
@@ -293,18 +293,10 @@ def __init__(self, root, dataset_name, imgs_folder, **kwargs):
         self.collate_fn = bypass_collator
 
         _dataset = DatasetFromList(self.dataset, copy=False)
-
-        # if kwargs["linear_mapper"] is True:
-        #    mapper = LinearMapper()
-        # else:
-        #    assert (
-        #        kwargs["cfg"] is not None
-        #    ), "A proper mapper information via cfg must be provided"
-        # mapper = DatasetMapper(kwargs["cfg"], False)
-        mapper = SAMCustomMapper(kwargs["patch_size"])
+        mapper = SAMCustomMapper()
 
         self.mapDataset = MapDataset(_dataset, mapper)
-        self._org_mapper_func = PicklableWrapper(SAMCustomMapper(kwargs["patch_size"]))
+        self._org_mapper_func = PicklableWrapper(SAMCustomMapper())
 
         metaData = MetadataCatalog.get(dataset_name)
         try:
diff --git a/compressai_vision/datasets/utils.py b/compressai_vision/datasets/utils.py
@@ -37,6 +37,7 @@
 
 from jde.utils.datasets import letterbox
 from mmpose.structures.bbox import get_warp_matrix
+from segment_anything.utils.transforms import ResizeLongestSide
 from torch.nn import functional as F
 from torchvision import transforms
 
@@ -307,15 +308,13 @@ def __call__(self, dataset_dict):
 
 
 class SAMCustomMapper:
-    def __init__(self, img_size=[1024, 1024]):
+    def __init__(self, img_size=1024):
         """
         Args:
-            img_size: expected input size (Height, Width)
+            img_size: single value - target size to SAM as input
         """
-        self.height = 1024
-        self.width = 1024
-        self.pixel_mean = [123.675, 116.28, 103.53]
-        self.pixel_std = [58.395, 57.12, 57.375]
+        self.target_size = img_size
+        self.transform = ResizeLongestSide(img_size)
 
     def __call__(self, dataset_dict):
         """
@@ -337,17 +336,14 @@ def __call__(self, dataset_dict):
 
         h = dataset_dict["height"]
         w = dataset_dict["width"]
-        org_img = (org_img - self.pixel_mean) / self.pixel_std
-
-        padh = self.height - h  # self.image_encoder.img_size - h
-        padw = self.width - w
-        image = torch.tensor(org_img)
-        image = image.unsqueeze(-1)
-        image = image.permute(3, 2, 0, 1)
-        image = F.pad(image, (0, padw, 0, padh))
-        image = image.to(torch.float32)
-        # to tensor
-        dataset_dict["image"] = image
+
+        # BGR --> RGB (SAM requires RGB input)
+        org_img = org_img[..., ::-1]
+        input_image = self.transform.apply_image(org_img)
+        input_image = torch.tensor(input_image)
+        input_image = input_image.permute(2, 0, 1).contiguous()[None, :, :, :]
+
+        dataset_dict["image"] = input_image
 
         return dataset_dict
 
diff --git a/compressai_vision/model_wrappers/sam.py b/compressai_vision/model_wrappers/sam.py
@@ -13,8 +13,8 @@
 
 from detectron2.structures import ImageList, Instances
 from segment_anything import (  # , Instances
-    SamAutomaticMaskGenerator,
-    SamPredictor,
+    # SamAutomaticMaskGenerator,
+    # SamPredictor,
     sam_model_registry,
 )
 from torch.nn import functional as F
@@ -50,6 +50,7 @@ def __repr__(self):
 
 
 def mask_to_bbx(mask):
+    mask = mask.cpu()
     mask = np.array(mask)
     mask = np.squeeze(mask)
     h, w = mask.shape[-2:]
@@ -81,17 +82,26 @@ class SAM(BaseWrapper):
     def __init__(self, device: str, **kwargs):
         super().__init__(device)
 
+        _path_prefix = (
+            f"{root_path}"
+            if kwargs["model_path_prefix"] == "default"
+            else kwargs["model_path_prefix"]
+        )
+        self.model_info = {
+            "cfg": f"{_path_prefix}/{kwargs['cfg']}",
+            "weights": f"{_path_prefix}/{kwargs['weights']}",
+        }
+
         self.model = (
-            sam_model_registry["vit_h"](checkpoint=kwargs["weights"]).to(device).eval()
+            sam_model_registry["vit_h"](checkpoint=self.model_info["weights"])
+            .to(device)
+            .eval()
         )
-        self.model.load_state_dict(torch.load(kwargs["weights"]))
 
-        self.backbone = self.model.image_encoder
+        self.image_encoder = self.model.image_encoder
         self.prompt_encoder = self.model.prompt_encoder
         self.head = self.model.mask_decoder
 
-        # SamPredictor(self.model)
-        # print(SamPredictor)
         self.supported_split_points = Split_Points
 
         assert "splits" in kwargs, "Split layer ids must be provided"
@@ -106,18 +116,31 @@ def __init__(self, device: str, **kwargs):
             zip(self.split_layer_list, [None] * len(self.split_layer_list))
         )
 
-        self.annotation_file = "/o/projects/proj-river/ctc_sequences/vcm_testdata/samtest/annotations/mpeg-oiv6-segmentation-coco_fortest.json"
-
     @property
     def SPLIT_IMGENC(self):
         return str(self.supported_split_points.ImageEncoder)
 
-    def input_to_features(self, x, device: str) -> Dict:
+    @staticmethod
+    def prompt_inputs(file_name):
+        # [TODO] should be improved...
+        prompt_link = file_name.replace("/images/", "/prompts/").replace(".jpg", ".txt")
+
+        with open(prompt_link, "r") as f:
+            line = f.readline()
+            # first_two = list(map(int, line.strip().split()[:2]))
+            parts = line.strip().split()
+            prompts = list(map(int, parts[:2]))
+            object_classes = [int(line.strip().split()[-1])]
+
+        return prompts, object_classes
+
+    def input_to_features(self, x: list, device: str) -> Dict:
         """Computes deep features at the intermediate layer(s) all the way from the input"""
         self.model = self.model.to(device).eval()
+        assert isinstance(x, list) and len(x) == 1
 
         if self.split_id == self.SPLIT_IMGENC:
-            return self._input_to_image_encoder(x)
+            return self._input_to_image_encoder(x, device)
         else:
             self.logger.error(f"Not supported split point {self.split_id}")
 
@@ -129,48 +152,37 @@ def features_to_output(self, x: Dict, device: str):
         self.model = self.model.to(device).eval()
 
         if self.split_id == self.SPLIT_IMGENC:
+            assert "file_name" in x
+
+            prompts, object_classes = self.prompt_inputs(x["file_name"])
+
             return self._image_encoder_to_output(
                 x["data"],
                 x["org_input_size"],
                 x["input_size"],
-                x["prompts"],
-                x["object_classes"],
+                prompts,
+                object_classes,
+                device,
             )
         else:
             self.logger.error(f"Not supported split points {self.split_id}")
 
         raise NotImplementedError
 
     @torch.no_grad()
-    def _input_to_image_encoder(self, x):
+    def _input_to_image_encoder(self, x, device):
         """Computes and return encoded image all the way from the input"""
-        # TODO pre_processing
-        # print("AAAAA _input_to_image_encoder", x ,'\n')
-        # imgs = ImageList(x)
-        imgs = x[0]["image"]
-        feature = {}
-        feature["backbone"] = self.backbone(imgs)
-
-        prompt_link = (
-            x[0]["file_name"].replace("/images/", "/prompts/").replace(".jpg", ".txt")
-        )
-        # print("AAAAA prompt_link", prompt_link)
-
-        with open(prompt_link, "r") as f:
-            line = f.readline()
-            # first_two = list(map(int, line.strip().split()[:2]))
-            parts = line.strip().split()
-            prompts = list(map(int, parts[:2]))
-            object_classes = [int(line.strip().split()[-1])]
+        assert len(x) == 1
 
-        image_sizes = [x[0]["height"], x[0]["width"]]
-        # print("AAAAA image_sizes", image_sizes, int(image_sizes[0]) * int(image_sizes[1])),
+        img = x[0]["image"].to(device)
+        input_size = list(img.size()[2:])
+        feature = {}
+        input_img = self.model.preprocess(img)
+        feature["backbone"] = self.image_encoder(input_img)
 
         return {
             "data": feature,
-            "input_size": image_sizes,
-            "prompts": prompts,
-            "object_classes": object_classes,
+            "input_size": input_size,
         }
 
     @torch.no_grad()
@@ -181,45 +193,6 @@ def get_input_size(self, x):
         image_sizes = [x[0]["height"], x[0]["width"]]
         return image_sizes  # [1024, 1024]
 
-    @torch.no_grad()
-    def get_prompts(self, x):
-        """Computes prompts"""
-        prompt_link = (
-            x[0]["file_name"].replace("/images/", "/prompts/").replace(".jpg", ".txt")
-        )
-        # print("AAAAA prompt_link", prompt_link)
-
-        with open(prompt_link, "r") as f:
-            line = f.readline()
-            # first_two = list(map(int, line.strip().split()[:2]))
-            parts = line.strip().split()
-            prompts = list(map(int, parts[:2]))
-            object_classes = [int(line.strip().split()[-1])]
-
-        image_sizes = [x[0]["height"], x[0]["width"]]
-        # print("AAAAA image_sizes", image_sizes, int(image_sizes[0]) * int(image_sizes[1])),
-
-        return prompts
-
-    @torch.no_grad()
-    def get_object_classes(self, x):
-        """Computes input image size to the network"""
-        prompt_link = (
-            x[0]["file_name"].replace("/images/", "/prompts/").replace(".jpg", ".txt")
-        )
-        # print("AAAAA prompt_link", prompt_link)
-
-        with open(prompt_link, "r") as f:
-            line = f.readline()
-            # first_two = list(map(int, line.strip().split()[:2]))
-            parts = line.strip().split()
-            prompts = list(map(int, parts[:2]))
-            object_classes = [int(line.strip().split()[-1])]
-
-        image_sizes = [x[0]["height"], x[0]["width"]]
-        # print("AAAAA image_sizes", image_sizes, int(image_sizes[0]) * int(image_sizes[1])),
-        return object_classes
-
     @torch.no_grad()
     def _image_encoder_to_output(
         self,
@@ -228,6 +201,7 @@ def _image_encoder_to_output(
         input_img_size: List,
         prompts: List,
         object_classes: List,
+        device,
     ):
         """
         performs  downstream task using the encoded image feature
@@ -237,7 +211,7 @@ def _image_encoder_to_output(
 
         input_points = [prompts]  # [[469, 295]] #prompts["points"]
         input_points = np.array(input_points)
-        input_points_ = torch.tensor(input_points)
+        input_points_ = torch.tensor(input_points, device=device)
         input_points_ = input_points_.unsqueeze(-1)
         input_points_ = input_points_.permute(2, 0, 1)
 
@@ -246,7 +220,7 @@ def _image_encoder_to_output(
         input_labels_ = input_labels_.unsqueeze(-1)
         input_labels_ = input_labels_.permute(1, 0)
 
-        points = (torch.tensor(input_points_), torch.tensor(input_labels_))
+        points = (input_points_, torch.tensor(input_labels_, device=device))
         prompt_feature = self.prompt_encoder(points=points, boxes=None, masks=None)
         image_pe = self.prompt_encoder.get_dense_pe()
 
@@ -261,7 +235,7 @@ def _image_encoder_to_output(
         # post process mask
         masks = F.interpolate(
             low_res_masks,
-            (1024, 1024),
+            (self.image_encoder.img_size, self.image_encoder.img_size),
             mode="bilinear",
             align_corners=False,
         )
@@ -270,7 +244,7 @@ def _image_encoder_to_output(
         ]  # [..., : 793, : 1024]
         masks = F.interpolate(
             masks,
-            (input_img_size[0], input_img_size[1]),
+            (org_img_size["height"], org_img_size["width"]),
             mode="bilinear",
             align_corners=False,
         )
@@ -314,14 +288,26 @@ def _image_encoder_to_output(
     def forward(self, x):
         """Complete the downstream task with end-to-end manner all the way from the input"""
         # test
-        enc = self._input_to_image_encoder(self, x)
-        dec = self._image_encoder_to_output(enc)
+        enc_res = self._input_to_image_encoder([x], self.device)
 
-        return dec
+        # suppose that the order of keys and values is matched
+        enc_res["data"] = {
+            k: v.to(device=self.device)
+            for k, v in zip(self.split_layer_list, enc_res["data"].values())
+        }
+
+        prompts, object_classes = self.prompt_inputs(x["file_name"])
+
+        dec_res = self._image_encoder_to_output(
+            enc_res["data"],
+            {"height": x["height"], "width": x["width"]},
+            enc_res["input_size"],
+            prompts,
+            object_classes,
+            device=self.device,
+        )
 
-    # @property
-    # def cfg(self):
-    #    return self._cfg
+        return dec_res
 
 
 @register_vision_model("sam_vit_h_4b8939")
diff --git a/compressai_vision/pipelines/base.py b/compressai_vision/pipelines/base.py
@@ -374,11 +374,6 @@ def _from_features_to_output(
             for k, v in zip(vision_model.split_layer_list, x["data"].values())
         }
 
-        if "prompts" in x:
-            x["prompts"] = x["prompts"]
-        if "object_classes" in x:
-            x["object_classes"] = x["object_classes"]
-
         results = vision_model.features_to_output(x, self.device_nn_part2)
         if self.configs["nn_task_part2"].dump_results:
             self._create_folder(output_results_dir)
diff --git a/compressai_vision/pipelines/split_inference/image_split_inference.py b/compressai_vision/pipelines/split_inference/image_split_inference.py
diff --git a/scripts/install.sh b/scripts/install.sh