Fix issues with iterative prompting

constantinpape · constantinpape · commit f984d9c8f2c3 · 2023-08-10T19:10:42.000+02:00
diff --git a/finetuning/livecell/evaluation/iterative_prompting.py b/finetuning/livecell/evaluation/iterative_prompting.py
@@ -10,7 +10,7 @@ def main():
 
     run_inference_with_iterative_prompting(
         checkpoint, model_type, image_paths, gt_paths,
-        prediction_root, use_boxes=False,
+        prediction_root, use_boxes=False, batch_size=16,
     )
 
 
diff --git a/micro_sam/evaluation/inference.py b/micro_sam/evaluation/inference.py
@@ -425,10 +425,9 @@ def run_inference_with_prompts(
 
 def _save_segmentation(masks, prediction_path):
     # masks to segmentation
-    masks = masks.numpy().squeeze()
+    masks = masks.cpu().numpy().squeeze().astype("bool")
     shape = masks.shape[-2:]
-    masks = {"segmentation": mask for mask in masks}
-    breakpoint()
+    masks = [{"segmentation": mask, "area": mask.sum()} for mask in masks]
     segmentation = mask_data_to_segmentation(masks, shape, with_background=True)
     imageio.imwrite(prediction_path, segmentation)
 
@@ -441,7 +440,7 @@ def _run_inference_with_iterative_prompting_for_image(
     device,
     use_boxes,
     prediction_paths,
-    batch_size=64
+    batch_size,
 ):
     assert len(prediction_paths) == n_iterations, f"{len(prediction_paths)}, {n_iterations}"
     to_sam_inputs = ConvertToSamInputs()
@@ -453,9 +452,6 @@ def _run_inference_with_iterative_prompting_for_image(
 
     n_pos = 0 if use_boxes else 1
     batched_inputs, sampled_ids = to_sam_inputs(image, gt, n_pos=n_pos, n_neg=0, get_boxes=use_boxes)
-    sampled_binary_y = torch.stack([
-        torch.stack([_gt == idx for idx in sampled]) for _gt, sampled in zip(gt, sampled_ids)
-    ]).to(torch.float32)
 
     input_images = torch.stack([model.preprocess(x=x["image"].to(device)) for x in batched_inputs], dim=0)
     image_embeddings = model.image_embeddings_oft(input_images)
@@ -471,15 +467,19 @@ def _run_inference_with_iterative_prompting_for_image(
         for batch_idx in range(n_batches):
             batch_start = batch_idx * batch_size
             batch_stop = min((batch_idx + 1) * batch_size, n_samples)
-            tmp_batched_inputs = deepcopy(batched_inputs)
-            for k, v in tmp_batched_inputs[0].items():
-                if k == "point_coords":
-                    tmp_batched_inputs[0]["point_coords"] = v[batch_start:batch_stop]
-                if k == "point_labels":
-                    tmp_batched_inputs[0]["point_labels"] = v[batch_start:batch_stop]
+
+            this_batched_inputs = [{
+                k: v[batch_start:batch_stop] if k in ("point_coords", "point_labels") else v
+                for k, v in batched_inputs[0].items()
+            }]
+
+            sampled_binary_y = torch.stack([
+                torch.stack([_gt == idx for idx in sampled[batch_start:batch_stop]])[:, None]
+                for _gt, sampled in zip(gt, sampled_ids)
+            ]).to(torch.float32)
 
             batched_outputs = model(
-                tmp_batched_inputs,
+                this_batched_inputs,
                 multimask_output=multimasking if iteration == 0 else False,
                 image_embeddings=image_embeddings
             )
@@ -499,7 +499,7 @@ def _run_inference_with_iterative_prompting_for_image(
             masks = (masks > 0.5).to(torch.float32)
             final_masks.append(masks)
 
-            for _pred, _gt, _inp, logits in zip(masks, sampled_binary_y, tmp_batched_inputs, logits_masks):
+            for _pred, _gt, _inp, logits in zip(masks, sampled_binary_y, this_batched_inputs, logits_masks):
                 next_coords, next_labels = prompt_generator(_gt, _pred, _inp["point_coords"], _inp["point_labels"])
                 _inp["point_coords"], _inp["point_labels"], _inp["mask_inputs"] = next_coords, next_labels, logits
 
@@ -515,6 +515,7 @@ def run_inference_with_iterative_prompting(
     prediction_root: Union[str, os.PathLike],
     use_boxes: bool,
     n_iterations: int = 8,
+    batch_size: int = 32,
 ) -> None:
     """
 
@@ -545,5 +546,5 @@ def run_inference_with_iterative_prompting(
 
         with torch.no_grad():
             _run_inference_with_iterative_prompting_for_image(
-                model, image, gt, n_iterations, device, use_boxes, prediction_paths,
+                model, image, gt, n_iterations, device, use_boxes, prediction_paths, batch_size,
             )
diff --git a/micro_sam/prompt_generators.py b/micro_sam/prompt_generators.py
@@ -199,46 +199,62 @@ def __init__(self, device=None):
         self.device = device if device is not None else "cuda" if torch.cuda.is_available() else "cpu"
 
     def get_positive_points(self, pos_region, overlap_region):
-        tmp_pos_loc = torch.where(pos_region)
-        # condiion below where there is no room for improvement for the model
-        # hence we put a positive point in the "already correct" regions
-        if torch.stack(tmp_pos_loc).shape[-1] == 0:
-            tmp_pos_loc = torch.where(overlap_region)
-
-        pos_index = np.random.choice(len(tmp_pos_loc[1]))
-        pos_coordinates = int(tmp_pos_loc[1][pos_index]), int(tmp_pos_loc[2][pos_index])
-        pos_coordinates = pos_coordinates[::-1]
-        pos_labels = 1
+        positive_locations = [torch.where(pos_reg) for pos_reg in pos_region]
+        # we may have objects withput a positive region (= missing true foreground)
+        # in this case we just sample a point where the model was already correct
+        positive_locations = [
+            torch.where(ovlp_reg) if len(pos_loc[0]) == 0 else pos_loc
+            for pos_loc, ovlp_reg in zip(positive_locations, overlap_region)
+        ]
+        # we sample one location for each object in the batch
+        sampled_indices = [np.random.choice(len(pos_loc[0])) for pos_loc in positive_locations]
+        # get the corresponding coordinates (Note that we flip the axis order here due to the expected order of SAM)
+        pos_coordinates = [
+            [pos_loc[-1][idx], pos_loc[-2][idx]] for pos_loc, idx in zip(positive_locations, sampled_indices)
+        ]
+
+        # make sure that we still have the correct batch size
+        assert len(pos_coordinates) == pos_region.shape[0]
+        pos_labels = [1] * len(pos_coordinates)
+
         return pos_coordinates, pos_labels
 
-    def get_negative_points(self, neg_region, true_object, gt):
-        tmp_neg_loc = torch.where(neg_region)
-        if torch.stack(tmp_neg_loc).shape[-1] == 0:
-            tmp_true_loc = torch.where(true_object)
-            x_coords, y_coords = tmp_true_loc[1], tmp_true_loc[2]
-            bbox = torch.stack([torch.min(x_coords), torch.min(y_coords),
-                                torch.max(x_coords) + 1, torch.max(y_coords) + 1])
-            bbox_mask = torch.zeros_like(true_object).squeeze(0)
-            bbox_mask[bbox[0]:bbox[2], bbox[1]:bbox[3]] = 1
-            bbox_mask = bbox_mask[None].to(self.device)
-
-            # NOTE: FIX: here we add dilation to the bbox because in some case we couldn't find objects at all
-            # TODO: just expand the pixels of bbox
-            dilated_bbox_mask = dilation(bbox_mask[None], torch.ones(3, 3).to(self.device)).squeeze(0)
-            background_mask = abs(dilated_bbox_mask - true_object)
-            tmp_neg_loc = torch.where(background_mask)
-
-            # there is a chance that the object is small to not return a decent-sized bounding box
-            # hence we might not find points sometimes there as well, hence we sample points from true background
-            if torch.stack(tmp_neg_loc).shape[-1] == 0:
-                tmp_neg_loc = torch.where(gt == 0)
+    # TODO get rid of this looped implementation and use proper batched computation instead
+    def get_negative_points(self, negative_region_batched, true_object_batched, gt_batched):
+        negative_coordinates, negative_labels = [], []
 
-        neg_index = np.random.choice(len(tmp_neg_loc[1]))
-        neg_coordinates = int(tmp_neg_loc[1][neg_index]), int(tmp_neg_loc[2][neg_index])
-        neg_coordinates = neg_coordinates[::-1]
-        neg_labels = 0
+        for neg_region, true_object, gt in zip(negative_region_batched, true_object_batched, gt_batched):
 
-        return neg_coordinates, neg_labels
+            tmp_neg_loc = torch.where(neg_region)
+            if torch.stack(tmp_neg_loc).shape[-1] == 0:
+                tmp_true_loc = torch.where(true_object)
+                x_coords, y_coords = tmp_true_loc[1], tmp_true_loc[2]
+                bbox = torch.stack([torch.min(x_coords), torch.min(y_coords),
+                                    torch.max(x_coords) + 1, torch.max(y_coords) + 1])
+                bbox_mask = torch.zeros_like(true_object).squeeze(0)
+                bbox_mask[bbox[0]:bbox[2], bbox[1]:bbox[3]] = 1
+                bbox_mask = bbox_mask[None].to(self.device)
+
+                # NOTE: FIX: here we add dilation to the bbox because in some case we couldn't find objects at all
+                # TODO: just expand the pixels of bbox
+                dilated_bbox_mask = dilation(bbox_mask[None], torch.ones(3, 3).to(self.device)).squeeze(0)
+                background_mask = abs(dilated_bbox_mask - true_object)
+                tmp_neg_loc = torch.where(background_mask)
+
+                # there is a chance that the object is small to not return a decent-sized bounding box
+                # hence we might not find points sometimes there as well, hence we sample points from true background
+                if torch.stack(tmp_neg_loc).shape[-1] == 0:
+                    tmp_neg_loc = torch.where(gt == 0)
+
+            neg_index = np.random.choice(len(tmp_neg_loc[1]))
+            neg_coordinates = [tmp_neg_loc[1][neg_index], tmp_neg_loc[2][neg_index]]
+            neg_coordinates = neg_coordinates[::-1]
+            neg_labels = 0
+
+            negative_coordinates.append(neg_coordinates)
+            negative_labels.append(neg_labels)
+
+        return negative_coordinates, negative_labels
 
     def __call__(
         self,
@@ -249,6 +265,7 @@ def __call__(
     ):
         """Generate the prompts for each object iteratively in the segmentation.
         """
+        assert gt.shape == object_mask.shape
         true_object = gt.to(self.device)
         expected_diff = (object_mask - true_object)
         neg_region = (expected_diff == 1).to(torch.float)
@@ -257,8 +274,12 @@ def __call__(
 
         pos_coordinates, pos_labels = self.get_positive_points(pos_region, overlap_region)
         neg_coordinates, neg_labels = self.get_negative_points(neg_region, true_object, gt)
+        assert len(pos_coordinates) == len(pos_labels) == len(neg_coordinates) == len(neg_labels)
+
+        pos_coordinates, neg_coordinates = torch.tensor(pos_coordinates)[:, None], torch.tensor(neg_coordinates)[:, None]
+        pos_labels, neg_labels = torch.tensor(pos_labels)[:, None], torch.tensor(neg_labels)[:, None]
 
-        net_coords = torch.cat([current_points, torch.tensor([[pos_coordinates, neg_coordinates]])], dim=1)
-        net_labels = torch.cat([current_labels, torch.tensor([[pos_labels, neg_labels]])], dim=1)
+        net_coords = torch.cat([current_points, pos_coordinates, neg_coordinates], dim=1)
+        net_labels = torch.cat([current_labels, pos_labels, neg_labels], dim=1)
 
         return net_coords, net_labels

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@ def main():`
`10`	`10`
`11`	`11`	`run_inference_with_iterative_prompting(`
`12`	`12`	`checkpoint, model_type, image_paths, gt_paths,`
`13`		`- prediction_root, use_boxes=False,`
	`13`	`+ prediction_root, use_boxes=False, batch_size=16,`
`14`	`14`	`)`
`15`	`15`
`16`	`16`