clip box value (#361)

fsx950223 · web-flow · commit df587df77b7d · 2020-05-06T11:28:19.000-07:00
Also changes the box output format of detection (from [y, x, height, width] to [y_min, x_min, y_max, x_ma].
diff --git a/efficientdet/anchors.py b/efficientdet/anchors.py
@@ -265,6 +265,7 @@ def _generate_detections_tf(cls_outputs,
                             classes,
                             image_id,
                             image_scale,
+                            image_size,
                             min_score_thresh=MIN_SCORE_THRESH,
                             max_boxes_to_draw=MAX_DETECTIONS_PER_IMAGE,
                             soft_nms_sigma=0.0,
@@ -303,7 +304,7 @@ def _generate_detections_tf(cls_outputs,
 
   Returns:
     detections: detection results in a tensor with each row representing
-      [image_id, y, x, height, width, score, class]
+      [image_id, ymin, xmin, ymax, xmax, score, class]
   """
   logging.info('Using tf version of post-processing.')
   anchor_boxes = tf.gather(anchor_boxes, indices)
@@ -330,15 +331,13 @@ def _generate_detections_tf(cls_outputs,
     detections = tf.gather(all_detections, top_detection_idx)
     scores = detections[:, 4]
     boxes = detections[:, :4]
-  height = boxes[:, 2] - boxes[:, 0]
-  width = boxes[:, 3] - boxes[:, 1]
 
   detections = tf.stack([
-      tf.cast(tf.tile(image_id, [tf.shape(top_detection_idx)[0]]), tf.float32),
-      boxes[:, 0] * image_scale,
-      boxes[:, 1] * image_scale,
-      height * image_scale,
-      width * image_scale,
+      tf.cast(tf.tile(image_id, tf.shape(top_detection_idx)), tf.float32),
+      tf.clip_by_value(boxes[:, 0], 0, image_size[0]) * image_scale,
+      tf.clip_by_value(boxes[:, 1], 0, image_size[1]) * image_scale,
+      tf.clip_by_value(boxes[:, 2], 0, image_size[0]) * image_scale,
+      tf.clip_by_value(boxes[:, 3], 0, image_size[1]) * image_scale,
       scores,
       tf.cast(tf.gather(classes, top_detection_idx) + 1, tf.float32)
   ], axis=1)
@@ -566,6 +565,7 @@ def generate_detections(self,
                           classes,
                           image_id,
                           image_scale,
+                          image_size,
                           min_score_thresh=MIN_SCORE_THRESH,
                           max_boxes_to_draw=MAX_DETECTIONS_PER_IMAGE,
                           disable_pyfun=None):
@@ -579,6 +579,7 @@ def generate_detections(self,
           classes,
           image_id,
           image_scale,
+          image_size,
           min_score_thresh=min_score_thresh,
           max_boxes_to_draw=max_boxes_to_draw)
     else:
diff --git a/efficientdet/inference.py b/efficientdet/inference.py
@@ -248,13 +248,15 @@ def det_post_process_combined(params, cls_outputs, box_outputs, scales,
       tf.tile(
           tf.expand_dims(tf.range(batch_size), axis=1), [1, max_boxes_to_draw]),
       dtype=tf.float32)
-  y = nmsed_boxes[..., 0] * scales
-  x = nmsed_boxes[..., 1] * scales
-  height = nmsed_boxes[..., 2] * scales - y
-  width = nmsed_boxes[..., 3] * scales - x
+  image_size = params['image_size']
+  ymin = tf.clip_by_value(nmsed_boxes[..., 0], 0, image_size[0]) * scales
+  xmin = tf.clip_by_value(nmsed_boxes[..., 1], 0, image_size[1]) * scales
+  ymax = tf.clip_by_value(nmsed_boxes[..., 2], 0, image_size[0]) * scales
+  xmax = tf.clip_by_value(nmsed_boxes[..., 3], 0, image_size[1]) * scales
+
   detection_list = [
-      # Format: (image_ids, y, x, height, width, score, class)
-      image_ids, y, x, height, width, nmsed_scores,
+      # Format: (image_ids, ymin, xmin, ymax, xmax, score, class)
+      image_ids, ymin, xmin, ymax, xmax, nmsed_scores,
       tf.cast(nmsed_classes + 1, tf.float32)
   ]
   detections = tf.stack(detection_list, axis=2, name='detections')
@@ -281,7 +283,7 @@ def det_post_process(params: Dict[Any, Any], cls_outputs: Dict[int, tf.Tensor],
 
   Returns:
     detections_batch: a batch of detection results. Each detection is a tensor
-      with each row representing [image_id, x, y, width, height, score, class].
+      with each row representing [image_id, ymin, xmin, ymax, xmax, score, class].
   """
   if not params['batch_size']:
     # Use combined version for dynamic batch size.
@@ -318,6 +320,7 @@ def det_post_process(params: Dict[Any, Any], cls_outputs: Dict[int, tf.Tensor],
         classes_per_sample,
         image_id=[index],
         image_scale=[scales[index]],
+        image_size=params['image_size'],
         min_score_thresh=min_score_thresh,
         max_boxes_to_draw=max_boxes_to_draw,
         disable_pyfun=params.get('disable_pyfun'))
@@ -412,7 +415,7 @@ def visualize_image_prediction(image,
   Args:
     image: Image content in shape of [height, width, 3].
     prediction: a list of vector, with each vector has the format of [image_id,
-      y, x, height, width, score, class].
+      ymin, xmin, ymax, xmax, score, class].
     disable_pyfun: disable pyfunc for faster post processing.
     label_id_mapping: a map from label id to name.
     **kwargs: extra parameters for vistualization, such as min_score_thresh,
@@ -430,7 +433,7 @@ def visualize_image_prediction(image,
     boxes[:, [0, 1, 2, 3]] = boxes[:, [1, 0, 3, 2]]
 
   label_id_mapping = label_id_mapping or coco_id_mapping
-  boxes[:, 2:4] += boxes[:, 0:2]
+
   return visualize_image(image, boxes, classes, scores, label_id_mapping,
                          **kwargs)