Normalize anchors during TFLite post-processing in object detection.

ziyeqinghan · fyangf · commit 056cd615b9d6 · 2023-03-17T10:36:36.000-07:00
PiperOrigin-RevId: 486704284
diff --git a/official/vision/modeling/factory.py b/official/vision/modeling/factory.py
@@ -306,6 +306,11 @@ def build_retinanet(
     decoder_features = decoder(backbone_features)
     _ = head(decoder_features)
 
+  # Add `input_image_size` into `tflite_post_processing_config`.
+  tflite_post_processing_config = generator_config.tflite_post_processing.as_dict(
+  )
+  tflite_post_processing_config['input_image_size'] = (input_specs.shape[1],
+                                                       input_specs.shape[2])
   detection_generator_obj = detection_generator.MultilevelDetectionGenerator(
       apply_nms=generator_config.apply_nms,
       pre_nms_top_k=generator_config.pre_nms_top_k,
@@ -315,8 +320,7 @@ def build_retinanet(
       nms_version=generator_config.nms_version,
       use_cpu_nms=generator_config.use_cpu_nms,
       soft_nms_sigma=generator_config.soft_nms_sigma,
-      tflite_post_processing_config=generator_config.tflite_post_processing
-      .as_dict())
+      tflite_post_processing_config=tflite_post_processing_config)
 
   model = retinanet_model.RetinaNetModel(
       backbone,
diff --git a/official/vision/modeling/layers/detection_generator.py b/official/vision/modeling/layers/detection_generator.py
@@ -533,6 +533,12 @@ def _generate_detections_tflite(raw_boxes: Mapping[str, tf.Tensor],
   wa = anchors[..., 3] - anchors[..., 1]
   anchors = tf.stack([ycenter_a, xcenter_a, ha, wa], axis=-1)
 
+  # TFLite's object detection APIs require normalized anchors.
+  height, width = config['input_image_size']
+  normalize_factor = tf.constant([height, width, height, width],
+                                 dtype=tf.float32)
+  anchors = anchors / normalize_factor
+
   # There is no TF equivalent for TFLite's custom post-processing op.
   # So we add an 'empty' composite function here, that is legalized to the
   # custom op with MLIR.
diff --git a/official/vision/modeling/layers/detection_generator_test.py b/official/vision/modeling/layers/detection_generator_test.py
@@ -148,7 +148,8 @@ def testDetectionsOutputShape(self, nms_version, has_att_heads, use_cpu_nms,
         'max_classes_per_detection': 1,
         'use_regular_nms': use_regular_nms,
         'nms_score_threshold': 0.01,
-        'nms_iou_threshold': 0.5
+        'nms_iou_threshold': 0.5,
+        'input_image_size': [224, 224],
     }
     kwargs = {
         'apply_nms': True,
@@ -253,7 +254,8 @@ def test_serialize_deserialize(self):
         'max_classes_per_detection': 1,
         'use_regular_nms': True,
         'nms_score_threshold': 0.01,
-        'nms_iou_threshold': 0.5
+        'nms_iou_threshold': 0.5,
+        'input_image_size': [224, 224],
     }
     kwargs = {
         'apply_nms': True,