|
| 1 | +# Copyright 2020 Google Research. All Rights Reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | +# ============================================================================== |
| 15 | +"""IoU utils for box regression with iou losses. |
| 16 | +
|
| 17 | +Distance-IoU Loss: Faster and Better Learning for Bounding Box Regression. |
| 18 | +https://arxiv.org/pdf/1911.08287.pdf |
| 19 | +""" |
| 20 | + |
| 21 | +from __future__ import absolute_import |
| 22 | +from __future__ import division |
| 23 | +# gtype import |
| 24 | +from __future__ import print_function |
| 25 | + |
| 26 | +import math |
| 27 | +import numpy as np |
| 28 | +import tensorflow.compat.v1 as tf |
| 29 | +from typing import Union, Text |
| 30 | + |
| 31 | +FloatType = Union[tf.Tensor, float, np.float32, np.float64] |
| 32 | + |
| 33 | + |
| 34 | +def _get_v(b1_height: FloatType, b1_width: FloatType, b2_height: FloatType, |
| 35 | + b2_width: FloatType) -> tf.Tensor: |
| 36 | + """Get the consistency measurement of aspect ratio for ciou.""" |
| 37 | + |
| 38 | + @tf.custom_gradient |
| 39 | + def _get_grad_v(height, width): |
| 40 | + """backpropogate gradient.""" |
| 41 | + arctan = tf.atan(tf.math.divide_no_nan(b1_width, b1_height)) - tf.atan( |
| 42 | + tf.math.divide_no_nan(width, height)) |
| 43 | + v = 4 * ((arctan / math.pi)**2) |
| 44 | + |
| 45 | + def _grad_v(dv, variables=None): |
| 46 | + gdw = dv * 8 * arctan * height / (math.pi**2) |
| 47 | + gdh = -dv * 8 * arctan * width / (math.pi**2) |
| 48 | + return [gdh, gdw], tf.gradients(v, variables, grad_ys=dv) |
| 49 | + |
| 50 | + return v, _grad_v |
| 51 | + |
| 52 | + return _get_grad_v(b2_height, b2_width) |
| 53 | + |
| 54 | + |
| 55 | +def _iou_per_anchor(pred_boxes: FloatType, |
| 56 | + target_boxes: FloatType, |
| 57 | + iou_type: Text = 'iou') -> tf.Tensor: |
| 58 | + """Computing the IoU for a single anchor. |
| 59 | +
|
| 60 | + Args: |
| 61 | + pred_boxes: predicted boxes, with coordinate [y_min, x_min, y_max, x_max]. |
| 62 | + target_boxes: target boxes, with coordinate [y_min, x_min, y_max, x_max]. |
| 63 | + iou_type: one of ['iou', 'ciou', 'diou', 'giou']. |
| 64 | +
|
| 65 | + Returns: |
| 66 | + IoU loss float `Tensor`. |
| 67 | + """ |
| 68 | + # t_ denotes target boxes and p_ denotes predicted boxes. |
| 69 | + t_ymin, t_xmin, t_ymax, t_xmax = target_boxes |
| 70 | + p_ymin, p_xmin, p_ymax, p_xmax = pred_boxes |
| 71 | + |
| 72 | + zero = tf.convert_to_tensor(0.0, t_ymin.dtype) |
| 73 | + p_width = tf.maximum(zero, p_xmax - p_xmin) |
| 74 | + p_height = tf.maximum(zero, p_ymax - p_ymin) |
| 75 | + t_width = tf.maximum(zero, t_xmax - t_xmin) |
| 76 | + t_height = tf.maximum(zero, t_ymax - t_ymin) |
| 77 | + p_area = p_width * p_height |
| 78 | + t_area = t_width * t_height |
| 79 | + |
| 80 | + intersect_ymin = tf.maximum(p_ymin, t_ymin) |
| 81 | + intersect_xmin = tf.maximum(p_xmin, t_xmin) |
| 82 | + intersect_ymax = tf.minimum(p_ymax, t_ymax) |
| 83 | + intersect_xmax = tf.minimum(p_xmax, t_xmax) |
| 84 | + intersect_width = tf.maximum(zero, intersect_xmax - intersect_xmin) |
| 85 | + intersect_height = tf.maximum(zero, intersect_ymax - intersect_ymin) |
| 86 | + intersect_area = intersect_width * intersect_height |
| 87 | + |
| 88 | + union_area = p_area + t_area - intersect_area |
| 89 | + iou_v = tf.math.divide_no_nan(intersect_area, union_area) |
| 90 | + if iou_type == 'iou': |
| 91 | + return iou_v # iou is the simplest form. |
| 92 | + |
| 93 | + enclose_ymin = tf.minimum(p_ymin, t_ymin) |
| 94 | + enclose_xmin = tf.minimum(p_xmin, t_xmin) |
| 95 | + enclose_ymax = tf.maximum(p_ymax, t_ymax) |
| 96 | + enclose_xmax = tf.maximum(p_xmax, t_xmax) |
| 97 | + |
| 98 | + assert iou_type in ('giou', 'diou', 'ciou') |
| 99 | + if iou_type == 'giou': # giou is the generalized iou. |
| 100 | + enclose_width = tf.maximum(zero, enclose_xmax - enclose_xmin) |
| 101 | + enclose_height = tf.maximum(zero, enclose_ymax - enclose_ymin) |
| 102 | + enclose_area = enclose_width * enclose_height |
| 103 | + giou_v = iou_v - tf.math.divide_no_nan( |
| 104 | + (enclose_area - union_area), enclose_area) |
| 105 | + return giou_v |
| 106 | + |
| 107 | + assert iou_type in ('diou', 'ciou') |
| 108 | + p_center = tf.stack([(p_ymin + p_ymax) / 2, (p_xmin + p_xmax) / 2]) |
| 109 | + t_center = tf.stack([(t_ymin + t_ymax) / 2, (t_xmin + t_xmax) / 2]) |
| 110 | + euclidean = tf.linalg.norm(t_center - p_center) |
| 111 | + diag_length = tf.linalg.norm( |
| 112 | + [enclose_ymax - enclose_ymin, enclose_xmax - enclose_xmin]) |
| 113 | + diou_v = iou_v - tf.math.divide_no_nan(euclidean**2, diag_length**2) |
| 114 | + if iou_type == 'diou': # diou is the distance iou. |
| 115 | + return diou_v |
| 116 | + |
| 117 | + assert iou_type == 'ciou' |
| 118 | + v = _get_v(p_height, p_width, t_height, t_width) |
| 119 | + alpha = tf.math.divide_no_nan(v, ((1 - iou_v) + v)) |
| 120 | + return diou_v - alpha * v # the last one is ciou. |
| 121 | + |
| 122 | + |
| 123 | +def iou_loss(pred_boxes: FloatType, |
| 124 | + target_boxes: FloatType, |
| 125 | + iou_type: Text = 'iou') -> tf.Tensor: |
| 126 | + """A unified interface for computing various IoU losses. |
| 127 | +
|
| 128 | + Let B and B_gt denotes the pred_box and B_gt is the target box (ground truth): |
| 129 | +
|
| 130 | + IoU = |B & B_gt| / |B | B_gt| |
| 131 | +
|
| 132 | + GIoU = IoU - |C - B U B_gt| / C, where C is the smallest box covering B and |
| 133 | + B_gt. |
| 134 | +
|
| 135 | + DIoU = IoU - E(B, B_gt)^2 / c^2, E is the Euclidean distance of the center |
| 136 | + points of B and B_gt, and c is the diagonal length of the smallest box |
| 137 | + covering the two boxes |
| 138 | +
|
| 139 | + CIoU = IoU - DIoU - a * v, where a is a positive trade-off parameter, and |
| 140 | + v measures the consistency of aspect ratio: |
| 141 | + v = (arctan(w_gt / h_gt) - arctan(w / h)) * 4 / pi^2 |
| 142 | + where (w_gt, h_gt) and (w, h) are the width and height of the target and |
| 143 | + predicted box respectively. |
| 144 | +
|
| 145 | + The returned loss is computed as 1 - one of {IoU, GIoU, DIoU, CIoU}. |
| 146 | +
|
| 147 | + Args: |
| 148 | + pred_boxes: predicted boxes, with coordinate [y_min, x_min, y_max, x_max]*. |
| 149 | + It can be multiple anchors, with each anchor box has four coordinates. |
| 150 | + target_boxes: target boxes, with coordinate [y_min, x_min, y_max, x_max]*. |
| 151 | + It can be multiple anchors, with each anchor box has four coordinates. |
| 152 | + iou_type: one of ['iou', 'ciou', 'diou', 'giou']. |
| 153 | +
|
| 154 | + Returns: |
| 155 | + IoU loss float `Tensor`. |
| 156 | + """ |
| 157 | + if iou_type not in ('iou', 'ciou', 'diou', 'giou'): |
| 158 | + raise ValueError( |
| 159 | + 'Unknown loss_type {}, not iou/ciou/diou/giou'.format(iou_type)) |
| 160 | + |
| 161 | + pred_boxes = tf.convert_to_tensor(pred_boxes, tf.float32) |
| 162 | + target_boxes = tf.cast(target_boxes, pred_boxes.dtype) |
| 163 | + |
| 164 | + # t_ denotes target boxes and p_ denotes predicted boxes: (y, x, y_max, x_max) |
| 165 | + pred_boxes_list = tf.unstack(pred_boxes, None, axis=-1) |
| 166 | + target_boxes_list = tf.unstack(target_boxes, None, axis=-1) |
| 167 | + assert len(pred_boxes_list) == len(target_boxes_list) |
| 168 | + assert len(pred_boxes_list) % 4 == 0 |
| 169 | + |
| 170 | + iou_loss_list = [] |
| 171 | + for i in range(0, len(pred_boxes_list), 4): |
| 172 | + pred_boxes = pred_boxes_list[i: i + 4] |
| 173 | + target_boxes = target_boxes_list[i: i + 4] |
| 174 | + |
| 175 | + # Compute mask. |
| 176 | + t_ymin, t_xmin, t_ymax, t_xmax = target_boxes |
| 177 | + mask = tf.not_equal((t_ymax - t_ymin) * (t_xmax - t_xmin), 0) |
| 178 | + mask = tf.cast(mask, t_ymin.dtype) |
| 179 | + # Loss should be mask * (1 - iou) = mask - masked_iou. |
| 180 | + pred_boxes = [b * mask for b in pred_boxes] |
| 181 | + iou_loss_list.append( |
| 182 | + mask - tf.squeeze(_iou_per_anchor(pred_boxes, target_boxes, iou_type))) |
| 183 | + if len(iou_loss_list) == 1: |
| 184 | + return iou_loss_list[0] |
| 185 | + return tf.reduce_sum(tf.stack(iou_loss_list), 0) |
| 186 | + |
0 commit comments