Inference output as 2 tensors: [b, n, 1, 4] and [b, n, cls]

ersheng-ai · ersheng-ai · commit 841be503f7bb · 2020-07-20T11:45:42.000Z
diff --git a/README.md b/README.md
@@ -109,12 +109,13 @@ you can use darknet2pytorch to convert it yourself, or download my converted mod
 
 - Inference output
 
-    Inference output is of shape `[batch, num_boxes, 4 + num_classes]` in which `[batch, num_boxes, 4]` is x_center, y_center, width, height of bounding boxes, and `[batch, num_boxes, num_classes]` is confidences of bounding box for all classes.
+    There are 2 inference outputs.
+    - One is locations of bounding boxes, its shape is  `[batch, num_boxes, 1, 4]` which represents x1, y1, x2, y2 of each bounding box.
+    - The other one is scores of bounding boxes which is of shape `[batch, num_boxes, num_classes]` indicating scores of all classes for each bounding box.
 
     Until now, still a small piece of post-processing including NMS is required. We are trying to minimize time and complexity of post-processing.
 
 
-
 # 3. Darknet2ONNX (Evolving)
 
 - **This script is to convert the official pretrained darknet model into ONNX**
diff --git a/demo_darknet2onnx.py b/demo_darknet2onnx.py
@@ -43,7 +43,7 @@ def detect(session, image_src):
 
     outputs = session.run(None, {input_name: img_in})
 
-    boxes = post_processing(img_in, 0.4, 0.6, outputs[0])
+    boxes = post_processing(img_in, 0.4, 0.6, outputs)
 
     num_classes = 80
     if num_classes == 20:
diff --git a/demo_pytorch2onnx.py b/demo_pytorch2onnx.py
@@ -31,7 +31,7 @@ def transform_to_onnx(weight_file, batch_size, n_classes, IN_IMAGE_H, IN_IMAGE_W
                       export_params=True,
                       opset_version=11,
                       do_constant_folding=True,
-                      input_names=['input'], output_names=['output'],
+                      input_names=['input'], output_names=['boxes', 'confs'],
                       dynamic_axes=None)
 
     print('Onnx model exporting done')
diff --git a/demo_trt.py b/demo_trt.py
@@ -162,17 +162,16 @@ def detect(context, buffers, image_src, image_size, num_classes):
 
     print('Len of outputs: ', len(trt_outputs))
 
-    trt_output = trt_outputs[0].reshape(1, -1, 4 + num_classes)
+    trt_outputs[0] = trt_outputs[0].reshape(1, -1, 1, 4)
+    trt_outputs[1] = trt_outputs[1].reshape(1, -1, num_classes)
 
     tb = time.time()
 
-    print(trt_output.shape)
-
     print('-----------------------------------')
     print('    TRT inference time: %f' % (tb - ta))
     print('-----------------------------------')
 
-    boxes = post_processing(img_in, 0.4, 0.6, trt_output)
+    boxes = post_processing(img_in, 0.4, 0.6, trt_outputs)
 
     return boxes
 
diff --git a/tool/darknet2onnx.py b/tool/darknet2onnx.py
@@ -24,7 +24,7 @@ def transform_to_onnx(cfgfile, weightfile, batch_size=1):
                       export_params=True,
                       opset_version=11,
                       do_constant_folding=True,
-                      input_names=['input'], output_names=['output'],
+                      input_names=['input'], output_names=['boxes', 'confs'],
                       dynamic_axes=None)
 
     print('Onnx model exporting done')
diff --git a/tool/torch_utils.py b/tool/torch_utils.py
@@ -56,14 +56,12 @@ def get_region_boxes(boxes_and_confs):
         boxes_list.append(item[0])
         confs_list.append(item[1])
 
-    # boxes: [batch, num1 + num2 + num3, 4]
+    # boxes: [batch, num1 + num2 + num3, 1, 4]
     # confs: [batch, num1 + num2 + num3, num_classes]
     boxes = torch.cat(boxes_list, dim=1)
     confs = torch.cat(confs_list, dim=1)
-
-    output = torch.cat((boxes, confs), dim=2)
         
-    return output
+    return [boxes, confs]
 
 
 def convert2cpu(gpu_matrix):
diff --git a/tool/utils.py b/tool/utils.py
@@ -62,8 +62,8 @@ def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False):
     # print(boxes.shape)
     x1 = boxes[:, 0]
     y1 = boxes[:, 1]
-    x2 = boxes[:, 0] + boxes[:, 2]
-    y2 = boxes[:, 1] + boxes[:, 3]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
 
     areas = (x2 - x1) * (y2 - y1)
     order = confs.argsort()[::-1]
@@ -113,10 +113,10 @@ def get_color(c, x, max_val):
     height = img.shape[0]
     for i in range(len(boxes)):
         box = boxes[i]
-        x1 = int((box[0] - box[2] / 2.0) * width)
-        y1 = int((box[1] - box[3] / 2.0) * height)
-        x2 = int((box[0] + box[2] / 2.0) * width)
-        y2 = int((box[1] + box[3] / 2.0) * height)
+        x1 = int(box[0] * width)
+        y1 = int(box[1] * height)
+        x2 = int(box[2] * width)
+        y2 = int(box[3] * height)
 
         if color:
             rgb = color
@@ -171,16 +171,19 @@ def post_processing(img, conf_thresh, nms_thresh, output):
     # strides = [8, 16, 32]
     # anchor_step = len(anchors) // num_anchors
 
+    # [batch, num, 1, 4]
+    box_array = output[0]
+    # [batch, num, num_classes]
+    confs = output[1]
+
     t1 = time.time()
 
-    if type(output).__name__ != 'ndarray':
-        output = output.cpu().detach().numpy()
+    if type(box_array).__name__ != 'ndarray':
+        box_array = box_array.cpu().detach().numpy()
+        confs = confs.cpu().detach().numpy()
 
     # [batch, num, 4]
-    box_array = output[:, :, :4]
-
-    # [batch, num, num_classes]
-    confs = output[:, :, 4:]
+    box_array = box_array[:, :, 0]
 
     # [batch, num, num_classes] --> [batch, num]
     max_conf = np.max(confs, axis=2)
diff --git a/tool/yolo_layer.py b/tool/yolo_layer.py
@@ -94,16 +94,20 @@ def yolo_forward_alternative(output, conf_thresh, num_classes, anchors, num_anch
     print(anchor_tensor.size())
     bwh *= anchor_tensor
 
-    # Shape: [batch, num_anchors, 4, H * W] --> [batch, num_anchors * H * W, 4]
-    boxes = torch.cat((bxy, bwh), dim=2).permute(0, 1, 3, 2).reshape(batch, num_anchors * H * W, 4)
+    bx1y1 = bxy - bwh * 0.5
+    bx2y2 = bxy + bwh
+
+    # Shape: [batch, num_anchors, 4, H * W] --> [batch, num_anchors * H * W, 1, 4]
+    boxes = torch.cat((bx1y1, bx2y2), dim=2).permute(0, 1, 3, 2).reshape(batch, num_anchors * H * W, 1, 4)
+    # boxes = boxes.repeat(1, 1, num_classes, 1)
 
     print(normal_tensor.size())
     boxes *= normal_tensor
 
     det_confs = det_confs.view(batch, num_anchors * H * W, 1)
     confs = cls_confs * det_confs
 
-    # boxes: [batch, num_anchors * H * W, 4]
+    # boxes: [batch, num_anchors * H * W, 1, 4]
     # confs: [batch, num_anchors * H * W, num_classes]
 
     return  boxes, confs
@@ -231,17 +235,23 @@ def yolo_forward(output, conf_thresh, num_classes, anchors, num_anchors, scale_x
     bw = bx_bw[:, num_anchors:].view(batch, num_anchors * H * W, 1)
     bh = by_bh[:, num_anchors:].view(batch, num_anchors * H * W, 1)
 
-    # Shape: [batch, num_anchors * h * w, 4]
-    boxes = torch.cat((bx, by, bw, bh), dim=2).view(batch, num_anchors * H * W, 4)
+    bx1 = bx - bw * 0.5
+    by1 = by - bh * 0.5
+    bx2 = bx1 + bw
+    by2 = by1 + bh
+
+    # Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4]
+    boxes = torch.cat((bx1, by1, bx2, by2), dim=2).view(batch, num_anchors * H * W, 1, 4)
+    # boxes = boxes.repeat(1, 1, num_classes, 1)
 
-    # boxes:     [batch, num_anchors * H * W, num_classes, 4]
+    # boxes:     [batch, num_anchors * H * W, 1, 4]
     # cls_confs: [batch, num_anchors * H * W, num_classes]
     # det_confs: [batch, num_anchors * H * W]
 
     det_confs = det_confs.view(batch, num_anchors * H * W, 1)
     confs = cls_confs * det_confs
 
-    # boxes: [batch, num_anchors * H * W, 4]
+    # boxes: [batch, num_anchors * H * W, 1, 4]
     # confs: [batch, num_anchors * H * W, num_classes]
 
     return  boxes, confs