Add support of dynamic batch size

ersheng-ai · ersheng-ai · commit 1ff5a1085ce0 · 2020-08-11T19:33:50.000+08:00
diff --git a/README.md b/README.md
@@ -89,7 +89,6 @@ See following sections for more details of conversions.
 | ------------------- | ----------: | ----------: | ----------: | ----------: | ----------: | ----------: |
 | DarkNet (YOLOv4 paper)|     0.471 |       0.710 |       0.510 |       0.278 |       0.525 |       0.636 |
 | Pytorch (TianXiaomo)|       0.466 |       0.704 |       0.505 |       0.267 |       0.524 |       0.629 |
-| ONNX                |    incoming |    incoming |    incoming |    incoming |    incoming |    incoming |
 | TensorRT FP32 + BatchedNMSPlugin | 0.472| 0.708 |       0.511 |       0.273 |       0.530 |       0.637 |
 | TensorRT FP16 + BatchedNMSPlugin | 0.472| 0.708 |       0.511 |       0.273 |       0.530 |       0.636 |
 
@@ -99,7 +98,6 @@ See following sections for more details of conversions.
 | ------------------- | ----------: | ----------: | ----------: | ----------: | ----------: | ----------: |
 | DarkNet (YOLOv4 paper)|     0.412 |       0.628 |       0.443 |       0.204 |       0.444 |       0.560 |
 | Pytorch (TianXiaomo)|       0.404 |       0.615 |       0.436 |       0.196 |       0.438 |       0.552 |
-| ONNX                |    incoming |    incoming |    incoming |    incoming |    incoming |    incoming |
 | TensorRT FP32 + BatchedNMSPlugin | 0.412| 0.625 |       0.445 |       0.200 |       0.446 |       0.564 |
 | TensorRT FP16 + BatchedNMSPlugin | 0.412| 0.625 |       0.445 |       0.200 |       0.446 |       0.563 |
 
@@ -163,10 +161,11 @@ Until now, still a small piece of post-processing including NMS is required. We
     python demo_darknet2onnx.py <cfgFile> <weightFile> <imageFile> <batchSize>
     ```
 
-  This script will generate 2 ONNX models.
+## 3.1 Dynamic or static batch size
 
-  - One is for running the demo (batch_size=1)
-  - The other one is what you want to generate (batch_size=batchSize)
+- **Positive batch size will generate ONNX model of static batch size, otherwise, batch size will be dynamic**
+    - Dynamic batch size will generate only one ONNX model
+    - Static batch size will generate 2 ONNX models, one is for running the demo (batch_size=1)
 
 # 4. Pytorch2ONNX (Evolving)
 
@@ -195,34 +194,54 @@ Until now, still a small piece of post-processing including NMS is required. We
     python demo_pytorch2onnx.py yolov4.pth dog.jpg 8 80 416 416
     ```
 
-  This script will generate 2 ONNX models.
+## 4.1 Dynamic or static batch size
 
-  - One is for running the demo (batch_size=1)
-  - The other one is what you want to generate (batch_size=batch_size)
+- **Positive batch size will generate ONNX model of static batch size, otherwise, batch size will be dynamic**
+    - Dynamic batch size will generate only one ONNX model
+    - Static batch size will generate 2 ONNX models, one is for running the demo (batch_size=1)
 
 
 # 5. ONNX2TensorRT (Evolving)
 
 - **TensorRT version Recommended: 7.0, 7.1**
 
+## 5.1 Convert from ONNX of static Batch size
+
 - **Run the following command to convert VOLOv4 ONNX model into TensorRT engine**
 
     ```sh
     trtexec --onnx=<onnx_file> --explicitBatch --saveEngine=<tensorRT_engine_file> --workspace=<size_in_megabytes> --fp16
     ```
     - Note: If you want to use int8 mode in conversion, extra int8 calibration is needed.
 
-- **Run the demo**
+## 5.2 Convert from ONNX of dynamic Batch size
+
+- **Run the following command to convert VOLOv4 ONNX model into TensorRT engine**
 
     ```sh
-    python demo_trt.py <tensorRT_engine_file> <input_image> <input_H> <input_W>
+    trtexec --onnx=<onnx_file> \
+    --minShapes=input:<shape_of_min_batch> --optShapes=input:<shape_of_opt_batch> --maxShapes=input:<shape_of_max_batch> \
+    --workspace=<size_in_megabytes> --saveEngine=yolov4_-1_3_320_512_dyna.engine --fp16
     ```
+- For example:
+
+    ```sh
+    trtexec --onnx=yolov4_-1_3_320_512_dynamic.onnx \
+    --minShapes=input:1x3x320x512 --optShapes=input:4x3x320x512 --maxShapes=input:8x3x320x512 \
+    --workspace=2048 --saveEngine=yolov4_-1_3_320_512_dynamic.engine --fp16
+    ```
+
+## 5.3 Run the demo
+
+```sh
+python demo_trt.py <tensorRT_engine_file> <input_image> <input_H> <input_W>
+```
 
-    - This demo here only works when batchSize=1, but you can update this demo a little for batched inputs.
+- This demo here only works when batchSize is dynamic (1 should be within dynamic range) or batchSize=1, but you can update this demo a little for other dynamic or static batch sizes.
     
-    - Note1: input_H and input_W should agree with the input size in the original ONNX file.
+- Note1: input_H and input_W should agree with the input size in the original ONNX file.
     
-    - Note2: extra NMS operations are needed for the tensorRT output. This demo uses python NMS code from `tool/utils.py`.
+- Note2: extra NMS operations are needed for the tensorRT output. This demo uses python NMS code from `tool/utils.py`.
 
 
 # 6. ONNX2Tensorflow
diff --git a/demo_darknet2onnx.py b/demo_darknet2onnx.py
@@ -12,10 +12,13 @@
 
 def main(cfg_file, weight_file, image_path, batch_size):
 
-    # Transform to onnx as specified batch size
-    transform_to_onnx(cfg_file, weight_file, batch_size)
-    # Transform to onnx for demo
-    onnx_path_demo = transform_to_onnx(cfg_file, weight_file, 1)
+    if batch_size <= 0:
+        onnx_path_demo = transform_to_onnx(cfg_file, weight_file, batch_size)
+    else:
+        # Transform to onnx as specified batch size
+        transform_to_onnx(cfg_file, weight_file, batch_size)
+        # Transform to onnx as demo
+        onnx_path_demo = transform_to_onnx(cfg_file, weight_file, 1)
 
     session = onnxruntime.InferenceSession(onnx_path_demo)
     # session = onnx.load(onnx_path)
diff --git a/demo_pytorch2onnx.py b/demo_pytorch2onnx.py
@@ -19,32 +19,59 @@ def transform_to_onnx(weight_file, batch_size, n_classes, IN_IMAGE_H, IN_IMAGE_W
     pretrained_dict = torch.load(weight_file, map_location=torch.device('cuda'))
     model.load_state_dict(pretrained_dict)
 
-    x = torch.randn((batch_size, 3, IN_IMAGE_H, IN_IMAGE_W), requires_grad=True)  # .cuda()
-
-    onnx_file_name = "yolov4_{}_3_{}_{}.onnx".format(batch_size, IN_IMAGE_H, IN_IMAGE_W)
-
-    # Export the model
-    print('Export the onnx model ...')
-    torch.onnx.export(model,
-                      x,
-                      onnx_file_name,
-                      export_params=True,
-                      opset_version=11,
-                      do_constant_folding=True,
-                      input_names=['input'], output_names=['boxes', 'confs'],
-                      dynamic_axes=None)
-
-    print('Onnx model exporting done')
-    return onnx_file_name
+    input_names = ["input"]
+    output_names = ['boxes', 'confs']
+
+    dynamic = False
+    if batch_size <= 0:
+        dynamic = True
+
+    if dynamic:
+        x = torch.randn((1, 3, IN_IMAGE_H, IN_IMAGE_W), requires_grad=True)
+        onnx_file_name = "yolov4_-1_3_{}_{}_dynamic.onnx".format(IN_IMAGE_H, IN_IMAGE_W)
+        dynamic_axes = {"input": {0: "batch_size"}, "boxes": {0: "batch_size"}, "confs": {0: "batch_size"}}
+        # Export the model
+        print('Export the onnx model ...')
+        torch.onnx.export(model,
+                          x,
+                          onnx_file_name,
+                          export_params=True,
+                          opset_version=11,
+                          do_constant_folding=True,
+                          input_names=input_names, output_names=output_names,
+                          dynamic_axes=dynamic_axes)
+
+        print('Onnx model exporting done')
+        return onnx_file_name
+
+    else:
+        x = torch.randn((batch_size, 3, IN_IMAGE_H, IN_IMAGE_W), requires_grad=True)
+        onnx_file_name = "yolov4_{}_3_{}_{}_static.onnx".format(batch_size, IN_IMAGE_H, IN_IMAGE_W)
+        # Export the model
+        print('Export the onnx model ...')
+        torch.onnx.export(model,
+                          x,
+                          onnx_file_name,
+                          export_params=True,
+                          opset_version=11,
+                          do_constant_folding=True,
+                          input_names=input_names, output_names=output_names,
+                          dynamic_axes=None)
+
+        print('Onnx model exporting done')
+        return onnx_file_name
     
 
 
 def main(weight_file, image_path, batch_size, n_classes, IN_IMAGE_H, IN_IMAGE_W):
 
-    # Transform to onnx as specified batch size
-    transform_to_onnx(weight_file, batch_size, n_classes, IN_IMAGE_H, IN_IMAGE_W)
-    # Transform to onnx for demo
-    onnx_path_demo = transform_to_onnx(weight_file, 1, n_classes, IN_IMAGE_H, IN_IMAGE_W)
+    if batch_size <= 0:
+        onnx_path_demo = transform_to_onnx(weight_file, batch_size, n_classes, IN_IMAGE_H, IN_IMAGE_W)
+    else:
+        # Transform to onnx as specified batch size
+        transform_to_onnx(weight_file, batch_size, n_classes, IN_IMAGE_H, IN_IMAGE_W)
+        # Transform to onnx for demo
+        onnx_path_demo = transform_to_onnx(weight_file, 1, n_classes, IN_IMAGE_H, IN_IMAGE_W)
 
     session = onnxruntime.InferenceSession(onnx_path_demo)
     # session = onnx.load(onnx_path)
diff --git a/demo_trt.py b/demo_trt.py
@@ -73,13 +73,20 @@ def __repr__(self):
         return self.__str__()
 
 # Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
-def allocate_buffers(engine):
+def allocate_buffers(engine, batch_size):
     inputs = []
     outputs = []
     bindings = []
     stream = cuda.Stream()
     for binding in engine:
-        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
+
+        size = trt.volume(engine.get_binding_shape(binding)) * batch_size
+        dims = engine.get_binding_shape(binding)
+        
+        # in case batch dimension is -1 (dynamic)
+        if dims[0] < 0:
+            size *= -1
+        
         dtype = trt.nptype(engine.get_binding_dtype(binding))
         # Allocate host and device buffers
         host_mem = cuda.pagelocked_empty(size, dtype)
@@ -112,7 +119,10 @@ def do_inference(context, bindings, inputs, outputs, stream):
 
 def main(engine_path, image_path, image_size):
     with get_engine(engine_path) as engine, engine.create_execution_context() as context:
-        buffers = allocate_buffers(engine)
+        buffers = allocate_buffers(engine, 1)
+        IN_IMAGE_H, IN_IMAGE_W = image_size
+        context.set_binding_shape(0, (1, 3, IN_IMAGE_H, IN_IMAGE_W))
+
         image_src = cv2.imread(image_path)
 
         num_classes = 80
diff --git a/models.py b/models.py
@@ -20,17 +20,20 @@ def __init__(self):
 
     def forward(self, x, target_size, inference=False):
         assert (x.data.dim() == 4)
-        _, _, tH, tW = target_size
+        # _, _, tH, tW = target_size
 
         if inference:
-            B = x.data.size(0)
-            C = x.data.size(1)
-            H = x.data.size(2)
-            W = x.data.size(3)
 
-            return x.view(B, C, H, 1, W, 1).expand(B, C, H, tH // H, W, tW // W).contiguous().view(B, C, tH, tW)
+            #B = x.data.size(0)
+            #C = x.data.size(1)
+            #H = x.data.size(2)
+            #W = x.data.size(3)
+
+            return x.view(x.size(0), x.size(1), x.size(2), 1, x.size(3), 1).\
+                    expand(x.size(0), x.size(1), x.size(2), target_size[2] // x.size(2), x.size(3), target_size[3] // x.size(3)).\
+                    contiguous().view(x.size(0), x.size(1), target_size[2], target_size[3])
         else:
-            return F.interpolate(x, size=(tH, tW), mode='nearest')
+            return F.interpolate(x, size=(target_size[2], target_size[3]), mode='nearest')
 
 
 class Conv_Bn_Activation(nn.Module):
diff --git a/tool/darknet2onnx.py b/tool/darknet2onnx.py
@@ -3,23 +3,23 @@
 from tool.darknet2pytorch import Darknet
 
 
-def transform_to_onnx(cfgfile, weightfile, batch_size=1, dynamic=False):
+def transform_to_onnx(cfgfile, weightfile, batch_size=1):
     model = Darknet(cfgfile)
 
     model.print_network()
     model.load_weights(weightfile)
     print('Loading weights from %s... Done!' % (weightfile))
 
-    # model.cuda()
+    dynamic = False
+    if batch_size <= 0:
+        dynamic = True
 
-    x = torch.randn((batch_size, 3, model.height, model.width), requires_grad=True)  # .cuda()
+    input_names = ["input"]
+    output_names = ['boxes', 'confs']
 
     if dynamic:
-
-        onnx_file_name = "yolov4_{}_3_{}_{}_dyna.onnx".format(batch_size, model.height, model.width)
-        input_names = ["input"]
-        output_names = ['boxes', 'confs']
-
+        x = torch.randn((1, 3, model.height, model.width), requires_grad=True)
+        onnx_file_name = "yolov4_-1_3_{}_{}_dynamic.onnx".format(model.height, model.width)
         dynamic_axes = {"input": {0: "batch_size"}, "boxes": {0: "batch_size"}, "confs": {0: "batch_size"}}
         # Export the model
         print('Export the onnx model ...')
@@ -36,14 +36,15 @@ def transform_to_onnx(cfgfile, weightfile, batch_size=1, dynamic=False):
         return onnx_file_name
 
     else:
+        x = torch.randn((batch_size, 3, model.height, model.width), requires_grad=True)
         onnx_file_name = "yolov4_{}_3_{}_{}_static.onnx".format(batch_size, model.height, model.width)
         torch.onnx.export(model,
                           x,
                           onnx_file_name,
                           export_params=True,
                           opset_version=11,
                           do_constant_folding=True,
-                          input_names=['input'], output_names=['boxes', 'confs'],
+                          input_names=input_names, output_names=output_names,
                           dynamic_axes=None)
 
         print('Onnx model exporting done')
diff --git a/tool/darknet2pytorch.py b/tool/darknet2pytorch.py
@@ -55,15 +55,12 @@ def __init__(self, stride=2):
         self.stride = stride
 
     def forward(self, x):
-        stride = self.stride
         assert (x.data.dim() == 4)
-        B = x.data.size(0)
-        C = x.data.size(1)
-        H = x.data.size(2)
-        W = x.data.size(3)
-        ws = stride
-        hs = stride
-        x = x.view(B, C, H, 1, W, 1).expand(B, C, H, stride, W, stride).contiguous().view(B, C, H * stride, W * stride)
+        
+        x = x.view(x.size(0), x.size(1), x.size(2), 1, x.size(3), 1).\
+            expand(x.size(0), x.size(1), x.size(2), self.stride, x.size(3), self.stride).contiguous().\
+            view(x.size(0), x.size(1), x.size(2) * self.stride, x.size(3) * self.stride)
+
         return x
 
 
@@ -73,14 +70,9 @@ def __init__(self, stride):
         self.stride = stride
 
     def forward(self, x):
-        x_numpy = x.cpu().detach().numpy()
-        H = x_numpy.shape[2]
-        W = x_numpy.shape[3]
-
-        H = H * self.stride
-        W = W * self.stride
+        assert (x.data.dim() == 4)
 
-        out = F.interpolate(x, size=(H, W), mode='nearest')
+        out = F.interpolate(x, size=(x.size(2) * self.stride, x.size(3) * self.stride), mode='nearest')
         return out
 
 
@@ -246,15 +238,15 @@ def create_network(self, blocks):
         conv_id = 0
         for block in blocks:
             if block['type'] == 'net':
-                prev_filters = int(float(block['channels']))
+                prev_filters = int(block['channels'])
                 continue
             elif block['type'] == 'convolutional':
                 conv_id = conv_id + 1
-                batch_normalize = int(float(block['batch_normalize']))
-                filters = int(float(block['filters']))
-                kernel_size = int(float(block['size']))
-                stride = int(float(block['stride']))
-                is_pad = int(float(block['pad']))
+                batch_normalize = int(block['batch_normalize'])
+                filters = int(block['filters'])
+                kernel_size = int(block['size'])
+                stride = int(block['stride'])
+                is_pad = int(block['pad'])
                 pad = (kernel_size - 1) // 2 if is_pad else 0
                 activation = block['activation']
                 model = nn.Sequential()
diff --git a/tool/yolo_layer.py b/tool/yolo_layer.py