Merge pull request #163 from CoinCheung/dev

CoinCheung · web-flow · commit ad67baa4d779 · 2021-07-13T17:35:01.000+08:00
more test on the recent modifications
diff --git a/README.md b/README.md
@@ -6,13 +6,14 @@ My implementation of [BiSeNetV1](https://arxiv.org/abs/1808.00897) and [BiSeNetV
 mIOUs and fps on cityscapes val set:
 | none | ss | ssc | msf | mscf | fps(fp16/fp32) | link |
 |------|:--:|:---:|:---:|:----:|:---:|:----:|
-| bisenetv1 | 75.10 | 76.90 | 77.22 | 78.73 | 60/19 | [download](https://github.com/CoinCheung/BiSeNet/releases/download/0.0.0/model_final_v1_city.pth) |
-| bisenetv2 | 74.95 | 75.58 | 76.53 | 77.08 | 50/16 | [download](https://github.com/CoinCheung/BiSeNet/releases/download/0.0.0/model_final_v2_city.pth) |
+| bisenetv1 | 75.44 | 76.94 | 77.45 | 78.86 | 68/23 | [download](https://github.com/CoinCheung/BiSeNet/releases/download/0.0.0/model_final_v1_city_new.pth) |
+| bisenetv2 | 74.95 | 75.58 | 76.53 | 77.08 | 59/21 | [download](https://github.com/CoinCheung/BiSeNet/releases/download/0.0.0/model_final_v2_city.pth) |
+
 
 mIOUs on cocostuff val2017 set:
 | none | ss | ssc | msf | mscf | link |
 |------|:--:|:---:|:---:|:----:|:----:|
-| bisenetv1 | 31.89 | 31.62 | 32.81 | 32.72 | [download](https://github.com/CoinCheung/BiSeNet/releases/download/0.0.0/model_final_v1_coco.pth) |
+| bisenetv1 | 31.49 | 31.42 | 32.46 | 32.55 | [download](https://github.com/CoinCheung/BiSeNet/releases/download/0.0.0/model_final_v1_coco_new.pth) |
 | bisenetv2 | 30.49 | 30.55 | 31.81 | 31.73 | [download](https://github.com/CoinCheung/BiSeNet/releases/download/0.0.0/model_final_v2_coco.pth) |
 
 > Where **ss** means single scale evaluation, **ssc** means single scale crop evaluation, **msf** means multi-scale evaluation with flip augment, and **mscf** means multi-scale crop evaluation with flip evaluation. The eval scales and crop size of multi-scales evaluation can be found in [configs](./configs/).
diff --git a/dist_train.sh b/dist_train.sh
diff --git a/lib/models/bisenetv1.py b/lib/models/bisenetv1.py
@@ -99,15 +99,16 @@ def __init__(self, in_chan, out_chan, *args, **kwargs):
         self.conv = ConvBNReLU(in_chan, out_chan, ks=3, stride=1, padding=1)
         self.conv_atten = nn.Conv2d(out_chan, out_chan, kernel_size= 1, bias=False)
         self.bn_atten = BatchNorm2d(out_chan)
-        self.sigmoid_atten = nn.Sigmoid()
+        #  self.sigmoid_atten = nn.Sigmoid()
         self.init_weight()
 
     def forward(self, x):
         feat = self.conv(x)
         atten = torch.mean(feat, dim=(2, 3), keepdim=True)
         atten = self.conv_atten(atten)
         atten = self.bn_atten(atten)
-        atten = self.sigmoid_atten(atten)
+        #  atten = self.sigmoid_atten(atten)
+        atten = atten.sigmoid()
         out = torch.mul(feat, atten)
         return out
 
@@ -206,30 +207,39 @@ class FeatureFusionModule(nn.Module):
     def __init__(self, in_chan, out_chan, *args, **kwargs):
         super(FeatureFusionModule, self).__init__()
         self.convblk = ConvBNReLU(in_chan, out_chan, ks=1, stride=1, padding=0)
-        self.conv1 = nn.Conv2d(out_chan,
-                out_chan//4,
-                kernel_size = 1,
-                stride = 1,
-                padding = 0,
-                bias = False)
-        self.conv2 = nn.Conv2d(out_chan//4,
+        ## use conv-bn instead of 2 layer mlp, so that tensorrt 7.2.3.4 can work for fp16
+        self.conv = nn.Conv2d(out_chan,
                 out_chan,
                 kernel_size = 1,
                 stride = 1,
                 padding = 0,
                 bias = False)
-        self.relu = nn.ReLU(inplace=True)
-        self.sigmoid = nn.Sigmoid()
+        self.bn = nn.BatchNorm2d(out_chan)
+        #  self.conv1 = nn.Conv2d(out_chan,
+        #          out_chan//4,
+        #          kernel_size = 1,
+        #          stride = 1,
+        #          padding = 0,
+        #          bias = False)
+        #  self.conv2 = nn.Conv2d(out_chan//4,
+        #          out_chan,
+        #          kernel_size = 1,
+        #          stride = 1,
+        #          padding = 0,
+        #          bias = False)
+        #  self.relu = nn.ReLU(inplace=True)
         self.init_weight()
 
     def forward(self, fsp, fcp):
         fcat = torch.cat([fsp, fcp], dim=1)
         feat = self.convblk(fcat)
         atten = torch.mean(feat, dim=(2, 3), keepdim=True)
-        atten = self.conv1(atten)
-        atten = self.relu(atten)
-        atten = self.conv2(atten)
-        atten = self.sigmoid(atten)
+        atten = self.conv(atten)
+        atten = self.bn(atten)
+        #  atten = self.conv1(atten)
+        #  atten = self.relu(atten)
+        #  atten = self.conv2(atten)
+        atten = atten.sigmoid()
         feat_atten = torch.mul(feat, atten)
         feat_out = feat_atten + feat
         return feat_out
diff --git a/tensorrt/CMakeLists.txt b/tensorrt/CMakeLists.txt
@@ -6,6 +6,7 @@ set(CMAKE_CXX_FLAGS "-std=c++14 -O1")
 
 
 link_directories(/usr/local/cuda/lib64)
+# set(OpenCV_DIR "/opt/opencv/lib/cmake/opencv4")
 
 
 find_package(CUDA REQUIRED)
diff --git a/tensorrt/README.md b/tensorrt/README.md
@@ -62,6 +62,5 @@ Likewise, you do not need to worry about this anymore with 7.2.3.4.
 
 3. The speed(fps) is tested on a single nvidia Tesla T4 gpu with `batchsize=1` and `cropsize=(1024,2048)`. Please note that T4 gpu is almost 2 times slower than 2080ti, you should evaluate the speed considering your own platform and cropsize. Also note that the performance would be affected if your gpu is concurrently working on other tasks. Please make sure no other program is running on your gpu when you test the speed.
 
-4. ~On my platform, after compiling with tensorrt, the model size of bisenetv1 is 33Mb(fp16) and 133Mb(fp32), and the size of bisenetv2 is 29Mb(fp16) and 54Mb(fp32). However, the fps of bisenetv1 is 60(fp16) and 19(fp32), while the fps of bisenetv2 is 50(fp16) and 16(fp32). It is obvious that bisenetv2 has fewer parameters than bisenetv1, but the speed is otherwise. I am not sure whether it is because tensorrt has worse optimization strategy in some ops used in bisenetv2(such as depthwise convolution) or because of the limitation of the gpu on different ops. Please tell me if you have better idea on this.~  
-Not tested.
+4. On my platform, after compiling with tensorrt, the model size of bisenetv1 is 29Mb(fp16) and 128Mb(fp32), and the size of bisenetv2 is 16Mb(fp16) and 42Mb(fp32). However, the fps of bisenetv1 is 68(fp16) and 23(fp32), while the fps of bisenetv2 is 59(fp16) and 21(fp32). It is obvious that bisenetv2 has fewer parameters than bisenetv1, but the speed is otherwise. I am not sure whether it is because tensorrt has worse optimization strategy in some ops used in bisenetv2(such as depthwise convolution) or because of the limitation of the gpu on different ops. Please tell me if you have better idea on this.  
 
diff --git a/tools/conver_to_trt.py b/tools/conver_to_trt.py
@@ -15,6 +15,7 @@
 parse = argparse.ArgumentParser()
 parse.add_argument('--config', dest='config', type=str, default='configs/bisenetv2.py',)
 parse.add_argument('--weight-path', type=str, default='./res/model_final.pth',)
+parse.add_argument('--fp16', action='store_true')
 parse.add_argument('--outpath', dest='out_pth', type=str,
         default='model.trt')
 args = parse.parse_args()
@@ -23,12 +24,16 @@
 cfg = set_cfg_from_file(args.config)
 if cfg.use_sync_bn: cfg.use_sync_bn = False
 
-net = model_factory[cfg.model_type](19, output_aux=False).cuda()
-net.load_state_dict(torch.load(args.weight_pth))
+net = model_factory[cfg.model_type](cfg.n_cats, aux_mode='pred')
+net.load_state_dict(torch.load(args.weight_path), strict=False)
+net.cuda()
 net.eval()
 
 
 #  dummy_input = torch.randn(1, 3, *cfg.crop_size)
 dummy_input = torch.randn(1, 3, 1024, 2048).cuda()
 
-trt_model = torch2trt(net, [dummy_input, ])
+trt_model = torch2trt(net, [dummy_input, ], fp16_mode=args.fp16, max_workspace=1 << 30)
+
+with open(args.out_pth, 'wb') as fw:
+    fw.write(trt_model.engine.serialize())
diff --git a/tools/export_libtorch.py b/tools/export_libtorch.py
@@ -0,0 +1,37 @@
+import argparse
+import os.path as osp
+import sys
+sys.path.insert(0, '.')
+
+import torch
+
+from lib.models import model_factory
+from configs import set_cfg_from_file
+
+torch.set_grad_enabled(False)
+
+
+parse = argparse.ArgumentParser()
+parse.add_argument('--config', dest='config', type=str,
+        default='configs/bisenetv2.py',)
+parse.add_argument('--weight-path', dest='weight_pth', type=str,
+        default='model_final.pth')
+parse.add_argument('--outpath', dest='out_pth', type=str,
+        default='model.pt')
+args = parse.parse_args()
+
+
+cfg = set_cfg_from_file(args.config)
+if cfg.use_sync_bn: cfg.use_sync_bn = False
+
+net = model_factory[cfg.model_type](cfg.n_cats, aux_mode='pred')
+net.load_state_dict(torch.load(args.weight_pth), strict=False)
+net.eval()
+
+
+#  dummy_input = torch.randn(1, 3, *cfg.crop_size)
+dummy_input = torch.randn(1, 3, 1024, 2048)
+script_module = torch.jit.trace(net, dummy_input)
+#  script_module.save(args.out_pth, _use_new_zipfile_serialization=False)
+script_module.save(args.out_pth)
+

Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,7 @@ set(CMAKE_CXX_FLAGS "-std=c++14 -O1")`
`6`	`6`
`7`	`7`
`8`	`8`	`link_directories(/usr/local/cuda/lib64)`
	`9`	`+# set(OpenCV_DIR "/opt/opencv/lib/cmake/opencv4")`
`9`	`10`
`10`	`11`
`11`	`12`	`find_package(CUDA REQUIRED)`