Skip to content

Commit a54e37a

Browse files
authored
Dev (#240)
* rebase to master * little modification * augmentation implemented with torch * use cuda11.3+torch11, torchrun * refactor * support other value of ignore label * discard distributed
1 parent f9231b7 commit a54e37a

File tree

14 files changed

+121
-98
lines changed

14 files changed

+121
-98
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ tensorrt/build/*
114114
datasets/coco/train.txt
115115
datasets/coco/val.txt
116116
pretrained/*
117-
dist_train.sh
117+
run.sh
118118
openvino/build/*
119119
openvino/output*
120120
*.onnx

README.md

Lines changed: 5 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,11 @@ Triton Inference Server(TIS) provides a service solution of deployment. You can
4646
My platform is like this:
4747

4848
* ubuntu 18.04
49-
* nvidia Tesla T4 gpu, driver 450.51.05
50-
* cuda 10.2
51-
* cudnn 7
49+
* nvidia Tesla T4 gpu, driver 450.80.02
50+
* cuda 10.2/11.3
51+
* cudnn 8
5252
* miniconda python 3.8.8
53-
* pytorch 1.8.1
53+
* pytorch 1.11.0
5454

5555

5656
## get start
@@ -114,33 +114,7 @@ Then you need to change the field of `im_root` and `train/val_im_anns` in the co
114114

115115
## train
116116

117-
I used the following command to train the models:
118-
119-
```bash
120-
# bisenetv1 cityscapes
121-
export CUDA_VISIBLE_DEVICES=0,1
122-
cfg_file=configs/bisenetv1_city.py
123-
NGPUS=2
124-
python -m torch.distributed.launch --nproc_per_node=$NGPUS tools/train_amp.py --config $cfg_file
125-
126-
# bisenetv2 cityscapes
127-
export CUDA_VISIBLE_DEVICES=0,1
128-
cfg_file=configs/bisenetv2_city.py
129-
NGPUS=2
130-
python -m torch.distributed.launch --nproc_per_node=$NGPUS tools/train_amp.py --config $cfg_file
131-
132-
# bisenetv1 cocostuff
133-
export CUDA_VISIBLE_DEVICES=0,1,2,3
134-
cfg_file=configs/bisenetv1_coco.py
135-
NGPUS=4
136-
python -m torch.distributed.launch --nproc_per_node=$NGPUS tools/train_amp.py --config $cfg_file
137-
138-
# bisenetv2 cocostuff
139-
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
140-
cfg_file=configs/bisenetv2_coco.py
141-
NGPUS=8
142-
python -m torch.distributed.launch --nproc_per_node=$NGPUS tools/train_amp.py --config $cfg_file
143-
```
117+
Training commands I used to train the models can be found in [here](./dist_train.sh).
144118

145119
Note:
146120
1. though `bisenetv2` has fewer flops, it requires much more training iterations. The the training time of `bisenetv1` is shorter.

dist_train.sh

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
2+
'''
3+
NOTE: replace torchrun with torch.distributed.launch if you use older version of pytorch. I suggest you use the same version as I do since I have not tested compatibility with older version after updating.
4+
'''
5+
6+
7+
## bisenetv1 cityscapes
8+
export CUDA_VISIBLE_DEVICES=0,1
9+
cfg_file=configs/bisenetv1_city.py
10+
NGPUS=2
11+
torchrun --nproc_per_node=$NGPUS tools/train_amp.py --config $cfg_file
12+
13+
14+
## bisenetv2 cityscapes
15+
export CUDA_VISIBLE_DEVICES=0,1
16+
cfg_file=configs/bisenetv2_city.py
17+
NGPUS=2
18+
torchrun --nproc_per_node=$NGPUS tools/train_amp.py --config $cfg_file
19+
20+
21+
## bisenetv1 cocostuff
22+
export CUDA_VISIBLE_DEVICES=0,1,2,3
23+
cfg_file=configs/bisenetv1_coco.py
24+
NGPUS=4
25+
torchrun --nproc_per_node=$NGPUS tools/train_amp.py --config $cfg_file
26+
27+
28+
## bisenetv2 cocostuff
29+
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
30+
cfg_file=configs/bisenetv2_coco.py
31+
NGPUS=8
32+
torchrun --nproc_per_node=$NGPUS tools/train_amp.py --config $cfg_file

lib/base_dataset.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ def __init__(self, dataroot, annpath, trans_func=None, mode='train'):
2424
self.mode = mode
2525
self.trans_func = trans_func
2626

27+
self.lb_ignore = -100
2728
self.lb_map = None
2829

2930
with open(annpath, 'r') as fr:
@@ -50,7 +51,8 @@ def __getitem__(self, idx):
5051
return img.detach(), label.unsqueeze(0).detach()
5152

5253
def get_image(self, impth, lbpth):
53-
img, label = cv2.imread(impth)[:, :, ::-1], cv2.imread(lbpth, 0)
54+
img = cv2.imread(impth)[:, :, ::-1].copy()
55+
label = cv2.imread(lbpth, 0)
5456
return img, label
5557

5658
def __len__(self):

lib/coco.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,9 @@
4848
class CocoStuff(BaseDataset):
4949

5050
def __init__(self, dataroot, annpath, trans_func=None, mode='train'):
51-
super(CocoStuff, self).__init__(dataroot, annpath, trans_func, mode)
51+
super(CocoStuff, self).__init__(
52+
dataroot, annpath, trans_func, mode)
5253
self.n_cats = 171 # 91 stuff, 91 thing, 11 of thing have no annos
53-
self.lb_ignore = 255
5454

5555
## label mapping, remove non-existing labels
5656
missing = [11, 25, 28, 29, 44, 65, 67, 68, 70, 82, 90]

lib/get_dataloader.py

Lines changed: 4 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -10,48 +10,25 @@
1010

1111

1212

13-
class TransformationTrain(object):
1413

15-
def __init__(self, scales, cropsize):
16-
self.trans_func = T.Compose([
17-
T.RandomResizedCrop(scales, cropsize),
18-
T.RandomHorizontalFlip(),
19-
T.ColorJitter(
20-
brightness=0.4,
21-
contrast=0.4,
22-
saturation=0.4
23-
),
24-
])
2514

26-
def __call__(self, im_lb):
27-
im_lb = self.trans_func(im_lb)
28-
return im_lb
29-
30-
31-
class TransformationVal(object):
32-
33-
def __call__(self, im_lb):
34-
im, lb = im_lb['im'], im_lb['lb']
35-
return dict(im=im, lb=lb)
36-
37-
38-
def get_data_loader(cfg, mode='train', distributed=True):
15+
def get_data_loader(cfg, mode='train'):
3916
if mode == 'train':
40-
trans_func = TransformationTrain(cfg.scales, cfg.cropsize)
17+
trans_func = T.TransformationTrain(cfg.scales, cfg.cropsize)
4118
batchsize = cfg.ims_per_gpu
4219
annpath = cfg.train_im_anns
4320
shuffle = True
4421
drop_last = True
4522
elif mode == 'val':
46-
trans_func = TransformationVal()
23+
trans_func = T.TransformationVal()
4724
batchsize = cfg.eval_ims_per_gpu
4825
annpath = cfg.val_im_anns
4926
shuffle = False
5027
drop_last = False
5128

5229
ds = eval(cfg.dataset)(cfg.im_root, annpath, trans_func=trans_func, mode=mode)
5330

54-
if distributed:
31+
if dist.is_initialized():
5532
assert dist.is_available(), "dist should be initialzed"
5633
if mode == 'train':
5734
assert not cfg.max_iter is None

lib/logger.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ def setup_logger(name, logpth):
1919
try:
2020
logging.basicConfig(level=log_level, format=FORMAT, filename=logfile, force=True)
2121
except Exception:
22+
for hl in logging.root.handlers: logging.root.removeHandler(hl)
2223
logging.basicConfig(level=log_level, format=FORMAT, filename=logfile)
2324
logging.root.addHandler(logging.StreamHandler())
2425

lib/ohem_ce_loss.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,30 +10,30 @@
1010
# import ohem_cpp
1111
# class OhemCELoss(nn.Module):
1212
#
13-
# def __init__(self, thresh, ignore_lb=255):
13+
# def __init__(self, thresh, lb_ignore=255):
1414
# super(OhemCELoss, self).__init__()
1515
# self.score_thresh = thresh
16-
# self.ignore_lb = ignore_lb
17-
# self.criteria = nn.CrossEntropyLoss(ignore_index=ignore_lb, reduction='mean')
16+
# self.lb_ignore = lb_ignore
17+
# self.criteria = nn.CrossEntropyLoss(ignore_index=lb_ignore, reduction='mean')
1818
#
1919
# def forward(self, logits, labels):
20-
# n_min = labels[labels != self.ignore_lb].numel() // 16
20+
# n_min = labels[labels != self.lb_ignore].numel() // 16
2121
# labels = ohem_cpp.score_ohem_label(
22-
# logits, labels, self.ignore_lb, self.score_thresh, n_min).detach()
22+
# logits, labels, self.lb_ignore, self.score_thresh, n_min).detach()
2323
# loss = self.criteria(logits, labels)
2424
# return loss
2525

2626

2727
class OhemCELoss(nn.Module):
2828

29-
def __init__(self, thresh, ignore_lb=255):
29+
def __init__(self, thresh, lb_ignore=255):
3030
super(OhemCELoss, self).__init__()
3131
self.thresh = -torch.log(torch.tensor(thresh, requires_grad=False, dtype=torch.float)).cuda()
32-
self.ignore_lb = ignore_lb
33-
self.criteria = nn.CrossEntropyLoss(ignore_index=ignore_lb, reduction='none')
32+
self.lb_ignore = lb_ignore
33+
self.criteria = nn.CrossEntropyLoss(ignore_index=lb_ignore, reduction='none')
3434

3535
def forward(self, logits, labels):
36-
n_min = labels[labels != self.ignore_lb].numel() // 16
36+
n_min = labels[labels != self.lb_ignore].numel() // 16
3737
loss = self.criteria(logits, labels).view(-1)
3838
loss_hard = loss[loss > self.thresh]
3939
if loss_hard.numel() < n_min:

lib/transform_cv2.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,30 @@ def __call__(self, im_lb):
151151
return im_lb
152152

153153

154+
class TransformationTrain(object):
155+
156+
def __init__(self, scales, cropsize):
157+
self.trans_func = Compose([
158+
RandomResizedCrop(scales, cropsize),
159+
RandomHorizontalFlip(),
160+
ColorJitter(
161+
brightness=0.4,
162+
contrast=0.4,
163+
saturation=0.4
164+
),
165+
])
166+
167+
def __call__(self, im_lb):
168+
im_lb = self.trans_func(im_lb)
169+
return im_lb
170+
171+
172+
class TransformationVal(object):
173+
174+
def __call__(self, im_lb):
175+
im, lb = im_lb['im'], im_lb['lb']
176+
return dict(im=im, lb=lb)
177+
154178

155179

156180
if __name__ == '__main__':

openvino/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
#include <inference_engine.hpp>
1010

1111

12-
std::string mdpth("../output_v2/model_v2.xml");
12+
std::string mdpth("../output_v2/model_v2_city.xml");
1313
std::string device("CPU"); // GNA does not support argmax, my cpu does not has integrated gpu
1414
std::string impth("../../example.png");
1515
std::string savepth("./res.jpg");

0 commit comments

Comments
 (0)