diff --git a/lib/layer_utils/anchor_target_layer.py b/lib/layer_utils/anchor_target_layer.py index 67cc0cc..8859320 100644 --- a/lib/layer_utils/anchor_target_layer.py +++ b/lib/layer_utils/anchor_target_layer.py @@ -9,11 +9,11 @@ from __future__ import print_function import os -from model.config import cfg +from lib.model.config import cfg import numpy as np import numpy.random as npr -from utils.bbox import bbox_overlaps -from model.bbox_transform import bbox_transform +from lib.utils.bbox import bbox_overlaps +from lib.model.bbox_transform import bbox_transform import torch def anchor_target_layer(rpn_cls_score, gt_boxes, im_info, _feat_stride, all_anchors, num_anchors): @@ -160,4 +160,4 @@ def _compute_targets(ex_rois, gt_rois): assert ex_rois.shape[1] == 4 assert gt_rois.shape[1] == 5 - return bbox_transform(torch.from_numpy(ex_rois), torch.from_numpy(gt_rois[:, :4])).numpy() \ No newline at end of file + return bbox_transform(torch.from_numpy(ex_rois), torch.from_numpy(gt_rois[:, :4])).numpy() diff --git a/lib/layer_utils/proposal_layer.py b/lib/layer_utils/proposal_layer.py index 527e2f3..cc592f3 100644 --- a/lib/layer_utils/proposal_layer.py +++ b/lib/layer_utils/proposal_layer.py @@ -8,9 +8,9 @@ from __future__ import print_function import numpy as np -from model.config import cfg -from model.bbox_transform import bbox_transform_inv, clip_boxes -from model.nms_wrapper import nms +from lib.model.config import cfg +from lib.model.bbox_transform import bbox_transform_inv, clip_boxes, bbox_transform_inv_batch +from lib.model.nms_wrapper import nms,nms_batch import torch @@ -52,3 +52,45 @@ def proposal_layer(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, blob = torch.cat((batch_inds, proposals), 1) return blob, scores + + +def proposal_layer_batch(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors, network_device): + """A simplified version compared to fast/er RCNN + For details please see the technical report + """ + if type(cfg_key) == bytes: + cfg_key = cfg_key.decode('utf-8') + pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N + post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N + nms_thresh = cfg[cfg_key].RPN_NMS_THRESH + + # Get the scores and bounding boxes + scores = rpn_cls_prob[:, :, :, num_anchors:] + rpn_bbox_pred = rpn_bbox_pred.view((rpn_bbox_pred.size(0),-1, 4)) + scores = scores.contiguous().view(scores.size(0),-1, 1) + proposals = bbox_transform_inv_batch(anchors, rpn_bbox_pred)#here bug + proposals = list(map(lambda x : clip_boxes(x, im_info[:2]),proposals)) + + blobs, scoress = [],[] + for i in range(scores.size(0)): + # Pick the top region proposals + score, order = scores[i].view(-1).sort(descending=True) + if pre_nms_topN > 0: + order = order[:pre_nms_topN] + score = score[:pre_nms_topN].view(-1, 1) + proposal = proposals[i][order.data, :] + + # Non-maximal suppression + keep = nms_batch(torch.cat((proposal, score), 1).data, nms_thresh, network_device) + # Pick th top region proposals after NMS + if post_nms_topN > 0: + keep = keep[:post_nms_topN] + proposal = proposal[keep] + score = score[keep] + + # Only support single image as input + batch_inds = proposal.new_zeros(proposal.size(0), 1) + blob = torch.cat((batch_inds, proposal), 1) + blobs.append(blob) + scoress.append(score) + return blobs, scoress diff --git a/lib/layer_utils/proposal_target_layer.py b/lib/layer_utils/proposal_target_layer.py index 87f7533..40f82d4 100644 --- a/lib/layer_utils/proposal_target_layer.py +++ b/lib/layer_utils/proposal_target_layer.py @@ -10,9 +10,9 @@ import numpy as np import numpy.random as npr -from model.config import cfg -from model.bbox_transform import bbox_transform -from utils.bbox import bbox_overlaps +from lib.model.config import cfg +from lib.model.bbox_transform import bbox_transform +from lib.utils.bbox import bbox_overlaps import torch diff --git a/lib/layer_utils/proposal_top_layer.py b/lib/layer_utils/proposal_top_layer.py index 97f44ad..84aaeee 100644 --- a/lib/layer_utils/proposal_top_layer.py +++ b/lib/layer_utils/proposal_top_layer.py @@ -8,11 +8,56 @@ from __future__ import print_function import numpy as np -from model.config import cfg -from model.bbox_transform import bbox_transform_inv, clip_boxes +from lib.model.config import cfg +from lib.model.bbox_transform import bbox_transform_inv, clip_boxes import numpy.random as npr import torch +import signal + +def proposal_top_layer_batch(rpn_cls_probs, rpn_bbox_preds, im_info, _feat_stride, anchors, num_anchors, network_device): + """A layer that just selects the top region proposals + without using non-maximal suppression, + For details please see the technical report + """ + rpn_top_n = cfg.TEST.RPN_TOP_N + + scores = rpn_cls_probs[:, :, :, num_anchors:] + + rpn_bbox_preds = rpn_bbox_preds.view(rpn_bbox_preds.size(0),-1, 4) + scores = scores.contiguous().view(rpn_bbox_preds.size(0),-1, 1) + + blobs, scoress = [],[] + for i in range(scores.size(0)): + score = scores[i] + length = score.size(0) + if length < rpn_top_n: + # Random selection, maybe unnecessary and loses good proposals + # But such case rarely happens + top_inds = torch.from_numpy(npr.choice(length, size=rpn_top_n, replace=True)).long().cuda(network_device) + else: + top_inds = score.sort(0, descending=True)[1] + top_inds = top_inds[:rpn_top_n] + top_inds = top_inds.view(rpn_top_n) + + # Do the selection here + anchor = anchors[top_inds, :].contiguous() + rpn_bbox_pred = rpn_bbox_preds[i][top_inds, :].contiguous() + score = score[top_inds].contiguous() + + # Convert anchors into proposals via bbox transformations + proposal = bbox_transform_inv(anchor, rpn_bbox_pred) + # Clip predicted boxes to image + proposal = clip_boxes(proposal, im_info[:2]) + + # Output rois blob + # Our RPN implementation only supports a single input image, so all + # batch inds are 0 + batch_inds = proposal.new_zeros(proposal.size(0), 1) + blob = torch.cat([batch_inds, proposal], 1) + blobs.append(blob) + scoress.append(score) + return blobs, scoress def proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, im_info, _feat_stride, anchors, num_anchors): """A layer that just selects the top region proposals @@ -43,10 +88,12 @@ def proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, im_info, _feat_stride, ancho # Convert anchors into proposals via bbox transformations proposals = bbox_transform_inv(anchors, rpn_bbox_pred) + #proposals = torch.zeros((5000,4)) # Clip predicted boxes to image proposals = clip_boxes(proposals, im_info[:2]) + # Output rois blob # Our RPN implementation only supports a single input image, so all # batch inds are 0 diff --git a/lib/layer_utils/roi_align/src/cuda/crop_and_resize_kernel.cu.o b/lib/layer_utils/roi_align/src/cuda/crop_and_resize_kernel.cu.o new file mode 100644 index 0000000..07f5997 Binary files /dev/null and b/lib/layer_utils/roi_align/src/cuda/crop_and_resize_kernel.cu.o differ diff --git a/lib/layer_utils/roi_pooling/_ext/roi_pooling/__init__.py b/lib/layer_utils/roi_pooling/_ext/roi_pooling/__init__.py index 9da23b1..d900ec5 100644 --- a/lib/layer_utils/roi_pooling/_ext/roi_pooling/__init__.py +++ b/lib/layer_utils/roi_pooling/_ext/roi_pooling/__init__.py @@ -6,7 +6,10 @@ def _import_symbols(locals): for symbol in dir(_lib): fn = getattr(_lib, symbol) - locals[symbol] = _wrap_function(fn, _ffi) + if callable(fn): + locals[symbol] = _wrap_function(fn, _ffi) + else: + locals[symbol] = fn __all__.append(symbol) _import_symbols(locals()) diff --git a/lib/layer_utils/roi_pooling/src/cuda/roi_pooling_kernel.cu.o b/lib/layer_utils/roi_pooling/src/cuda/roi_pooling_kernel.cu.o new file mode 100644 index 0000000..799cb6f Binary files /dev/null and b/lib/layer_utils/roi_pooling/src/cuda/roi_pooling_kernel.cu.o differ diff --git a/lib/layer_utils/snippets.py b/lib/layer_utils/snippets.py index 14bf77f..ee5e4e2 100644 --- a/lib/layer_utils/snippets.py +++ b/lib/layer_utils/snippets.py @@ -8,7 +8,7 @@ from __future__ import print_function import numpy as np -from layer_utils.generate_anchors import generate_anchors +from lib.layer_utils.generate_anchors import generate_anchors def generate_anchors_pre(height, width, feat_stride, anchor_scales=(8,16,32), anchor_ratios=(0.5,1,2)): """ A wrapper function to generate anchors given different scales diff --git a/lib/make.sh b/lib/make.sh index 163bf11..5811347 100644 --- a/lib/make.sh +++ b/lib/make.sh @@ -11,7 +11,7 @@ cd layer_utils/roi_pooling/src/cuda echo "Compiling roi_pooling kernels by nvcc..." nvcc -c -o roi_pooling_kernel.cu.o roi_pooling_kernel.cu -x cu -Xcompiler -fPIC $CUDA_ARCH cd ../../ -python build.py +python3 build.py cd ../../ # Build RoIAlign @@ -19,7 +19,7 @@ cd layer_utils/roi_align/src/cuda echo 'Compiling crop_and_resize kernels by nvcc...' nvcc -c -o crop_and_resize_kernel.cu.o crop_and_resize_kernel.cu -x cu -Xcompiler -fPIC $CUDA_ARCH cd ../../ -python build.py +python3 build.py cd ../../ # Build NMS @@ -27,5 +27,5 @@ cd nms/src/cuda echo "Compiling nms kernels by nvcc..." nvcc -c -o nms_kernel.cu.o nms_kernel.cu -x cu -Xcompiler -fPIC $CUDA_ARCH cd ../../ -python build.py +python3 build.py cd ../ diff --git a/lib/model/bbox_transform.py b/lib/model/bbox_transform.py index 66916a8..8b80eae 100644 --- a/lib/model/bbox_transform.py +++ b/lib/model/bbox_transform.py @@ -32,6 +32,34 @@ def bbox_transform(ex_rois, gt_rois): return targets +def bbox_transform_inv_batch(boxes, deltas): + # Input should be both tensor or both Variable and on the same device + if len(boxes) == 0: + return deltas.detach() * 0 + + widths = boxes[:, 2] - boxes[:, 0] + 1.0 + heights = boxes[:, 3] - boxes[:, 1] + 1.0 + ctr_x = boxes[:, 0] + 0.5 * widths + ctr_y = boxes[:, 1] + 0.5 * heights + + dx = deltas[:, :, 0::4] + dy = deltas[:, :, 1::4] + dw = deltas[:, :, 2::4] + dh = deltas[:, :, 3::4] + + pred_ctr_x = dx * widths.repeat(deltas.size(0),1).unsqueeze(-1) + ctr_x.repeat(deltas.size(0),1).unsqueeze(-1) + pred_ctr_y = dy * heights.repeat(deltas.size(0),1).unsqueeze(-1) + ctr_y.repeat(deltas.size(0),1).unsqueeze(-1) + pred_w = torch.exp(dw) * widths.repeat(deltas.size(0),1).unsqueeze(-1) + pred_h = torch.exp(dh) * heights.repeat(deltas.size(0),1).unsqueeze(-1) + + pred_boxes = torch.cat(\ + [_.unsqueeze(2) for _ in [pred_ctr_x - 0.5 * pred_w,\ + pred_ctr_y - 0.5 * pred_h,\ + pred_ctr_x + 0.5 * pred_w,\ + pred_ctr_y + 0.5 * pred_h]], 2).view(deltas.size(0),len(boxes), -1) + + return pred_boxes + def bbox_transform_inv(boxes, deltas): # Input should be both tensor or both Variable and on the same device if len(boxes) == 0: @@ -46,7 +74,7 @@ def bbox_transform_inv(boxes, deltas): dy = deltas[:, 1::4] dw = deltas[:, 2::4] dh = deltas[:, 3::4] - + pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1) pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1) pred_w = torch.exp(dw) * widths.unsqueeze(1) @@ -61,6 +89,7 @@ def bbox_transform_inv(boxes, deltas): return pred_boxes + def clip_boxes(boxes, im_shape): """ Clip boxes to image boundaries. diff --git a/lib/model/config.py b/lib/model/config.py index 4c16a49..56b2486 100644 --- a/lib/model/config.py +++ b/lib/model/config.py @@ -39,7 +39,7 @@ # Whether to double the learning rate for bias __C.TRAIN.DOUBLE_BIAS = True -# Whether to initialize the weights with truncated normal distribution +# Whether to initialize the weights with truncated normal distribution __C.TRAIN.TRUNCATED = False # Whether to have weight decay on bias as well @@ -50,7 +50,7 @@ # Whether to use aspect-ratio grouping of training images, introduced merely for saving # GPU memory -__C.TRAIN.ASPECT_GROUPING = False +__C.TRAIN.ASPECT_GROUPING = True # The number of snapshots kept, older ones are deleted to save space __C.TRAIN.SNAPSHOT_KEPT = 3 @@ -139,7 +139,7 @@ __C.TRAIN.RPN_BATCHSIZE = 256 # NMS threshold used on RPN proposals -__C.TRAIN.RPN_NMS_THRESH = 0.7 +__C.TRAIN.RPN_NMS_THRESH = 0.8 # Number of top scoring boxes to keep before apply NMS to RPN proposals __C.TRAIN.RPN_PRE_NMS_TOP_N = 12000 @@ -155,7 +155,7 @@ # Set to -1.0 to use uniform example weighting __C.TRAIN.RPN_POSITIVE_WEIGHT = -1.0 -# Whether to use all ground truth bounding boxes for training, +# Whether to use all ground truth bounding boxes for training, # For COCO, setting USE_ALL_GT to False will exclude boxes that are flagged as ''iscrowd'' __C.TRAIN.USE_ALL_GT = True @@ -213,8 +213,8 @@ __C.RESNET = edict() -# Option to set if max-pooling is appended after crop_and_resize. -# if true, the region will be resized to a square of 2xPOOLING_SIZE, +# Option to set if max-pooling is appended after crop_and_resize. +# if true, the region will be resized to a square of 2xPOOLING_SIZE, # then 2x2 max-pooling is applied; otherwise the region will be directly # resized to a square of POOLING_SIZE __C.RESNET.MAX_POOL = False @@ -267,7 +267,7 @@ __C.EXP_DIR = 'default' # Use GPU implementation of non-maximum suppression -__C.USE_GPU_NMS = True +__C.USE_GPU_NMS = False # Default pooling mode __C.POOLING_MODE = 'crop' diff --git a/lib/model/nms_wrapper.py b/lib/model/nms_wrapper.py index 3e45e6a..2015548 100644 --- a/lib/model/nms_wrapper.py +++ b/lib/model/nms_wrapper.py @@ -8,9 +8,15 @@ from __future__ import division from __future__ import print_function -from nms.pth_nms import pth_nms +from lib.nms.pth_nms import pth_nms +from lib.nms.pth_nms import pth_nms_batch +def nms_batch(dets, thresh, network_device): + """Dispatch to either CPU or GPU NMS implementations. + Accept dets as tensor""" + return pth_nms_batch(dets, thresh, network_device) + def nms(dets, thresh): """Dispatch to either CPU or GPU NMS implementations. Accept dets as tensor""" diff --git a/lib/model/test.py b/lib/model/test.py index 19678ab..7cdd473 100644 --- a/lib/model/test.py +++ b/lib/model/test.py @@ -16,15 +16,17 @@ import os import math -from utils.timer import Timer -from model.nms_wrapper import nms -from utils.blob import im_list_to_blob +from lib.utils.timer import Timer +from lib.model.nms_wrapper import nms +from lib.utils.blob import im_list_to_blob -from model.config import cfg, get_output_dir -from model.bbox_transform import clip_boxes, bbox_transform_inv +from lib.model.config import cfg, get_output_dir +from lib.model.bbox_transform import clip_boxes, bbox_transform_inv import torch + + def _get_image_blob(im): """Converts an image into a network input. Arguments: @@ -59,6 +61,44 @@ def _get_image_blob(im): return blob, np.array(im_scale_factors) + +def _get_image_blob_batch(ims): + """Converts an image into a network input. + Arguments: + im (ndarray): a color image in BGR order + Returns: + blob (ndarray): a data blob holding an image pyramid + im_scale_factors (list): list of image scales (relative to im) used + in the image pyramid + """ + im_origs = ims.astype(np.float32, copy=True) + im_origs -= cfg.PIXEL_MEANS + + im_shape = im_origs.shape + im_size_min = np.min(im_shape[1:3]) + im_size_max = np.max(im_shape[1:3]) + + processed_ims = [] + im_scale_factors = [] + + for target_size in cfg.TEST.SCALES: + im_scale = float(target_size) / float(im_size_min) + # Prevent the biggest axis from being more than MAX_SIZE + if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE: + im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max) + ims = np.array(list(map(lambda im_orig:cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, + interpolation=cv2.INTER_LINEAR),im_origs))) + im_scale_factors.append(im_scale) + processed_ims.append(ims) + + processed_ims = np.concatenate(processed_ims) + # Create a blob to hold the input images + blob = im_list_to_blob(processed_ims) + + return blob, np.array(im_scale_factors) + +# np.array(processed_ims).shape + def _get_blobs(im): """Convert an image and RoIs within that image into network inputs.""" blobs = {} @@ -66,6 +106,14 @@ def _get_blobs(im): return blobs, im_scale_factors +def _get_blobs_batch(ims): + """Convert an image and RoIs within that image into network inputs.""" + blobs = {} + blobs['data'], im_scale_factors = _get_image_blob_batch(ims) + + return blobs, im_scale_factors + + def _clip_boxes(boxes, im_shape): """Clip boxes to image boundaries.""" # x1 >= 0 @@ -93,7 +141,7 @@ def im_detect(net, im): blobs['im_info'] = np.array([im_blob.shape[1], im_blob.shape[2], im_scales[0]], dtype=np.float32) _, scores, bbox_pred, rois = net.test_image(blobs['data'], blobs['im_info']) - + boxes = rois[:, 1:5] / im_scales[0] scores = np.reshape(scores, [scores.shape[0], -1]) bbox_pred = np.reshape(bbox_pred, [bbox_pred.shape[0], -1]) @@ -108,6 +156,34 @@ def im_detect(net, im): return scores, pred_boxes + +def im_detect_batch(net, ims): + blobs, im_scales = _get_blobs_batch(ims) + + im_blob = blobs['data'] + blobs['im_info'] = np.array([im_blob.shape[1], im_blob.shape[2], im_scales[0]], dtype=np.float32) + + _, scores, bbox_preds, rois = net.test_images(blobs['data'], blobs['im_info']) + + scoress,pred_boxess = [],[] + for i in range(len(ims)): + boxes = rois[i][:, 1:5] / im_scales[0] + score = np.reshape(scores[i], [scores[i].shape[0], -1]) + bbox_pred = np.reshape(bbox_preds[i], [bbox_preds[i].shape[0], -1]) + if cfg.TEST.BBOX_REG: + # Apply bounding-box regression deltas + box_deltas = bbox_pred + pred_boxes = bbox_transform_inv(torch.from_numpy(boxes), torch.from_numpy(box_deltas)).numpy() + pred_boxes = _clip_boxes(pred_boxes, ims[i].shape) + else: + # Simply repeat the boxes, once for each class + pred_boxes = np.tile(boxes, (1, score.shape[1])) + pred_boxess.append(pred_boxes) + scoress.append(score) + + return scoress,pred_boxess + + def apply_nms(all_boxes, thresh): """Apply non-maximum suppression to all predicted boxes output by the test_net method. @@ -192,4 +268,3 @@ def test_net(net, imdb, weights_filename, max_per_image=100, thresh=0.): print('Evaluating detections') imdb.evaluate_detections(all_boxes, output_dir) - diff --git a/lib/nets/mobilenet_v1.py b/lib/nets/mobilenet_v1.py index ff78726..dbb8e36 100644 --- a/lib/nets/mobilenet_v1.py +++ b/lib/nets/mobilenet_v1.py @@ -15,8 +15,8 @@ import numpy as np from collections import namedtuple, OrderedDict -from nets.network import Network -from model.config import cfg +from lib.nets.network import Network +from lib.model.config import cfg # The following is adapted from: # https://github.com/tensorflow/models/blob/master/slim/nets/mobilenet_v1.py @@ -193,7 +193,7 @@ def normal_init(m, mean, stddev, truncated=False): else: m.weight.data.normal_(mean, stddev) if m.bias is not None: m.bias.data.zero_() - + self.mobilenet.apply(lambda m: normal_init(m, 0, 0.09, True)) normal_init(self.rpn_net, 0, 0.01, cfg.TRAIN.TRUNCATED) normal_init(self.rpn_cls_score_net, 0, 0.01, cfg.TRAIN.TRUNCATED) @@ -215,12 +215,12 @@ def _head_to_tail(self, pool5): def _init_head_tail(self): self.mobilenet = mobilenet_v1_base() - # Fix blocks + # Fix blocks assert (0 <= cfg.MOBILENET.FIXED_LAYERS <= 12) for m in list(self.mobilenet.children())[:cfg.MOBILENET.FIXED_LAYERS]: for p in m.parameters(): p.requires_grad = False - + def set_bn_fix(m): classname = m.__class__.__name__ if classname.find('BatchNorm') != -1: diff --git a/lib/nets/network.py b/lib/nets/network.py index 989c47e..c97634e 100644 --- a/lib/nets/network.py +++ b/lib/nets/network.py @@ -14,20 +14,22 @@ import torch.nn as nn import torch.nn.functional as F from torch.autograd import Variable +import signal +import lib.utils.timer -import utils.timer +from lib.layer_utils.snippets import generate_anchors_pre +from lib.layer_utils.proposal_layer import proposal_layer +from lib.layer_utils.proposal_top_layer import proposal_top_layer +from lib.layer_utils.proposal_layer import proposal_layer_batch +from lib.layer_utils.proposal_top_layer import proposal_top_layer_batch +from lib.layer_utils.anchor_target_layer import anchor_target_layer +from lib.layer_utils.proposal_target_layer import proposal_target_layer +from lib.utils.visualization import draw_bounding_boxes -from layer_utils.snippets import generate_anchors_pre -from layer_utils.proposal_layer import proposal_layer -from layer_utils.proposal_top_layer import proposal_top_layer -from layer_utils.anchor_target_layer import anchor_target_layer -from layer_utils.proposal_target_layer import proposal_target_layer -from utils.visualization import draw_bounding_boxes +from lib.layer_utils.roi_pooling.roi_pool import RoIPoolFunction +from lib.layer_utils.roi_align.crop_and_resize import CropAndResizeFunction -from layer_utils.roi_pooling.roi_pool import RoIPoolFunction -from layer_utils.roi_align.crop_and_resize import CropAndResizeFunction - -from model.config import cfg +from lib.model.config import cfg import tensorboardX as tb @@ -47,7 +49,8 @@ def __init__(self): self._event_summaries = {} self._image_gt_summaries = {} self._variables_to_fix = {} - self._device = 'cuda' + self._cuda_device = 0 + self._device = "cuda" def _add_gt_image(self): # add back mean @@ -88,6 +91,21 @@ def _proposal_layer(self, rpn_cls_prob, rpn_bbox_pred): return rois, rpn_scores + def _proposal_top_layer_batch(self, rpn_cls_prob, rpn_bbox_pred): + rois, rpn_scores = proposal_top_layer_batch(\ + rpn_cls_prob, rpn_bbox_pred, self._im_info, + self._feat_stride, self._anchors, self._num_anchors, self._cuda_device) + return rois, rpn_scores + + def _proposal_layer_batch(self, rpn_cls_prob, rpn_bbox_pred): + rois, rpn_scores = proposal_layer_batch(\ + rpn_cls_prob, rpn_bbox_pred, self._im_info, self._mode, + self._feat_stride, self._anchors, self._num_anchors, self._cuda_device) + + return rois, rpn_scores + + + def _roi_pool_layer(self, bottom, rois): return RoIPoolFunction(cfg.POOLING_SIZE, cfg.POOLING_SIZE, 1. / 16.)(bottom, rois) @@ -115,7 +133,7 @@ def _crop_pool_layer(self, bottom, rois, max_pool=True): width = bottom.size(3) pre_pool_size = cfg.POOLING_SIZE * 2 if max_pool else cfg.POOLING_SIZE - crops = CropAndResizeFunction(pre_pool_size, pre_pool_size)(bottom, + crops = CropAndResizeFunction(pre_pool_size, pre_pool_size)(bottom, torch.cat([y1/(height-1),x1/(width-1),y2/(height-1),x2/(width-1)], 1), rois[:, 0].int()) if max_pool: crops = F.max_pool2d(crops, 2, 2) @@ -126,10 +144,10 @@ def _anchor_target_layer(self, rpn_cls_score): anchor_target_layer( rpn_cls_score.data, self._gt_boxes.data.cpu().numpy(), self._im_info, self._feat_stride, self._anchors.data.cpu().numpy(), self._num_anchors) - rpn_labels = torch.from_numpy(rpn_labels).float().to(self._device) #.set_shape([1, 1, None, None]) - rpn_bbox_targets = torch.from_numpy(rpn_bbox_targets).float().to(self._device)#.set_shape([1, None, None, self._num_anchors * 4]) - rpn_bbox_inside_weights = torch.from_numpy(rpn_bbox_inside_weights).float().to(self._device)#.set_shape([1, None, None, self._num_anchors * 4]) - rpn_bbox_outside_weights = torch.from_numpy(rpn_bbox_outside_weights).float().to(self._device)#.set_shape([1, None, None, self._num_anchors * 4]) + rpn_labels = torch.from_numpy(rpn_labels).float().cuda(self._cuda_device) #.set_shape([1, 1, None, None]) + rpn_bbox_targets = torch.from_numpy(rpn_bbox_targets).float().cuda(self._cuda_device)#.set_shape([1, None, None, self._num_anchors * 4]) + rpn_bbox_inside_weights = torch.from_numpy(rpn_bbox_inside_weights).float().cuda(self._cuda_device)#.set_shape([1, None, None, self._num_anchors * 4]) + rpn_bbox_outside_weights = torch.from_numpy(rpn_bbox_outside_weights).float().cuda(self._cuda_device)#.set_shape([1, None, None, self._num_anchors * 4]) rpn_labels = rpn_labels.long() self._anchor_targets['rpn_labels'] = rpn_labels @@ -165,7 +183,7 @@ def _anchor_component(self, height, width): anchors, anchor_length = generate_anchors_pre(\ height, width, self._feat_stride, self._anchor_scales, self._anchor_ratios) - self._anchors = torch.from_numpy(anchors).to(self._device) + self._anchors = torch.from_numpy(anchors).cuda(self._cuda_device) self._anchor_length = anchor_length def _smooth_l1_loss(self, bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights, sigma=1.0, dim=[1]): @@ -234,7 +252,7 @@ def _region_proposal(self, net_conv): # change it so that the score has 2 as its channel size rpn_cls_score_reshape = rpn_cls_score.view(1, 2, -1, rpn_cls_score.size()[-1]) # batch * 2 * (num_anchors*h) * w rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape, dim=1) - + # Move channel to the last dimenstion, to fit the input of python functions rpn_cls_prob = rpn_cls_prob_reshape.view_as(rpn_cls_score).permute(0, 2, 3, 1) # batch * h * w * (num_anchors * 2) rpn_cls_score = rpn_cls_score.permute(0, 2, 3, 1) # batch * h * w * (num_anchors * 2) @@ -265,6 +283,47 @@ def _region_proposal(self, net_conv): return rois + def _region_proposal_batch(self, net_conv): + rpn = F.relu(self.rpn_net(net_conv)) + self._act_summaries['rpn'] = rpn + + rpn_cls_score = self.rpn_cls_score_net(rpn) # batch * (num_anchors * 2) * h * w + + # change it so that the score has 2 as its channel size + rpn_cls_score_reshape = rpn_cls_score.view(rpn_cls_score.size(0), 2, -1, rpn_cls_score.size()[-1]) # batch * 2 * (num_anchors*h) * w + rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape, dim=1) + + # Move channel to the last dimenstion, to fit the input of python functions + rpn_cls_prob = rpn_cls_prob_reshape.view_as(rpn_cls_score).permute(0, 2, 3, 1) # batch * h * w * (num_anchors * 2) + rpn_cls_score = rpn_cls_score.permute(0, 2, 3, 1) # batch * h * w * (num_anchors * 2) + rpn_cls_score_reshape = rpn_cls_score_reshape.permute(0, 2, 3, 1).contiguous() # batch * (num_anchors*h) * w * 2 + rpn_cls_pred = torch.max(rpn_cls_score_reshape.view(rpn_cls_score.size(0),-1, 2), 2)[1] + + rpn_bbox_pred = self.rpn_bbox_pred_net(rpn) + rpn_bbox_pred = rpn_bbox_pred.permute(0, 2, 3, 1).contiguous() # batch * h * w * (num_anchors*4) + + if self._mode == 'TRAIN': + rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred) # rois, roi_scores are varible + rpn_labels = self._anchor_target_layer(rpn_cls_score) + rois, _ = self._proposal_target_layer(rois, roi_scores) + else: + if cfg.TEST.MODE == 'nms': + rois, _ = self._proposal_layer_batch(rpn_cls_prob, rpn_bbox_pred) + elif cfg.TEST.MODE == 'top': + rois, _ = self._proposal_top_layer_batch(rpn_cls_prob, rpn_bbox_pred) + else: + raise NotImplementedError + + self._predictions["rpn_cls_score"] = rpn_cls_score + self._predictions["rpn_cls_score_reshape"] = rpn_cls_score_reshape + self._predictions["rpn_cls_prob"] = rpn_cls_prob + self._predictions["rpn_cls_pred"] = rpn_cls_pred + self._predictions["rpn_bbox_pred"] = rpn_bbox_pred + self._predictions["rois"] = rois + + return rois + + def _region_classification(self, fc7): cls_score = self.cls_score_net(fc7) cls_pred = torch.max(cls_score, 1)[1] @@ -278,6 +337,19 @@ def _region_classification(self, fc7): return cls_prob, bbox_pred + def _region_classification_batch(self, fc7): + cls_score = self.cls_score_net(fc7) + cls_pred = torch.max(cls_score, 1)[1] + cls_prob = F.softmax(cls_score, dim=1) + bbox_pred = self.bbox_pred_net(fc7) + + self._predictions["cls_score"].append(cls_score) + self._predictions["cls_pred"].append(cls_pred) + self._predictions["cls_prob"].append(cls_prob) + self._predictions["bbox_pred"].append(bbox_pred) + + return cls_prob, bbox_pred + def _image_to_head(self): raise NotImplementedError @@ -309,7 +381,7 @@ def _init_modules(self): self.rpn_net = nn.Conv2d(self._net_conv_channels, cfg.RPN_CHANNELS, [3, 3], padding=1) self.rpn_cls_score_net = nn.Conv2d(cfg.RPN_CHANNELS, self._num_anchors * 2, [1, 1]) - + self.rpn_bbox_pred_net = nn.Conv2d(cfg.RPN_CHANNELS, self._num_anchors * 4, [1, 1]) self.cls_score_net = nn.Linear(self._fc7_channels, self._num_classes) @@ -343,7 +415,7 @@ def _run_summary_op(self, val=False): summaries.append(self._add_train_summary(k, var)) self._image_gt_summaries = {} - + return summaries def _predict(self): @@ -353,7 +425,7 @@ def _predict(self): # build the anchors for the image self._anchor_component(net_conv.size(2), net_conv.size(3)) - + rois = self._region_proposal(net_conv) if cfg.POOLING_MODE == 'crop': pool5 = self._crop_pool_layer(net_conv, rois) @@ -365,12 +437,64 @@ def _predict(self): fc7 = self._head_to_tail(pool5) cls_prob, bbox_pred = self._region_classification(fc7) - + + for k in self._predictions.keys(): + self._score_summaries[k] = self._predictions[k] + + return rois, cls_prob, bbox_pred + + + def _predict_batch(self): + # This is just _build_network in tf-faster-rcnn + torch.backends.cudnn.benchmark = False + net_conv = self._image_to_head() + + # build the anchors for the image + self._anchor_component(net_conv.size(2), net_conv.size(3)) + #net_conv.size : (bs,1024,h,w) + rois = self._region_proposal_batch(net_conv) + if cfg.POOLING_MODE == 'crop': + pool5 = list(map(lambda x : self._crop_pool_layer(net_conv, x),rois)) + else: + pool5 = self._roi_pool_layer(net_conv, rois) + + if self._mode == 'TRAIN': + torch.backends.cudnn.benchmark = True # benchmark because now the input size are fixed + fc7 = list(map(self._head_to_tail,pool5)) + + self._predictions["cls_score"] = [] + self._predictions["cls_pred"] = [] + self._predictions["cls_prob"] = [] + self._predictions["bbox_pred"] = [] + cls_probs_bbox_preds = list(map(self._region_classification_batch,fc7)) + cls_prob, bbox_pred = [x[0] for x in cls_probs_bbox_preds],[x[1] for x in cls_probs_bbox_preds] + for k in self._predictions.keys(): self._score_summaries[k] = self._predictions[k] return rois, cls_prob, bbox_pred + def forward_batch(self, image, im_info, gt_boxes=None, mode='TRAIN'): + self._image_gt_summaries['image'] = image + self._image_gt_summaries['gt_boxes'] = gt_boxes + self._image_gt_summaries['im_info'] = im_info + + self._image = torch.from_numpy(image.transpose([0,3,1,2])).cuda(self._cuda_device) + self._im_info = im_info # No need to change; actually it can be an list + self._gt_boxes = torch.from_numpy(gt_boxes).cuda(self._cuda_device) if gt_boxes is not None else None + + self._mode = mode + + rois, cls_prob, bbox_preds = self._predict_batch() + + if mode == 'TEST': + stds = list(map(lambda x : x.data.new(cfg.TRAIN.BBOX_NORMALIZE_STDS).repeat(self._num_classes).unsqueeze(0).expand_as(x),bbox_preds)) + means = list(map(lambda x : x.data.new(cfg.TRAIN.BBOX_NORMALIZE_MEANS).repeat(self._num_classes).unsqueeze(0).expand_as(x),bbox_preds)) + self._predictions["bbox_pred"] = list(map(lambda i : bbox_preds[i].mul(stds[i]).add(means[i]), range(len(stds)))) + else: + self._add_losses() # compute losses + + def forward(self, image, im_info, gt_boxes=None, mode='TRAIN'): self._image_gt_summaries['image'] = image self._image_gt_summaries['gt_boxes'] = gt_boxes @@ -391,6 +515,8 @@ def forward(self, image, im_info, gt_boxes=None, mode='TRAIN'): else: self._add_losses() # compute losses + + def init_weights(self): def normal_init(m, mean, stddev, truncated=False): """ @@ -402,7 +528,7 @@ def normal_init(m, mean, stddev, truncated=False): else: m.weight.data.normal_(mean, stddev) m.bias.data.zero_() - + normal_init(self.rpn_net, 0, 0.01, cfg.TRAIN.TRUNCATED) normal_init(self.rpn_cls_score_net, 0, 0.01, cfg.TRAIN.TRUNCATED) normal_init(self.rpn_bbox_pred_net, 0, 0.01, cfg.TRAIN.TRUNCATED) @@ -412,7 +538,7 @@ def normal_init(m, mean, stddev, truncated=False): # Extract the head feature maps, for example for vgg16 it is conv5_3 # only useful during testing mode def extract_head(self, image): - feat = self._layers["head"](torch.from_numpy(image.transpose([0,3,1,2])).to(self._device)) + feat = self._layers["head"](torch.from_numpy(image.transpose([0,3,1,2])).cuda(self._cuda_device)) return feat # only useful during testing mode @@ -426,6 +552,16 @@ def test_image(self, image, im_info): self._predictions['rois'].data.cpu().numpy() return cls_score, cls_prob, bbox_pred, rois + def test_images(self, images, im_info): + self.eval() + with torch.no_grad(): + self.forward_batch(images, im_info, None, mode='TEST') + cls_score, cls_prob, bbox_pred, rois = list(map(lambda x : x.data.cpu().numpy(),self._predictions['cls_score'])), \ + list(map(lambda x : x.data.cpu().numpy(),self._predictions['cls_prob'])), \ + list(map(lambda x : x.data.cpu().numpy(),self._predictions['bbox_pred'])), \ + list(map(lambda x : x.data.cpu().numpy(),self._predictions['rois'])) + return cls_score, cls_prob, bbox_pred, rois + def delete_intermediate_states(self): # Delete intermediate result to save memory for d in [self._losses, self._predictions, self._anchor_targets, self._proposal_targets]: @@ -482,9 +618,8 @@ def train_step_no_return(self, blobs, train_op): def load_state_dict(self, state_dict): """ - Because we remove the definition of fc layer in resnet now, it will fail when loading + Because we remove the definition of fc layer in resnet now, it will fail when loading the model trained before. To provide back compatibility, we overwrite the load_state_dict """ nn.Module.load_state_dict(self, {k: state_dict[k] for k in list(self.state_dict())}) - diff --git a/lib/nets/resnet_v1.py b/lib/nets/resnet_v1.py index 350cfa2..9342f04 100644 --- a/lib/nets/resnet_v1.py +++ b/lib/nets/resnet_v1.py @@ -7,10 +7,10 @@ from __future__ import division from __future__ import print_function -from nets.network import Network -from model.config import cfg +from lib.nets.network import Network +from lib.model.config import cfg -import utils.timer +import lib.utils.timer import torch import torch.nn as nn @@ -110,7 +110,7 @@ def _image_to_head(self): return net_conv def _head_to_tail(self, pool5): - fc7 = self.resnet.layer4(pool5).mean(3).mean(2) # average pooling after layer4 + fc7 = self.resnet.module.layer4(pool5).mean(3).mean(2) # average pooling after layer4 return fc7 def _init_head_tail(self): @@ -128,7 +128,7 @@ def _init_head_tail(self): # other numbers are not supported raise NotImplementedError - # Fix blocks + # Fix blocks for p in self.resnet.bn1.parameters(): p.requires_grad=False for p in self.resnet.conv1.parameters(): p.requires_grad=False assert (0 <= cfg.RESNET.FIXED_BLOCKS < 4) @@ -147,7 +147,7 @@ def set_bn_fix(m): self.resnet.apply(set_bn_fix) # Build resnet. - self._layers['head'] = nn.Sequential(self.resnet.conv1, self.resnet.bn1,self.resnet.relu, + self._layers['head'] = nn.Sequential(self.resnet.conv1, self.resnet.bn1,self.resnet.relu, self.resnet.maxpool,self.resnet.layer1,self.resnet.layer2,self.resnet.layer3) def train(self, mode=True): diff --git a/lib/nets/vgg16.py b/lib/nets/vgg16.py index c204dba..f22e2bb 100644 --- a/lib/nets/vgg16.py +++ b/lib/nets/vgg16.py @@ -7,8 +7,8 @@ from __future__ import division from __future__ import print_function -from nets.network import Network -from model.config import cfg +from lib.nets.network import Network +from lib.model.config import cfg import torch import torch.nn as nn @@ -40,7 +40,7 @@ def _init_head_tail(self): def _image_to_head(self): net_conv = self._layers['head'](self._image) self._act_summaries['conv'] = net_conv - + return net_conv def _head_to_tail(self, pool5): @@ -50,4 +50,4 @@ def _head_to_tail(self, pool5): return fc7 def load_pretrained_cnn(self, state_dict): - self.vgg.load_state_dict({k:v for k,v in state_dict.items() if k in self.vgg.state_dict()}) \ No newline at end of file + self.vgg.load_state_dict({k:v for k,v in state_dict.items() if k in self.vgg.state_dict()}) diff --git a/lib/nms/_ext/__init__.py b/lib/nms/_ext/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/nms/_ext/nms/__init__.py b/lib/nms/_ext/nms/__init__.py new file mode 100644 index 0000000..d71786f --- /dev/null +++ b/lib/nms/_ext/nms/__init__.py @@ -0,0 +1,15 @@ + +from torch.utils.ffi import _wrap_function +from ._nms import lib as _lib, ffi as _ffi + +__all__ = [] +def _import_symbols(locals): + for symbol in dir(_lib): + fn = getattr(_lib, symbol) + if callable(fn): + locals[symbol] = _wrap_function(fn, _ffi) + else: + locals[symbol] = fn + __all__.append(symbol) + +_import_symbols(locals()) diff --git a/lib/nms/pth_nms.py b/lib/nms/pth_nms.py index 5dac09d..d84c0f7 100644 --- a/lib/nms/pth_nms.py +++ b/lib/nms/pth_nms.py @@ -2,6 +2,49 @@ from ._ext import nms import numpy as np +def pth_nms_batch(dets, thresh, network_device): + """ + dets has to be a tensor + """ + if not dets.is_cuda: + x1 = dets[:, 0] + y1 = dets[:, 1] + x2 = dets[:, 2] + y2 = dets[:, 3] + scores = dets[:, 4] + + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + order = scores.sort(0, descending=True)[1] + # order = torch.from_numpy(np.ascontiguousarray(scores.numpy().argsort()[::-1])).long() + + keep = torch.LongTensor(dets.size(0)) + num_out = torch.LongTensor(1) + nms.cpu_nms(keep, num_out, dets, order, areas, thresh) + + return keep[:num_out[0]] + else: + x1 = dets[:, 0] + y1 = dets[:, 1] + x2 = dets[:, 2] + y2 = dets[:, 3] + scores = dets[:, 4] + + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + order = scores.sort(0, descending=True)[1] + # order = torch.from_numpy(np.ascontiguousarray(scores.cpu().numpy().argsort()[::-1])).long().cuda() + + dets = dets[order].contiguous() + + keep = torch.LongTensor(dets.size(0)) + num_out = torch.LongTensor(1) + # keep = torch.cuda.LongTensor(dets.size(0)) + # num_out = torch.cuda.LongTensor(1) + nms.gpu_nms(keep, num_out, dets, thresh) + + return order[keep[:num_out[0]].cuda(network_device)].contiguous() + # return order[keep[:num_out[0]]].contiguous() + + def pth_nms(dets, thresh): """ dets has to be a tensor @@ -43,4 +86,3 @@ def pth_nms(dets, thresh): return order[keep[:num_out[0]].cuda()].contiguous() # return order[keep[:num_out[0]]].contiguous() - diff --git a/lib/nms/src/cuda/nms_kernel.cu.o b/lib/nms/src/cuda/nms_kernel.cu.o new file mode 100644 index 0000000..47504d2 Binary files /dev/null and b/lib/nms/src/cuda/nms_kernel.cu.o differ