diff --git a/lib/layer_utils/anchor_target_layer.py b/lib/layer_utils/anchor_target_layer.py
index 67cc0cc..8859320 100644
--- a/lib/layer_utils/anchor_target_layer.py
+++ b/lib/layer_utils/anchor_target_layer.py
@@ -9,11 +9,11 @@
 from __future__ import print_function
 
 import os
-from model.config import cfg
+from lib.model.config import cfg
 import numpy as np
 import numpy.random as npr
-from utils.bbox import bbox_overlaps
-from model.bbox_transform import bbox_transform
+from lib.utils.bbox import bbox_overlaps
+from lib.model.bbox_transform import bbox_transform
 import torch
 
 def anchor_target_layer(rpn_cls_score, gt_boxes, im_info, _feat_stride, all_anchors, num_anchors):
@@ -160,4 +160,4 @@ def _compute_targets(ex_rois, gt_rois):
   assert ex_rois.shape[1] == 4
   assert gt_rois.shape[1] == 5
 
-  return bbox_transform(torch.from_numpy(ex_rois), torch.from_numpy(gt_rois[:, :4])).numpy()
\ No newline at end of file
+  return bbox_transform(torch.from_numpy(ex_rois), torch.from_numpy(gt_rois[:, :4])).numpy()
diff --git a/lib/layer_utils/proposal_layer.py b/lib/layer_utils/proposal_layer.py
index 527e2f3..cc592f3 100644
--- a/lib/layer_utils/proposal_layer.py
+++ b/lib/layer_utils/proposal_layer.py
@@ -8,9 +8,9 @@
 from __future__ import print_function
 
 import numpy as np
-from model.config import cfg
-from model.bbox_transform import bbox_transform_inv, clip_boxes
-from model.nms_wrapper import nms
+from lib.model.config import cfg
+from lib.model.bbox_transform import bbox_transform_inv, clip_boxes, bbox_transform_inv_batch
+from lib.model.nms_wrapper import nms,nms_batch
 
 import torch
 
@@ -52,3 +52,45 @@ def proposal_layer(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride,
   blob = torch.cat((batch_inds, proposals), 1)
 
   return blob, scores
+
+
+def proposal_layer_batch(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors, network_device):
+  """A simplified version compared to fast/er RCNN
+     For details please see the technical report
+  """
+  if type(cfg_key) == bytes:
+      cfg_key = cfg_key.decode('utf-8')
+  pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
+  post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
+  nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
+
+  # Get the scores and bounding boxes
+  scores = rpn_cls_prob[:, :, :, num_anchors:]
+  rpn_bbox_pred = rpn_bbox_pred.view((rpn_bbox_pred.size(0),-1, 4))
+  scores = scores.contiguous().view(scores.size(0),-1, 1)
+  proposals = bbox_transform_inv_batch(anchors, rpn_bbox_pred)#here bug
+  proposals = list(map(lambda x : clip_boxes(x, im_info[:2]),proposals))
+
+  blobs, scoress = [],[]
+  for i in range(scores.size(0)):
+      # Pick the top region proposals
+      score, order = scores[i].view(-1).sort(descending=True)
+      if pre_nms_topN > 0:
+        order = order[:pre_nms_topN]
+        score = score[:pre_nms_topN].view(-1, 1)
+      proposal = proposals[i][order.data, :]
+
+      # Non-maximal suppression
+      keep = nms_batch(torch.cat((proposal, score), 1).data, nms_thresh, network_device)
+      # Pick th top region proposals after NMS
+      if post_nms_topN > 0:
+        keep = keep[:post_nms_topN]
+      proposal = proposal[keep]
+      score = score[keep]
+
+      # Only support single image as input
+      batch_inds = proposal.new_zeros(proposal.size(0), 1)
+      blob = torch.cat((batch_inds, proposal), 1)
+      blobs.append(blob)
+      scoress.append(score)
+  return blobs, scoress
diff --git a/lib/layer_utils/proposal_target_layer.py b/lib/layer_utils/proposal_target_layer.py
index 87f7533..40f82d4 100644
--- a/lib/layer_utils/proposal_target_layer.py
+++ b/lib/layer_utils/proposal_target_layer.py
@@ -10,9 +10,9 @@
 
 import numpy as np
 import numpy.random as npr
-from model.config import cfg
-from model.bbox_transform import bbox_transform
-from utils.bbox import bbox_overlaps
+from lib.model.config import cfg
+from lib.model.bbox_transform import bbox_transform
+from lib.utils.bbox import bbox_overlaps
 
 
 import torch
diff --git a/lib/layer_utils/proposal_top_layer.py b/lib/layer_utils/proposal_top_layer.py
index 97f44ad..84aaeee 100644
--- a/lib/layer_utils/proposal_top_layer.py
+++ b/lib/layer_utils/proposal_top_layer.py
@@ -8,11 +8,56 @@
 from __future__ import print_function
 
 import numpy as np
-from model.config import cfg
-from model.bbox_transform import bbox_transform_inv, clip_boxes
+from lib.model.config import cfg
+from lib.model.bbox_transform import bbox_transform_inv, clip_boxes
 import numpy.random as npr
 
 import torch
+import signal
+
+def proposal_top_layer_batch(rpn_cls_probs, rpn_bbox_preds, im_info, _feat_stride, anchors, num_anchors, network_device):
+  """A layer that just selects the top region proposals
+     without using non-maximal suppression,
+     For details please see the technical report
+  """
+  rpn_top_n = cfg.TEST.RPN_TOP_N
+
+  scores = rpn_cls_probs[:, :, :, num_anchors:]
+
+  rpn_bbox_preds = rpn_bbox_preds.view(rpn_bbox_preds.size(0),-1, 4)
+  scores = scores.contiguous().view(rpn_bbox_preds.size(0),-1, 1)
+
+  blobs, scoress = [],[]
+  for i in range(scores.size(0)):
+      score = scores[i]
+      length = score.size(0)
+      if length < rpn_top_n:
+        # Random selection, maybe unnecessary and loses good proposals
+        # But such case rarely happens
+        top_inds = torch.from_numpy(npr.choice(length, size=rpn_top_n, replace=True)).long().cuda(network_device)
+      else:
+        top_inds = score.sort(0, descending=True)[1]
+        top_inds = top_inds[:rpn_top_n]
+        top_inds = top_inds.view(rpn_top_n)
+
+      # Do the selection here
+      anchor = anchors[top_inds, :].contiguous()
+      rpn_bbox_pred = rpn_bbox_preds[i][top_inds, :].contiguous()
+      score = score[top_inds].contiguous()
+
+      # Convert anchors into proposals via bbox transformations
+      proposal = bbox_transform_inv(anchor, rpn_bbox_pred)
+      # Clip predicted boxes to image
+      proposal = clip_boxes(proposal, im_info[:2])
+
+      # Output rois blob
+      # Our RPN implementation only supports a single input image, so all
+      # batch inds are 0
+      batch_inds = proposal.new_zeros(proposal.size(0), 1)
+      blob = torch.cat([batch_inds, proposal], 1)
+      blobs.append(blob)
+      scoress.append(score)
+  return blobs, scoress
 
 def proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, im_info, _feat_stride, anchors, num_anchors):
   """A layer that just selects the top region proposals
@@ -43,10 +88,12 @@ def proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, im_info, _feat_stride, ancho
 
   # Convert anchors into proposals via bbox transformations
   proposals = bbox_transform_inv(anchors, rpn_bbox_pred)
+  #proposals = torch.zeros((5000,4))
 
   # Clip predicted boxes to image
   proposals = clip_boxes(proposals, im_info[:2])
 
+
   # Output rois blob
   # Our RPN implementation only supports a single input image, so all
   # batch inds are 0
diff --git a/lib/layer_utils/roi_align/src/cuda/crop_and_resize_kernel.cu.o b/lib/layer_utils/roi_align/src/cuda/crop_and_resize_kernel.cu.o
new file mode 100644
index 0000000..07f5997
Binary files /dev/null and b/lib/layer_utils/roi_align/src/cuda/crop_and_resize_kernel.cu.o differ
diff --git a/lib/layer_utils/roi_pooling/_ext/roi_pooling/__init__.py b/lib/layer_utils/roi_pooling/_ext/roi_pooling/__init__.py
index 9da23b1..d900ec5 100644
--- a/lib/layer_utils/roi_pooling/_ext/roi_pooling/__init__.py
+++ b/lib/layer_utils/roi_pooling/_ext/roi_pooling/__init__.py
@@ -6,7 +6,10 @@
 def _import_symbols(locals):
     for symbol in dir(_lib):
         fn = getattr(_lib, symbol)
-        locals[symbol] = _wrap_function(fn, _ffi)
+        if callable(fn):
+            locals[symbol] = _wrap_function(fn, _ffi)
+        else:
+            locals[symbol] = fn
         __all__.append(symbol)
 
 _import_symbols(locals())
diff --git a/lib/layer_utils/roi_pooling/src/cuda/roi_pooling_kernel.cu.o b/lib/layer_utils/roi_pooling/src/cuda/roi_pooling_kernel.cu.o
new file mode 100644
index 0000000..799cb6f
Binary files /dev/null and b/lib/layer_utils/roi_pooling/src/cuda/roi_pooling_kernel.cu.o differ
diff --git a/lib/layer_utils/snippets.py b/lib/layer_utils/snippets.py
index 14bf77f..ee5e4e2 100644
--- a/lib/layer_utils/snippets.py
+++ b/lib/layer_utils/snippets.py
@@ -8,7 +8,7 @@
 from __future__ import print_function
 
 import numpy as np
-from layer_utils.generate_anchors import generate_anchors
+from lib.layer_utils.generate_anchors import generate_anchors
 
 def generate_anchors_pre(height, width, feat_stride, anchor_scales=(8,16,32), anchor_ratios=(0.5,1,2)):
   """ A wrapper function to generate anchors given different scales
diff --git a/lib/make.sh b/lib/make.sh
index 163bf11..5811347 100644
--- a/lib/make.sh
+++ b/lib/make.sh
@@ -11,7 +11,7 @@ cd layer_utils/roi_pooling/src/cuda
 echo "Compiling roi_pooling kernels by nvcc..."
 nvcc -c -o roi_pooling_kernel.cu.o roi_pooling_kernel.cu -x cu -Xcompiler -fPIC $CUDA_ARCH 
 cd ../../
-python build.py
+python3 build.py
 cd ../../
 
 # Build RoIAlign
@@ -19,7 +19,7 @@ cd layer_utils/roi_align/src/cuda
 echo 'Compiling crop_and_resize kernels by nvcc...'
 nvcc -c -o crop_and_resize_kernel.cu.o crop_and_resize_kernel.cu -x cu -Xcompiler -fPIC $CUDA_ARCH
 cd ../../
-python build.py
+python3 build.py
 cd ../../
 
 # Build NMS
@@ -27,5 +27,5 @@ cd nms/src/cuda
 echo "Compiling nms kernels by nvcc..."
 nvcc -c -o nms_kernel.cu.o nms_kernel.cu -x cu -Xcompiler -fPIC $CUDA_ARCH
 cd ../../
-python build.py
+python3 build.py
 cd ../
diff --git a/lib/model/bbox_transform.py b/lib/model/bbox_transform.py
index 66916a8..8b80eae 100644
--- a/lib/model/bbox_transform.py
+++ b/lib/model/bbox_transform.py
@@ -32,6 +32,34 @@ def bbox_transform(ex_rois, gt_rois):
   return targets
 
 
+def bbox_transform_inv_batch(boxes, deltas):
+  # Input should be both tensor or both Variable and on the same device
+  if len(boxes) == 0:
+    return deltas.detach() * 0
+
+  widths = boxes[:, 2] - boxes[:, 0] + 1.0
+  heights = boxes[:, 3] - boxes[:, 1] + 1.0
+  ctr_x = boxes[:, 0] + 0.5 * widths
+  ctr_y = boxes[:, 1] + 0.5 * heights
+
+  dx = deltas[:, :, 0::4]
+  dy = deltas[:, :, 1::4]
+  dw = deltas[:, :, 2::4]
+  dh = deltas[:, :, 3::4]
+
+  pred_ctr_x = dx * widths.repeat(deltas.size(0),1).unsqueeze(-1) + ctr_x.repeat(deltas.size(0),1).unsqueeze(-1)
+  pred_ctr_y = dy * heights.repeat(deltas.size(0),1).unsqueeze(-1) + ctr_y.repeat(deltas.size(0),1).unsqueeze(-1)
+  pred_w = torch.exp(dw) * widths.repeat(deltas.size(0),1).unsqueeze(-1)
+  pred_h = torch.exp(dh) * heights.repeat(deltas.size(0),1).unsqueeze(-1)
+
+  pred_boxes = torch.cat(\
+    [_.unsqueeze(2) for _ in [pred_ctr_x - 0.5 * pred_w,\
+                              pred_ctr_y - 0.5 * pred_h,\
+                              pred_ctr_x + 0.5 * pred_w,\
+                              pred_ctr_y + 0.5 * pred_h]], 2).view(deltas.size(0),len(boxes), -1)
+
+  return pred_boxes
+
 def bbox_transform_inv(boxes, deltas):
   # Input should be both tensor or both Variable and on the same device
   if len(boxes) == 0:
@@ -46,7 +74,7 @@ def bbox_transform_inv(boxes, deltas):
   dy = deltas[:, 1::4]
   dw = deltas[:, 2::4]
   dh = deltas[:, 3::4]
-  
+
   pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1)
   pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1)
   pred_w = torch.exp(dw) * widths.unsqueeze(1)
@@ -61,6 +89,7 @@ def bbox_transform_inv(boxes, deltas):
   return pred_boxes
 
 
+
 def clip_boxes(boxes, im_shape):
   """
   Clip boxes to image boundaries.
diff --git a/lib/model/config.py b/lib/model/config.py
index 4c16a49..56b2486 100644
--- a/lib/model/config.py
+++ b/lib/model/config.py
@@ -39,7 +39,7 @@
 # Whether to double the learning rate for bias
 __C.TRAIN.DOUBLE_BIAS = True
 
-# Whether to initialize the weights with truncated normal distribution 
+# Whether to initialize the weights with truncated normal distribution
 __C.TRAIN.TRUNCATED = False
 
 # Whether to have weight decay on bias as well
@@ -50,7 +50,7 @@
 
 # Whether to use aspect-ratio grouping of training images, introduced merely for saving
 # GPU memory
-__C.TRAIN.ASPECT_GROUPING = False
+__C.TRAIN.ASPECT_GROUPING = True
 
 # The number of snapshots kept, older ones are deleted to save space
 __C.TRAIN.SNAPSHOT_KEPT = 3
@@ -139,7 +139,7 @@
 __C.TRAIN.RPN_BATCHSIZE = 256
 
 # NMS threshold used on RPN proposals
-__C.TRAIN.RPN_NMS_THRESH = 0.7
+__C.TRAIN.RPN_NMS_THRESH = 0.8
 
 # Number of top scoring boxes to keep before apply NMS to RPN proposals
 __C.TRAIN.RPN_PRE_NMS_TOP_N = 12000
@@ -155,7 +155,7 @@
 # Set to -1.0 to use uniform example weighting
 __C.TRAIN.RPN_POSITIVE_WEIGHT = -1.0
 
-# Whether to use all ground truth bounding boxes for training, 
+# Whether to use all ground truth bounding boxes for training,
 # For COCO, setting USE_ALL_GT to False will exclude boxes that are flagged as ''iscrowd''
 __C.TRAIN.USE_ALL_GT = True
 
@@ -213,8 +213,8 @@
 
 __C.RESNET = edict()
 
-# Option to set if max-pooling is appended after crop_and_resize. 
-# if true, the region will be resized to a square of 2xPOOLING_SIZE, 
+# Option to set if max-pooling is appended after crop_and_resize.
+# if true, the region will be resized to a square of 2xPOOLING_SIZE,
 # then 2x2 max-pooling is applied; otherwise the region will be directly
 # resized to a square of POOLING_SIZE
 __C.RESNET.MAX_POOL = False
@@ -267,7 +267,7 @@
 __C.EXP_DIR = 'default'
 
 # Use GPU implementation of non-maximum suppression
-__C.USE_GPU_NMS = True
+__C.USE_GPU_NMS = False
 
 # Default pooling mode
 __C.POOLING_MODE = 'crop'
diff --git a/lib/model/nms_wrapper.py b/lib/model/nms_wrapper.py
index 3e45e6a..2015548 100644
--- a/lib/model/nms_wrapper.py
+++ b/lib/model/nms_wrapper.py
@@ -8,9 +8,15 @@
 from __future__ import division
 from __future__ import print_function
 
-from nms.pth_nms import pth_nms
+from lib.nms.pth_nms import pth_nms
+from lib.nms.pth_nms import pth_nms_batch
 
 
+def nms_batch(dets, thresh, network_device):
+  """Dispatch to either CPU or GPU NMS implementations.
+  Accept dets as tensor"""
+  return pth_nms_batch(dets, thresh, network_device)
+
 def nms(dets, thresh):
   """Dispatch to either CPU or GPU NMS implementations.
   Accept dets as tensor"""
diff --git a/lib/model/test.py b/lib/model/test.py
index 19678ab..7cdd473 100644
--- a/lib/model/test.py
+++ b/lib/model/test.py
@@ -16,15 +16,17 @@
 import os
 import math
 
-from utils.timer import Timer
-from model.nms_wrapper import nms
-from utils.blob import im_list_to_blob
+from lib.utils.timer import Timer
+from lib.model.nms_wrapper import nms
+from lib.utils.blob import im_list_to_blob
 
-from model.config import cfg, get_output_dir
-from model.bbox_transform import clip_boxes, bbox_transform_inv
+from lib.model.config import cfg, get_output_dir
+from lib.model.bbox_transform import clip_boxes, bbox_transform_inv
 
 import torch
 
+
+
 def _get_image_blob(im):
   """Converts an image into a network input.
   Arguments:
@@ -59,6 +61,44 @@ def _get_image_blob(im):
 
   return blob, np.array(im_scale_factors)
 
+
+def _get_image_blob_batch(ims):
+  """Converts an image into a network input.
+  Arguments:
+    im (ndarray): a color image in BGR order
+  Returns:
+    blob (ndarray): a data blob holding an image pyramid
+    im_scale_factors (list): list of image scales (relative to im) used
+      in the image pyramid
+  """
+  im_origs = ims.astype(np.float32, copy=True)
+  im_origs -= cfg.PIXEL_MEANS
+
+  im_shape = im_origs.shape
+  im_size_min = np.min(im_shape[1:3])
+  im_size_max = np.max(im_shape[1:3])
+
+  processed_ims = []
+  im_scale_factors = []
+
+  for target_size in cfg.TEST.SCALES:
+    im_scale = float(target_size) / float(im_size_min)
+    # Prevent the biggest axis from being more than MAX_SIZE
+    if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE:
+      im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)
+    ims = np.array(list(map(lambda im_orig:cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale,
+            interpolation=cv2.INTER_LINEAR),im_origs)))
+    im_scale_factors.append(im_scale)
+    processed_ims.append(ims)
+
+  processed_ims = np.concatenate(processed_ims)
+  # Create a blob to hold the input images
+  blob = im_list_to_blob(processed_ims)
+
+  return blob, np.array(im_scale_factors)
+
+# np.array(processed_ims).shape
+
 def _get_blobs(im):
   """Convert an image and RoIs within that image into network inputs."""
   blobs = {}
@@ -66,6 +106,14 @@ def _get_blobs(im):
 
   return blobs, im_scale_factors
 
+def _get_blobs_batch(ims):
+  """Convert an image and RoIs within that image into network inputs."""
+  blobs = {}
+  blobs['data'], im_scale_factors = _get_image_blob_batch(ims)
+
+  return blobs, im_scale_factors
+
+
 def _clip_boxes(boxes, im_shape):
   """Clip boxes to image boundaries."""
   # x1 >= 0
@@ -93,7 +141,7 @@ def im_detect(net, im):
   blobs['im_info'] = np.array([im_blob.shape[1], im_blob.shape[2], im_scales[0]], dtype=np.float32)
 
   _, scores, bbox_pred, rois = net.test_image(blobs['data'], blobs['im_info'])
-  
+
   boxes = rois[:, 1:5] / im_scales[0]
   scores = np.reshape(scores, [scores.shape[0], -1])
   bbox_pred = np.reshape(bbox_pred, [bbox_pred.shape[0], -1])
@@ -108,6 +156,34 @@ def im_detect(net, im):
 
   return scores, pred_boxes
 
+
+def im_detect_batch(net, ims):
+  blobs, im_scales = _get_blobs_batch(ims)
+
+  im_blob = blobs['data']
+  blobs['im_info'] = np.array([im_blob.shape[1], im_blob.shape[2], im_scales[0]], dtype=np.float32)
+
+  _, scores, bbox_preds, rois = net.test_images(blobs['data'], blobs['im_info'])
+
+  scoress,pred_boxess = [],[]
+  for i in range(len(ims)):
+      boxes = rois[i][:, 1:5] / im_scales[0]
+      score = np.reshape(scores[i], [scores[i].shape[0], -1])
+      bbox_pred = np.reshape(bbox_preds[i], [bbox_preds[i].shape[0], -1])
+      if cfg.TEST.BBOX_REG:
+        # Apply bounding-box regression deltas
+        box_deltas = bbox_pred
+        pred_boxes = bbox_transform_inv(torch.from_numpy(boxes), torch.from_numpy(box_deltas)).numpy()
+        pred_boxes = _clip_boxes(pred_boxes, ims[i].shape)
+      else:
+        # Simply repeat the boxes, once for each class
+        pred_boxes = np.tile(boxes, (1, score.shape[1]))
+      pred_boxess.append(pred_boxes)
+      scoress.append(score)
+
+  return scoress,pred_boxess
+
+
 def apply_nms(all_boxes, thresh):
   """Apply non-maximum suppression to all predicted boxes output by the
   test_net method.
@@ -192,4 +268,3 @@ def test_net(net, imdb, weights_filename, max_per_image=100, thresh=0.):
 
   print('Evaluating detections')
   imdb.evaluate_detections(all_boxes, output_dir)
-
diff --git a/lib/nets/mobilenet_v1.py b/lib/nets/mobilenet_v1.py
index ff78726..dbb8e36 100644
--- a/lib/nets/mobilenet_v1.py
+++ b/lib/nets/mobilenet_v1.py
@@ -15,8 +15,8 @@
 import numpy as np
 from collections import namedtuple, OrderedDict
 
-from nets.network import Network
-from model.config import cfg
+from lib.nets.network import Network
+from lib.model.config import cfg
 
 # The following is adapted from:
 # https://github.com/tensorflow/models/blob/master/slim/nets/mobilenet_v1.py
@@ -193,7 +193,7 @@ def normal_init(m, mean, stddev, truncated=False):
       else:
         m.weight.data.normal_(mean, stddev)
       if m.bias is not None: m.bias.data.zero_()
-      
+
     self.mobilenet.apply(lambda m: normal_init(m, 0, 0.09, True))
     normal_init(self.rpn_net, 0, 0.01, cfg.TRAIN.TRUNCATED)
     normal_init(self.rpn_cls_score_net, 0, 0.01,  cfg.TRAIN.TRUNCATED)
@@ -215,12 +215,12 @@ def _head_to_tail(self, pool5):
   def _init_head_tail(self):
     self.mobilenet = mobilenet_v1_base()
 
-    # Fix blocks  
+    # Fix blocks
     assert (0 <= cfg.MOBILENET.FIXED_LAYERS <= 12)
     for m in list(self.mobilenet.children())[:cfg.MOBILENET.FIXED_LAYERS]:
       for p in m.parameters():
         p.requires_grad = False
-    
+
     def set_bn_fix(m):
       classname = m.__class__.__name__
       if classname.find('BatchNorm') != -1:
diff --git a/lib/nets/network.py b/lib/nets/network.py
index 989c47e..c97634e 100644
--- a/lib/nets/network.py
+++ b/lib/nets/network.py
@@ -14,20 +14,22 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.autograd import Variable
+import signal
+import lib.utils.timer
 
-import utils.timer
+from lib.layer_utils.snippets import generate_anchors_pre
+from lib.layer_utils.proposal_layer import proposal_layer
+from lib.layer_utils.proposal_top_layer import proposal_top_layer
+from lib.layer_utils.proposal_layer import proposal_layer_batch
+from lib.layer_utils.proposal_top_layer import proposal_top_layer_batch
+from lib.layer_utils.anchor_target_layer import anchor_target_layer
+from lib.layer_utils.proposal_target_layer import proposal_target_layer
+from lib.utils.visualization import draw_bounding_boxes
 
-from layer_utils.snippets import generate_anchors_pre
-from layer_utils.proposal_layer import proposal_layer
-from layer_utils.proposal_top_layer import proposal_top_layer
-from layer_utils.anchor_target_layer import anchor_target_layer
-from layer_utils.proposal_target_layer import proposal_target_layer
-from utils.visualization import draw_bounding_boxes
+from lib.layer_utils.roi_pooling.roi_pool import RoIPoolFunction
+from lib.layer_utils.roi_align.crop_and_resize import CropAndResizeFunction
 
-from layer_utils.roi_pooling.roi_pool import RoIPoolFunction
-from layer_utils.roi_align.crop_and_resize import CropAndResizeFunction
-
-from model.config import cfg
+from lib.model.config import cfg
 
 import tensorboardX as tb
 
@@ -47,7 +49,8 @@ def __init__(self):
     self._event_summaries = {}
     self._image_gt_summaries = {}
     self._variables_to_fix = {}
-    self._device = 'cuda'
+    self._cuda_device = 0
+    self._device = "cuda"
 
   def _add_gt_image(self):
     # add back mean
@@ -88,6 +91,21 @@ def _proposal_layer(self, rpn_cls_prob, rpn_bbox_pred):
 
     return rois, rpn_scores
 
+  def _proposal_top_layer_batch(self, rpn_cls_prob, rpn_bbox_pred):
+    rois, rpn_scores = proposal_top_layer_batch(\
+                                    rpn_cls_prob, rpn_bbox_pred, self._im_info,
+                                     self._feat_stride, self._anchors, self._num_anchors, self._cuda_device)
+    return rois, rpn_scores
+
+  def _proposal_layer_batch(self, rpn_cls_prob, rpn_bbox_pred):
+    rois, rpn_scores = proposal_layer_batch(\
+                                    rpn_cls_prob, rpn_bbox_pred, self._im_info, self._mode,
+                                     self._feat_stride, self._anchors, self._num_anchors, self._cuda_device)
+
+    return rois, rpn_scores
+
+
+
   def _roi_pool_layer(self, bottom, rois):
     return RoIPoolFunction(cfg.POOLING_SIZE, cfg.POOLING_SIZE, 1. / 16.)(bottom, rois)
 
@@ -115,7 +133,7 @@ def _crop_pool_layer(self, bottom, rois, max_pool=True):
     width = bottom.size(3)
 
     pre_pool_size = cfg.POOLING_SIZE * 2 if max_pool else cfg.POOLING_SIZE
-    crops = CropAndResizeFunction(pre_pool_size, pre_pool_size)(bottom, 
+    crops = CropAndResizeFunction(pre_pool_size, pre_pool_size)(bottom,
       torch.cat([y1/(height-1),x1/(width-1),y2/(height-1),x2/(width-1)], 1), rois[:, 0].int())
     if max_pool:
       crops = F.max_pool2d(crops, 2, 2)
@@ -126,10 +144,10 @@ def _anchor_target_layer(self, rpn_cls_score):
       anchor_target_layer(
       rpn_cls_score.data, self._gt_boxes.data.cpu().numpy(), self._im_info, self._feat_stride, self._anchors.data.cpu().numpy(), self._num_anchors)
 
-    rpn_labels = torch.from_numpy(rpn_labels).float().to(self._device) #.set_shape([1, 1, None, None])
-    rpn_bbox_targets = torch.from_numpy(rpn_bbox_targets).float().to(self._device)#.set_shape([1, None, None, self._num_anchors * 4])
-    rpn_bbox_inside_weights = torch.from_numpy(rpn_bbox_inside_weights).float().to(self._device)#.set_shape([1, None, None, self._num_anchors * 4])
-    rpn_bbox_outside_weights = torch.from_numpy(rpn_bbox_outside_weights).float().to(self._device)#.set_shape([1, None, None, self._num_anchors * 4])
+    rpn_labels = torch.from_numpy(rpn_labels).float().cuda(self._cuda_device) #.set_shape([1, 1, None, None])
+    rpn_bbox_targets = torch.from_numpy(rpn_bbox_targets).float().cuda(self._cuda_device)#.set_shape([1, None, None, self._num_anchors * 4])
+    rpn_bbox_inside_weights = torch.from_numpy(rpn_bbox_inside_weights).float().cuda(self._cuda_device)#.set_shape([1, None, None, self._num_anchors * 4])
+    rpn_bbox_outside_weights = torch.from_numpy(rpn_bbox_outside_weights).float().cuda(self._cuda_device)#.set_shape([1, None, None, self._num_anchors * 4])
 
     rpn_labels = rpn_labels.long()
     self._anchor_targets['rpn_labels'] = rpn_labels
@@ -165,7 +183,7 @@ def _anchor_component(self, height, width):
     anchors, anchor_length = generate_anchors_pre(\
                                           height, width,
                                            self._feat_stride, self._anchor_scales, self._anchor_ratios)
-    self._anchors = torch.from_numpy(anchors).to(self._device)
+    self._anchors = torch.from_numpy(anchors).cuda(self._cuda_device)
     self._anchor_length = anchor_length
 
   def _smooth_l1_loss(self, bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights, sigma=1.0, dim=[1]):
@@ -234,7 +252,7 @@ def _region_proposal(self, net_conv):
     # change it so that the score has 2 as its channel size
     rpn_cls_score_reshape = rpn_cls_score.view(1, 2, -1, rpn_cls_score.size()[-1]) # batch * 2 * (num_anchors*h) * w
     rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape, dim=1)
-    
+
     # Move channel to the last dimenstion, to fit the input of python functions
     rpn_cls_prob = rpn_cls_prob_reshape.view_as(rpn_cls_score).permute(0, 2, 3, 1) # batch * h * w * (num_anchors * 2)
     rpn_cls_score = rpn_cls_score.permute(0, 2, 3, 1) # batch * h * w * (num_anchors * 2)
@@ -265,6 +283,47 @@ def _region_proposal(self, net_conv):
 
     return rois
 
+  def _region_proposal_batch(self, net_conv):
+    rpn = F.relu(self.rpn_net(net_conv))
+    self._act_summaries['rpn'] = rpn
+
+    rpn_cls_score = self.rpn_cls_score_net(rpn) # batch * (num_anchors * 2) * h * w
+
+    # change it so that the score has 2 as its channel size
+    rpn_cls_score_reshape = rpn_cls_score.view(rpn_cls_score.size(0), 2, -1, rpn_cls_score.size()[-1]) # batch * 2 * (num_anchors*h) * w
+    rpn_cls_prob_reshape = F.softmax(rpn_cls_score_reshape, dim=1)
+
+    # Move channel to the last dimenstion, to fit the input of python functions
+    rpn_cls_prob = rpn_cls_prob_reshape.view_as(rpn_cls_score).permute(0, 2, 3, 1) # batch * h * w * (num_anchors * 2)
+    rpn_cls_score = rpn_cls_score.permute(0, 2, 3, 1) # batch * h * w * (num_anchors * 2)
+    rpn_cls_score_reshape = rpn_cls_score_reshape.permute(0, 2, 3, 1).contiguous()  # batch * (num_anchors*h) * w * 2
+    rpn_cls_pred = torch.max(rpn_cls_score_reshape.view(rpn_cls_score.size(0),-1, 2), 2)[1]
+
+    rpn_bbox_pred = self.rpn_bbox_pred_net(rpn)
+    rpn_bbox_pred = rpn_bbox_pred.permute(0, 2, 3, 1).contiguous()  # batch * h * w * (num_anchors*4)
+
+    if self._mode == 'TRAIN':
+      rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred) # rois, roi_scores are varible
+      rpn_labels = self._anchor_target_layer(rpn_cls_score)
+      rois, _ = self._proposal_target_layer(rois, roi_scores)
+    else:
+      if cfg.TEST.MODE == 'nms':
+        rois, _ = self._proposal_layer_batch(rpn_cls_prob, rpn_bbox_pred)
+      elif cfg.TEST.MODE == 'top':
+        rois, _ = self._proposal_top_layer_batch(rpn_cls_prob, rpn_bbox_pred)
+      else:
+        raise NotImplementedError
+
+    self._predictions["rpn_cls_score"] = rpn_cls_score
+    self._predictions["rpn_cls_score_reshape"] = rpn_cls_score_reshape
+    self._predictions["rpn_cls_prob"] = rpn_cls_prob
+    self._predictions["rpn_cls_pred"] = rpn_cls_pred
+    self._predictions["rpn_bbox_pred"] = rpn_bbox_pred
+    self._predictions["rois"] = rois
+
+    return rois
+
+
   def _region_classification(self, fc7):
     cls_score = self.cls_score_net(fc7)
     cls_pred = torch.max(cls_score, 1)[1]
@@ -278,6 +337,19 @@ def _region_classification(self, fc7):
 
     return cls_prob, bbox_pred
 
+  def _region_classification_batch(self, fc7):
+    cls_score = self.cls_score_net(fc7)
+    cls_pred = torch.max(cls_score, 1)[1]
+    cls_prob = F.softmax(cls_score, dim=1)
+    bbox_pred = self.bbox_pred_net(fc7)
+
+    self._predictions["cls_score"].append(cls_score)
+    self._predictions["cls_pred"].append(cls_pred)
+    self._predictions["cls_prob"].append(cls_prob)
+    self._predictions["bbox_pred"].append(bbox_pred)
+
+    return cls_prob, bbox_pred
+
   def _image_to_head(self):
     raise NotImplementedError
 
@@ -309,7 +381,7 @@ def _init_modules(self):
     self.rpn_net = nn.Conv2d(self._net_conv_channels, cfg.RPN_CHANNELS, [3, 3], padding=1)
 
     self.rpn_cls_score_net = nn.Conv2d(cfg.RPN_CHANNELS, self._num_anchors * 2, [1, 1])
-    
+
     self.rpn_bbox_pred_net = nn.Conv2d(cfg.RPN_CHANNELS, self._num_anchors * 4, [1, 1])
 
     self.cls_score_net = nn.Linear(self._fc7_channels, self._num_classes)
@@ -343,7 +415,7 @@ def _run_summary_op(self, val=False):
           summaries.append(self._add_train_summary(k, var))
 
       self._image_gt_summaries = {}
-    
+
     return summaries
 
   def _predict(self):
@@ -353,7 +425,7 @@ def _predict(self):
 
     # build the anchors for the image
     self._anchor_component(net_conv.size(2), net_conv.size(3))
-   
+
     rois = self._region_proposal(net_conv)
     if cfg.POOLING_MODE == 'crop':
       pool5 = self._crop_pool_layer(net_conv, rois)
@@ -365,12 +437,64 @@ def _predict(self):
     fc7 = self._head_to_tail(pool5)
 
     cls_prob, bbox_pred = self._region_classification(fc7)
-    
+
+    for k in self._predictions.keys():
+      self._score_summaries[k] = self._predictions[k]
+
+    return rois, cls_prob, bbox_pred
+
+
+  def _predict_batch(self):
+    # This is just _build_network in tf-faster-rcnn
+    torch.backends.cudnn.benchmark = False
+    net_conv = self._image_to_head()
+
+    # build the anchors for the image
+    self._anchor_component(net_conv.size(2), net_conv.size(3))
+    #net_conv.size : (bs,1024,h,w)
+    rois = self._region_proposal_batch(net_conv)
+    if cfg.POOLING_MODE == 'crop':
+      pool5 = list(map(lambda x : self._crop_pool_layer(net_conv, x),rois))
+    else:
+      pool5 = self._roi_pool_layer(net_conv, rois)
+
+    if self._mode == 'TRAIN':
+      torch.backends.cudnn.benchmark = True # benchmark because now the input size are fixed
+    fc7 = list(map(self._head_to_tail,pool5))
+
+    self._predictions["cls_score"] = []
+    self._predictions["cls_pred"] = []
+    self._predictions["cls_prob"] = []
+    self._predictions["bbox_pred"] = []
+    cls_probs_bbox_preds = list(map(self._region_classification_batch,fc7))
+    cls_prob, bbox_pred = [x[0] for x in cls_probs_bbox_preds],[x[1] for x in cls_probs_bbox_preds]
+
     for k in self._predictions.keys():
       self._score_summaries[k] = self._predictions[k]
 
     return rois, cls_prob, bbox_pred
 
+  def forward_batch(self, image, im_info, gt_boxes=None, mode='TRAIN'):
+    self._image_gt_summaries['image'] = image
+    self._image_gt_summaries['gt_boxes'] = gt_boxes
+    self._image_gt_summaries['im_info'] = im_info
+
+    self._image = torch.from_numpy(image.transpose([0,3,1,2])).cuda(self._cuda_device)
+    self._im_info = im_info # No need to change; actually it can be an list
+    self._gt_boxes = torch.from_numpy(gt_boxes).cuda(self._cuda_device) if gt_boxes is not None else None
+
+    self._mode = mode
+
+    rois, cls_prob, bbox_preds = self._predict_batch()
+
+    if mode == 'TEST':
+      stds = list(map(lambda x : x.data.new(cfg.TRAIN.BBOX_NORMALIZE_STDS).repeat(self._num_classes).unsqueeze(0).expand_as(x),bbox_preds))
+      means = list(map(lambda x : x.data.new(cfg.TRAIN.BBOX_NORMALIZE_MEANS).repeat(self._num_classes).unsqueeze(0).expand_as(x),bbox_preds))
+      self._predictions["bbox_pred"] = list(map(lambda i : bbox_preds[i].mul(stds[i]).add(means[i]), range(len(stds))))
+    else:
+      self._add_losses() # compute losses
+
+
   def forward(self, image, im_info, gt_boxes=None, mode='TRAIN'):
     self._image_gt_summaries['image'] = image
     self._image_gt_summaries['gt_boxes'] = gt_boxes
@@ -391,6 +515,8 @@ def forward(self, image, im_info, gt_boxes=None, mode='TRAIN'):
     else:
       self._add_losses() # compute losses
 
+
+
   def init_weights(self):
     def normal_init(m, mean, stddev, truncated=False):
       """
@@ -402,7 +528,7 @@ def normal_init(m, mean, stddev, truncated=False):
       else:
         m.weight.data.normal_(mean, stddev)
       m.bias.data.zero_()
-      
+
     normal_init(self.rpn_net, 0, 0.01, cfg.TRAIN.TRUNCATED)
     normal_init(self.rpn_cls_score_net, 0, 0.01, cfg.TRAIN.TRUNCATED)
     normal_init(self.rpn_bbox_pred_net, 0, 0.01, cfg.TRAIN.TRUNCATED)
@@ -412,7 +538,7 @@ def normal_init(m, mean, stddev, truncated=False):
   # Extract the head feature maps, for example for vgg16 it is conv5_3
   # only useful during testing mode
   def extract_head(self, image):
-    feat = self._layers["head"](torch.from_numpy(image.transpose([0,3,1,2])).to(self._device))
+    feat = self._layers["head"](torch.from_numpy(image.transpose([0,3,1,2])).cuda(self._cuda_device))
     return feat
 
   # only useful during testing mode
@@ -426,6 +552,16 @@ def test_image(self, image, im_info):
                                                      self._predictions['rois'].data.cpu().numpy()
     return cls_score, cls_prob, bbox_pred, rois
 
+  def test_images(self, images, im_info):
+    self.eval()
+    with torch.no_grad():
+      self.forward_batch(images, im_info, None, mode='TEST')
+    cls_score, cls_prob, bbox_pred, rois = list(map(lambda x : x.data.cpu().numpy(),self._predictions['cls_score'])), \
+                                                     list(map(lambda x : x.data.cpu().numpy(),self._predictions['cls_prob'])), \
+                                                     list(map(lambda x : x.data.cpu().numpy(),self._predictions['bbox_pred'])), \
+                                                     list(map(lambda x : x.data.cpu().numpy(),self._predictions['rois']))
+    return cls_score, cls_prob, bbox_pred, rois
+
   def delete_intermediate_states(self):
     # Delete intermediate result to save memory
     for d in [self._losses, self._predictions, self._anchor_targets, self._proposal_targets]:
@@ -482,9 +618,8 @@ def train_step_no_return(self, blobs, train_op):
 
   def load_state_dict(self, state_dict):
     """
-    Because we remove the definition of fc layer in resnet now, it will fail when loading 
+    Because we remove the definition of fc layer in resnet now, it will fail when loading
     the model trained before.
     To provide back compatibility, we overwrite the load_state_dict
     """
     nn.Module.load_state_dict(self, {k: state_dict[k] for k in list(self.state_dict())})
-
diff --git a/lib/nets/resnet_v1.py b/lib/nets/resnet_v1.py
index 350cfa2..9342f04 100644
--- a/lib/nets/resnet_v1.py
+++ b/lib/nets/resnet_v1.py
@@ -7,10 +7,10 @@
 from __future__ import division
 from __future__ import print_function
 
-from nets.network import Network
-from model.config import cfg
+from lib.nets.network import Network
+from lib.model.config import cfg
 
-import utils.timer
+import lib.utils.timer
 
 import torch
 import torch.nn as nn
@@ -110,7 +110,7 @@ def _image_to_head(self):
     return net_conv
 
   def _head_to_tail(self, pool5):
-    fc7 = self.resnet.layer4(pool5).mean(3).mean(2) # average pooling after layer4
+    fc7 = self.resnet.module.layer4(pool5).mean(3).mean(2) # average pooling after layer4
     return fc7
 
   def _init_head_tail(self):
@@ -128,7 +128,7 @@ def _init_head_tail(self):
       # other numbers are not supported
       raise NotImplementedError
 
-    # Fix blocks 
+    # Fix blocks
     for p in self.resnet.bn1.parameters(): p.requires_grad=False
     for p in self.resnet.conv1.parameters(): p.requires_grad=False
     assert (0 <= cfg.RESNET.FIXED_BLOCKS < 4)
@@ -147,7 +147,7 @@ def set_bn_fix(m):
     self.resnet.apply(set_bn_fix)
 
     # Build resnet.
-    self._layers['head'] = nn.Sequential(self.resnet.conv1, self.resnet.bn1,self.resnet.relu, 
+    self._layers['head'] = nn.Sequential(self.resnet.conv1, self.resnet.bn1,self.resnet.relu,
       self.resnet.maxpool,self.resnet.layer1,self.resnet.layer2,self.resnet.layer3)
 
   def train(self, mode=True):
diff --git a/lib/nets/vgg16.py b/lib/nets/vgg16.py
index c204dba..f22e2bb 100644
--- a/lib/nets/vgg16.py
+++ b/lib/nets/vgg16.py
@@ -7,8 +7,8 @@
 from __future__ import division
 from __future__ import print_function
 
-from nets.network import Network
-from model.config import cfg
+from lib.nets.network import Network
+from lib.model.config import cfg
 
 import torch
 import torch.nn as nn
@@ -40,7 +40,7 @@ def _init_head_tail(self):
   def _image_to_head(self):
     net_conv = self._layers['head'](self._image)
     self._act_summaries['conv'] = net_conv
-    
+
     return net_conv
 
   def _head_to_tail(self, pool5):
@@ -50,4 +50,4 @@ def _head_to_tail(self, pool5):
     return fc7
 
   def load_pretrained_cnn(self, state_dict):
-    self.vgg.load_state_dict({k:v for k,v in state_dict.items() if k in self.vgg.state_dict()})
\ No newline at end of file
+    self.vgg.load_state_dict({k:v for k,v in state_dict.items() if k in self.vgg.state_dict()})
diff --git a/lib/nms/_ext/__init__.py b/lib/nms/_ext/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/lib/nms/_ext/nms/__init__.py b/lib/nms/_ext/nms/__init__.py
new file mode 100644
index 0000000..d71786f
--- /dev/null
+++ b/lib/nms/_ext/nms/__init__.py
@@ -0,0 +1,15 @@
+
+from torch.utils.ffi import _wrap_function
+from ._nms import lib as _lib, ffi as _ffi
+
+__all__ = []
+def _import_symbols(locals):
+    for symbol in dir(_lib):
+        fn = getattr(_lib, symbol)
+        if callable(fn):
+            locals[symbol] = _wrap_function(fn, _ffi)
+        else:
+            locals[symbol] = fn
+        __all__.append(symbol)
+
+_import_symbols(locals())
diff --git a/lib/nms/pth_nms.py b/lib/nms/pth_nms.py
index 5dac09d..d84c0f7 100644
--- a/lib/nms/pth_nms.py
+++ b/lib/nms/pth_nms.py
@@ -2,6 +2,49 @@
 from ._ext import nms
 import numpy as np
 
+def pth_nms_batch(dets, thresh, network_device):
+  """
+  dets has to be a tensor
+  """
+  if not dets.is_cuda:
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.sort(0, descending=True)[1]
+    # order = torch.from_numpy(np.ascontiguousarray(scores.numpy().argsort()[::-1])).long()
+
+    keep = torch.LongTensor(dets.size(0))
+    num_out = torch.LongTensor(1)
+    nms.cpu_nms(keep, num_out, dets, order, areas, thresh)
+
+    return keep[:num_out[0]]
+  else:
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.sort(0, descending=True)[1]
+    # order = torch.from_numpy(np.ascontiguousarray(scores.cpu().numpy().argsort()[::-1])).long().cuda()
+
+    dets = dets[order].contiguous()
+
+    keep = torch.LongTensor(dets.size(0))
+    num_out = torch.LongTensor(1)
+    # keep = torch.cuda.LongTensor(dets.size(0))
+    # num_out = torch.cuda.LongTensor(1)
+    nms.gpu_nms(keep, num_out, dets, thresh)
+
+    return order[keep[:num_out[0]].cuda(network_device)].contiguous()
+    # return order[keep[:num_out[0]]].contiguous()
+
+
 def pth_nms(dets, thresh):
   """
   dets has to be a tensor
@@ -43,4 +86,3 @@ def pth_nms(dets, thresh):
 
     return order[keep[:num_out[0]].cuda()].contiguous()
     # return order[keep[:num_out[0]]].contiguous()
-
diff --git a/lib/nms/src/cuda/nms_kernel.cu.o b/lib/nms/src/cuda/nms_kernel.cu.o
new file mode 100644
index 0000000..47504d2
Binary files /dev/null and b/lib/nms/src/cuda/nms_kernel.cu.o differ