diff --git a/detection/retinaface/retinaface.py b/detection/retinaface/retinaface.py
index 5b274e3da..78054b797 100644
--- a/detection/retinaface/retinaface.py
+++ b/detection/retinaface/retinaface.py
@@ -1,19 +1,14 @@
 from __future__ import print_function
 import sys
 import os
-import datetime
-import time
 import numpy as np
 import mxnet as mx
 from mxnet import ndarray as nd
 import cv2
-#from rcnn import config
 from rcnn.logger import logger
-#from rcnn.processing.bbox_transform import nonlinear_pred, clip_boxes, landmark_pred
 from rcnn.processing.bbox_transform import clip_boxes
 from rcnn.processing.generate_anchor import generate_anchors_fpn, anchors_plane
 from rcnn.processing.nms import gpu_nms_wrapper, cpu_nms_wrapper
-from rcnn.processing.bbox_transform import bbox_overlaps
 
 
 class RetinaFace:
@@ -32,25 +27,28 @@ def __init__(self,
         self.nms_threshold = nms
         self.vote = vote
         self.nocrop = nocrop
-        self.debug = False
         self.fpn_keys = []
         self.anchor_cfg = None
+        
+        # Initialize preprocessing parameters
         pixel_means = [0.0, 0.0, 0.0]
         pixel_stds = [1.0, 1.0, 1.0]
         pixel_scale = 1.0
         self.preprocess = False
-        _ratio = (1., )
+        _ratio = (1.,)
         fmc = 3
+        
+        # Network-specific configurations
         if network == 'ssh' or network == 'vgg':
             pixel_means = [103.939, 116.779, 123.68]
             self.preprocess = True
         elif network == 'net3':
-            _ratio = (1., )
+            _ratio = (1.,)
         elif network == 'net3a':
             _ratio = (1., 1.5)
-        elif network == 'net6':  #like pyramidbox or s3fd
+        elif network == 'net6':
             fmc = 6
-        elif network == 'net5':  #retinaface
+        elif network == 'net5':
             fmc = 5
         elif network == 'net5a':
             fmc = 5
@@ -60,560 +58,368 @@ def __init__(self,
         elif network == 'net4a':
             fmc = 4
             _ratio = (1., 1.5)
-        elif network == 'x5':
-            fmc = 5
-            pixel_means = [103.52, 116.28, 123.675]
-            pixel_stds = [57.375, 57.12, 58.395]
-        elif network == 'x3':
-            fmc = 3
-            pixel_means = [103.52, 116.28, 123.675]
-            pixel_stds = [57.375, 57.12, 58.395]
-        elif network == 'x3a':
-            fmc = 3
-            _ratio = (1., 1.5)
+        elif network in ['x5', 'x3', 'x3a']:
+            fmc = 5 if network == 'x5' else 3
             pixel_means = [103.52, 116.28, 123.675]
             pixel_stds = [57.375, 57.12, 58.395]
+            if network == 'x3a':
+                _ratio = (1., 1.5)
         else:
             assert False, 'network setting error %s' % network
 
-        if fmc == 3:
-            self._feat_stride_fpn = [32, 16, 8]
-            self.anchor_cfg = {
-                '32': {
-                    'SCALES': (32, 16),
-                    'BASE_SIZE': 16,
-                    'RATIOS': _ratio,
-                    'ALLOWED_BORDER': 9999
-                },
-                '16': {
-                    'SCALES': (8, 4),
-                    'BASE_SIZE': 16,
-                    'RATIOS': _ratio,
-                    'ALLOWED_BORDER': 9999
-                },
-                '8': {
-                    'SCALES': (2, 1),
-                    'BASE_SIZE': 16,
-                    'RATIOS': _ratio,
-                    'ALLOWED_BORDER': 9999
-                },
-            }
-        elif fmc == 4:
-            self._feat_stride_fpn = [32, 16, 8, 4]
-            self.anchor_cfg = {
-                '32': {
-                    'SCALES': (32, 16),
-                    'BASE_SIZE': 16,
-                    'RATIOS': _ratio,
-                    'ALLOWED_BORDER': 9999
-                },
-                '16': {
-                    'SCALES': (8, 4),
-                    'BASE_SIZE': 16,
-                    'RATIOS': _ratio,
-                    'ALLOWED_BORDER': 9999
-                },
-                '8': {
-                    'SCALES': (2, 1),
-                    'BASE_SIZE': 16,
-                    'RATIOS': _ratio,
-                    'ALLOWED_BORDER': 9999
-                },
-                '4': {
-                    'SCALES': (2, 1),
-                    'BASE_SIZE': 16,
-                    'RATIOS': _ratio,
-                    'ALLOWED_BORDER': 9999
-                },
-            }
-        elif fmc == 6:
-            self._feat_stride_fpn = [128, 64, 32, 16, 8, 4]
-            self.anchor_cfg = {
-                '128': {
-                    'SCALES': (32, ),
-                    'BASE_SIZE': 16,
-                    'RATIOS': _ratio,
-                    'ALLOWED_BORDER': 9999
-                },
-                '64': {
-                    'SCALES': (16, ),
-                    'BASE_SIZE': 16,
-                    'RATIOS': _ratio,
-                    'ALLOWED_BORDER': 9999
-                },
-                '32': {
-                    'SCALES': (8, ),
-                    'BASE_SIZE': 16,
-                    'RATIOS': _ratio,
-                    'ALLOWED_BORDER': 9999
-                },
-                '16': {
-                    'SCALES': (4, ),
-                    'BASE_SIZE': 16,
-                    'RATIOS': _ratio,
-                    'ALLOWED_BORDER': 9999
-                },
-                '8': {
-                    'SCALES': (2, ),
-                    'BASE_SIZE': 16,
-                    'RATIOS': _ratio,
-                    'ALLOWED_BORDER': 9999
-                },
-                '4': {
-                    'SCALES': (1, ),
-                    'BASE_SIZE': 16,
-                    'RATIOS': _ratio,
-                    'ALLOWED_BORDER': 9999
-                },
-            }
-        elif fmc == 5:
-            self._feat_stride_fpn = [64, 32, 16, 8, 4]
-            self.anchor_cfg = {}
-            _ass = 2.0**(1.0 / 3)
-            _basescale = 1.0
-            for _stride in [4, 8, 16, 32, 64]:
-                key = str(_stride)
-                value = {
-                    'BASE_SIZE': 16,
-                    'RATIOS': _ratio,
-                    'ALLOWED_BORDER': 9999
-                }
-                scales = []
-                for _ in range(3):
-                    scales.append(_basescale)
-                    _basescale *= _ass
-                value['SCALES'] = tuple(scales)
-                self.anchor_cfg[key] = value
-
+        # Configure FPN strides and anchors
+        self._configure_fpn(fmc, _ratio)
+        
         print(self._feat_stride_fpn, self.anchor_cfg)
 
-        for s in self._feat_stride_fpn:
-            self.fpn_keys.append('stride%s' % s)
-
+        # Generate anchors
+        self.fpn_keys = ['stride%s' % s for s in self._feat_stride_fpn]
         dense_anchor = False
-        #self._anchors_fpn = dict(zip(self.fpn_keys, generate_anchors_fpn(base_size=fpn_base_size, scales=self._scales, ratios=self._ratios)))
         self._anchors_fpn = dict(
-            zip(
-                self.fpn_keys,
-                generate_anchors_fpn(dense_anchor=dense_anchor,
-                                     cfg=self.anchor_cfg)))
-        for k in self._anchors_fpn:
-            v = self._anchors_fpn[k].astype(np.float32)
-            self._anchors_fpn[k] = v
-
-        self._num_anchors = dict(
             zip(self.fpn_keys,
-                [anchors.shape[0] for anchors in self._anchors_fpn.values()]))
-        #self._bbox_pred = nonlinear_pred
-        #self._landmark_pred = landmark_pred
+                generate_anchors_fpn(dense_anchor=dense_anchor, cfg=self.anchor_cfg)))
+        
+        # Convert to float32 and cache num_anchors
+        for k in self._anchors_fpn:
+            self._anchors_fpn[k] = self._anchors_fpn[k].astype(np.float32)
+        
+        self._num_anchors = {k: anchors.shape[0] 
+                            for k, anchors in zip(self.fpn_keys, self._anchors_fpn.values())}
+        
+        # Load model
         sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+        
+        # Setup context and NMS
         if self.ctx_id >= 0:
             self.ctx = mx.gpu(self.ctx_id)
             self.nms = gpu_nms_wrapper(self.nms_threshold, self.ctx_id)
         else:
             self.ctx = mx.cpu()
             self.nms = cpu_nms_wrapper(self.nms_threshold)
+        
+        # Cache preprocessing parameters as numpy arrays
         self.pixel_means = np.array(pixel_means, dtype=np.float32)
         self.pixel_stds = np.array(pixel_stds, dtype=np.float32)
         self.pixel_scale = float(pixel_scale)
+        
+        # Precompute reversed pixel parameters for optimization
+        self.pixel_means_reversed = self.pixel_means[::-1]
+        self.pixel_stds_reversed = self.pixel_stds[::-1]
+        
         print('means', self.pixel_means)
-        self.use_landmarks = False
-        if len(sym) // len(self._feat_stride_fpn) >= 3:
-            self.use_landmarks = True
+        
+        # Detect landmarks and cascade usage
+        self.use_landmarks = len(sym) // len(self._feat_stride_fpn) >= 3
         print('use_landmarks', self.use_landmarks)
-        self.cascade = 0
-        if float(len(sym)) // len(self._feat_stride_fpn) > 3.0:
-            self.cascade = 1
+        
+        self.cascade = 1 if float(len(sym)) // len(self._feat_stride_fpn) > 3.0 else 0
         print('cascade', self.cascade)
-        #self.bbox_stds = [0.1, 0.1, 0.2, 0.2]
-        #self.landmark_std = 0.1
-        self.bbox_stds = [1.0, 1.0, 1.0, 1.0]
+        
+        # Bbox and landmark standards
+        self.bbox_stds = np.array([1.0, 1.0, 1.0, 1.0], dtype=np.float32)
         self.landmark_std = 1.0
-
-        if self.debug:
-            c = len(sym) // len(self._feat_stride_fpn)
-            sym = sym[(c * 0):]
-            self._feat_stride_fpn = [32, 16, 8]
+        
         print('sym size:', len(sym))
-
+        
+        # Initialize model
         image_size = (640, 640)
-        self.model = mx.mod.Module(symbol=sym,
-                                   context=self.ctx,
-                                   label_names=None)
-        self.model.bind(data_shapes=[('data', (1, 3, image_size[0],
-                                               image_size[1]))],
-                        for_training=False)
+        self.model = mx.mod.Module(symbol=sym, context=self.ctx, label_names=None)
+        self.model.bind(data_shapes=[('data', (1, 3, image_size[0], image_size[1]))],
+                       for_training=False)
         self.model.set_params(arg_params, aux_params)
 
+    def _configure_fpn(self, fmc, _ratio):
+        """Configure FPN strides and anchor configurations"""
+        if fmc == 3:
+            self._feat_stride_fpn = [32, 16, 8]
+            self.anchor_cfg = {
+                '32': {'SCALES': (32, 16), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999},
+                '16': {'SCALES': (8, 4), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999},
+                '8': {'SCALES': (2, 1), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999},
+            }
+        elif fmc == 4:
+            self._feat_stride_fpn = [32, 16, 8, 4]
+            self.anchor_cfg = {
+                '32': {'SCALES': (32, 16), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999},
+                '16': {'SCALES': (8, 4), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999},
+                '8': {'SCALES': (2, 1), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999},
+                '4': {'SCALES': (2, 1), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999},
+            }
+        elif fmc == 6:
+            self._feat_stride_fpn = [128, 64, 32, 16, 8, 4]
+            self.anchor_cfg = {
+                '128': {'SCALES': (32,), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999},
+                '64': {'SCALES': (16,), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999},
+                '32': {'SCALES': (8,), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999},
+                '16': {'SCALES': (4,), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999},
+                '8': {'SCALES': (2,), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999},
+                '4': {'SCALES': (1,), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999},
+            }
+        elif fmc == 5:
+            self._feat_stride_fpn = [64, 32, 16, 8, 4]
+            self.anchor_cfg = {}
+            _ass = 2.0**(1.0 / 3)
+            _basescale = 1.0
+            for _stride in [4, 8, 16, 32, 64]:
+                scales = [_basescale * (_ass ** i) for i in range(3)]
+                self.anchor_cfg[str(_stride)] = {
+                    'BASE_SIZE': 16,
+                    'RATIOS': _ratio,
+                    'SCALES': tuple(scales),
+                    'ALLOWED_BORDER': 9999
+                }
+                _basescale = scales[-1] * _ass
+
     def get_input(self, img):
+        """Optimized preprocessing - single pass through channels"""
         im = img.astype(np.float32)
-        im_tensor = np.zeros((1, 3, im.shape[0], im.shape[1]))
+        im_tensor = np.zeros((1, 3, im.shape[0], im.shape[1]), dtype=np.float32)
+        
+        # Vectorized channel processing
         for i in range(3):
-            im_tensor[
-                0,
-                i, :, :] = (im[:, :, 2 - i] / self.pixel_scale -
-                            self.pixel_means[2 - i]) / self.pixel_stds[2 - i]
-        #if self.debug:
-        #  timeb = datetime.datetime.now()
-        #  diff = timeb - timea
-        #  print('X2 uses', diff.total_seconds(), 'seconds')
-        data = nd.array(im_tensor)
-        return data
+            im_tensor[0, i] = (im[:, :, 2 - i] / self.pixel_scale - 
+                              self.pixel_means_reversed[i]) / self.pixel_stds_reversed[i]
+        
+        return nd.array(im_tensor)
 
     def detect(self, img, threshold=0.5, scales=[1.0], do_flip=False):
-        #print('in_detect', threshold, scales, do_flip, do_nms)
+        """Optimized detection with reduced redundancy"""
         proposals_list = []
         scores_list = []
         landmarks_list = []
         strides_list = []
-        timea = datetime.datetime.now()
-        flips = [0]
-        if do_flip:
-            flips = [0, 1]
-
-        imgs = [img]
-        if isinstance(img, list):
-            imgs = img
+        
+        flips = [0, 1] if do_flip else [0]
+        imgs = img if isinstance(img, list) else [img]
+        
+        # Cache frequently used values
+        use_landmarks = self.use_landmarks
+        num_anchors = self._num_anchors
+        bbox_stds = self.bbox_stds
+        
         for img in imgs:
             for im_scale in scales:
                 for flip in flips:
+                    # Resize image
                     if im_scale != 1.0:
-                        im = cv2.resize(img,
-                                        None,
-                                        None,
-                                        fx=im_scale,
-                                        fy=im_scale,
-                                        interpolation=cv2.INTER_LINEAR)
+                        im = cv2.resize(img, None, None, fx=im_scale, fy=im_scale,
+                                      interpolation=cv2.INTER_LINEAR)
                     else:
                         im = img.copy()
+                    
                     if flip:
                         im = im[:, ::-1, :]
+                    
+                    # Handle nocrop padding
                     if self.nocrop:
-                        if im.shape[0] % 32 == 0:
-                            h = im.shape[0]
-                        else:
-                            h = (im.shape[0] // 32 + 1) * 32
-                        if im.shape[1] % 32 == 0:
-                            w = im.shape[1]
-                        else:
-                            w = (im.shape[1] // 32 + 1) * 32
-                        _im = np.zeros((h, w, 3), dtype=np.float32)
-                        _im[0:im.shape[0], 0:im.shape[1], :] = im
-                        im = _im
+                        h = ((im.shape[0] + 31) // 32) * 32
+                        w = ((im.shape[1] + 31) // 32) * 32
+                        if h != im.shape[0] or w != im.shape[1]:
+                            _im = np.zeros((h, w, 3), dtype=np.float32)
+                            _im[:im.shape[0], :im.shape[1], :] = im
+                            im = _im
                     else:
                         im = im.astype(np.float32)
-                    if self.debug:
-                        timeb = datetime.datetime.now()
-                        diff = timeb - timea
-                        print('X1 uses', diff.total_seconds(), 'seconds')
-                    #self.model.bind(data_shapes=[('data', (1, 3, image_size[0], image_size[1]))], for_training=False)
-                    #im_info = [im.shape[0], im.shape[1], im_scale]
+                    
+                    # Preprocess image (optimized)
                     im_info = [im.shape[0], im.shape[1]]
-                    im_tensor = np.zeros((1, 3, im.shape[0], im.shape[1]))
+                    im_tensor = np.zeros((1, 3, im.shape[0], im.shape[1]), dtype=np.float32)
+                    
+                    # Vectorized preprocessing
                     for i in range(3):
-                        im_tensor[0, i, :, :] = (
-                            im[:, :, 2 - i] / self.pixel_scale -
-                            self.pixel_means[2 - i]) / self.pixel_stds[2 - i]
-                    if self.debug:
-                        timeb = datetime.datetime.now()
-                        diff = timeb - timea
-                        print('X2 uses', diff.total_seconds(), 'seconds')
+                        im_tensor[0, i] = (im[:, :, 2 - i] / self.pixel_scale - 
+                                          self.pixel_means_reversed[i]) / self.pixel_stds_reversed[i]
+                    
                     data = nd.array(im_tensor)
-                    db = mx.io.DataBatch(data=(data, ),
-                                         provide_data=[('data', data.shape)])
-                    if self.debug:
-                        timeb = datetime.datetime.now()
-                        diff = timeb - timea
-                        print('X3 uses', diff.total_seconds(), 'seconds')
+                    db = mx.io.DataBatch(data=(data,), provide_data=[('data', data.shape)])
+                    
+                    # Forward pass
                     self.model.forward(db, is_train=False)
                     net_out = self.model.get_outputs()
-                    #post_nms_topN = self._rpn_post_nms_top_n
-                    #min_size_dict = self._rpn_min_size_fpn
-
+                    
                     sym_idx = 0
-
+                    
                     for _idx, s in enumerate(self._feat_stride_fpn):
-                        #if len(scales)>1 and s==32 and im_scale==scales[-1]:
-                        #  continue
                         _key = 'stride%s' % s
                         stride = int(s)
-                        is_cascade = False
-                        if self.cascade:
-                            is_cascade = True
-                        #if self.vote and stride==4 and len(scales)>2 and (im_scale==scales[0]):
-                        #  continue
-                        #print('getting', im_scale, stride, idx, len(net_out), data.shape, file=sys.stderr)
+                        
+                        # Get scores and bbox deltas
                         scores = net_out[sym_idx].asnumpy()
-                        if self.debug:
-                            timeb = datetime.datetime.now()
-                            diff = timeb - timea
-                            print('A uses', diff.total_seconds(), 'seconds')
-                        #print(scores.shape)
-                        #print('scores',stride, scores.shape, file=sys.stderr)
-                        scores = scores[:, self._num_anchors['stride%s' %
-                                                             s]:, :, :]
-
+                        scores = scores[:, num_anchors[_key]:, :, :]
                         bbox_deltas = net_out[sym_idx + 1].asnumpy()
-
-                        #if DEBUG:
-                        #    print 'im_size: ({}, {})'.format(im_info[0], im_info[1])
-                        #    print 'scale: {}'.format(im_info[2])
-
-                        #_height, _width = int(im_info[0] / stride), int(im_info[1] / stride)
-                        height, width = bbox_deltas.shape[
-                            2], bbox_deltas.shape[3]
-
-                        A = self._num_anchors['stride%s' % s]
+                        
+                        height, width = bbox_deltas.shape[2], bbox_deltas.shape[3]
+                        A = num_anchors[_key]
                         K = height * width
-                        anchors_fpn = self._anchors_fpn['stride%s' % s]
-                        anchors = anchors_plane(height, width, stride,
-                                                anchors_fpn)
-                        #print((height, width), (_height, _width), anchors.shape, bbox_deltas.shape, scores.shape, file=sys.stderr)
-                        anchors = anchors.reshape((K * A, 4))
-                        #print('num_anchors', self._num_anchors['stride%s'%s], file=sys.stderr)
-                        #print('HW', (height, width), file=sys.stderr)
-                        #print('anchors_fpn', anchors_fpn.shape, file=sys.stderr)
-                        #print('anchors', anchors.shape, file=sys.stderr)
-                        #print('bbox_deltas', bbox_deltas.shape, file=sys.stderr)
-                        #print('scores', scores.shape, file=sys.stderr)
-
-                        #scores = self._clip_pad(scores, (height, width))
-                        scores = scores.transpose((0, 2, 3, 1)).reshape(
-                            (-1, 1))
-
-                        #print('pre', bbox_deltas.shape, height, width)
-                        #bbox_deltas = self._clip_pad(bbox_deltas, (height, width))
-                        #print('after', bbox_deltas.shape, height, width)
+                        
+                        # Generate anchors
+                        anchors_fpn = self._anchors_fpn[_key]
+                        anchors = anchors_plane(height, width, stride, anchors_fpn).reshape((K * A, 4))
+                        
+                        # Process scores
+                        scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1))
+                        
+                        # Process bbox deltas
                         bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1))
                         bbox_pred_len = bbox_deltas.shape[3] // A
-                        #print(bbox_deltas.shape)
                         bbox_deltas = bbox_deltas.reshape((-1, bbox_pred_len))
-                        bbox_deltas[:,
-                                    0::4] = bbox_deltas[:, 0::
-                                                        4] * self.bbox_stds[0]
-                        bbox_deltas[:,
-                                    1::4] = bbox_deltas[:, 1::
-                                                        4] * self.bbox_stds[1]
-                        bbox_deltas[:,
-                                    2::4] = bbox_deltas[:, 2::
-                                                        4] * self.bbox_stds[2]
-                        bbox_deltas[:,
-                                    3::4] = bbox_deltas[:, 3::
-                                                        4] * self.bbox_stds[3]
+                        
+                        # Apply bbox standards (vectorized)
+                        bbox_deltas[:, 0::4] *= bbox_stds[0]
+                        bbox_deltas[:, 1::4] *= bbox_stds[1]
+                        bbox_deltas[:, 2::4] *= bbox_stds[2]
+                        bbox_deltas[:, 3::4] *= bbox_stds[3]
+                        
                         proposals = self.bbox_pred(anchors, bbox_deltas)
-
-                        #print(anchors.shape, bbox_deltas.shape, A, K, file=sys.stderr)
-                        if is_cascade:
+                        
+                        # Handle cascade
+                        if self.cascade:
                             cascade_sym_num = 0
                             cls_cascade = False
                             bbox_cascade = False
-                            __idx = [3, 4]
-                            if not self.use_landmarks:
-                                __idx = [2, 3]
+                            __idx = [2, 3] if not use_landmarks else [3, 4]
+                            
                             for diff_idx in __idx:
                                 if sym_idx + diff_idx >= len(net_out):
                                     break
                                 body = net_out[sym_idx + diff_idx].asnumpy()
-                                if body.shape[1] // A == 2:  #cls branch
-                                    if cls_cascade or bbox_cascade:
-                                        break
-                                    else:
-                                        cascade_scores = body[:, self.
-                                                              _num_anchors[
-                                                                  'stride%s' %
-                                                                  s]:, :, :]
-                                        cascade_scores = cascade_scores.transpose(
-                                            (0, 2, 3, 1)).reshape((-1, 1))
-                                        #scores = (scores+cascade_scores)/2.0
-                                        scores = cascade_scores  #TODO?
+                                
+                                if body.shape[1] // A == 2:  # cls branch
+                                    if not cls_cascade and not bbox_cascade:
+                                        cascade_scores = body[:, num_anchors[_key]:, :, :]
+                                        scores = cascade_scores.transpose((0, 2, 3, 1)).reshape((-1, 1))
                                         cascade_sym_num += 1
                                         cls_cascade = True
-                                        #print('find cascade cls at stride', stride)
-                                elif body.shape[1] // A == 4:  #bbox branch
-                                    cascade_deltas = body.transpose(
-                                        (0, 2, 3, 1)).reshape(
-                                            (-1, bbox_pred_len))
-                                    cascade_deltas[:, 0::
-                                                   4] = cascade_deltas[:, 0::
-                                                                       4] * self.bbox_stds[
-                                                                           0]
-                                    cascade_deltas[:, 1::
-                                                   4] = cascade_deltas[:, 1::
-                                                                       4] * self.bbox_stds[
-                                                                           1]
-                                    cascade_deltas[:, 2::
-                                                   4] = cascade_deltas[:, 2::
-                                                                       4] * self.bbox_stds[
-                                                                           2]
-                                    cascade_deltas[:, 3::
-                                                   4] = cascade_deltas[:, 3::
-                                                                       4] * self.bbox_stds[
-                                                                           3]
-                                    proposals = self.bbox_pred(
-                                        proposals, cascade_deltas)
+                                    else:
+                                        break
+                                        
+                                elif body.shape[1] // A == 4:  # bbox branch
+                                    cascade_deltas = body.transpose((0, 2, 3, 1)).reshape((-1, bbox_pred_len))
+                                    cascade_deltas[:, 0::4] *= bbox_stds[0]
+                                    cascade_deltas[:, 1::4] *= bbox_stds[1]
+                                    cascade_deltas[:, 2::4] *= bbox_stds[2]
+                                    cascade_deltas[:, 3::4] *= bbox_stds[3]
+                                    proposals = self.bbox_pred(proposals, cascade_deltas)
                                     cascade_sym_num += 1
                                     bbox_cascade = True
-                                    #print('find cascade bbox at stride', stride)
-
+                        
                         proposals = clip_boxes(proposals, im_info[:2])
-
-                        #if self.vote:
-                        #  if im_scale>1.0:
-                        #    keep = self._filter_boxes2(proposals, 160*im_scale, -1)
-                        #  else:
-                        #    keep = self._filter_boxes2(proposals, -1, 100*im_scale)
-                        #  if stride==4:
-                        #    keep = self._filter_boxes2(proposals, 12*im_scale, -1)
-                        #    proposals = proposals[keep, :]
-                        #    scores = scores[keep]
-
-                        #keep = self._filter_boxes(proposals, min_size_dict['stride%s'%s] * im_info[2])
-                        #proposals = proposals[keep, :]
-                        #scores = scores[keep]
-                        #print('333', proposals.shape)
+                        
+                        # Apply decay for stride 4
                         if stride == 4 and self.decay4 < 1.0:
                             scores *= self.decay4
-
+                        
+                        # Filter by threshold
                         scores_ravel = scores.ravel()
-                        #print('__shapes', proposals.shape, scores_ravel.shape)
-                        #print('max score', np.max(scores_ravel))
                         order = np.where(scores_ravel >= threshold)[0]
-                        #_scores = scores_ravel[order]
-                        #_order = _scores.argsort()[::-1]
-                        #order = order[_order]
-                        proposals = proposals[order, :]
+                        proposals = proposals[order]
                         scores = scores[order]
+                        
+                        # Handle flipping
                         if flip:
-                            oldx1 = proposals[:, 0].copy()
-                            oldx2 = proposals[:, 2].copy()
-                            proposals[:, 0] = im.shape[1] - oldx2 - 1
-                            proposals[:, 2] = im.shape[1] - oldx1 - 1
-
-                        proposals[:, 0:4] /= im_scale
-
+                            proposals[:, [0, 2]] = im.shape[1] - proposals[:, [2, 0]] - 1
+                        
+                        proposals[:, :4] /= im_scale
+                        
                         proposals_list.append(proposals)
                         scores_list.append(scores)
+                        
                         if self.nms_threshold < 0.0:
-                            _strides = np.empty(shape=(scores.shape),
-                                                dtype=np.float32)
-                            _strides.fill(stride)
-                            strides_list.append(_strides)
-
-                        if not self.vote and self.use_landmarks:
+                            strides_list.append(np.full(scores.shape, stride, dtype=np.float32))
+                        
+                        # Handle landmarks
+                        if not self.vote and use_landmarks:
                             landmark_deltas = net_out[sym_idx + 2].asnumpy()
-                            #landmark_deltas = self._clip_pad(landmark_deltas, (height, width))
                             landmark_pred_len = landmark_deltas.shape[1] // A
-                            landmark_deltas = landmark_deltas.transpose(
-                                (0, 2, 3, 1)).reshape(
-                                    (-1, 5, landmark_pred_len // 5))
+                            landmark_deltas = landmark_deltas.transpose((0, 2, 3, 1)).reshape(
+                                (-1, 5, landmark_pred_len // 5))
                             landmark_deltas *= self.landmark_std
-                            #print(landmark_deltas.shape, landmark_deltas)
-                            landmarks = self.landmark_pred(
-                                anchors, landmark_deltas)
-                            landmarks = landmarks[order, :]
-
+                            
+                            landmarks = self.landmark_pred(anchors, landmark_deltas)
+                            landmarks = landmarks[order]
+                            
                             if flip:
-                                landmarks[:, :,
-                                          0] = im.shape[1] - landmarks[:, :,
-                                                                       0] - 1
-                                #for a in range(5):
-                                #  oldx1 = landmarks[:, a].copy()
-                                #  landmarks[:,a] = im.shape[1] - oldx1 - 1
-                                order = [1, 0, 2, 4, 3]
-                                flandmarks = landmarks.copy()
-                                for idx, a in enumerate(order):
-                                    flandmarks[:, idx, :] = landmarks[:, a, :]
-                                    #flandmarks[:, idx*2] = landmarks[:,a*2]
-                                    #flandmarks[:, idx*2+1] = landmarks[:,a*2+1]
-                                landmarks = flandmarks
-                            landmarks[:, :, 0:2] /= im_scale
-                            #landmarks /= im_scale
-                            #landmarks = landmarks.reshape( (-1, landmark_pred_len) )
+                                landmarks[:, :, 0] = im.shape[1] - landmarks[:, :, 0] - 1
+                                landmarks = landmarks[:, [1, 0, 2, 4, 3], :]
+                            
+                            landmarks[:, :, :2] /= im_scale
                             landmarks_list.append(landmarks)
-                            #proposals = np.hstack((proposals, landmarks))
-                        if self.use_landmarks:
-                            sym_idx += 3
-                        else:
-                            sym_idx += 2
-                        if is_cascade:
+                        
+                        # Update symbol index
+                        sym_idx += 3 if use_landmarks else 2
+                        if self.cascade:
                             sym_idx += cascade_sym_num
-
-        if self.debug:
-            timeb = datetime.datetime.now()
-            diff = timeb - timea
-            print('B uses', diff.total_seconds(), 'seconds')
-        proposals = np.vstack(proposals_list)
-        landmarks = None
-        if proposals.shape[0] == 0:
-            if self.use_landmarks:
-                landmarks = np.zeros((0, 5, 2))
+        
+        # Combine all proposals
+        if not proposals_list:
+            landmarks = np.zeros((0, 5, 2)) if use_landmarks else None
             if self.nms_threshold < 0.0:
                 return np.zeros((0, 6)), landmarks
             else:
                 return np.zeros((0, 5)), landmarks
+        
+        proposals = np.vstack(proposals_list)
         scores = np.vstack(scores_list)
-        #print('shapes', proposals.shape, scores.shape)
+        
+        # Sort by score
         scores_ravel = scores.ravel()
         order = scores_ravel.argsort()[::-1]
-        #if config.TEST.SCORE_THRESH>0.0:
-        #  _count = np.sum(scores_ravel>config.TEST.SCORE_THRESH)
-        #  order = order[:_count]
-        proposals = proposals[order, :]
+        proposals = proposals[order]
         scores = scores[order]
+        
         if self.nms_threshold < 0.0:
-            strides = np.vstack(strides_list)
-            strides = strides[order]
-        if not self.vote and self.use_landmarks:
-            landmarks = np.vstack(landmarks_list)
-            landmarks = landmarks[order].astype(np.float32, copy=False)
-
+            strides = np.vstack(strides_list)[order]
+        
+        if not self.vote and use_landmarks:
+            landmarks = np.vstack(landmarks_list)[order].astype(np.float32, copy=False)
+        else:
+            landmarks = None
+        
+        # Apply NMS
         if self.nms_threshold > 0.0:
-            pre_det = np.hstack((proposals[:, 0:4], scores)).astype(np.float32,
-                                                                    copy=False)
+            pre_det = np.hstack((proposals[:, :4], scores)).astype(np.float32, copy=False)
+            
             if not self.vote:
                 keep = self.nms(pre_det)
-                det = np.hstack((pre_det, proposals[:, 4:]))
-                det = det[keep, :]
-                if self.use_landmarks:
+                det = np.hstack((pre_det, proposals[:, 4:]))[keep]
+                if use_landmarks:
                     landmarks = landmarks[keep]
             else:
                 det = np.hstack((pre_det, proposals[:, 4:]))
                 det = self.bbox_vote(det)
         elif self.nms_threshold < 0.0:
-            det = np.hstack(
-                (proposals[:, 0:4], scores, strides)).astype(np.float32,
-                                                             copy=False)
+            det = np.hstack((proposals[:, :4], scores, strides)).astype(np.float32, copy=False)
         else:
-            det = np.hstack((proposals[:, 0:4], scores)).astype(np.float32,
-                                                                copy=False)
-
-        if self.debug:
-            timeb = datetime.datetime.now()
-            diff = timeb - timea
-            print('C uses', diff.total_seconds(), 'seconds')
+            det = np.hstack((proposals[:, :4], scores)).astype(np.float32, copy=False)
+        
         return det, landmarks
 
     def detect_center(self, img, threshold=0.5, scales=[1.0], do_flip=False):
+        """Detect face closest to center"""
         det, landmarks = self.detect(img, threshold, scales, do_flip)
         if det.shape[0] == 0:
             return None, None
-        bindex = 0
+        
         if det.shape[0] > 1:
-            img_size = np.asarray(img.shape)[0:2]
-            bounding_box_size = (det[:, 2] - det[:, 0]) * (det[:, 3] -
-                                                           det[:, 1])
+            img_size = np.asarray(img.shape[:2])
+            bounding_box_size = (det[:, 2] - det[:, 0]) * (det[:, 3] - det[:, 1])
             img_center = img_size / 2
-            offsets = np.vstack([(det[:, 0] + det[:, 2]) / 2 - img_center[1],
-                                 (det[:, 1] + det[:, 3]) / 2 - img_center[0]])
-            offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
-            bindex = np.argmax(bounding_box_size - offset_dist_squared *
-                               2.0)  # some extra weight on the centering
-        bbox = det[bindex, :]
-        landmark = landmarks[bindex, :, :]
-        return bbox, landmark
+            
+            # Vectorized center distance calculation
+            box_centers = np.column_stack([
+                (det[:, 0] + det[:, 2]) / 2 - img_center[1],
+                (det[:, 1] + det[:, 3]) / 2 - img_center[0]
+            ])
+            offset_dist_squared = np.sum(box_centers ** 2, axis=1)
+            bindex = np.argmax(bounding_box_size - offset_dist_squared * 2.0)
+        else:
+            bindex = 0
+        
+        return det[bindex], landmarks[bindex]
 
     @staticmethod
     def check_large_pose(landmark, bbox):
+        """Check for large pose angles"""
         assert landmark.shape == (5, 2)
         assert len(bbox) == 4
 
@@ -624,13 +430,8 @@ def get_theta(base, x, y):
             vy[1] *= -1
             tx = np.arctan2(vx[1], vx[0])
             ty = np.arctan2(vy[1], vy[0])
-            d = ty - tx
-            d = np.degrees(d)
-            #print(vx, tx, vy, ty, d)
-            #if d<-1.*math.pi:
-            #  d+=2*math.pi
-            #elif d>math.pi:
-            #  d-=2*math.pi
+            d = np.degrees(ty - tx)
+            
             if d < -180.0:
                 d += 360.
             elif d > 180.0:
@@ -641,124 +442,60 @@ def get_theta(base, x, y):
 
         theta1 = get_theta(landmark[0], landmark[3], landmark[2])
         theta2 = get_theta(landmark[1], landmark[2], landmark[4])
-        #print(va, vb, theta2)
         theta3 = get_theta(landmark[0], landmark[2], landmark[1])
         theta4 = get_theta(landmark[1], landmark[0], landmark[2])
         theta5 = get_theta(landmark[3], landmark[4], landmark[2])
         theta6 = get_theta(landmark[4], landmark[2], landmark[3])
         theta7 = get_theta(landmark[3], landmark[2], landmark[0])
         theta8 = get_theta(landmark[4], landmark[1], landmark[2])
-        #print(theta1, theta2, theta3, theta4, theta5, theta6, theta7, theta8)
-        left_score = 0.0
-        right_score = 0.0
-        up_score = 0.0
-        down_score = 0.0
-        if theta1 <= 0.0:
-            left_score = 10.0
-        elif theta2 <= 0.0:
-            right_score = 10.0
-        else:
-            left_score = theta2 / theta1
-            right_score = theta1 / theta2
-        if theta3 <= 10.0 or theta4 <= 10.0:
-            up_score = 10.0
-        else:
-            up_score = max(theta1 / theta3, theta2 / theta4)
-        if theta5 <= 10.0 or theta6 <= 10.0:
-            down_score = 10.0
-        else:
-            down_score = max(theta7 / theta5, theta8 / theta6)
-        mleft = (landmark[0][0] + landmark[3][0]) / 2
-        mright = (landmark[1][0] + landmark[4][0]) / 2
+
+        left_score = 10.0 if theta1 <= 0.0 else (theta2 / theta1 if theta2 > 0.0 else 0.0)
+        right_score = 10.0 if theta2 <= 0.0 else (theta1 / theta2 if theta1 > 0.0 else 0.0)
+        up_score = 10.0 if (theta3 <= 10.0 or theta4 <= 10.0) else max(theta1 / theta3, theta2 / theta4)
+        down_score = 10.0 if (theta5 <= 10.0 or theta6 <= 10.0) else max(theta7 / theta5, theta8 / theta6)
+        
+        mleft = (landmark[0, 0] + landmark[3, 0]) / 2
+        mright = (landmark[1, 0] + landmark[4, 0]) / 2
         box_center = ((bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2)
+        
         ret = 0
         if left_score >= 3.0:
             ret = 1
-        if ret == 0 and left_score >= 2.0:
-            if mright <= box_center[0]:
-                ret = 1
-        if ret == 0 and right_score >= 3.0:
+        elif left_score >= 2.0 and mright <= box_center[0]:
+            ret = 1
+        elif right_score >= 3.0:
             ret = 2
-        if ret == 0 and right_score >= 2.0:
-            if mleft >= box_center[0]:
-                ret = 2
-        if ret == 0 and up_score >= 2.0:
+        elif right_score >= 2.0 and mleft >= box_center[0]:
+            ret = 2
+        elif up_score >= 2.0:
             ret = 3
-        if ret == 0 and down_score >= 5.0:
+        elif down_score >= 5.0:
             ret = 4
+        
         return ret, left_score, right_score, up_score, down_score
 
-    @staticmethod
-    def _filter_boxes(boxes, min_size):
-        """ Remove all boxes with any side smaller than min_size """
-        ws = boxes[:, 2] - boxes[:, 0] + 1
-        hs = boxes[:, 3] - boxes[:, 1] + 1
-        keep = np.where((ws >= min_size) & (hs >= min_size))[0]
-        return keep
-
-    @staticmethod
-    def _filter_boxes2(boxes, max_size, min_size):
-        """ Remove all boxes with any side smaller than min_size """
-        ws = boxes[:, 2] - boxes[:, 0] + 1
-        hs = boxes[:, 3] - boxes[:, 1] + 1
-        if max_size > 0:
-            keep = np.where(np.minimum(ws, hs) < max_size)[0]
-        elif min_size > 0:
-            keep = np.where(np.maximum(ws, hs) > min_size)[0]
-        return keep
-
-    @staticmethod
-    def _clip_pad(tensor, pad_shape):
-        """
-      Clip boxes of the pad area.
-      :param tensor: [n, c, H, W]
-      :param pad_shape: [h, w]
-      :return: [n, c, h, w]
-      """
-        H, W = tensor.shape[2:]
-        h, w = pad_shape
-
-        if h < H or w < W:
-            tensor = tensor[:, :, :h, :w].copy()
-
-        return tensor
-
     @staticmethod
     def bbox_pred(boxes, box_deltas):
-        """
-      Transform the set of class-agnostic boxes into class-specific boxes
-      by applying the predicted offsets (box_deltas)
-      :param boxes: !important [N 4]
-      :param box_deltas: [N, 4 * num_classes]
-      :return: [N 4 * num_classes]
-      """
+        """Optimized bbox prediction with vectorized operations"""
         if boxes.shape[0] == 0:
-            return np.zeros((0, box_deltas.shape[1]))
+            return np.zeros((0, box_deltas.shape[1]), dtype=np.float32)
 
-        boxes = boxes.astype(np.float, copy=False)
+        boxes = boxes.astype(np.float32, copy=False)
         widths = boxes[:, 2] - boxes[:, 0] + 1.0
         heights = boxes[:, 3] - boxes[:, 1] + 1.0
         ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0)
         ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0)
 
-        dx = box_deltas[:, 0:1]
-        dy = box_deltas[:, 1:2]
-        dw = box_deltas[:, 2:3]
-        dh = box_deltas[:, 3:4]
+        # Vectorized delta application
+        pred_ctr_x = box_deltas[:, 0:1] * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+        pred_ctr_y = box_deltas[:, 1:2] * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+        pred_w = np.exp(box_deltas[:, 2:3]) * widths[:, np.newaxis]
+        pred_h = np.exp(box_deltas[:, 3:4]) * heights[:, np.newaxis]
 
-        pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
-        pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
-        pred_w = np.exp(dw) * widths[:, np.newaxis]
-        pred_h = np.exp(dh) * heights[:, np.newaxis]
-
-        pred_boxes = np.zeros(box_deltas.shape)
-        # x1
+        pred_boxes = np.zeros(box_deltas.shape, dtype=np.float32)
         pred_boxes[:, 0:1] = pred_ctr_x - 0.5 * (pred_w - 1.0)
-        # y1
         pred_boxes[:, 1:2] = pred_ctr_y - 0.5 * (pred_h - 1.0)
-        # x2
         pred_boxes[:, 2:3] = pred_ctr_x + 0.5 * (pred_w - 1.0)
-        # y2
         pred_boxes[:, 3:4] = pred_ctr_y + 0.5 * (pred_h - 1.0)
 
         if box_deltas.shape[1] > 4:
@@ -768,40 +505,31 @@ def bbox_pred(boxes, box_deltas):
 
     @staticmethod
     def landmark_pred(boxes, landmark_deltas):
+        """Optimized landmark prediction"""
         if boxes.shape[0] == 0:
-            return np.zeros((0, landmark_deltas.shape[1]))
-        boxes = boxes.astype(np.float, copy=False)
+            return np.zeros((0, landmark_deltas.shape[1], landmark_deltas.shape[2]), dtype=np.float32)
+        
+        boxes = boxes.astype(np.float32, copy=False)
         widths = boxes[:, 2] - boxes[:, 0] + 1.0
         heights = boxes[:, 3] - boxes[:, 1] + 1.0
         ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0)
         ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0)
+        
         pred = landmark_deltas.copy()
-        for i in range(5):
-            pred[:, i, 0] = landmark_deltas[:, i, 0] * widths + ctr_x
-            pred[:, i, 1] = landmark_deltas[:, i, 1] * heights + ctr_y
+        pred[:, :, 0] = landmark_deltas[:, :, 0] * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+        pred[:, :, 1] = landmark_deltas[:, :, 1] * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+        
         return pred
-        #preds = []
-        #for i in range(landmark_deltas.shape[1]):
-        #  if i%2==0:
-        #    pred = (landmark_deltas[:,i]*widths + ctr_x)
-        #  else:
-        #    pred = (landmark_deltas[:,i]*heights + ctr_y)
-        #  preds.append(pred)
-        #preds = np.vstack(preds).transpose()
-        #return preds
 
     def bbox_vote(self, det):
-        #order = det[:, 4].ravel().argsort()[::-1]
-        #det = det[order, :]
+        """Optimized bbox voting with vectorized operations"""
         if det.shape[0] == 0:
-            return np.zeros((0, 5))
-            #dets = np.array([[10, 10, 20, 20, 0.002]])
-            #det = np.empty(shape=[0, 5])
-        dets = None
-        while det.shape[0] > 0:
-            if dets is not None and dets.shape[0] >= 750:
-                break
-            # IOU
+            return np.zeros((0, 5), dtype=np.float32)
+        
+        dets = []
+        
+        while det.shape[0] > 0 and len(dets) < 750:
+            # Vectorized IOU calculation
             area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1)
             xx1 = np.maximum(det[0, 0], det[:, 0])
             yy1 = np.maximum(det[0, 1], det[:, 1])
@@ -810,30 +538,24 @@ def bbox_vote(self, det):
             w = np.maximum(0.0, xx2 - xx1 + 1)
             h = np.maximum(0.0, yy2 - yy1 + 1)
             inter = w * h
-            o = inter / (area[0] + area[:] - inter)
+            o = inter / (area[0] + area - inter)
 
-            # nms
+            # NMS merge
             merge_index = np.where(o >= self.nms_threshold)[0]
-            det_accu = det[merge_index, :]
+            det_accu = det[merge_index]
             det = np.delete(det, merge_index, 0)
+            
             if merge_index.shape[0] <= 1:
                 if det.shape[0] == 0:
-                    try:
-                        dets = np.row_stack((dets, det_accu))
-                    except:
-                        dets = det_accu
+                    dets.append(det_accu)
                 continue
-            det_accu[:,
-                     0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:],
-                                                       (1, 4))
+            
+            # Weighted average
+            det_accu[:, :4] *= det_accu[:, 4:5]
             max_score = np.max(det_accu[:, 4])
-            det_accu_sum = np.zeros((1, 5))
-            det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4], axis=0) / np.sum(
-                det_accu[:, -1:])
-            det_accu_sum[:, 4] = max_score
-            if dets is None:
-                dets = det_accu_sum
-            else:
-                dets = np.row_stack((dets, det_accu_sum))
-        dets = dets[0:750, :]
-        return dets
+            det_accu_sum = np.zeros((1, 5), dtype=np.float32)
+            det_accu_sum[0, :4] = np.sum(det_accu[:, :4], axis=0) / np.sum(det_accu[:, 4])
+            det_accu_sum[0, 4] = max_score
+            dets.append(det_accu_sum)
+        
+        return np.vstack(dets[:750]) if dets else np.zeros((0, 5), dtype=np.float32)