diff --git a/README.md b/README.md
index 9c62965..57a308f 100644
--- a/README.md
+++ b/README.md
@@ -35,6 +35,12 @@ python demo.py --weight ./checkpoint_VOC_efficientdet-d1_97.pth --threshold 0.6
 &nbsp;
 
 ## Recent Update
+ - [04/04/2020] VOC dataset training success.
+     ```Shell
+	nice -n1 python3.6 train.py --dataset VOC --dataset_root $VOC_PATH --network efficientdet-d0 --batch_size $BSIZE --workers 8 --grad_accumulation_steps 1 --lr 0.00001 --eval_epochs 20
+     ```
+     I set `lr=1e-5` because `1e-4` did not work
+ - [31/03/2020] ~~Make support for freezing backbone layers and batch norm layers. In addition, it supports to mixed precision training APEX opt method=O1.~~ [requires testing].
  - [06/01/2020] Support both DistributedDataParallel and DataParallel, change augmentation, eval_voc
  - [17/12/2019] Add Fast normalized fusion, Augmentation with Ratio, Change RetinaHead, Fix Support EfficientDet-D0->D7
  - [7/12/2019] Support EfficientDet-D0, EfficientDet-D1, EfficientDet-D2, EfficientDet-D3, EfficientDet-D4,... . Support change gradient accumulation steps, AdamW.
@@ -88,22 +94,22 @@ sh datasets/scripts/COCO2017.sh
 - To train EfficientDet using the train script simply specify the parameters listed in `train.py` as a flag or manually change them.
 
 ```Shell
-python train.py --network effcientdet-d0  # Example
+python train.py --network efficientdet-d0  # Example
 ```
 
   - With VOC Dataset:
   ```Shell
   # DataParallel
-  python train.py --dataset VOC --dataset_root /root/data/VOCdevkit/ --network effcientdet-d0 --batch_size 32 
+  python train.py --dataset VOC --dataset_root /root/data/VOCdevkit/ --network efficientdet-d0 --batch_size 32 
   # DistributedDataParallel with backend nccl
-  python train.py --dataset VOC --dataset_root /root/data/VOCdevkit/ --network effcientdet-d0 --batch_size 32 --multiprocessing-distributed
+  python train.py --dataset VOC --dataset_root /root/data/VOCdevkit/ --network efficientdet-d0 --batch_size 32 --multiprocessing-distributed
   ```
   - With COCO Dataset:
   ```Shell
   # DataParallel
-  python train.py --dataset COCO --dataset_root ~/data/coco/ --network effcientdet-d0 --batch_size 32
+  python train.py --dataset COCO --dataset_root ~/data/coco/ --network efficientdet-d0 --batch_size 32
   # DistributedDataParallel with backend nccl
-  python train.py --dataset COCO --dataset_root ~/data/coco/ --network effcientdet-d0 --batch_size 32 --multiprocessing-distributed
+  python train.py --dataset COCO --dataset_root ~/data/coco/ --network efficientdet-d0 --batch_size 32 --multiprocessing-distributed
   ```
 
 ## Evaluation
diff --git a/datasets/augmentation.py b/datasets/augmentation.py
index 10a5615..d9289e4 100644
--- a/datasets/augmentation.py
+++ b/datasets/augmentation.py
@@ -16,8 +16,6 @@ def get_augumentation(phase, width=512, height=512, min_area=0., min_visibility=
             albu.augmentations.transforms.RandomResizedCrop(
                 height=height,
                 width=width, p=0.3),
-            albu.augmentations.transforms.Flip(),
-            albu.augmentations.transforms.Transpose(),
             albu.OneOf([
                 albu.RandomBrightnessContrast(brightness_limit=0.5,
                                               contrast_limit=0.4),
@@ -33,7 +31,6 @@ def get_augumentation(phase, width=512, height=512, min_area=0., min_visibility=
             ]),
             albu.CLAHE(p=0.8),
             albu.HorizontalFlip(p=0.5),
-            albu.VerticalFlip(p=0.5),
         ])
     if(phase == 'test' or phase == 'valid'):
         list_transforms.extend([
@@ -46,32 +43,43 @@ def get_augumentation(phase, width=512, height=512, min_area=0., min_visibility=
     ])
     if(phase == 'test'):
         return albu.Compose(list_transforms)
-    return albu.Compose(list_transforms, bbox_params=albu.BboxParams(format='pascal_voc', min_area=min_area,
-                                                                     min_visibility=min_visibility, label_fields=['category_id']))
+    return albu.Compose(list_transforms,
+                        bbox_params=albu.BboxParams(format='pascal_voc',
+                                                    min_area=min_area,
+                                                    min_visibility=min_visibility,
+                                                    label_fields=['category_id']))
 
 
 def detection_collate(batch):
     imgs = [s['image'] for s in batch]
     annots = [s['bboxes'] for s in batch]
     labels = [s['category_id'] for s in batch]
+    scales = [s['scale'] for s in batch]
 
     max_num_annots = max(len(annot) for annot in annots)
     annot_padded = np.ones((len(annots), max_num_annots, 5))*-1
 
     if max_num_annots > 0:
         for idx, (annot, lab) in enumerate(zip(annots, labels)):
+            # pylint: disable=C1801
             if len(annot) > 0:
                 annot_padded[idx, :len(annot), :4] = annot
                 annot_padded[idx, :len(annot), 4] = lab
-    return (torch.stack(imgs, 0), torch.FloatTensor(annot_padded))
+    return (torch.stack(imgs, 0),
+            torch.FloatTensor(annot_padded),
+            torch.FloatTensor(scales))
 
 
 def collater(data):
+    data = [x for x in data if x is not None]
     imgs = [s['img'] for s in data]
     annots = [s['annot'] for s in data]
     scales = [s['scale'] for s in data]
+    try:
+        imgs = torch.from_numpy(np.stack(imgs, axis=0))
+    except ValueError:
+        import pdb; pdb.set_trace()
 
-    imgs = torch.from_numpy(np.stack(imgs, axis=0))
 
     max_num_annots = max(annot.shape[0] for annot in annots)
 
@@ -88,7 +96,8 @@ def collater(data):
 
     imgs = imgs.permute(0, 3, 1, 2)
 
-    return (imgs, torch.FloatTensor(annot_padded))
+    return (imgs, torch.FloatTensor(annot_padded),
+            torch.FloatTensor(scales))
 
 
 class Resizer(object):
@@ -108,11 +117,13 @@ def __call__(self, sample, common_size=512):
 
         image = cv2.resize(image, (resized_width, resized_height))
 
-        new_image = np.zeros((common_size, common_size, 3))
+        new_image = np.zeros((common_size, common_size, 3), np.float32)
         new_image[0:resized_height, 0:resized_width] = image
         annots[:, :4] *= scale
 
-        return {'img': torch.from_numpy(new_image), 'annot': torch.from_numpy(annots), 'scale': scale}
+        return {'img': torch.from_numpy(new_image),
+                'annot': torch.from_numpy(annots),
+                'scale': scale}
 
 
 class Augmenter(object):
@@ -147,4 +158,5 @@ def __init__(self):
     def __call__(self, sample):
         image, annots = sample['img'], sample['annot']
 
-        return {'img': ((image.astype(np.float32) - self.mean) / self.std), 'annot': annots}
+        # 1/255. = 0.00392156862745098
+        return {'img': ((image.astype(np.float32) *0.00392156862745098 - self.mean) / self.std), 'annot': annots}
diff --git a/datasets/coco.py b/datasets/coco.py
index c006f44..5e3835d 100644
--- a/datasets/coco.py
+++ b/datasets/coco.py
@@ -64,6 +64,7 @@ def __len__(self):
     def __getitem__(self, idx):
 
         img = self.load_image(idx)
+        image_size = img.shape[:2]
         annot = self.load_annotations(idx)
         sample = {'img': img, 'annot': annot}
         if self.transform:
diff --git a/datasets/voc0712.py b/datasets/voc0712.py
index 4814754..28e5de4 100644
--- a/datasets/voc0712.py
+++ b/datasets/voc0712.py
@@ -8,6 +8,8 @@
     import xml.etree.cElementTree as ET
 else:
     import xml.etree.ElementTree as ET
+import albumentations as albu
+
 
 VOC_CLASSES = (  # always index 0
     'aeroplane', 'bicycle', 'bird', 'boat',
@@ -106,15 +108,26 @@ def __getitem__(self, index):
         target = ET.parse(self._annopath % img_id).getroot()
         img = cv2.imread(self._imgpath % img_id)
         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-        img = img.astype(np.float32)/255.
         height, width, channels = img.shape
 
         if self.target_transform is not None:
             target = self.target_transform(target, width, height)
         target = np.array(target)
         sample = {'img': img, 'annot': target}
-        if self.transform is not None:
-            sample = self.transform(sample)
+        if isinstance(self.transform, albu.core.composition.Compose):
+            result = self.transform(image=img, bboxes=target[:, :4], category_id=target[:, -1])
+            bboxes = np.array(result["bboxes"])
+            cls = np.atleast_2d(result["category_id"]).T
+            if bboxes.size == 0:  # after data augmentation we loose all bboxes
+                return None
+            target = np.hstack((bboxes, cls))
+            sample = {"img": result["image"].transpose(1, 0).transpose(2, 1),
+                      "annot": torch.from_numpy(target),
+                      "scale": -1}  # fake scale
+        else:
+            img = img.astype(np.float32)/255.
+            if self.transform is not None:
+                sample = self.transform(sample)
         return sample
 
         bbox = target[:, :4]
diff --git a/eval.py b/eval.py
index dc8393d..2b7ef19 100644
--- a/eval.py
+++ b/eval.py
@@ -73,7 +73,8 @@ def _compute_ap(recall, precision):
     return ap
 
 
-def _get_detections(dataset, retinanet, score_threshold=0.05, max_detections=100, save_path=None):
+def _get_detections(dataloader, retinanet,
+                    score_threshold=0.05, max_detections=100, save_path=None):
     """ Get the detections from the retinanet using the generator.
     The result is a list of lists such that the size is:
         all_detections[num_images][num_classes] = detections[num_detections, 4 + num_classes]
@@ -86,52 +87,55 @@ def _get_detections(dataset, retinanet, score_threshold=0.05, max_detections=100
     # Returns
         A list of lists containing the detections for each image in the generator.
     """
-    all_detections = [[None for i in range(
-        dataset.num_classes())] for j in range(len(dataset))]
+    dataset = dataloader.dataset
+    all_detections = [[None for i in range(dataset.num_classes())]
+                      for j in range(len(dataset))]
 
     retinanet.eval()
-
+    index = 0
     with torch.no_grad():
-
-        for index in range(len(dataset)):
-            data = dataset[index]
-            scale = data['scale']
-
-            # run network
-            scores, labels, boxes = retinanet(data['img'].permute(
-                2, 0, 1).cuda().float().unsqueeze(dim=0))
-            scores = scores.cpu().numpy()
-            labels = labels.cpu().numpy()
-            boxes = boxes.cpu().numpy()
-
-            # correct boxes for image scale
-            boxes /= scale
-
-            # select indices which have a score above the threshold
-            indices = np.where(scores > score_threshold)[0]
-            if indices.shape[0] > 0:
-                # select those scores
-                scores = scores[indices]
-
-                # find the order with which to sort the scores
-                scores_sort = np.argsort(-scores)[:max_detections]
-
-                # select detections
-                image_boxes = boxes[indices[scores_sort], :]
-                image_scores = scores[scores_sort]
-                image_labels = labels[indices[scores_sort]]
-                image_detections = np.concatenate([image_boxes, np.expand_dims(
-                    image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1)
-
-                # copy detections to all_detections
-                for label in range(dataset.num_classes()):
-                    all_detections[index][label] = image_detections[image_detections[:, -1] == label, :-1]
-            else:
-                # copy detections to all_detections
-                for label in range(dataset.num_classes()):
-                    all_detections[index][label] = np.zeros((0, 5))
-
-            print('{}/{}'.format(index + 1, len(dataset)), end='\r')
+        for idx, data in tqdm(enumerate(dataloader), total=len(dataloader)):
+            images = data[0].cuda()
+            scores_batch, labels_batch, boxes_batch = retinanet(images)
+
+            scores_batch = scores_batch.reshape(images.shape[0], -1)
+            labels_batch = labels_batch.reshape(images.shape[0], -1)
+            boxes_batch = boxes_batch.reshape(images.shape[0], -1, 4)
+            for scores, labels, boxes, scale in zip(scores_batch,
+                                                    labels_batch,
+                                                    boxes_batch, data[2]):
+
+                scores = scores.cpu().numpy()
+                labels = labels.cpu().numpy()
+                boxes = boxes.cpu().numpy()
+
+                # correct boxes for image scale
+                boxes /= scale
+
+                # select indices which have a score above the threshold
+                indices = np.where(scores > score_threshold)[0]
+                if indices.shape[0] > 0:
+                    # select those scores
+                    scores = scores[indices]
+
+                    # find the order with which to sort the scores
+                    scores_sort = np.argsort(-scores)[:max_detections]
+
+                    # select detections
+                    image_boxes = boxes[indices[scores_sort], :]
+                    image_scores = scores[scores_sort]
+                    image_labels = labels[indices[scores_sort]]
+                    image_detections = np.concatenate([image_boxes, np.expand_dims(
+                        image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1)
+
+                    # copy detections to all_detections
+                    for label in range(dataset.num_classes()):
+                        all_detections[index][label] = image_detections[image_detections[:, -1] == label, :-1]
+                else:
+                    # copy detections to all_detections
+                    for label in range(dataset.num_classes()):
+                        all_detections[index][label] = np.zeros((0, 5))
+                index += 1
 
     return all_detections
 
@@ -146,18 +150,18 @@ def _get_annotations(generator):
         A list of lists containing the annotations for each image in the generator.
     """
     all_annotations = [[None for i in range(
-        generator.num_classes())] for j in range(len(generator))]
+        generator.dataset.num_classes())] for j in range(len(generator.dataset))]
 
-    for i in range(len(generator)):
+    for i in range(len(generator.dataset)):
         # load the annotations
-        annotations = generator.load_annotations(i)
+        annotations = generator.dataset.load_annotations(i)
 
         # copy detections to all_annotations
-        for label in range(generator.num_classes()):
+        for label in range(generator.dataset.num_classes()):
             all_annotations[i][label] = annotations[annotations[:, 4]
                                                     == label, :4].copy()
 
-        print('{}/{}'.format(i + 1, len(generator)), end='\r')
+        print('{}/{}'.format(i + 1, len(generator.dataset)), end='\r')
 
     return all_annotations
 
@@ -190,13 +194,13 @@ def evaluate(
 
     average_precisions = {}
 
-    for label in range(generator.num_classes()):
+    for label in range(generator.dataset.num_classes()):
         false_positives = np.zeros((0,))
         true_positives = np.zeros((0,))
         scores = np.zeros((0,))
         num_annotations = 0.0
 
-        for i in range(len(generator)):
+        for i in range(len(generator.dataset)):
             detections = all_detections[i][label]
             annotations = all_annotations[i][label]
             num_annotations += annotations.shape[0]
@@ -249,72 +253,66 @@ def evaluate(
 
     print('\nmAP:')
     avg_mAP = []
-    for label in range(generator.num_classes()):
-        label_name = generator.label_to_name(label)
+    for label in range(generator.dataset.num_classes()):
+        label_name = generator.dataset.label_to_name(label)
         print('{}: {}'.format(label_name, average_precisions[label][0]))
         avg_mAP.append(average_precisions[label][0])
     print('avg mAP: {}'.format(np.mean(avg_mAP)))
     return np.mean(avg_mAP), average_precisions
 
 
-def evaluate_coco(dataset, model, threshold=0.05):
+def evaluate_coco(dataloader, model, threshold=0.05):
 
     model.eval()
 
+    dataset = dataloader.dataset
+
     with torch.no_grad():
 
         # start collecting results
         results = []
         image_ids = []
+        index = 0
 
-        for index in range(len(dataset)):
-            data = dataset[index]
-            scale = data['scale']
-
+        for data in tqdm(dataloader, total=len(dataloader)):
+            images = data[0]
             # run network
-            scores, labels, boxes = model(data['img'].permute(
-                2, 0, 1).cuda().float().unsqueeze(dim=0))
-            scores = scores.cpu()
-            labels = labels.cpu()
-            boxes = boxes.cpu()
-
-            # correct boxes for image scale
-            boxes /= scale
-
-            if boxes.shape[0] > 0:
-                # change to (x, y, w, h) (MS COCO standard)
-                boxes[:, 2] -= boxes[:, 0]
-                boxes[:, 3] -= boxes[:, 1]
-
-                # compute predicted labels and scores
-                # for box, score, label in zip(boxes[0], scores[0], labels[0]):
-                for box_id in range(boxes.shape[0]):
-                    score = float(scores[box_id])
-                    label = int(labels[box_id])
-                    box = boxes[box_id, :]
-
-                    # scores are sorted, so we can break
-                    if score < threshold:
-                        break
-
-                    # append detection for each positively labeled class
-                    image_result = {
-                        'image_id': dataset.image_ids[index],
-                        'category_id': dataset.label_to_coco_label(label),
-                        'score': float(score),
-                        'bbox': box.tolist(),
-                    }
-
-                    # append detection to results
-                    results.append(image_result)
-
-            # append image to list of processed images
-            image_ids.append(dataset.image_ids[index])
-
-            # print progress
-            print('{}/{}'.format(index, len(dataset)), end='\r')
-
-        if not len(results):
+            scores_batch, labels_batch, boxes_batch = model(images)
+
+            scores_batch = scores_batch.reshape(images.shape[0], -1)
+            labels_batch = labels_batch.reshape(images.shape[0], -1)
+            boxes_batch = boxes_batch.reshape(images.shape[0], -1, 4)
+
+            for scores, labels, boxes, scale in zip(scores_batch,
+                                                    labels_batch,
+                                                    boxes_batch, data[2]):
+                scores = scores.cpu().numpy()
+                labels = labels.cpu().numpy()
+                boxes = boxes.cpu().numpy()
+
+                # correct boxes for image scale
+                boxes /= scale
+
+                if boxes.shape[0] > 0:
+                    # change to (x, y, w, h) (MS COCO standard)
+                    boxes[:, 2] -= boxes[:, 0]
+                    boxes[:, 3] -= boxes[:, 1]
+
+                    # compute predicted labels and scores
+                    # for box, score, label in zip(boxes[0], scores[0], labels[0]):
+                    boxes = boxes[scores >= threshold]
+                    labels = labels[scores >= threshold]
+                    scores = scores[scores >= threshold]
+                    results.extend([{"image_id": dataset.image_ids[index],
+                                     "category_id": dataset.label_to_coco_label(label),
+                                     "score": float(score),
+                                     "bbox": box.tolist()}
+                                    for box, score, label in zip(boxes, scores, labels)])
+
+                # append image to list of processed images
+                image_ids.append(dataset.image_ids[index])
+                index += 1
+        if not results:
             return
 
         # write output
diff --git a/loader.py b/loader.py
new file mode 100644
index 0000000..eeaf11e
--- /dev/null
+++ b/loader.py
@@ -0,0 +1,67 @@
+import torch
+
+
+class PrefetchLoader:
+
+    def __init__(self, loader):
+        self.loader = loader
+        # self.mean = torch.tensor([x * 255 for x in mean]).cuda().view(1, 3, 1, 1)
+        # self.std = torch.tensor([x * 255 for x in std]).cuda().view(1, 3, 1, 1)
+        # self.fp16 = fp16
+        # if fp16:
+        #     self.mean = self.mean.half()
+        #     self.std = self.std.half()
+        # if re_prob > 0.:
+        #     self.random_erasing = RandomErasing(
+        #         probability=re_prob, mode=re_mode, max_count=re_count, num_splits=re_num_splits)
+        # else:
+        #     self.random_erasing = None
+
+    def __iter__(self):
+        stream = torch.cuda.Stream()
+        first = True
+
+        for next_input, next_target, _ in self.loader:
+            with torch.cuda.stream(stream):
+                next_input = next_input.cuda(non_blocking=True)
+                next_target = next_target.cuda(non_blocking=True)
+                # if self.fp16:
+                #     next_input = next_input.half().sub_(self.mean).div_(self.std)
+                # else:
+                #     next_input = next_input.float().sub_(self.mean).div_(self.std)
+                # if self.random_erasing is not None:
+                #     next_input = self.random_erasing(next_input)
+
+            if not first:
+                yield input, target
+            else:
+                first = False
+
+            torch.cuda.current_stream().wait_stream(stream)
+            input = next_input
+            target = next_target
+
+        yield input, target
+
+    def __len__(self):
+        return len(self.loader)
+
+    @property
+    def sampler(self):
+        return self.loader.sampler
+
+    @property
+    def dataset(self):
+        return self.loader.dataset
+
+    # @property
+    # def mixup_enabled(self):
+    #     if isinstance(self.loader.collate_fn, FastCollateMixup):
+    #         return self.loader.collate_fn.mixup_enabled
+    #     else:
+    #         return False
+
+    # @mixup_enabled.setter
+    # def mixup_enabled(self, x):
+    #     if isinstance(self.loader.collate_fn, FastCollateMixup):
+    #         self.loader.collate_fn.mixup_enabled = x
diff --git a/models/bifpn.py b/models/bifpn.py
index 0f8e6bd..a9bc8f6 100644
--- a/models/bifpn.py
+++ b/models/bifpn.py
@@ -162,6 +162,7 @@ def __init__(self,
                         inplace=False)
                 )
                 self.bifpn_convs.append(fpn_conv)
+        # self.init_weights() # new code
 
     # default init_weights for conv(msra) and norm in ConvModule
     def init_weights(self):
diff --git a/models/efficientdet.py b/models/efficientdet.py
index 43357c9..2601366 100644
--- a/models/efficientdet.py
+++ b/models/efficientdet.py
@@ -31,7 +31,7 @@ def __init__(self,
                  iou_threshold=0.5):
         super(EfficientDet, self).__init__()
         self.backbone = EfficientNet.from_pretrained(MODEL_MAP[network])
-        self.is_training = is_training
+        # self.is_training = is_training
         self.neck = BIFPN(in_channels=self.backbone.get_list_features()[-5:],
                           out_channels=W_bifpn,
                           stack=D_bifpn,
@@ -44,18 +44,32 @@ def __init__(self,
         self.clipBoxes = ClipBoxes()
         self.threshold = threshold
         self.iou_threshold = iou_threshold
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-                m.weight.data.normal_(0, math.sqrt(2. / n))
-            elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
-        self.freeze_bn()
+
+        # ============== original code starts ===============
+        """The following code forces all weights to be random, which does not make sense at all!"""
+        # for m in self.modules():
+        #     if isinstance(m, nn.Conv2d):
+        #         n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        #         m.weight.data.normal_(0, math.sqrt(2. / n))
+        #     elif isinstance(m, nn.BatchNorm2d):
+        #         m.weight.data.fill_(1)
+        #         m.bias.data.zero_()
+
+        # self.freeze_bn()
+        # ============== original code ends ===============
+
         self.criterion = FocalLoss()
 
+    def extract_feat(self, img):
+        """
+            Directly extract features from the backbone+neck
+        """
+        x = self.backbone(img)
+        x = self.neck(x[-5:])
+        return x
+
     def forward(self, inputs):
-        if self.is_training:
+        if self.training:
             inputs, annotations = inputs
         else:
             inputs = inputs
@@ -64,37 +78,62 @@ def forward(self, inputs):
         classification = torch.cat([out for out in outs[0]], dim=1)
         regression = torch.cat([out for out in outs[1]], dim=1)
         anchors = self.anchors(inputs)
-        if self.is_training:
+        # if anchors.dtype != inputs.dtype:  # used for mixed precision training
+        #     anchors = anchors.type_as(inputs)
+        if self.training:
             return self.criterion(classification, regression, anchors, annotations)
         else:
+            max_per_image = 256
             transformed_anchors = self.regressBoxes(anchors, regression)
             transformed_anchors = self.clipBoxes(transformed_anchors, inputs)
             scores = torch.max(classification, dim=2, keepdim=True)[0]
-            scores_over_thresh = (scores > self.threshold)[0, :, 0]
+            nms_scores = []
+            nms_class = []
+            anchors = []
+            for idx, score in enumerate(scores):
+                scores_over_thresh = (score > self.threshold)[:, 0]
+                if scores_over_thresh.sum() == 0:
+                    print('No boxes to NMS')
+                    # no boxes to NMS, just return
+                    # return [torch.zeros(0), torch.zeros(0), torch.zeros(0, 4)]
+                    continue
+                cls_tmp = classification[idx, scores_over_thresh, :]
+                trf_anchors = transformed_anchors[idx, scores_over_thresh, :]
+                scores_tmp = scores[idx, scores_over_thresh, :]
+                anchors_nms_idx = nms(trf_anchors, scores_tmp[:, 0],
+                                      iou_threshold=self.iou_threshold)
+                nms_scores_tmp, nms_class_tmp = cls_tmp[anchors_nms_idx, :].max(dim=1)
+                trf_anchors = trf_anchors[anchors_nms_idx, :]
 
-            if scores_over_thresh.sum() == 0:
-                print('No boxes to NMS')
-                # no boxes to NMS, just return
-                return [torch.zeros(0), torch.zeros(0), torch.zeros(0, 4)]
-            classification = classification[:, scores_over_thresh, :]
-            transformed_anchors = transformed_anchors[:, scores_over_thresh, :]
-            scores = scores[:, scores_over_thresh, :]
-            anchors_nms_idx = nms(
-                transformed_anchors[0, :, :], scores[0, :, 0], iou_threshold=self.iou_threshold)
-            nms_scores, nms_class = classification[0, anchors_nms_idx, :].max(
-                dim=1)
-            return [nms_scores, nms_class, transformed_anchors[0, anchors_nms_idx, :]]
+                if not torch.all(-nms_scores_tmp[:-1] <= -nms_scores_tmp[1:]):
+                    raise ValueError("Please make nms score sorted")
+                if nms_scores_tmp.shape[0] > max_per_image:
+                    nms_scores_tmp = nms_scores_tmp[:max_per_image]
+                    nms_class_tmp = nms_class_tmp[:max_per_image]
+                    trf_anchors = trf_anchors[:max_per_image]
+                else:
+                    K = max_per_image - nms_scores_tmp.shape[0]
+                    nms_scores_tmp = torch.cat((nms_scores_tmp,
+                                                -torch.ones(K,).type_as(nms_scores_tmp)))
+                    nms_class_tmp = torch.cat((nms_class_tmp,
+                                               -torch.ones(K,).type_as(nms_class_tmp)))
+                    trf_anchors = torch.cat((trf_anchors,
+                                             -torch.ones(K, 4).type_as(trf_anchors)))
+                nms_scores.append(nms_scores_tmp)
+                nms_class.append(nms_class_tmp)
+                anchors.append(trf_anchors)
+            return torch.cat(nms_scores), torch.cat(nms_class), torch.cat(anchors)
+
+    def freeze_backbone(self):
+        """Freeze backbone weights and bn layers."""
+        for layer in self.backbone.modules():
+            if isinstance(layer, nn.BatchNorm2d):
+                layer.eval()
+        for param in self.backbone.parameters():
+            param.requires_grad = False
 
     def freeze_bn(self):
         '''Freeze BatchNorm layers.'''
         for layer in self.modules():
             if isinstance(layer, nn.BatchNorm2d):
                 layer.eval()
-
-    def extract_feat(self, img):
-        """
-            Directly extract features from the backbone+neck
-        """
-        x = self.backbone(img)
-        x = self.neck(x[-5:])
-        return x
diff --git a/models/losses.py b/models/losses.py
index 99b9cfd..d780d46 100644
--- a/models/losses.py
+++ b/models/losses.py
@@ -43,6 +43,13 @@ def forward(self, classifications, regressions, anchors, annotations):
         anchor_ctr_x = anchor[:, 0] + 0.5 * anchor_widths
         anchor_ctr_y = anchor[:, 1] + 0.5 * anchor_heights
 
+        if classifications.dtype == torch.float32:
+            MAX_ONE = 0.9999
+            MIN_ZERO = 1e-4
+        else:
+            MAX_ONE = 0.999
+            MIN_ZERO = 1e-4
+        not_found = 0
         for j in range(batch_size):
 
             classification = classifications[j, :, :]
@@ -50,23 +57,19 @@ def forward(self, classifications, regressions, anchors, annotations):
 
             bbox_annotation = annotations[j, :, :]
             bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1]
-
             if bbox_annotation.shape[0] == 0:
                 regression_losses.append(torch.tensor(0).float().cuda())
                 classification_losses.append(torch.tensor(0).float().cuda())
 
                 continue
 
-            classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4)
+            classification = torch.clamp(classification, MIN_ZERO, MAX_ONE)
 
             # num_anchors x num_annotations
             IoU = calc_iou(anchors[0, :, :], bbox_annotation[:, :4])
 
             IoU_max, IoU_argmax = torch.max(IoU, dim=1)  # num_anchors x 1
 
-            #import pdb
-            # pdb.set_trace()
-
             # compute the loss for classification
             targets = torch.ones(classification.shape) * -1
             targets = targets.cuda()
@@ -76,6 +79,8 @@ def forward(self, classifications, regressions, anchors, annotations):
             positive_indices = torch.ge(IoU_max, 0.5)
 
             num_positive_anchors = positive_indices.sum()
+            if num_positive_anchors == 0:
+                not_found += 1
 
             assigned_annotations = bbox_annotation[IoU_argmax, :]
 
@@ -148,5 +153,6 @@ def forward(self, classifications, regressions, anchors, annotations):
                 regression_losses.append(regression_loss.mean())
             else:
                 regression_losses.append(torch.tensor(0).float().cuda())
-
+        if not_found == batch_size:
+            print("Not positive sample is found in the batch")
         return torch.stack(classification_losses).mean(dim=0, keepdim=True), torch.stack(regression_losses).mean(dim=0, keepdim=True)
diff --git a/models/retinahead.py b/models/retinahead.py
index 7fcbadf..896f1ab 100644
--- a/models/retinahead.py
+++ b/models/retinahead.py
@@ -1,6 +1,7 @@
 from functools import partial
 
 import numpy as np
+import torch
 import torch.nn as nn
 
 from .module import ConvModule, bias_init_with_prob, normal_init
diff --git a/models/utils.py b/models/utils.py
index 34c8649..69c620c 100644
--- a/models/utils.py
+++ b/models/utils.py
@@ -302,15 +302,27 @@ def get_model_params(model_name, override_params):
     return blocks_args, global_params
 
 
+# url_map = {
+#     'efficientnet-b0': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b0-355c32eb.pth',
+#     'efficientnet-b1': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b1-f1951068.pth',
+#     'efficientnet-b2': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b2-8bb594d6.pth',
+#     'efficientnet-b3': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b3-5fb5a3c3.pth',
+#     'efficientnet-b4': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b4-6ed6700e.pth',
+#     'efficientnet-b5': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b5-b6417697.pth',
+#     'efficientnet-b6': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b6-c76e70fd.pth',
+#     'efficientnet-b7': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b7-dcc49843.pth',
+# }
+
+
 url_map = {
-    'efficientnet-b0': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b0-355c32eb.pth',
-    'efficientnet-b1': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b1-f1951068.pth',
-    'efficientnet-b2': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b2-8bb594d6.pth',
-    'efficientnet-b3': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b3-5fb5a3c3.pth',
-    'efficientnet-b4': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b4-6ed6700e.pth',
-    'efficientnet-b5': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b5-b6417697.pth',
-    'efficientnet-b6': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b6-c76e70fd.pth',
-    'efficientnet-b7': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b7-dcc49843.pth',
+    'efficientnet-b0': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth',
+    'efficientnet-b1': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b1-f1951068.pth',
+    'efficientnet-b2': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b2-8bb594d6.pth',
+    'efficientnet-b3': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b3-5fb5a3c3.pth',
+    'efficientnet-b4': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b4-6ed6700e.pth',
+    'efficientnet-b5': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b5-b6417697.pth',
+    'efficientnet-b6': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b6-c76e70fd.pth',
+    'efficientnet-b7': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b7-dcc49843.pth',
 }
 
 
diff --git a/train.py b/train.py
index 8e90697..0cf502a 100644
--- a/train.py
+++ b/train.py
@@ -3,8 +3,10 @@
 import os
 import random
 import shutil
+from collections import OrderedDict
 import time
 import warnings
+import epdb
 import torch
 import torch.nn as nn
 import torch.nn.parallel
@@ -17,6 +19,8 @@
 import torchvision.transforms as transforms
 import torchvision.datasets as datasets
 
+import pytorch_warmup as warmup
+
 import os
 import sys
 import time
@@ -27,12 +31,29 @@
 import torch.backends.cudnn as cudnn
 from torch.utils.data import DataLoader
 
+try:
+    from apex.parallel import DistributedDataParallel as DDP
+    from apex.fp16_utils import *
+    from apex import amp, optimizers
+    from apex.multi_tensor_apply import multi_tensor_applier
+except ImportError:
+    print("Please install apex from https://www.github.com/nvidia/apex to run this example.")
+
 from models.efficientdet import EfficientDet
 from models.losses import FocalLoss
 from datasets import VOCDetection, CocoDataset, get_augumentation, detection_collate, Resizer, Normalizer, Augmenter, collater
 from utils import EFFICIENTDET, get_state_dict
 from eval import evaluate, evaluate_coco
 
+from loader import PrefetchLoader
+from torch.utils.tensorboard import SummaryWriter
+
+
+breakpoint = epdb.set_trace
+
+writer = SummaryWriter()
+
+
 parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
 parser.add_argument('--dataset', default='VOC', choices=['VOC', 'COCO'],
                     type=str, help='VOC or COCO')
@@ -67,7 +88,7 @@
                     help='Directory for saving checkpoint models')
 parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
                     help='number of data loading workers (default: 4)')
-parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
+parser.add_argument('--start_epoch', default=-1, type=int, metavar='N',
                     help='manual epoch number (useful on restarts)')
 parser.add_argument('--world-size', default=1, type=int,
                     help='number of nodes for distributed training')
@@ -77,7 +98,7 @@
                     help='url used to set up distributed training')
 parser.add_argument('--dist-backend', default='nccl', type=str,
                     help='distributed backend')
-parser.add_argument('--seed', default=24, type=int,
+parser.add_argument('--seed', default=None, type=int,
                     help='seed for initializing training. ')
 parser.add_argument('--gpu', default=None, type=int,
                     help='GPU id to use.')
@@ -88,21 +109,35 @@
     'N processes per node, which has N GPUs. This is the '
     'fastest way to use PyTorch for either single node or '
     'multi node data parallel training')
+parser.add_argument('--eval_epochs', default=5, type=int,
+                    help='after how many training epochs will do evaluation (default 5).')
+parser.add_argument('--freeze_backbone', action='store_true',
+                    help='freeze EfficientNet-d{x} backbone')
+parser.add_argument('--freeze_bn', action='store_true',
+                    help='freeze all batch norm layers')
+parser.add_argument('--mixed_training', action='store_true',
+                    help='Use AMP mixed training optimization O1')
+parser.add_argument('--eval', action='store_true',
+                    help='Perform evaluation')
 
 iteration = 1
 
 
-def train(train_loader, model, scheduler, optimizer, epoch, args):
+def train(train_loader, model, scheduler, warmup_scheduler, optimizer, epoch, args):
     global iteration
     print("{} epoch: \t start training....".format(epoch))
     start = time.time()
     total_loss = []
     model.train()
-    model.module.is_training = True
-    model.module.freeze_bn()
+    # model.module.is_training = True
+    # model.module.freeze_bn()
     optimizer.zero_grad()
-    for idx, (images, annotations) in enumerate(train_loader):
-        images = images.cuda().float()
+
+    prefetcher = PrefetchLoader(train_loader)
+
+    for idx, (images, annotations) in tqdm(enumerate(prefetcher),
+                                           total=len(prefetcher)):
+        images = images.float().cuda()
         annotations = annotations.cuda()
         classification_loss, regression_loss = model([images, annotations])
         classification_loss = classification_loss.mean()
@@ -111,15 +146,24 @@ def train(train_loader, model, scheduler, optimizer, epoch, args):
         if bool(loss == 0):
             print('loss equal zero(0)')
             continue
-        loss.backward()
+
+        if args.mixed_training:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()
+
         if (idx + 1) % args.grad_accumulation_steps == 0:
             torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
             optimizer.step()
             optimizer.zero_grad()
+            # scheduler.step()
+            # if warmup_scheduler:
+            #     warmup_scheduler.dampen()
 
         total_loss.append(loss.item())
-        if(iteration % 300 == 0):
-            print('{} iteration: training ...'.format(iteration))
+        if (iteration % 100 == 0):
+            # print('{} iteration: training ...'.format(iteration))
             ans = {
                 'epoch': epoch,
                 'iteration': iteration,
@@ -129,8 +173,11 @@ def train(train_loader, model, scheduler, optimizer, epoch, args):
             }
             for key, value in ans.items():
                 print('    {:15s}: {}'.format(str(key), value))
+                if key != "epoch":
+                    writer.add_scalar(key, value, iteration)
         iteration += 1
-    scheduler.step(np.mean(total_loss))
+    scheduler.step(np.mean(total_loss))  # used for ReduceLROnPlateau
+
     result = {
         'time': time.time() - start,
         'loss': np.mean(total_loss)
@@ -139,16 +186,16 @@ def train(train_loader, model, scheduler, optimizer, epoch, args):
         print('    {:15s}: {}'.format(str(key), value))
 
 
-def test(dataset, model, epoch, args):
+def test(dataloader, model, epoch, args):
     print("{} epoch: \t start validation....".format(epoch))
-    model = model.module
+    # model = model.module
     model.eval()
-    model.is_training = False
+    # model.is_training = False
     with torch.no_grad():
         if(args.dataset == 'VOC'):
-            evaluate(dataset, model)
+            evaluate(dataloader, model)
         else:
-            evaluate_coco(dataset, model)
+            evaluate_coco(dataloader, model)
 
 
 def main_worker(gpu, ngpus_per_node, args):
@@ -171,29 +218,23 @@ def main_worker(gpu, ngpus_per_node, args):
             rank=args.rank)
 
     # Training dataset
+    data_augmenter = transforms.Compose([Normalizer(), Augmenter(), Resizer()])
+    data_augmenter = get_augumentation(phase="train")
+    inference_augmenter = transforms.Compose([Normalizer(), Resizer()])
     train_dataset = []
-    if(args.dataset == 'VOC'):
-        train_dataset = VOCDetection(root=args.dataset_root, transform=transforms.Compose(
-            [Normalizer(), Augmenter(), Resizer()]))
+    if (args.dataset == 'VOC'):
+        train_dataset = VOCDetection(root=args.dataset_root,
+                                     transform=data_augmenter)
         valid_dataset = VOCDetection(root=args.dataset_root, image_sets=[(
-            '2007', 'test')], transform=transforms.Compose([Normalizer(), Resizer()]))
+            '2007', 'test')], transform=inference_augmenter)
         args.num_class = train_dataset.num_classes()
-    elif(args.dataset == 'COCO'):
-        train_dataset = CocoDataset(
-            root_dir=args.dataset_root,
-            set_name='train2017',
-            transform=transforms.Compose(
-                [
-                    Normalizer(),
-                    Augmenter(),
-                    Resizer()]))
-        valid_dataset = CocoDataset(
-            root_dir=args.dataset_root,
-            set_name='val2017',
-            transform=transforms.Compose(
-                [
-                    Normalizer(),
-                    Resizer()]))
+    elif (args.dataset == 'COCO'):
+        train_dataset = CocoDataset(root_dir=args.dataset_root,
+                                    set_name='train2017',
+                                    transform=data_augmenter)
+        valid_dataset = CocoDataset(root_dir=args.dataset_root,
+                                    set_name='val2017',
+                                    transform=inference_augmenter)
         args.num_class = train_dataset.num_classes()
 
     train_loader = DataLoader(train_dataset,
@@ -203,7 +244,7 @@ def main_worker(gpu, ngpus_per_node, args):
                               collate_fn=collater,
                               pin_memory=True)
     valid_loader = DataLoader(valid_dataset,
-                              batch_size=1,
+                              batch_size=args.batch_size,
                               num_workers=args.workers,
                               shuffle=False,
                               collate_fn=collater,
@@ -222,8 +263,11 @@ def main_worker(gpu, ngpus_per_node, args):
         params = checkpoint['parser']
         args.num_class = params.num_class
         args.network = params.network
-        args.start_epoch = checkpoint['epoch'] + 1
+        if args.start_epoch == -1:
+            args.start_epoch = checkpoint['epoch'] + 1
         del params
+    if args.start_epoch == -1:
+        args.start_epoch = 0
 
     model = EfficientDet(num_classes=args.num_class,
                          network=args.network,
@@ -232,8 +276,25 @@ def main_worker(gpu, ngpus_per_node, args):
                          D_class=EFFICIENTDET[args.network]['D_class']
                          )
     if(args.resume is not None):
-        model.load_state_dict(checkpoint['state_dict'])
-    del checkpoint
+        tmp = OrderedDict()
+        for k, v in checkpoint['state_dict'].items():
+            k = k.replace("module.", "")
+            tmp[k] = v
+        model.load_state_dict(tmp)
+        del tmp
+
+    if args.freeze_backbone:
+        model.freeze_backbone()
+
+    if args.freeze_bn:
+        model.freeze_bn()
+
+    # define loss function (criterion) , optimizer, scheduler
+    optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()),
+                            lr=args.lr)
+    if args.resume is not None and "optimizer" in checkpoint:
+        optimizer.load_state_dict(checkpoint["optimizer"])
+
     if args.distributed:
         # For multiprocessing distributed, DistributedDataParallel constructor
         # should always set the single device scope, otherwise,
@@ -260,35 +321,65 @@ def main_worker(gpu, ngpus_per_node, args):
         torch.cuda.set_device(args.gpu)
         model = model.cuda(args.gpu)
     else:
-        model = model.cuda()
         print('Run with DataParallel ....')
-        model = torch.nn.DataParallel(model).cuda()
+        model = model.cuda()
+
+        # define loss function (criterion) , optimizer, scheduler
+        optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()),
+                            lr=args.lr)
+        if args.resume is not None and "optimizer" in checkpoint:
+            optimizer.load_state_dict(checkpoint["optimizer"])
+
+        if args.mixed_training:
+            model, optimizer = amp.initialize(model, optimizer,
+                                              opt_level="O1",
+                                              keep_batchnorm_fp32=None,
+                                              master_weights=None,
+                                              loss_scale=None)
+        model = torch.nn.DataParallel(model)
+
+    num_steps = len(train_loader) * args.num_epoch
 
-    # define loss function (criterion) , optimizer, scheduler
-    optimizer = optim.AdamW(model.parameters(), lr=args.lr)
     scheduler = optim.lr_scheduler.ReduceLROnPlateau(
         optimizer, patience=3, verbose=True)
+    # scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
+    #                                                 T_max=num_steps)
+    if args.resume is not None and "scheduler" in checkpoint:
+        scheduler.load_state_dict(checkpoint["scheduler"])
+
+    # warmup_scheduler = warmup.UntunedLinearWarmup(optimizer)
+    warmup_scheduler = None
+
+    if args.resume is not None and "warmup_scheduler" in checkpoint:
+        scheduler.load_state_dict(checkpoint["warmup_scheduler"])
+    del checkpoint
+
     cudnn.benchmark = True
 
-    for epoch in range(args.start_epoch, args.num_epoch):
-        train(train_loader, model, scheduler, optimizer, epoch, args)
-
-        if (epoch + 1) % 5 == 0:
-            test(valid_dataset, model, epoch, args)
-
-        state = {
-            'epoch': epoch,
-            'parser': args,
-            'state_dict': get_state_dict(model)
-        }
-
-        torch.save(
-            state,
-            os.path.join(
-                args.save_folder,
-                args.dataset,
-                args.network,
-                "checkpoint_{}.pth".format(epoch)))
+    if args.eval:
+        test(valid_loader, model, epoch=0, args=args)
+    else:
+        for epoch in range(args.start_epoch, args.num_epoch):
+            train(train_loader, model, scheduler, warmup_scheduler,
+                  optimizer, epoch, args)
+
+            state = {
+                'epoch': epoch,
+                'parser': args,
+                'state_dict': get_state_dict(model),
+                'optimizer': optimizer.state_dict()
+            }
+
+            torch.save(
+                state,
+                os.path.join(
+                    args.save_folder,
+                    args.dataset,
+                    args.network,
+                    "checkpoint_{}.pth".format(epoch)))
+
+            if (epoch + 1) % args.eval_epochs == 0:
+                test(valid_loader, model, epoch, args)
 
 
 def main():