feat(data): support cache ram of COCO dataset (Megvii-BaseDetection#1562)

yuangpeng · web-flow · commit 33f48a923104 · 2022-12-24T09:50:23.000+08:00
feat(data): support cache ram of COCO dataset
diff --git a/requirements.txt b/requirements.txt
@@ -8,6 +8,7 @@ torchvision
 thop
 ninja
 tabulate
+psutil
 
 # verified versions
 # pycocotools corresponds to https://github.com/ppwwyyxx/cocoapi
diff --git a/setup.cfg b/setup.cfg
@@ -3,7 +3,7 @@ line_length = 100
 multi_line_output = 3
 balanced_wrapping = True
 known_standard_library = setuptools
-known_third_party = tqdm,loguru,tabulate
+known_third_party = tqdm,loguru,tabulate,psutil
 known_data_processing = cv2,numpy,scipy,PIL,matplotlib
 known_datasets = pycocotools
 known_deeplearning = torch,torchvision,caffe2,onnx,apex,timm,thop,torch2trt,tensorrt,openvino,onnxruntime
diff --git a/tools/train.py b/tools/train.py
@@ -67,10 +67,10 @@ def make_parser():
     )
     parser.add_argument(
         "--cache",
-        dest="cache",
-        default=False,
-        action="store_true",
-        help="Caching imgs to RAM for fast training.",
+        type=str,
+        nargs="?",
+        const="ram",
+        help="Caching imgs to ram/disk for fast training.",
     )
     parser.add_argument(
         "-o",
@@ -130,6 +130,9 @@ def main(exp: Exp, args):
     num_gpu = get_num_devices() if args.devices is None else args.devices
     assert num_gpu <= get_num_devices()
 
+    if args.cache is not None:
+        exp.create_cache_dataset(args.cache)
+
     dist_url = "auto" if args.dist_url is None else args.dist_url
     launch(
         main,
diff --git a/yolox/core/trainer.py b/yolox/core/trainer.py
@@ -26,6 +26,7 @@
     gpu_mem_usage,
     is_parallel,
     load_ckpt,
+    mem_usage,
     occupy_mem,
     save_checkpoint,
     setup_logger,
@@ -250,10 +251,12 @@ def after_iter(self):
                 ["{}: {:.3f}s".format(k, v.avg) for k, v in time_meter.items()]
             )
 
+            mem_str = "gpu mem: {:.0f}Mb, mem: {:.1f}Gb".format(gpu_mem_usage(), mem_usage())
+
             logger.info(
-                "{}, mem: {:.0f}Mb, {}, {}, lr: {:.3e}".format(
+                "{}, {}, {}, {}, lr: {:.3e}".format(
                     progress_str,
-                    gpu_mem_usage(),
+                    mem_str,
                     time_str,
                     loss_str,
                     self.meter["lr"].latest,
diff --git a/yolox/data/datasets/coco.py b/yolox/data/datasets/coco.py
@@ -1,9 +1,13 @@
 #!/usr/bin/env python3
 # -*- coding:utf-8 -*-
 # Copyright (c) Megvii, Inc. and its affiliates.
-
+import copy
 import os
+import random
+from multiprocessing.pool import ThreadPool
+import psutil
 from loguru import logger
+from tqdm import tqdm
 
 import cv2
 import numpy as np
@@ -45,6 +49,7 @@ def __init__(
         img_size=(416, 416),
         preproc=None,
         cache=False,
+        cache_type="ram",
     ):
         """
         COCO dataset initialization. Annotation data are read into memory by COCO API.
@@ -64,74 +69,95 @@ def __init__(
         self.coco = COCO(os.path.join(self.data_dir, "annotations", self.json_file))
         remove_useless_info(self.coco)
         self.ids = self.coco.getImgIds()
+        self.num_imgs = len(self.ids)
         self.class_ids = sorted(self.coco.getCatIds())
         self.cats = self.coco.loadCats(self.coco.getCatIds())
         self._classes = tuple([c["name"] for c in self.cats])
-        self.imgs = None
         self.name = name
         self.img_size = img_size
         self.preproc = preproc
         self.annotations = self._load_coco_annotations()
-        if cache:
+        self.imgs = None
+        self.cache = cache
+        self.cache_type = cache_type
+
+        if self.cache:
             self._cache_images()
 
-    def __len__(self):
-        return len(self.ids)
+    def _cache_images(self):
+        mem = psutil.virtual_memory()
+        mem_required = self.cal_cache_ram()
+        gb = 1 << 30
 
-    def __del__(self):
-        del self.imgs
+        if self.cache_type == "ram" and mem_required > mem.available:
+            self.cache = False
+        else:
+            logger.info(
+                f"{mem_required / gb:.1f}GB RAM required, "
+                f"{mem.available / gb:.1f}/{mem.total / gb:.1f}GB RAM available, "
+                f"Since the first thing we do is cache, "
+                f"there is no guarantee that the remaining memory space is sufficient"
+            )
 
-    def _load_coco_annotations(self):
-        return [self.load_anno_from_ids(_ids) for _ids in self.ids]
+        if self.cache and self.imgs is None:
+            if self.cache_type == 'ram':
+                self.imgs = [None] * self.num_imgs
+                logger.info("You are using cached images in RAM to accelerate training!")
+            else:   # 'disk'
+                self.cache_dir = os.path.join(
+                    self.data_dir,
+                    f"{self.name}_cache{self.img_size[0]}x{self.img_size[1]}"
+                )
+                if not os.path.exists(self.cache_dir):
+                    os.mkdir(self.cache_dir)
+                    logger.warning(
+                        f"\n*******************************************************************\n"
+                        f"You are using cached images in DISK to accelerate training.\n"
+                        f"This requires large DISK space.\n"
+                        f"Make sure you have {mem_required / gb:.1f} "
+                        f"available DISK space for training COCO.\n"
+                        f"*******************************************************************\\n"
+                    )
+                else:
+                    logger.info("Found disk cache!")
+                    return
 
-    def _cache_images(self):
-        logger.warning(
-            "\n********************************************************************************\n"
-            "You are using cached images in RAM to accelerate training.\n"
-            "This requires large system RAM.\n"
-            "Make sure you have 200G+ RAM and 136G available disk space for training COCO.\n"
-            "********************************************************************************\n"
-        )
-        max_h = self.img_size[0]
-        max_w = self.img_size[1]
-        cache_file = os.path.join(self.data_dir, f"img_resized_cache_{self.name}.array")
-        if not os.path.exists(cache_file):
             logger.info(
-                "Caching images for the first time. This might take about 20 minutes for COCO"
+                "Caching images for the first time. "
+                "This might take about 15 minutes for COCO"
             )
-            self.imgs = np.memmap(
-                cache_file,
-                shape=(len(self.ids), max_h, max_w, 3),
-                dtype=np.uint8,
-                mode="w+",
-            )
-            from tqdm import tqdm
-            from multiprocessing.pool import ThreadPool
 
-            NUM_THREADs = min(8, os.cpu_count())
-            loaded_images = ThreadPool(NUM_THREADs).imap(
-                lambda x: self.load_resized_img(x),
-                range(len(self.annotations)),
-            )
-            pbar = tqdm(enumerate(loaded_images), total=len(self.annotations))
-            for k, out in pbar:
-                self.imgs[k][: out.shape[0], : out.shape[1], :] = out.copy()
-            self.imgs.flush()
+            num_threads = min(8, max(1, os.cpu_count() - 1))
+            b = 0
+            load_imgs = ThreadPool(num_threads).imap(self.load_resized_img, range(self.num_imgs))
+            pbar = tqdm(enumerate(load_imgs), total=self.num_imgs)
+            for i, x in pbar:   # x = self.load_resized_img(self, i)
+                if self.cache_type == 'ram':
+                    self.imgs[i] = x
+                else:   # 'disk'
+                    cache_filename = f'{self.annotations[i]["filename"].split(".")[0]}.npy'
+                    np.save(os.path.join(self.cache_dir, cache_filename), x)
+                b += x.nbytes
+                pbar.desc = f'Caching images ({b / gb:.1f}/{mem_required / gb:.1f}GB {self.cache})'
             pbar.close()
-        else:
-            logger.warning(
-                "You are using cached imgs! Make sure your dataset is not changed!!\n"
-                "Everytime the self.input_size is changed in your exp file, you need to delete\n"
-                "the cached data and re-generate them.\n"
-            )
 
-        logger.info("Loading cached imgs...")
-        self.imgs = np.memmap(
-            cache_file,
-            shape=(len(self.ids), max_h, max_w, 3),
-            dtype=np.uint8,
-            mode="r+",
-        )
+    def cal_cache_ram(self):
+        cache_bytes = 0
+        num_samples = min(self.num_imgs, 32)
+        for _ in range(num_samples):
+            img = self.load_resized_img(random.randint(0, self.num_imgs - 1))
+            cache_bytes += img.nbytes
+        mem_required = cache_bytes * self.num_imgs / num_samples
+        return mem_required
+
+    def __len__(self):
+        return self.num_imgs
+
+    def __del__(self):
+        del self.imgs
+
+    def _load_coco_annotations(self):
+        return [self.load_anno_from_ids(_ids) for _ids in self.ids]
 
     def load_anno_from_ids(self, id_):
         im_ann = self.coco.loadImgs(id_)[0]
@@ -152,7 +178,6 @@ def load_anno_from_ids(self, id_):
         num_objs = len(objs)
 
         res = np.zeros((num_objs, 5))
-
         for ix, obj in enumerate(objs):
             cls = self.class_ids.index(obj["category_id"])
             res[ix, 0:4] = obj["clean_bbox"]
@@ -197,15 +222,16 @@ def load_image(self, index):
 
     def pull_item(self, index):
         id_ = self.ids[index]
+        label, origin_image_size, _, filename = self.annotations[index]
 
-        res, img_info, resized_info, _ = self.annotations[index]
-        if self.imgs is not None:
-            pad_img = self.imgs[index]
-            img = pad_img[: resized_info[0], : resized_info[1], :].copy()
+        if self.cache_type == 'ram':
+            img = self.imgs[index]
+        elif self.cache_type == 'disk':
+            img = np.load(os.path.join(self.cache_dir, f"{filename.split('.')[0]}.npy"))
         else:
             img = self.load_resized_img(index)
 
-        return img, res.copy(), img_info, np.array([id_])
+        return copy.deepcopy(img), copy.deepcopy(label), origin_image_size, np.array([id_])
 
     @Dataset.mosaic_getitem
     def __getitem__(self, index):
diff --git a/yolox/exp/yolox_base.py b/yolox/exp/yolox_base.py
@@ -106,6 +106,23 @@ def __init__(self):
         self.test_conf = 0.01
         # nms threshold
         self.nmsthre = 0.65
+        self.cache_dataset = None
+        self.dataset = None
+
+    def create_cache_dataset(self, cache_type: str = "ram"):
+        from yolox.data import COCODataset, TrainTransform
+        self.cache_dataset = COCODataset(
+            data_dir=self.data_dir,
+            json_file=self.train_ann,
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                max_labels=50,
+                flip_prob=self.flip_prob,
+                hsv_prob=self.hsv_prob
+            ),
+            cache=True,
+            cache_type=cache_type,
+        )
 
     def get_model(self):
         from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
@@ -127,7 +144,16 @@ def init_yolo(M):
         self.model.train()
         return self.model
 
-    def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img: str = None):
+        """
+        Get dataloader according to cache_img parameter.
+        Args:
+            no_aug (bool, optional): Whether to turn off mosaic data enhancement. Defaults to False.
+            cache_img (str, optional): cache_img is equivalent to cache_type. Defaults to None.
+                "ram" : Caching imgs to ram for fast training.
+                "disk": Caching imgs to disk for fast training.
+                None: Do not use cache, in this case cache_data is also None.
+        """
         from yolox.data import (
             COCODataset,
             TrainTransform,
@@ -140,18 +166,23 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=Fa
         from yolox.utils import wait_for_the_master
 
         with wait_for_the_master():
-            dataset = COCODataset(
-                data_dir=self.data_dir,
-                json_file=self.train_ann,
-                img_size=self.input_size,
-                preproc=TrainTransform(
-                    max_labels=50,
-                    flip_prob=self.flip_prob,
-                    hsv_prob=self.hsv_prob),
-                cache=cache_img,
-            )
+            if self.cache_dataset is None:
+                assert cache_img is None, "cache is True, but cache_dataset is None"
+                dataset = COCODataset(
+                    data_dir=self.data_dir,
+                    json_file=self.train_ann,
+                    img_size=self.input_size,
+                    preproc=TrainTransform(
+                        max_labels=50,
+                        flip_prob=self.flip_prob,
+                        hsv_prob=self.hsv_prob),
+                    cache=False,
+                    cache_type=cache_img,
+                )
+            else:
+                dataset = self.cache_dataset
 
-        dataset = MosaicDetection(
+        self.dataset = MosaicDetection(
             dataset,
             mosaic=not no_aug,
             img_size=self.input_size,
@@ -169,8 +200,6 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=Fa
             mixup_prob=self.mixup_prob,
         )
 
-        self.dataset = dataset
-
         if is_distributed:
             batch_size = batch_size // dist.get_world_size()
 
diff --git a/yolox/utils/metric.py b/yolox/utils/metric.py
@@ -5,6 +5,7 @@
 import os
 import time
 from collections import defaultdict, deque
+import psutil
 
 import numpy as np
 
@@ -16,6 +17,7 @@
     "get_total_and_free_memory_in_Mb",
     "occupy_mem",
     "gpu_mem_usage",
+    "mem_usage"
 ]
 
 
@@ -51,6 +53,15 @@ def gpu_mem_usage():
     return mem_usage_bytes / (1024 * 1024)
 
 
+def mem_usage():
+    """
+    Compute the memory usage for the current machine (GB).
+    """
+    gb = 1 << 30
+    mem = psutil.virtual_memory()
+    return mem.used / gb
+
+
 class AverageMeter:
     """Track a series of values and provide access to smoothed values over a
     window or the global series average.