chore(logger): log predictions during training to wandb tables (Megvii-BaseDetection#1181)

manangoel99 · web-flow · commit 48fd2a95837a · 2022-06-29T14:49:15.000+08:00
chore(logger): log predictions during training to wandb tables
diff --git a/README.md b/README.md
@@ -150,6 +150,19 @@ On the second machine, run
 python tools/train.py -n yolox-s -b 128 --dist-url tcp://123.123.123.123:12312 --num_machines 2 --machine_rank 1
 ```
 
+**Logging to Weights & Biases**
+
+To log metrics, predictions and model checkpoints to [W&B](https://docs.wandb.ai/guides/integrations/other/yolox) use the command line argument `--logger wandb` and use the prefix "wandb-" to specify arguments for initializing the wandb run.
+
+```shell
+python tools/train.py -n yolox-s -d 8 -b 64 --fp16 -o [--cache] --logger wandb wandb-project <project name>
+                         yolox-m
+                         yolox-l
+                         yolox-x
+```
+
+An example wandb dashboard is available [here](https://wandb.ai/manan-goel/yolox-nano/runs/3pzfeom0)
+
 **Others**  
 See more information with the following command:
 ```shell
diff --git a/docs/quick_run.md b/docs/quick_run.md
@@ -76,6 +76,19 @@ python tools/train.py -n yolox-s -d 8 -b 64 --fp16 -o [--cache] --logger wandb w
                          yolox-x
 ```
 
+More WandbLogger arguments include
+
+```shell
+python tools/train.py .... --logger wandb wandb-project <project-name> \
+                wandb-name <run-name> \
+                wandb-id <run-id> \
+                wandb-save_dir <save-dir> \
+                wandb-num_eval_images <num-images> \
+                wandb-log_checkpoints <bool>
+```
+
+More information available [here](https://docs.wandb.ai/guides/integrations/other/yolox).
+
 **Multi Machine Training**
 
 We also support multi-nodes training. Just add the following args:
diff --git a/tools/train.py b/tools/train.py
@@ -84,7 +84,8 @@ def make_parser():
         "-l",
         "--logger",
         type=str,
-        help="Logger to be used for metrics",
+        help="Logger to be used for metrics. \
+        Implemented loggers include `tensorboard` and `wandb`.",
         default="tensorboard"
     )
     parser.add_argument(
diff --git a/yolox/core/trainer.py b/yolox/core/trainer.py
@@ -180,11 +180,11 @@ def before_train(self):
             if self.args.logger == "tensorboard":
                 self.tblogger = SummaryWriter(os.path.join(self.file_name, "tensorboard"))
             elif self.args.logger == "wandb":
-                wandb_params = dict()
-                for k, v in zip(self.args.opts[0::2], self.args.opts[1::2]):
-                    if k.startswith("wandb-"):
-                        wandb_params.update({k[len("wandb-"):]: v})
-                self.wandb_logger = WandbLogger(config=vars(self.exp), **wandb_params)
+                self.wandb_logger = WandbLogger.initialize_wandb_logger(
+                    self.args,
+                    self.exp,
+                    self.evaluator.dataloader.dataset
+                )
             else:
                 raise ValueError("logger must be either 'tensorboard' or 'wandb'")
 
@@ -263,8 +263,11 @@ def after_iter(self):
 
             if self.rank == 0:
                 if self.args.logger == "wandb":
-                    self.wandb_logger.log_metrics({k: v.latest for k, v in loss_meter.items()})
-                    self.wandb_logger.log_metrics({"lr": self.meter["lr"].latest})
+                    metrics = {"train/" + k: v.latest for k, v in loss_meter.items()}
+                    metrics.update({
+                        "train/lr": self.meter["lr"].latest
+                    })
+                    self.wandb_logger.log_metrics(metrics, step=self.progress_in_iter)
 
             self.meter.clear_meters()
 
@@ -322,8 +325,8 @@ def evaluate_and_save_model(self):
                 evalmodel = evalmodel.module
 
         with adjust_status(evalmodel, training=False):
-            ap50_95, ap50, summary = self.exp.eval(
-                evalmodel, self.evaluator, self.is_distributed
+            (ap50_95, ap50, summary), predictions = self.exp.eval(
+                evalmodel, self.evaluator, self.is_distributed, return_outputs=True
             )
 
         update_best_ckpt = ap50_95 > self.best_ap
@@ -337,16 +340,17 @@ def evaluate_and_save_model(self):
                 self.wandb_logger.log_metrics({
                     "val/COCOAP50": ap50,
                     "val/COCOAP50_95": ap50_95,
-                    "epoch": self.epoch + 1,
+                    "train/epoch": self.epoch + 1,
                 })
+                self.wandb_logger.log_images(predictions)
             logger.info("\n" + summary)
         synchronize()
 
-        self.save_ckpt("last_epoch", update_best_ckpt)
+        self.save_ckpt("last_epoch", update_best_ckpt, ap=ap50_95)
         if self.save_history_ckpt:
-            self.save_ckpt(f"epoch_{self.epoch + 1}")
+            self.save_ckpt(f"epoch_{self.epoch + 1}", ap=ap50_95)
 
-    def save_ckpt(self, ckpt_name, update_best_ckpt=False):
+    def save_ckpt(self, ckpt_name, update_best_ckpt=False, ap=None):
         if self.rank == 0:
             save_model = self.ema_model.ema if self.use_model_ema else self.model
             logger.info("Save weights to {}".format(self.file_name))
@@ -355,6 +359,7 @@ def save_ckpt(self, ckpt_name, update_best_ckpt=False):
                 "model": save_model.state_dict(),
                 "optimizer": self.optimizer.state_dict(),
                 "best_ap": self.best_ap,
+                "curr_ap": ap,
             }
             save_checkpoint(
                 ckpt_state,
@@ -364,4 +369,14 @@ def save_ckpt(self, ckpt_name, update_best_ckpt=False):
             )
 
             if self.args.logger == "wandb":
-                self.wandb_logger.save_checkpoint(self.file_name, ckpt_name, update_best_ckpt)
+                self.wandb_logger.save_checkpoint(
+                    self.file_name,
+                    ckpt_name,
+                    update_best_ckpt,
+                    metadata={
+                        "epoch": self.epoch + 1,
+                        "optimizer": self.optimizer.state_dict(),
+                        "best_ap": self.best_ap,
+                        "curr_ap": ap
+                    }
+                )
diff --git a/yolox/data/datasets/coco.py b/yolox/data/datasets/coco.py
@@ -65,8 +65,8 @@ def __init__(
         remove_useless_info(self.coco)
         self.ids = self.coco.getImgIds()
         self.class_ids = sorted(self.coco.getCatIds())
-        cats = self.coco.loadCats(self.coco.getCatIds())
-        self._classes = tuple([c["name"] for c in cats])
+        self.cats = self.coco.loadCats(self.coco.getCatIds())
+        self._classes = tuple([c["name"] for c in self.cats])
         self.imgs = None
         self.name = name
         self.img_size = img_size
diff --git a/yolox/evaluators/coco_evaluator.py b/yolox/evaluators/coco_evaluator.py
@@ -8,6 +8,7 @@
 import json
 import tempfile
 import time
+from collections import ChainMap, defaultdict
 from loguru import logger
 from tabulate import tabulate
 from tqdm import tqdm
@@ -120,6 +121,7 @@ def evaluate(
         trt_file=None,
         decoder=None,
         test_size=None,
+        return_outputs=False
     ):
         """
         COCO average precision (AP) Evaluation. Iterate inference on the test dataset
@@ -142,6 +144,7 @@ def evaluate(
             model = model.half()
         ids = []
         data_list = []
+        output_data = defaultdict()
         progress_bar = tqdm if is_main_process() else iter
 
         inference_time = 0
@@ -184,20 +187,29 @@ def evaluate(
                     nms_end = time_synchronized()
                     nms_time += nms_end - infer_end
 
-            data_list.extend(self.convert_to_coco_format(outputs, info_imgs, ids))
+            data_list_elem, image_wise_data = self.convert_to_coco_format(
+                outputs, info_imgs, ids, return_outputs=True)
+            data_list.extend(data_list_elem)
+            output_data.update(image_wise_data)
 
         statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples])
         if distributed:
             data_list = gather(data_list, dst=0)
+            output_data = gather(output_data, dst=0)
             data_list = list(itertools.chain(*data_list))
+            output_data = dict(ChainMap(*output_data))
             torch.distributed.reduce(statistics, dst=0)
 
         eval_results = self.evaluate_prediction(data_list, statistics)
         synchronize()
+
+        if return_outputs:
+            return eval_results, output_data
         return eval_results
 
-    def convert_to_coco_format(self, outputs, info_imgs, ids):
+    def convert_to_coco_format(self, outputs, info_imgs, ids, return_outputs=False):
         data_list = []
+        image_wise_data = defaultdict(dict)
         for (output, img_h, img_w, img_id) in zip(
             outputs, info_imgs[0], info_imgs[1], ids
         ):
@@ -212,10 +224,22 @@ def convert_to_coco_format(self, outputs, info_imgs, ids):
                 self.img_size[0] / float(img_h), self.img_size[1] / float(img_w)
             )
             bboxes /= scale
-            bboxes = xyxy2xywh(bboxes)
-
             cls = output[:, 6]
             scores = output[:, 4] * output[:, 5]
+
+            image_wise_data.update({
+                int(img_id): {
+                    "bboxes": [box.numpy().tolist() for box in bboxes],
+                    "scores": [score.numpy().item() for score in scores],
+                    "categories": [
+                        self.dataloader.dataset.class_ids[int(cls[ind])]
+                        for ind in range(bboxes.shape[0])
+                    ],
+                }
+            })
+
+            bboxes = xyxy2xywh(bboxes)
+
             for ind in range(bboxes.shape[0]):
                 label = self.dataloader.dataset.class_ids[int(cls[ind])]
                 pred_data = {
@@ -226,6 +250,9 @@ def convert_to_coco_format(self, outputs, info_imgs, ids):
                     "segmentation": [],
                 }  # COCO json format
                 data_list.append(pred_data)
+
+        if return_outputs:
+            return data_list, image_wise_data
         return data_list
 
     def evaluate_prediction(self, data_dict, statistics):
diff --git a/yolox/exp/yolox_base.py b/yolox/exp/yolox_base.py
@@ -318,5 +318,5 @@ def get_trainer(self, args):
         # NOTE: trainer shouldn't be an attribute of exp object
         return trainer
 
-    def eval(self, model, evaluator, is_distributed, half=False):
-        return evaluator.evaluate(model, is_distributed, half)
+    def eval(self, model, evaluator, is_distributed, half=False, return_outputs=False):
+        return evaluator.evaluate(model, is_distributed, half, return_outputs=return_outputs)
diff --git a/yolox/utils/logger.py b/yolox/utils/logger.py