Add TracingAdapter module to flatten module outputs automatically

ppwwyyxx · facebook-github-bot · commit 911b091c23a2 · 2021-02-22T20:04:27.000-08:00
Summary:
This adapter allows tracing models with rich inputs/outputs formats, as long as the formats are recognized (dict/list/tuple, and d2 builtin structures.)

It simplifies code for tracing export, tracing evaluation and flop counting

Reviewed By: theschnitz

Differential Revision: D26298375

fbshipit-source-id: d0b20c26c13f69c80752caa921efc6011c2651bc
diff --git a/detectron2/evaluation/evaluator.py b/detectron2/evaluation/evaluator.py
@@ -3,8 +3,9 @@
 import logging
 import time
 from collections import OrderedDict
-from contextlib import contextmanager
+from contextlib import ExitStack, contextmanager
 import torch
+from torch import nn
 
 from detectron2.utils.comm import get_world_size, is_main_process
 from detectron2.utils.logger import log_every_n_seconds
@@ -101,13 +102,14 @@ def evaluate(self):
 def inference_on_dataset(model, data_loader, evaluator):
     """
     Run model on the data_loader and evaluate the metrics with evaluator.
-    Also benchmark the inference speed of `model.forward` accurately.
+    Also benchmark the inference speed of `model.__call__` accurately.
     The model will be used in eval mode.
 
     Args:
-        model (nn.Module): a module which accepts an object from
-            `data_loader` and returns some outputs. It will be temporarily set to `eval` mode.
+        model (callable): a callable which takes an object from
+            `data_loader` and returns some outputs.
 
+            If it's an nn.Module, it will be temporarily set to `eval` mode.
             If you wish to evaluate a model in `training` mode instead, you can
             wrap the given model and override its behavior of `.eval()` and `.train()`.
         data_loader: an iterable object with a length.
@@ -131,7 +133,11 @@ def inference_on_dataset(model, data_loader, evaluator):
     num_warmup = min(5, total - 1)
     start_time = time.perf_counter()
     total_compute_time = 0
-    with inference_context(model), torch.no_grad():
+    with ExitStack() as stack:
+        if isinstance(model, nn.Module):
+            stack.enter_context(inference_context(model))
+        stack.enter_context(torch.no_grad())
+
         for idx, inputs in enumerate(data_loader):
             if idx == num_warmup:
                 start_time = time.perf_counter()
diff --git a/detectron2/export/__init__.py b/detectron2/export/__init__.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 
 from .api import *
+from .flatten import TracingAdapter
 
 __all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/detectron2/export/flatten.py b/detectron2/export/flatten.py
@@ -1,7 +1,8 @@
 import collections
 from dataclasses import dataclass
-from typing import List
+from typing import Callable, List, Optional, Tuple
 import torch
+from torch import nn
 
 from detectron2.structures import Boxes, Instances
 
@@ -171,3 +172,99 @@ def flatten_to_tuple(obj):
         F = IdentitySchema
 
     return F.flatten(obj)
+
+
+class TracingAdapter(nn.Module):
+    """
+    A model may take rich input/output format (e.g. dict or custom classes).
+    This adapter flattens input/output format of a model so it becomes traceable.
+
+    It also records the necessary schema to rebuild model's inputs/outputs from flattened
+    inputs/outputs.
+
+    Example:
+
+    ::
+        outputs = model(inputs)   # inputs/outputs may be rich structure
+        adapter = TracingAdapter(model, inputs)
+
+        # can now trace the model, with adapter.flattened_inputs, or another
+        # tuple of tensors with the same length and meaning
+        traced = torch.jit.trace(adapter, adapter.flattened_inputs)
+
+        # traced model can only produce flattened outputs (tuple of tensors)
+        flattened_outputs = traced(*adapter.flattened_inputs)
+        # adapter knows the schema to convert it back (new_outputs == outputs)
+        new_outputs = adapter.outputs_schema(flattened_outputs)
+    """
+
+    flattened_inputs: Tuple[torch.Tensor] = None
+    """
+    Flattened version of inputs given to this class's constructor.
+    """
+
+    inputs_schema: Schema = None
+    """
+    Schema of the inputs given to this class's constructor.
+    """
+
+    outputs_schema: Schema = None
+    """
+    Schema of the output produced by calling the given model with inputs.
+    """
+
+    def __init__(self, model: nn.Module, inputs, inference_func: Optional[Callable] = None):
+        """
+        Args:
+            model: an nn.Module
+            inputs: An input argument or a tuple of input arguments used to call model.
+                After flattening, it has to only consist of tensors.
+            inference_func: a callable that takes (model, *inputs), calls the
+                model with inputs, and return outputs. By default it
+                is ``lambda model, *inputs: model(*inputs)``. Can be override
+                if you need to call the model differently.
+        """
+        super().__init__()
+        if isinstance(model, (nn.parallel.distributed.DistributedDataParallel, nn.DataParallel)):
+            model = model.module
+        self.model = model
+        if not isinstance(inputs, tuple):
+            inputs = (inputs,)
+        self.inputs = inputs
+
+        if inference_func is None:
+            inference_func = lambda model, *inputs: model(*inputs)  # noqa
+        self.inference_func = inference_func
+
+        self.flattened_inputs, self.inputs_schema = flatten_to_tuple(inputs)
+        for input in self.flattened_inputs:
+            if not isinstance(input, torch.Tensor):
+                raise ValueError(
+                    f"Inputs for tracing must only contain tensors. Got a {type(input)} instead."
+                )
+
+    def forward(self, *args: torch.Tensor):
+        with torch.no_grad():
+            inputs_orig_format = self.inputs_schema(args)
+            outputs = self.inference_func(self.model, *inputs_orig_format)
+            flattened_outputs, schema = flatten_to_tuple(outputs)
+            if self.outputs_schema is None:
+                self.outputs_schema = schema
+            else:
+                assert (
+                    self.outputs_schema == schema
+                ), "Model should always return outputs with the same structure so it can be traced!"
+            return flattened_outputs
+
+    def _create_wrapper(self, traced_model):
+        """
+        Return a function that has an input/output interface the same as the
+        original model, but it calls the given traced model under the hood.
+        """
+
+        def forward(*args):
+            flattened_inputs, _ = flatten_to_tuple(args)
+            flattened_outputs = traced_model(*flattened_inputs)
+            return self.outputs_schema(flattened_outputs)
+
+        return forward
diff --git a/detectron2/utils/analysis.py b/detectron2/utils/analysis.py
@@ -1,15 +1,11 @@
 # Copyright (c) Facebook, Inc. and its affiliates.
 # -*- coding: utf-8 -*-
 
-import logging
 import typing
-import torch
 from fvcore.nn import activation_count, flop_count, parameter_count, parameter_count_table
 from torch import nn
 
-from detectron2.structures import BitMasks, Boxes, ImageList, Instances
-
-from .logger import log_first_n
+from detectron2.export import TracingAdapter
 
 __all__ = [
     "activation_count_operators",
@@ -64,11 +60,13 @@ def flop_count_operators(
         the flops of box & mask head depends on the number of proposals &
         the number of detected objects.
         Therefore, the flops counting using a single input may not accurately
-        reflect the computation cost of a model.
+        reflect the computation cost of a model. It's recommended to average
+        across a number of inputs.
 
     Args:
         model: a detectron2 model that takes `list[dict]` as input.
         inputs (list[dict]): inputs to model, in detectron2's standard format.
+            Only "image" key will be used.
     """
     return _wrapper_count_operators(model=model, inputs=inputs, mode=FLOPS_MODE, **kwargs)
 
@@ -90,71 +88,34 @@ def activation_count_operators(
     Args:
         model: a detectron2 model that takes `list[dict]` as input.
         inputs (list[dict]): inputs to model, in detectron2's standard format.
+            Only "image" key will be used.
     """
     return _wrapper_count_operators(model=model, inputs=inputs, mode=ACTIVATIONS_MODE, **kwargs)
 
 
-def _flatten_to_tuple(outputs):
-    result = []
-    if isinstance(outputs, torch.Tensor):
-        result.append(outputs)
-    elif isinstance(outputs, (list, tuple)):
-        for v in outputs:
-            result.extend(_flatten_to_tuple(v))
-    elif isinstance(outputs, dict):
-        for _, v in outputs.items():
-            result.extend(_flatten_to_tuple(v))
-    elif isinstance(outputs, Instances):
-        result.extend(_flatten_to_tuple(outputs.get_fields()))
-    elif isinstance(outputs, (Boxes, BitMasks, ImageList)):
-        result.append(outputs.tensor)
-    else:
-        log_first_n(
-            logging.WARN,
-            f"Output of type {type(outputs)} not included in flops/activations count.",
-            n=10,
-        )
-    return tuple(result)
-
-
 def _wrapper_count_operators(
     model: nn.Module, inputs: list, mode: str, **kwargs
 ) -> typing.DefaultDict[str, float]:
-
     # ignore some ops
     supported_ops = {k: lambda *args, **kwargs: {} for k in _IGNORED_OPS}
     supported_ops.update(kwargs.pop("supported_ops", {}))
     kwargs["supported_ops"] = supported_ops
 
     assert len(inputs) == 1, "Please use batch size=1"
     tensor_input = inputs[0]["image"]
-
-    class WrapModel(nn.Module):
-        def __init__(self, model):
-            super().__init__()
-            if isinstance(
-                model, (nn.parallel.distributed.DistributedDataParallel, nn.DataParallel)
-            ):
-                self.model = model.module
-            else:
-                self.model = model
-
-        def forward(self, image):
-            # jit requires the input/output to be Tensors
-            inputs = [{"image": image}]
-            outputs = self.model.forward(inputs)
-            # Only the subgraph that computes the returned tuple of tensor will be
-            # counted. So we flatten everything we found to tuple of tensors.
-            return _flatten_to_tuple(outputs)
+    inputs = [{"image": tensor_input}]  # remove other keys, in case there are any
 
     old_train = model.training
-    with torch.no_grad():
-        if mode == FLOPS_MODE:
-            ret = flop_count(WrapModel(model).train(False), (tensor_input,), **kwargs)
-        elif mode == ACTIVATIONS_MODE:
-            ret = activation_count(WrapModel(model).train(False), (tensor_input,), **kwargs)
-        else:
-            raise NotImplementedError("Count for mode {} is not supported yet.".format(mode))
+    if isinstance(model, (nn.parallel.distributed.DistributedDataParallel, nn.DataParallel)):
+        model = model.module
+    wrapper = TracingAdapter(model, inputs)
+    wrapper.eval()
+    if mode == FLOPS_MODE:
+        ret = flop_count(wrapper, (tensor_input,), **kwargs)
+    elif mode == ACTIVATIONS_MODE:
+        ret = activation_count(wrapper, (tensor_input,), **kwargs)
+    else:
+        raise NotImplementedError("Count for mode {} is not supported yet.".format(mode))
     # compatible with change in fvcore
     if isinstance(ret, tuple):
         ret = ret[0]
diff --git a/tests/test_export_torchscript.py b/tests/test_export_torchscript.py
@@ -8,7 +8,7 @@
 
 from detectron2 import model_zoo
 from detectron2.config import get_cfg
-from detectron2.export.flatten import flatten_to_tuple
+from detectron2.export.flatten import TracingAdapter, flatten_to_tuple
 from detectron2.export.torchscript import dump_torchscript_IR, export_torchscript_with_instances
 from detectron2.export.torchscript_patch import patch_builtin_len
 from detectron2.layers import ShapeSpec
@@ -86,8 +86,7 @@ class TestTracing(unittest.TestCase):
     def testMaskRCNN(self):
         def inference_func(model, image):
             inputs = [{"image": image}]
-            outputs = model.inference(inputs, do_postprocess=False)[0]
-            return outputs
+            return model.inference(inputs, do_postprocess=False)[0]
 
         self._test_model("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml", inference_func)
 
@@ -102,26 +101,15 @@ def _test_model(self, config_path, inference_func):
         model = model_zoo.get(config_path, trained=True)
         image = get_sample_coco_image()
 
-        class Wrapper(nn.ModuleList):  # a wrapper to make the model traceable
-            def forward(self, image):
-                outputs = inference_func(self[0], image)
-                flattened_outputs, schema = flatten_to_tuple(outputs)
-                if not hasattr(self, "schema"):
-                    self.schema = schema
-                return flattened_outputs
-
-            def rebuild(self, flattened_outputs):
-                return self.schema(flattened_outputs)
-
-        wrapper = Wrapper([model])
+        wrapper = TracingAdapter(model, image, inference_func)
         wrapper.eval()
         with torch.no_grad(), patch_builtin_len():
             small_image = nn.functional.interpolate(image, scale_factor=0.5)
             # trace with a different image, and the trace must still work
             traced_model = torch.jit.trace(wrapper, (small_image,))
 
             output = inference_func(model, image)
-            traced_output = wrapper.rebuild(traced_model(image))
+            traced_output = wrapper.outputs_schema(traced_model(image))
         assert_instances_allclose(output, traced_output, size_as_tensor=True)
 
     def testKeypointHead(self):
@@ -191,6 +179,9 @@ def test_flatten_basic(self):
         new_obj = schema(res)
         self.assertEqual(new_obj, obj)
 
+        _, new_schema = flatten_to_tuple(new_obj)
+        self.assertEqual(schema, new_schema)  # test __eq__
+
     def test_flatten_instances_boxes(self):
         inst = Instances(
             torch.tensor([5, 8]), pred_masks=torch.tensor([3]), pred_boxes=Boxes(torch.ones((1, 4)))
diff --git a/tests/test_model_analysis.py b/tests/test_model_analysis.py
@@ -14,7 +14,7 @@ def setUp(self):
 
     def test_flop(self):
         # RetinaNet supports flop-counting with random inputs
-        inputs = [{"image": torch.rand(3, 800, 800)}]
+        inputs = [{"image": torch.rand(3, 800, 800), "test_unused": "abcd"}]
         res = flop_count_operators(self.model, inputs)
         self.assertTrue(int(res["conv"]), 146)  # 146B flops
 
diff --git a/tools/deploy/README.md b/tools/deploy/README.md
@@ -65,12 +65,12 @@ We show a few example commands to export and execute a Mask R-CNN model in C++.
 ## Notes:
 
 1. Tracing/Caffe2-tracing requires valid weights & sample inputs.
-   Therefore the above commands require [setting up the COCO dataset](https://detectron2.readthedocs.io/tutorials/builtin_datasets.html).
+   Therefore the above commands require pre-trained models and [COCO dataset](https://detectron2.readthedocs.io/tutorials/builtin_datasets.html).
    You can modify the script to obtain sample inputs in other ways instead of from COCO.
 
-2. `--run-eval` flag is supported with caffe2 format.
-   This flag will evaluate the converted models to verify its accuracy.
-   The accuracy is typically slightly different (within 0.1 AP) from original model due to
-   numerical precisions between different implementations.
+2. `--run-eval` flag can be used under certain modes
+   (caffe2_tracing with caffe2 format, or tracing with torchscript format)
+   to evaluate the exported model using the dataset in the config.
    It's recommended to always verify the accuracy in case the conversion is not successful.
-   Evaluation can be slow if model is exported to CPU.
+   Evaluation can be slow if model is exported to CPU or dataset is too large ("coco_2017_val_100" is a small subset of COCO useful for evaluation).
+   Caffe2 accuracy may be slightly different (within 0.1 AP) from original model due to numerical precisions between different runtime.
diff --git a/tools/deploy/export_model.py b/tools/deploy/export_model.py
diff --git a/tools/deploy/torchscript_traced_mask_rcnn.cpp b/tools/deploy/torchscript_traced_mask_rcnn.cpp