don't use a separate stream to gather losses

ppwwyyxx · facebook-github-bot · commit 097efabd18b8 · 2021-02-04T20:58:57.000-08:00
Summary: Task in the new stream can start even before the loss is computed. To do this correctly, it requires creating a cuda event before backward, and wait for the event in the new stream. The perf difference is tiny, so it's probably not worthwhile so just remove the stream entirely. Some refs: pytorch/pytorch#23729 Reviewed By: theschnitz Differential Revision: D26238335 fbshipit-source-id: 614a5b173861b0c0a2bd1240855f12c19d58b76e
diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md
@@ -1,6 +1,6 @@
 ---
 name: "\U0001F4DA Documentation Issues"
-about: Issues or enhancements about docs and comments
+about: Docs and comments are missing, incorrect, or not clear enough
 labels: documentation
 
 ---
diff --git a/.github/workflows/check-template.yml b/.github/workflows/check-template.yml
@@ -29,9 +29,9 @@ jobs:
               core.debug("Issue " + issue.data.title + " was skipped.");
               return;
             }
-            const body = issue.data.body;
-            const hasInstructions = body.toLowerCase().indexOf("reproduce") != -1;
-            const hasEnvironment = body.indexOf("sys.platform") != -1;
+            const body = issue.data.body.toLowerCase();
+            const hasInstructions = body.indexOf("reproduce") != -1;
+            const hasEnvironment = (body.indexOf("sys.platform") != -1) || (body.indexOf("colab") != -1);
             if (hasInstructions && hasEnvironment) {
               core.debug("Issue " + issue.data.title + " follows template.");
               return;
diff --git a/detectron2/engine/train_loop.py b/detectron2/engine/train_loop.py
@@ -255,17 +255,13 @@ def _write_metrics(
             loss_dict (dict): dict of scalar losses
             data_time (float): time taken by the dataloader iteration
         """
-        device = next(iter(loss_dict.values())).device
+        metrics_dict = {k: v.detach().cpu().item() for k, v in loss_dict.items()}
+        metrics_dict["data_time"] = data_time
 
-        # Use a new stream so these ops don't wait for DDP or backward
-        with torch.cuda.stream(torch.cuda.Stream() if device.type == "cuda" else None):
-            metrics_dict = {k: v.detach().cpu().item() for k, v in loss_dict.items()}
-            metrics_dict["data_time"] = data_time
-
-            # Gather metrics among all workers for logging
-            # This assumes we do DDP-style training, which is currently the only
-            # supported method in detectron2.
-            all_metrics_dict = comm.gather(metrics_dict)
+        # Gather metrics among all workers for logging
+        # This assumes we do DDP-style training, which is currently the only
+        # supported method in detectron2.
+        all_metrics_dict = comm.gather(metrics_dict)
 
         if comm.is_main_process():
             storage = get_event_storage()
diff --git a/detectron2/modeling/roi_heads/roi_heads.py b/detectron2/modeling/roi_heads/roi_heads.py
@@ -345,6 +345,7 @@ class Res5ROIHeads(ROIHeads):
     The ROIHeads in a typical "C4" R-CNN model, where
     the box and mask head share the cropping and
     the per-region feature computation by a Res5 block.
+    See :paper:`ResNet` Appendix A.
     """
 
     @configurable
diff --git a/detectron2/utils/collect_env.py b/detectron2/utils/collect_env.py
@@ -187,5 +187,8 @@ def collect_env_info():
             try:
                 x = torch.tensor([1, 2.0], dtype=torch.float32)
                 x = x.to(device)
-            except Exception:
-                print(f"Unable to copy tensor to device={device}")
+            except Exception as e:
+                print(
+                    f"Unable to copy tensor to device={device}: {e}. "
+                    "Your CUDA environment is broken."
+                )
diff --git a/detectron2/utils/registry.py b/detectron2/utils/registry.py
@@ -33,10 +33,10 @@ def locate(name: str) -> Any:
     # Should use _locate directly if it's public.
     if obj is None:
         try:
-            from hydra._internal.utils import _locate
+            from hydra.utils import get_method
         except ImportError as e:
             raise ImportError(f"Cannot dynamically locate object {name}!") from e
         else:
-            obj = _locate(name)  # it raises if fails
+            obj = get_method(name)  # it raises if fails
 
     return obj
diff --git a/docker/README.md b/docker/README.md
@@ -25,7 +25,7 @@ cd docker && USER_ID=$UID docker-compose run detectron2
 After building the base detectron2 container as above, do:
 ```
 # Build:
-docker build -t detectron2-deploy:v0 -f deploy.Dockerfile
+docker build -t detectron2-deploy:v0 -f deploy.Dockerfile .
 # Launch:
 docker run --gpus all -it detectron2-deploy:v0
 ```
diff --git a/docs/modules/fvcore.rst b/docs/modules/fvcore.rst
@@ -5,15 +5,15 @@ Detectron2 depends on utilities in
 `fvcore <https://github.com/facebookresearch/fvcore/>`_.
 We include part of fvcore documentation here for easier reference.
 
-fvcore.nn 
+fvcore.nn
 -----------------
 
 .. automodule:: fvcore.nn
     :members:
     :undoc-members:
     :show-inheritance:
 
-fvcore.common 
+fvcore.common
 ---------------------
 
 .. automodule:: fvcore.common.checkpoint
@@ -31,6 +31,11 @@ fvcore.common
     :undoc-members:
     :show-inheritance:
 
+.. automodule:: fvcore.common.param_scheduler
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 .. automodule:: fvcore.common.registry
     :members:
     :undoc-members:
diff --git a/tools/benchmark.py b/tools/benchmark.py
@@ -25,6 +25,7 @@
 from detectron2.modeling import build_model
 from detectron2.solver import build_optimizer
 from detectron2.utils import comm
+from detectron2.utils.collect_env import collect_env_info
 from detectron2.utils.events import CommonMetricPrinter
 from detectron2.utils.logger import setup_logger
 
@@ -154,6 +155,7 @@ def f():
     args = parser.parse_args()
     assert not args.eval_only
 
+    logger.info("Environment info:\n" + collect_env_info())
     if args.task == "data":
         f = benchmark_data
         print("Initial " + RAM_msg())
diff --git a/tools/deploy/README.md b/tools/deploy/README.md
@@ -12,7 +12,7 @@ This directory contains the following examples:
 All C++ examples depend on libtorch and OpenCV. Some require more dependencies:
 
 * Running caffe2-format models requires:
-  * PyTorch with caffe2 inside
+  * libtorch built with caffe2 inside
   * gflags, glog
   * protobuf library that matches the version used by PyTorch (version defined in `include/caffe2/proto/caffe2.pb.h` of your PyTorch installation)
   * MKL headers if caffe2 is built with MKL