Added fix for mult-gpu configuration

Anurag Dixit · Anurag Dixit · commit 00cf1d582965 · 2021-02-23T18:29:02.000-08:00
Signed-off-by: Anurag Dixit &lt;anuragd@nvidia.com&gt;
diff --git a/core/conversion/conversionctx/ConversionCtx.cpp b/core/conversion/conversionctx/ConversionCtx.cpp
@@ -47,6 +47,11 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings)
           util::logging::get_logger().get_reportable_severity(),
           util::logging::get_logger().get_is_colored_output_on()) {
   // TODO: Support FP16 and FP32 from JIT information
+  if (settings.device.gpu_id) {
+    TRTORCH_CHECK(
+        cudaSetDevice(settings.device.gpu_id) == cudaSuccess, "Unable to set gpu id: " << settings.device.gpu_id);
+  }
+
   builder = nvinfer1::createInferBuilder(logger);
   net = builder->createNetworkV2(1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
 
@@ -108,10 +113,6 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings)
   cfg->setDefaultDeviceType(settings.device.device_type);
   cfg->setEngineCapability(settings.capability);
 
-  if (settings.device.gpu_id) {
-    TRTORCH_CHECK(cudaSetDevice(settings.device.gpu_id), "Unable to set gpu id: " << settings.device.gpu_id);
-  }
-
   if (settings.device.device_type == nvinfer1::DeviceType::kDLA) {
     auto nbDLACores = builder->getNbDLACores();
     TRTORCH_CHECK(
diff --git a/docsrc/py_api/trtorch.rst b/docsrc/py_api/trtorch.rst
@@ -11,6 +11,8 @@ trtorch
 Functions
 ------------
 
+.. autofunction:: set_device
+
 .. autofunction:: compile
 
 .. autofunction:: convert_method_to_trt_engine
diff --git a/py/trtorch/_compiler.py b/py/trtorch/_compiler.py
@@ -156,3 +156,6 @@ def get_build_info() -> str:
     build_info = trtorch._C.get_build_info()
     build_info = "TRTorch Version: " + str(__version__) + '\n' + build_info
     return build_info
+
+def set_device(gpu_id):
+    trtorch._C.set_device(gpu_id)
diff --git a/py/trtorch/csrc/trtorch_py.cpp b/py/trtorch/csrc/trtorch_py.cpp
@@ -15,6 +15,10 @@ namespace py = pybind11;
 namespace trtorch {
 namespace pyapi {
 
+void set_device(const int device_id) {
+  core::set_device(device_id);
+}
+
 torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec& info) {
   py::gil_scoped_acquire gil;
   auto trt_mod = core::CompileGraph(mod, info.toInternalCompileSpec());
@@ -146,6 +150,7 @@ PYBIND11_MODULE(_C, m) {
   m.def("_get_is_colored_output_on", &logging::get_is_colored_output_on, "Get if the logging output will be colored");
   m.def("_set_is_colored_output_on", &logging::set_is_colored_output_on, "Set if the logging output should be colored");
   m.def("_log", &logging::log, "Add a message to the logger");
+  m.def("set_device", &trtorch::pyapi::set_device, "Set CUDA device id");
 
   py::enum_<core::util::logging::LogLevel>(m, "LogLevel", py::arithmetic())
       .value("INTERNAL_ERROR", core::util::logging::LogLevel::kINTERNAL_ERROR)
diff --git a/tests/py/BUILD b/tests/py/BUILD
@@ -25,6 +25,23 @@ py_test(
     ]
 )
 
+# Following multi_gpu test is only targeted for multi-gpu configurations. It is not included in the test suite by default.
+py_test(
+    name = "test_api_multi_gpu",
+    srcs = [
+        "test_api_multi_gpu.py",
+        "model_test_case.py"
+    ] + select({
+	":aarch64_linux": [
+		"test_api_dla.py"
+	],
+    "//conditions:default" : []
+    }),
+    deps = [
+        requirement("torchvision")
+    ]
+)
+
 py_test(
     name = "test_to_backend_api",
     srcs = [
diff --git a/tests/py/test_api_multi_gpu.py b/tests/py/test_api_multi_gpu.py
@@ -0,0 +1,66 @@
+import unittest
+import trtorch
+import torch
+import torchvision.models as models
+
+from model_test_case import ModelTestCase
+
+class TestCompile(MultiGpuTestCase):
+
+    def setUp(self):
+        if not torch.cuda.device_count() > 1:
+            raise ValueError("This test case is applicable for multi-gpu configurations only")
+        
+        self.gpu_id = 1
+        # Setting it up here so that all CUDA allocations are done on correct device
+        trtorch.set_device(self.gpu_id)
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.traced_model = torch.jit.trace(self.model, [self.input])
+        self.scripted_model = torch.jit.script(self.model)
+
+    def test_compile_traced(self):
+        compile_spec = {
+            "input_shapes": [self.input.shape],
+            "device": {
+                "device_type": trtorch.DeviceType.GPU,
+                "gpu_id": self.gpu_id,
+                "dla_core": 0,
+                "allow_gpu_fallback": False,
+                "disable_tf32": False
+            }
+        }
+
+        trt_mod = trtorch.compile(self.traced_model, compile_spec)
+        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
+        self.assertTrue(same < 2e-3)
+
+    def test_compile_script(self):
+        compile_spec = {
+            "input_shapes": [self.input.shape],
+            "device": {
+                "device_type": trtorch.DeviceType.GPU,
+                "gpu_id": self.gpu_id,
+                "dla_core": 0,
+                "allow_gpu_fallback": False,
+                "disable_tf32": False
+            }
+        }
+
+        trt_mod = trtorch.compile(self.scripted_model, compile_spec)
+        same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
+        self.assertTrue(same < 2e-3)
+
+
+
+def test_suite():
+    suite = unittest.TestSuite()
+    suite.addTest(TestCompile.parametrize(TestCompile, model=models.resnet18(pretrained=True)))
+
+    return suite
+
+suite = test_suite()
+
+runner = unittest.TextTestRunner()
+result = runner.run(suite)
+
+exit(int(not result.wasSuccessful()))