Merge branch 'master' into build_workspace

peri044 · peri044 · commit eb9e1f62d172 · 2021-10-18T13:06:32.000-07:00
diff --git a/.github/pr-labels.yml b/.github/pr-labels.yml
@@ -19,6 +19,9 @@
 "component: evaluators":
   - core/conversion/evaluators/**/*
 
+"component: partitioning":
+  - core/partitioning/**/*
+
 "component: runtime":
   - core/runtime/**/*
 
diff --git a/core/conversion/conversion.cpp b/core/conversion/conversion.cpp
@@ -326,7 +326,7 @@ void EvaluateLoopBlock(ConversionCtx* ctx, const torch::jit::Node* n) {
     MapIValues(ctx, n->outputs(), n->blocks()[0]->inputs(), 0, 1);
     for (auto bn : n->blocks()[0]->nodes()) {
       if (bn->kind() == torch::jit::prim::Loop) {
-        EvaluateLoopBlock(ctx, n);
+        EvaluateLoopBlock(ctx, bn);
       } else if (bn->kind() == torch::jit::prim::If) {
         EvaluateConditionalBlock(ctx, bn, true);
       } else {
diff --git a/core/lowering/passes/module_fallback.cpp b/core/lowering/passes/module_fallback.cpp
@@ -133,4 +133,4 @@ void MarkNodesForFallback(std::shared_ptr<torch::jit::Graph>& g, bool delete_del
 } // namespace passes
 } // namespace lowering
 } // namespace core
-} // namespace trtorch
+} // namespace trtorch
diff --git a/core/lowering/passes/unpack_var.cpp b/core/lowering/passes/unpack_var.cpp
@@ -42,7 +42,7 @@ void UnpackVar(std::shared_ptr<torch::jit::Graph>& graph) {
   torch::jit::SubgraphRewriter var_rewriter;
   var_rewriter.RegisterRewritePattern(var_pattern, unpacked_pattern);
   var_rewriter.runOnGraph(graph);
-  LOG_DEBUG("Post unpack var: " << *graph);
+  LOG_GRAPH("Post unpack var: " << *graph);
 }
 
 } // namespace passes
diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp
@@ -2,6 +2,7 @@
 
 #include <queue>
 #include "core/conversion/conversion.h"
+#include "core/conversion/evaluators/evaluators.h"
 #include "core/partitioning/shape_analysis.h"
 #include "torch/csrc/jit/passes/constant_pooling.h"
 #include "torch/csrc/jit/passes/dead_code_elimination.h"
@@ -114,7 +115,7 @@ std::vector<SegmentedBlock> segmentBlocksWithNonTensorInputs(SegmentedBlock& seg
         pytorch_nodes.push_back(n);
         prev_non_tensor_outputs = containNonTensorOutputs(n);
       } else {
-        // If pytorch_nodes is not empty, the previous nodes were all tensorrt_nodes. Construct a
+        // If pytorch_nodes is not empty, the previous nodes were all pytorch_nodes. Construct a
         // Pytorch segmented_block and clear the pytorch_nodes list to be later used for new Pytorch segments.
         if (!pytorch_nodes.empty()) {
           new_seg_blocks.emplace_back(SegmentedBlock::kTorch, pytorch_nodes);
@@ -131,6 +132,7 @@ std::vector<SegmentedBlock> segmentBlocksWithNonTensorInputs(SegmentedBlock& seg
       new_seg_blocks.emplace_back(SegmentedBlock::kTorch, pytorch_nodes);
     }
   }
+
   return std::move(new_seg_blocks);
 }
 
@@ -158,6 +160,8 @@ void resolveNonTensorInputs(PartitionedGraph& segmented_blocks) { // , std::shar
       }
     }
 
+    // For each non-tensor value in the usage_counts map, keep updating the produce_id to the earliest segmented block
+    // that has/produces it.
     for (auto& use : usage_counts) {
       // Set the produce_id to the segmented block index that contains/produces this non-tensor torch::jit::Value
       if (segmented_blocks[i].contain_raw_value(use.first)) {
@@ -177,9 +181,8 @@ void resolveNonTensorInputs(PartitionedGraph& segmented_blocks) { // , std::shar
         // Segmented Blocks with non-tensor inputs will have to be re-segmented as
         // TRTorch doesn't support non-tensor inputs for a module.
         auto to_inject_blocks = segmentBlocksWithNonTensorInputs(segmented_blocks[first_torch_id]);
-        segmented_blocks.erase(segmented_blocks.begin() + first_torch_id);
-        segmented_blocks.insert(
-            segmented_blocks.begin() + first_torch_id, to_inject_blocks.begin(), to_inject_blocks.end());
+        auto next_iter = segmented_blocks_list.erase(idx_to_iter[first_torch_id]);
+        segmented_blocks_list.insert(next_iter, to_inject_blocks.begin(), to_inject_blocks.end());
         updated_segments.insert(first_torch_id);
       }
     }
@@ -258,6 +261,20 @@ void registerSegmentsOutputs(PartitionedGraph& segmented_blocks, torch::jit::Blo
   return;
 }
 
+bool checkLoopEvaluatable(torch::jit::Node* n) {
+  bool compile_to_trt = true;
+  for (auto bn : n->blocks()[0]->nodes()) {
+    if (bn->kind() == torch::jit::prim::Loop) {
+      compile_to_trt = compile_to_trt && checkLoopEvaluatable(bn);
+    } else if (bn->kind() == torch::jit::prim::If) {
+      compile_to_trt = compile_to_trt && containNonTensorOutputs(bn);
+    } else {
+      compile_to_trt = compile_to_trt && core::conversion::evaluators::shouldEvalAtConversionTime(bn);
+    }
+  }
+  return compile_to_trt;
+}
+
 std::vector<SegmentedBlock> segment_graph(torch::jit::Block* block, const PartitionInfo& partition_info) {
   auto min_block_size = partition_info.min_block_size;
   std::unordered_set<std::string> forced_fallback_operators(
@@ -298,6 +315,17 @@ std::vector<SegmentedBlock> segment_graph(torch::jit::Block* block, const Partit
         }
         segmented_blocks.emplace_back(SegmentedBlock::kTorch, std::vector<torch::jit::Node*>{n});
         continue;
+      } else if (n->kind() == torch::jit::prim::Loop) {
+        if (!pytorch_nodes.empty()) {
+          segmented_blocks.emplace_back(SegmentedBlock::kTorch, pytorch_nodes);
+          pytorch_nodes.clear();
+        }
+        if (checkLoopEvaluatable(n)) {
+          tensorrt_nodes.push_back(n);
+        } else {
+          segmented_blocks.emplace_back(SegmentedBlock::kTorch, std::vector<torch::jit::Node*>{n});
+        }
+        continue;
       }
       pytorch_nodes.push_back(n);
     }
diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp
@@ -56,7 +56,9 @@ void getSegmentsOutputByRunning(
   for (auto& input : seg_block.raw_inputs()) {
     TRTORCH_CHECK(
         ivalues_maps.count(input),
-        "Could not find torch::jit::Value* " << input->debugName() << " in lowering graph for mini graph input.\n");
+        "Could not find torch::jit::Value* " << input->debugName() << " produced from "
+                                             << util::node_info(input->node())
+                                             << " in lowering graph for mini graph input.\n");
     if (input->node()->kind() == torch::jit::prim::Param) {
       jit_inputs_ivalues.push_back(ivalues_maps[input]);
     } else if (input->type()->isSubtypeOf(torch::jit::TensorType::get())) {
diff --git a/core/runtime/register_trt_op.cpp b/core/runtime/register_trt_op.cpp
@@ -112,6 +112,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
   }
 
   c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(inputs[0].device().index());
+
+  // nvinfer1::IExecutionContext::enqueue is not thread safe and we need a mutex for it.
+  std::unique_lock<std::mutex> lock(compiled_engine->mu);
   compiled_engine->exec_ctx->enqueueV2(gpu_handles.data(), stream, nullptr);
 
   return outputs;
diff --git a/core/runtime/runtime.h b/core/runtime/runtime.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <map>
 #include <memory>
+#include <mutex>
 #include <utility>
 #include "ATen/core/function_schema.h"
 #include "NvInfer.h"
@@ -47,6 +48,7 @@ struct TRTEngine : torch::CustomClassHolder {
   std::shared_ptr<nvinfer1::IExecutionContext> exec_ctx;
   std::pair<uint64_t, uint64_t> num_io;
   std::string name;
+  std::mutex mu;
   CudaDevice device_info;
 
   std::unordered_map<uint64_t, uint64_t> in_binding_map;
diff --git a/docs/_notebooks/Resnet50-example.html b/docs/_notebooks/Resnet50-example.html
@@ -725,7 +725,7 @@
         </div>
        </div>
        <p>
-        <img alt="d34deeb0bd04450db415e7ad8573b82a" src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png"/>
+        <img alt="1794a581632146b3a0c2a5cea9db9870" src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png"/>
        </p>
        <section id="TRTorch-Getting-Started---ResNet-50">
         <h1 id="notebooks-resnet50-example--page-root">
diff --git a/docs/_notebooks/lenet-getting-started.html b/docs/_notebooks/lenet-getting-started.html
@@ -819,7 +819,7 @@
         </div>
        </div>
        <p>
-        <img alt="4ad32834008942b1a13a55d1a56e70b2" src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png"/>
+        <img alt="f6316bc6a1b54cada66e418a5317073b" src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png"/>
        </p>
        <section id="TRTorch-Getting-Started---LeNet">
         <h1 id="notebooks-lenet-getting-started--page-root">
diff --git a/docs/_notebooks/ssd-object-detection-demo.html b/docs/_notebooks/ssd-object-detection-demo.html
@@ -839,7 +839,7 @@
         </div>
        </div>
        <p>
-        <img alt="0ffdbddd2d824b96be90489b9946fe00" src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png"/>
+        <img alt="ddfea6bf93f943eb92f9db5bebd14202" src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png"/>
        </p>
        <section id="Object-Detection-with-TRTorch-(SSD)">
         <h1 id="notebooks-ssd-object-detection-demo--page-root">
diff --git a/docs/_notebooks/vgg-qat.html b/docs/_notebooks/vgg-qat.html
@@ -811,7 +811,7 @@ <h2 id="Overview">
          </div>
          <p>
           ## 2. VGG16 Overview ### Very Deep Convolutional Networks for Large-Scale Image Recognition VGG is one of the earliest family of image classification networks that first used small (3x3) convolution filters and achieved significant improvements on ImageNet recognition challenge. The network architecture looks as follows
-          <img alt="021c727aca7f4c2ea75c25574595d823" src="https://neurohive.io/wp-content/uploads/2018/11/vgg16-1-e1542731207177.png"/>
+          <img alt="e8ce67e3d7fb49a98051de2d0f357923" src="https://neurohive.io/wp-content/uploads/2018/11/vgg16-1-e1542731207177.png"/>
          </p>
          <p>
           ## 3. Training a baseline VGG16 model We train VGG16 on CIFAR10 dataset. Define training and testing datasets and dataloaders. This will download the CIFAR 10 data in your
diff --git a/docs/py_api/trtorch.html b/docs/py_api/trtorch.html
@@ -1156,7 +1156,7 @@ <h2 id="functions">
            <span class="sig-paren">
             )
            </span>
-           → &lt;torch._C.ScriptClass object at 0x7f6f02ba7d70&gt;
+           → &lt;torch._C.ScriptClass object at 0x7fc8c4e30670&gt;
            <a class="headerlink" href="#trtorch.TensorRTCompileSpec" title="Permalink to this definition">
             ¶
            </a>
diff --git a/docs/searchindex.js b/docs/searchindex.js
diff --git a/tests/core/partitioning/BUILD b/tests/core/partitioning/BUILD
@@ -11,7 +11,9 @@ filegroup(
     name = "jit_models",
     srcs = ["//tests/modules:resnet50_traced.jit.pt",
             "//tests/modules:mobilenet_v2_traced.jit.pt",
-            "//tests/modules:conditional_scripted.jit.pt"]
+            "//tests/modules:conditional_scripted.jit.pt",
+            "//tests/modules:loop_fallback_eval_scripted.jit.pt",
+            "//tests/modules:loop_fallback_no_eval_scripted.jit.pt"]
 )
 
 partitioning_test(
@@ -45,6 +47,22 @@ cc_test(
   ]
 )
 
+cc_test(
+  name = "test_loop_fallback",
+  srcs = ["test_loop_fallback.cpp"],
+  deps = [
+      "//tests/util",
+      "//core",
+      "@googletest//:gtest_main",
+  ] + select({
+      ":use_pre_cxx11_abi":  ["@libtorch_pre_cxx11_abi//:libtorch"],
+      "//conditions:default":  ["@libtorch//:libtorch"],
+  }),
+  data = [
+      ":jit_models"
+  ]
+)
+
 cc_test(
   name = "test_conditionals",
   srcs = ["test_conditionals.cpp"],
@@ -68,6 +86,7 @@ test_suite(
         ":test_tensorrt_conversion",
         ":test_stitched_graph",
         ":test_fallback_graph_output",
+        ":test_loop_fallback",
         ":test_conditionals"
     ]
 )
diff --git a/tests/core/partitioning/test_loop_fallback.cpp b/tests/core/partitioning/test_loop_fallback.cpp
@@ -0,0 +1,62 @@
+#include <string>
+#include <unordered_set>
+#include "core/compiler.h"
+#include "gtest/gtest.h"
+#include "tests/util/util.h"
+#include "torch/script.h"
+
+TEST(Partitioning, CheckLoopFallbackEvalCompilesCorrectly) {
+  torch::jit::script::Module mod;
+  try {
+    mod = torch::jit::load("tests/modules/loop_fallback_eval_scripted.jit.pt");
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+    return;
+  }
+
+  const std::vector<std::vector<int64_t>> input_shapes = {{1, 10}};
+  std::vector<torch::jit::IValue> jit_inputs_ivalues;
+  std::vector<torch::jit::IValue> trt_inputs_ivalues;
+  for (auto in_shape : input_shapes) {
+    auto in = at::randint(5, in_shape, {at::kCUDA});
+    jit_inputs_ivalues.push_back(in.clone());
+    trt_inputs_ivalues.push_back(in.clone());
+  }
+
+  std::vector<trtorch::core::ir::Input> input_ranges{trtorch::core::ir::Input({1, 10})};
+  trtorch::core::CompileSpec cfg(input_ranges);
+  cfg.partition_info.enabled = true;
+
+  auto jit_results = mod.forward(jit_inputs_ivalues).toTensor();
+  auto trt_mod = trtorch::core::CompileGraph(mod, cfg);
+  auto trt_results = trt_mod.forward(trt_inputs_ivalues).toTensor();
+  ASSERT_TRUE(trtorch::tests::util::almostEqual(jit_results, trt_results, 2e-6));
+}
+
+TEST(Partitioning, CheckLoopFallbackNoEvalCompilesCorrectly) {
+  torch::jit::script::Module mod;
+  try {
+    mod = torch::jit::load("tests/modules/loop_fallback_no_eval_scripted.jit.pt");
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+    return;
+  }
+
+  const std::vector<std::vector<int64_t>> input_shapes = {{1, 10}};
+  std::vector<torch::jit::IValue> jit_inputs_ivalues;
+  std::vector<torch::jit::IValue> trt_inputs_ivalues;
+  for (auto in_shape : input_shapes) {
+    auto in = at::randint(5, in_shape, {at::kCUDA});
+    jit_inputs_ivalues.push_back(in.clone());
+    trt_inputs_ivalues.push_back(in.clone());
+  }
+
+  std::vector<trtorch::core::ir::Input> input_ranges{trtorch::core::ir::Input({1, 10})};
+  trtorch::core::CompileSpec cfg(input_ranges);
+  cfg.partition_info.enabled = true;
+
+  auto jit_results = mod.forward(jit_inputs_ivalues).toTensor();
+  auto trt_mod = trtorch::core::CompileGraph(mod, cfg);
+  auto trt_results = trt_mod.forward(trt_inputs_ivalues).toTensor();
+  ASSERT_TRUE(trtorch::tests::util::almostEqual(jit_results, trt_results, 2e-6));
+}
diff --git a/tests/cpp/BUILD b/tests/cpp/BUILD
@@ -14,6 +14,7 @@ test_suite(
         ":test_default_input_types",
         ":test_compiled_modules",
         ":test_modules_as_engines",
+        ":test_runtime_thread_safety",
         ":test_multiple_registered_engines",
         ":test_serialization",
         ":test_module_fallback",
@@ -27,6 +28,7 @@ test_suite(
         ":test_default_input_types",
         ":test_compiled_modules",
         ":test_modules_as_engines",
+        ":test_runtime_thread_safety",
         ":test_multiple_registered_engines",
         ":test_serialization",
         ":test_module_fallback",
@@ -94,6 +96,17 @@ cc_test(
     timeout="long"
 )
 
+cc_test(
+    name = "test_runtime_thread_safety",
+    srcs = ["test_runtime_thread_safety.cpp"],
+    data = [
+        "//tests/modules:jit_models",
+    ],
+    deps = [
+        ":cpp_api_test",
+    ]
+)
+
 cc_test(
     name = "test_module_fallback",
     srcs = ["test_module_fallback.cpp"],
diff --git a/tests/cpp/test_module_fallback.cpp b/tests/cpp/test_module_fallback.cpp
@@ -7,7 +7,7 @@
 TEST(CppAPITest, ResNetModuleFallbacksCorrectly) {
   torch::jit::script::Module mod;
   try {
-    mod = torch::jit::load("tests/modules/resnet18_traced.jit.pt");
+    mod = torch::jit::load("tests/modules/resnet18_scripted.jit.pt");
   } catch (const c10::Error& e) {
     std::cerr << "error loading the model\n";
     ASSERT_TRUE(false);
@@ -35,7 +35,7 @@ TEST(CppAPITest, ResNetModuleFallbacksCorrectly) {
 TEST(CppAPITest, MobileNetModuleFallbacksCorrectlyWithOneEngine) {
   torch::jit::script::Module mod;
   try {
-    mod = torch::jit::load("tests/modules/mobilenet_v2_traced.jit.pt");
+    mod = torch::jit::load("tests/modules/mobilenet_v2_scripted.jit.pt");
   } catch (const c10::Error& e) {
     std::cerr << "error loading the model\n";
     ASSERT_TRUE(false);
diff --git a/tests/cpp/test_modules_as_engines.cpp b/tests/cpp/test_modules_as_engines.cpp
@@ -29,7 +29,6 @@ TEST_P(CppAPITests, ModuleToEngineToModuleIsClose) {
   std::vector<at::Tensor> jit_results;
   jit_results.push_back(jit_results_ivalues.toTensor());
 
-  auto forward_graph = mod.get_method("forward");
   std::vector<c10::ArrayRef<int64_t>> input_ranges;
   for (auto in : inputs) {
     input_ranges.push_back(in.sizes());
@@ -43,7 +42,7 @@ TEST_P(CppAPITests, ModuleToEngineToModuleIsClose) {
   auto engine = trtorch::ConvertGraphToTRTEngine(mod, "forward", input_ranges);
   auto trt_mod = trtorch::EmbedEngineInNewModule(engine, compile_spec.device);
 
-  torch::jit::IValue trt_results_ivalues = trtorch::tests::util::RunModuleForward(mod, inputs_ivalues);
+  torch::jit::IValue trt_results_ivalues = trtorch::tests::util::RunModuleForward(trt_mod, inputs_ivalues);
   std::vector<at::Tensor> trt_results;
   trt_results.push_back(trt_results_ivalues.toTensor());
 
@@ -61,4 +60,4 @@ INSTANTIATE_TEST_SUITE_P(
         PathAndInSize({"tests/modules/resnet50_scripted.jit.pt", {{1, 3, 224, 224}}, 2e-5}),
         PathAndInSize({"tests/modules/mobilenet_v2_scripted.jit.pt", {{1, 3, 224, 224}}, 2e-5}),
         PathAndInSize({"tests/modules/efficientnet_b0_scripted.jit.pt", {{1, 3, 224, 224}}, 2e-5}),
-        PathAndInSize({"tests/modules/vit_scripted.jit.pt", {{1, 3, 224, 224}}, 8e-3})));
+        PathAndInSize({"tests/modules/vit_scripted.jit.pt", {{1, 3, 224, 224}}, 8e-3})));
diff --git a/tests/cpp/test_runtime_thread_safety.cpp b/tests/cpp/test_runtime_thread_safety.cpp
diff --git a/tests/modules/hub.py b/tests/modules/hub.py

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ void UnpackVar(std::shared_ptr<torch::jit::Graph>& graph) {`
`42`	`42`	`torch::jit::SubgraphRewriter var_rewriter;`
`43`	`43`	`var_rewriter.RegisterRewritePattern(var_pattern, unpacked_pattern);`
`44`	`44`	`var_rewriter.runOnGraph(graph);`
`45`		`- LOG_DEBUG("Post unpack var: " << *graph);`
	`45`	`+ LOG_GRAPH("Post unpack var: " << *graph);`
`46`	`46`	`}`
`47`	`47`
`48`	`48`	`} // namespace passes`
Original file line number	Diff line number	Diff line change
`@@ -112,6 +112,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr`
`112`	`112`	`}`
`113`	`113`
`114`	`114`	`c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(inputs[0].device().index());`
	`115`	`+`
	`116`	`+ // nvinfer1::IExecutionContext::enqueue is not thread safe and we need a mutex for it.`
	`117`	`+ std::unique_lock<std::mutex> lock(compiled_engine->mu);`
`115`	`118`	`compiled_engine->exec_ctx->enqueueV2(gpu_handles.data(), stream, nullptr);`
`116`	`119`
`117`	`120`	`return outputs;`