fix: Device casting issues with certain aten operators

gs-olive · gs-olive · commit e94c64a4376c · 2022-10-24T22:33:35.000-07:00
- Investigated issue arising with BART-base model (https://huggingface.co/facebook/bart-base) where certain tensor inputs to TensorRT were on the cpu, despite users explicitly casting all inputs properly - Traced issue to internally-generated 0D tensors, mask tensors, and operations returning CPU tensors passed between Torch and Torch-TensorRT engines - Added lowering passes to ensure function edge cases are appropriately dealt with, and added validation check in runtime to avoid models crashing at runtime due to device mismatches
diff --git a/core/lowering/lowering.cpp b/core/lowering/lowering.cpp
@@ -70,6 +70,10 @@ void LowerGraph(std::shared_ptr<torch::jit::Graph>& g, LowerInfo lower_info) {
   passes::SiluToSigmoidMultipication(g);
   passes::RemoveSingleUse0DTensors(g);
   passes::RemoveUnnecessaryCasts(g);
+  passes::UnpackAndCastMaskedFill(g);
+  passes::UnpackAndCastNumToTensor(g);
+  passes::UnpackAndCastFull(g);
+  passes::ReplaceScalarImplicit(g);
   LOG_GRAPH(*g);
 }
 
diff --git a/core/lowering/passes/BUILD b/core/lowering/passes/BUILD
@@ -14,6 +14,7 @@ cc_library(
     name = "passes",
     srcs = [
         "convNd_to_convolution.cpp",
+        "device_casting.cpp",
         "exception_elimination.cpp",
         "fuse_addmm_branches.cpp",
         "linear_to_addmm.cpp",
diff --git a/core/lowering/passes/CMakeLists.txt b/core/lowering/passes/CMakeLists.txt
@@ -1,5 +1,6 @@
 target_sources(${lib_name}
     PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/convNd_to_convolution.cpp"
+            "${CMAKE_CURRENT_SOURCE_DIR}/device_casting.cpp"
             "${CMAKE_CURRENT_SOURCE_DIR}/exception_elimination.cpp"
             "${CMAKE_CURRENT_SOURCE_DIR}/fuse_addmm_branches.cpp"
             "${CMAKE_CURRENT_SOURCE_DIR}/linear_to_addmm.cpp"
diff --git a/core/lowering/passes/device_casting.cpp b/core/lowering/passes/device_casting.cpp
@@ -0,0 +1,114 @@
+#include "torch/csrc/jit/ir/constants.h"
+#include "torch/csrc/jit/passes/subgraph_rewrite.h"
+
+#include "core/util/prelude.h"
+
+namespace torch_tensorrt {
+namespace core {
+namespace lowering {
+namespace passes {
+
+void UnpackAndCastMaskedFill(std::shared_ptr<torch::jit::Graph>& graph) {
+  std::string masked_fill_pattern = R"IR(
+    graph(%self, %mask, %value):
+      %out: Tensor = aten::masked_fill_(%self, %mask, %value)
+      return (%out))IR";
+
+  // Calls to masked_fill_ often utilize CPU tensors, and as such
+  // should be casted to CUDA to avoid device mismatch errors
+  std::string unpacked_pattern = R"IR(
+    graph(%self, %mask, %value):
+      %device: Device = prim::Constant[value="cuda"]()
+      %dtype: NoneType = prim::Constant()
+      %false: bool = prim::Constant[value=0]()
+      %mask_cuda: Tensor = aten::to(%mask, %device, %dtype, %false, %false)
+      %self_cuda: Tensor = aten::to(%self, %device, %dtype, %false, %false)
+
+      # Value is cast to type of original tensor and value defaults to float
+      %is_float: bool = aten::is_floating_point(%self)
+      %out: Tensor = prim::If(%is_float)
+        block0():
+          %no_cast: Tensor = aten::masked_fill(%self_cuda, %mask_cuda, %value)
+          -> (%no_cast)
+        block1():
+          %value_int: int = aten::Int(%value)
+          %casted_int: Tensor = aten::masked_fill(%self_cuda, %mask_cuda, %value_int)
+          -> (%casted_int)
+
+      return (%out))IR";
+
+  torch::jit::SubgraphRewriter masked_fill_rewriter;
+  masked_fill_rewriter.RegisterRewritePattern(masked_fill_pattern, unpacked_pattern);
+  masked_fill_rewriter.runOnGraph(graph);
+  LOG_GRAPH("After unpack and cast masked_fill_: " << *graph);
+}
+
+void UnpackAndCastNumToTensor(std::shared_ptr<torch::jit::Graph>& graph) {
+  std::string num_to_tensor_cast_pattern = R"IR(
+    graph(%1: int):
+      %2: Tensor = prim::NumToTensor(%1)
+      return (%2))IR";
+
+  // 0D Tensors are initialized on cpu, and need to be casted to CUDA
+  // to avoid device mismatch issues
+  std::string num_to_tensor_clean_pattern = R"IR(
+    graph(%1: int):
+      %2: Tensor = prim::NumToTensor(%1)
+      %device: Device = prim::Constant[value="cuda"]()
+      %dtype: NoneType = prim::Constant()
+      %false: bool = prim::Constant[value=0]()
+      %3: Tensor = aten::to(%2, %device, %dtype, %false, %false)
+      return (%3))IR";
+
+  torch::jit::SubgraphRewriter num_to_tensor_cast_rewriter;
+  num_to_tensor_cast_rewriter.RegisterRewritePattern(num_to_tensor_cast_pattern, num_to_tensor_clean_pattern);
+  num_to_tensor_cast_rewriter.runOnGraph(graph);
+
+  LOG_GRAPH("After unpack and cast NumToTensor: " << *graph);
+}
+
+void UnpackAndCastFull(std::shared_ptr<torch::jit::Graph>& graph) {
+  std::string full_cast_pattern = R"IR(
+    graph(%1, %2, %3, %4, %5, %6):
+      %out: Tensor = aten::full(%1, %2, %3, %4, %5, %6)
+      return (%out))IR";
+
+  // Tensors created via aten::full are initialized on cpu, and need to be casted to CUDA
+  // to avoid device mismatch issues
+  std::string full_clean_pattern = R"IR(
+    graph(%1, %2, %3, %4, %5, %6):
+      %cuda: Device = prim::Constant[value="cuda"]()
+      %out: Tensor = aten::full(%1, %2, %3, %4, %cuda, %6)
+      return (%out))IR";
+
+  torch::jit::SubgraphRewriter full_cast_rewriter;
+  full_cast_rewriter.RegisterRewritePattern(full_cast_pattern, full_clean_pattern);
+  full_cast_rewriter.runOnGraph(graph);
+
+  LOG_GRAPH("After unpack and cast full: " << *graph);
+}
+
+void ReplaceScalarImplicit(std::shared_ptr<torch::jit::Graph>& graph) {
+  std::string scalar_implicit_cast_pattern = R"IR(
+    graph(%1: Tensor):
+      %2: Scalar = aten::ScalarImplicit(%1)
+      return (%2))IR";
+
+  // ScalarImplicit can only unpack 0D tensors, whereas Tensors operated on by
+  // TensorRT are padded to 1 dimension. aten::item() resolves this conflict
+  std::string scalar_implicit_clean_pattern = R"IR(
+    graph(%1: Tensor):
+      %2: Scalar = aten::item(%1)
+      return (%2))IR";
+
+  torch::jit::SubgraphRewriter scalar_implicit_cast_rewriter;
+  scalar_implicit_cast_rewriter.RegisterRewritePattern(scalar_implicit_cast_pattern, scalar_implicit_clean_pattern);
+  scalar_implicit_cast_rewriter.runOnGraph(graph);
+
+  LOG_GRAPH("After unpack and cast full: " << *graph);
+}
+
+} // namespace passes
+} // namespace lowering
+} // namespace core
+} // namespace torch_tensorrt
diff --git a/core/lowering/passes/passes.h b/core/lowering/passes/passes.h
@@ -40,6 +40,10 @@ void AliasOperators(std::shared_ptr<torch::jit::Graph>& graph);
 void SiluToSigmoidMultipication(std::shared_ptr<torch::jit::Graph>& graph);
 void UnpackHardSwish(std::shared_ptr<torch::jit::Graph>& graph);
 void UnpackHardSigmoid(std::shared_ptr<torch::jit::Graph>& graph);
+void UnpackAndCastMaskedFill(std::shared_ptr<torch::jit::Graph>& graph);
+void UnpackAndCastNumToTensor(std::shared_ptr<torch::jit::Graph>& graph);
+void UnpackAndCastFull(std::shared_ptr<torch::jit::Graph>& graph);
+void ReplaceScalarImplicit(std::shared_ptr<torch::jit::Graph>& graph);
 
 } // namespace passes
 } // namespace lowering
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -63,16 +63,41 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
   CudaDevice curr_device = get_current_device();
   LOG_DEBUG("Current Device: " << curr_device);
 
+  // Generic Target Device Prefix
+  std::string target_device = "cuda:";
+
   if (is_switch_required(curr_device, compiled_engine->device_info)) {
     // Scan through available CUDA devices and set the CUDA device context correctly
     CudaDevice device = select_cuda_device(compiled_engine->device_info);
     set_cuda_device(device);
 
-    std::string target_device = "cuda:" + std::to_string(device.id);
+    // Target device is new device
+    target_device += std::to_string(device.id);
 
     for (auto& in : inputs) {
       in = in.to(torch::Device(target_device));
     }
+  } else {
+    // Target device is current device
+    target_device += std::to_string(curr_device.id);
+
+    // For each input, ensure its current device is the desired target device
+    for (size_t i = 0; i < inputs.size(); i++) {
+      at::Tensor* in = &inputs[i];
+      std::string current_tensor_device = in->device().str();
+
+      // If current device string does not match target device, display warning and move tensor accordingly
+      if (current_tensor_device != target_device) {
+        LOG_WARNING(
+            "Input " << i << " of engine " << compiled_engine->name << " was found to be on " << current_tensor_device
+                     << " but should be on " << target_device
+                     << ". This tensor is being moved manually by the runtime but "
+                     << "for performance considerations, ensure your inputs are all on GPU "
+                     << "and open an issue here (https://github.com/pytorch/TensorRT/issues) if this "
+                     << "warning persists.");
+        *in = in->to(torch::Device(target_device));
+      }
+    }
   }
 
   std::vector<void*> gpu_handles;