pytorch
diff --git a/‎core/conversion/conversionctx/BUILD‎
Lines changed: 1 addition & 0 deletions b/‎core/conversion/conversionctx/BUILD‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/conversion/conversionctx/ConversionCtx.h‎
Lines changed: 2 additions & 9 deletions b/‎core/conversion/conversionctx/ConversionCtx.h‎
Lines changed: 2 additions & 9 deletions
diff --git a/‎core/conversion/converters/impl/element_wise.cpp‎
Lines changed: 13 additions & 22 deletions b/‎core/conversion/converters/impl/element_wise.cpp‎
Lines changed: 13 additions & 22 deletions
diff --git a/‎core/ir/ir.h‎
Lines changed: 8 additions & 0 deletions b/‎core/ir/ir.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎core/lowering/BUILD‎
Lines changed: 1 addition & 0 deletions b/‎core/lowering/BUILD‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/lowering/CMakeLists.txt‎
Lines changed: 4 additions & 1 deletion b/‎core/lowering/CMakeLists.txt‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎core/lowering/lowering.cpp‎
Lines changed: 7 additions & 2 deletions b/‎core/lowering/lowering.cpp‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎core/lowering/lowering.h‎
Lines changed: 6 additions & 0 deletions b/‎core/lowering/lowering.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎core/lowering/passes/BUILD‎
Lines changed: 2 additions & 0 deletions b/‎core/lowering/passes/BUILD‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎core/lowering/passes/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎core/lowering/passes/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
@@ -21,6 +21,7 @@ cc_library(
     deps = [
         "@tensorrt//:nvinfer",
         "//core/util:prelude",
+        "//core/ir",
     ] + select({
         ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"],
         "//conditions:default": ["@libtorch//:libtorch"],
 
@@ -9,28 +9,21 @@
 #include "torch/csrc/jit/ir/ir.h"
 
 #include <cuda_runtime.h>
+#include "core/ir/ir.h"
 #include "core/util/prelude.h"
 
 namespace torch_tensorrt {
 namespace core {
 namespace conversion {
 
-struct Device {
-  nvinfer1::DeviceType device_type;
-  int64_t gpu_id;
-  int64_t dla_core;
-  bool allow_gpu_fallback;
-  Device() : device_type(nvinfer1::DeviceType::kGPU), gpu_id(0), dla_core(0), allow_gpu_fallback(false) {}
-};
-
 struct BuilderSettings {
   std::set<nvinfer1::DataType> enabled_precisions = {};
   bool sparse_weights = false;
   bool disable_tf32 = false;
   bool refit = false;
   bool debug = false;
   bool truncate_long_and_double = false;
-  Device device;
+  ir::Device device;
   nvinfer1::EngineCapability capability = TRT_ENGINE_CAPABILITY_STANDARD;
   nvinfer1::IInt8Calibrator* calibrator = nullptr;
   uint64_t num_avg_timing_iters = 1;
 
@@ -166,11 +166,11 @@ auto element_wise_registrations TORCHTRT_UNUSED =
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                // Should implement self - alpha * other
                auto self = args[0].ITensorOrFreeze(ctx);
-               auto scalar = args[2].unwrapToScalar().to<float>();
                auto other = args[1].ITensorOrFreeze(ctx);
+               auto scalar = args[2].unwrapToScalar();
 
-               if (1 != scalar) {
-                 auto alphaTensor = tensor_to_const(ctx, torch::tensor({scalar}));
+               if (1 != scalar.to<float>()) {
+                 auto alphaTensor = scalar_to_tensor(ctx, scalar);
                  auto scaleLayer = add_elementwise(
                      ctx,
                      nvinfer1::ElementWiseOperation::kPROD,
@@ -214,11 +214,11 @@ auto element_wise_registrations TORCHTRT_UNUSED =
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                // Should implement self - alpha * other
                auto self = args[0].ITensorOrFreeze(ctx);
-               auto scalar = args[2].unwrapToScalar().to<float>();
                auto other = args[1].ITensorOrFreeze(ctx);
+               auto scalar = args[2].unwrapToScalar();
 
-               if (1 != scalar) {
-                 auto alphaTensor = tensor_to_const(ctx, torch::tensor({scalar}));
+               if (1 != scalar.to<float>()) {
+                 auto alphaTensor = scalar_to_tensor(ctx, scalar);
                  auto scaleLayer = add_elementwise(
                      ctx,
                      nvinfer1::ElementWiseOperation::kPROD,
@@ -351,8 +351,7 @@ auto element_wise_registrations TORCHTRT_UNUSED =
             {"aten::div.Scalar(Tensor self, Scalar other) -> (Tensor)",
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                auto self = args[0].ITensorOrFreeze(ctx);
-               auto otherScalar = args[1].unwrapToScalar().to<float>();
-               auto other = tensor_to_const(ctx, torch::tensor({otherScalar}));
+               auto other = scalar_to_tensor(ctx, args[1].unwrapToScalar());
                auto div = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kDIV, self, other, util::node_info(n));
                TORCHTRT_CHECK(div, "Unable to create div layer from node: " << *n);
 
@@ -381,8 +380,7 @@ auto element_wise_registrations TORCHTRT_UNUSED =
             {"aten::div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)",
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                auto self = args[0].ITensorOrFreeze(ctx);
-               auto otherScalar = args[1].unwrapToScalar().to<float>();
-               auto other = tensor_to_const(ctx, torch::tensor({otherScalar}));
+               auto other = scalar_to_tensor(ctx, args[1].unwrapToScalar());
                auto div = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kDIV, self, other, util::node_info(n));
                TORCHTRT_CHECK(div, "Unable to create div layer from node: " << *n);
 
@@ -481,18 +479,12 @@ auto element_wise_registrations TORCHTRT_UNUSED =
             {"aten::ne.Scalar(Tensor self, Scalar other) -> (Tensor)",
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                auto self = args[0].ITensorOrFreeze(ctx);
-               auto scalar = args[1].unwrapToScalar();
-               nvinfer1::ITensor* scalar_tensor;
-               if (self->getType() == nvinfer1::DataType::kFLOAT || self->getType() == nvinfer1::DataType::kHALF) {
-                 scalar_tensor = tensor_to_const(ctx, torch::tensor({scalar.to<float>()}));
-               } else {
-                 scalar_tensor = tensor_to_const(ctx, torch::tensor({scalar.to<int>()}));
-               }
+               auto other = scalar_to_tensor(ctx, args[1].unwrapToScalar());
                auto equal = add_elementwise(
                    ctx,
                    nvinfer1::ElementWiseOperation::kEQUAL,
                    self,
-                   scalar_tensor,
+                   other,
                    util::node_info(n) + std::string("is_equal"));
                TORCHTRT_CHECK(equal, "Unable to create elementwise equal layer from node: " << *n);
                // XOR with ones negates and produces not_equal result
@@ -534,8 +526,7 @@ auto element_wise_registrations TORCHTRT_UNUSED =
             {"aten::pow.Tensor_Scalar(Tensor self, Scalar exponent) -> (Tensor)",
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                auto self = args[0].ITensorOrFreeze(ctx);
-               auto exponentScalar = args[1].unwrapToScalar().to<float>();
-               auto exponent = tensor_to_const(ctx, torch::tensor({exponentScalar}));
+               auto exponent = scalar_to_tensor(ctx, args[1].unwrapToScalar());
                auto pow =
                    add_elementwise(ctx, nvinfer1::ElementWiseOperation::kPOW, self, exponent, util::node_info(n));
                TORCHTRT_CHECK(pow, "Unable to create Power layer from node: " << *n);
@@ -681,9 +672,9 @@ auto element_wise_registrations TORCHTRT_UNUSED =
             {"aten::eq.Scalar(Tensor self, Scalar other) -> (Tensor)",
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                auto self = args[0].ITensorOrFreeze(ctx);
-               auto otherScalar = args[1].unwrapToScalar().to<float>();
-               auto other = tensor_to_const(ctx, torch::tensor({otherScalar}));
+               auto other = scalar_to_tensor(ctx, args[1].unwrapToScalar());
                if (self->getType() == nvinfer1::DataType::kBOOL) {
+                 auto otherScalar = args[1].unwrapToScalar().to<float>();
                  if (otherScalar == 0 || otherScalar == 1) {
                    LOG_DEBUG("Since input tensor is type bool, casting input tensor and scalar to int32");
                    other = castITensor(ctx, other, nvinfer1::DataType::kINT32);
 
@@ -17,6 +17,14 @@ enum class ShapeMode {
   kMAX,
 };
 
+struct Device {
+  nvinfer1::DeviceType device_type;
+  int64_t gpu_id;
+  int64_t dla_core;
+  bool allow_gpu_fallback;
+  Device() : device_type(nvinfer1::DeviceType::kGPU), gpu_id(0), dla_core(0), allow_gpu_fallback(false) {}
+};
+
 struct Input : torch::CustomClassHolder {
   Input(){};
   Input(
 
@@ -24,6 +24,7 @@ cc_library(
     deps = [
         "//core/lowering/passes",
         "//core/util:prelude",
+        "//core/ir",
     ] + select({
         ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"],
         "//conditions:default": ["@libtorch//:libtorch"],
 
@@ -15,6 +15,8 @@ set(HEADER_FILES
 target_sources(${lib_name}
     PRIVATE
         ${CXX_SRCS}
+    PUBLIC
+        $<TARGET_OBJECTS:core_ir>
         $<TARGET_OBJECTS:core_util>
 )
 
@@ -25,8 +27,9 @@ target_include_directories(${lib_name}
 
 target_link_libraries(${lib_name}
     PUBLIC
+        TensorRT::nvinfer
         torch
-    PRIVATE
+        core_ir
         core_util
 )
 
 
@@ -26,7 +26,7 @@ void LowerBlock(torch::jit::Block* b) {
   DropUnusedNodes(b);
 }
 
-void LowerGraph(std::shared_ptr<torch::jit::Graph>& g, LowerInfo lower_info) {
+void LowerGraph(std::shared_ptr<torch::jit::Graph>& g, std::vector<torch::jit::IValue>& params, LowerInfo lower_info) {
   torch::jit::EliminateRedundantGuards(g);
   torch::jit::RemoveListMutation(g);
   torch::jit::RemoveTensorMutation(g);
@@ -70,6 +70,11 @@ void LowerGraph(std::shared_ptr<torch::jit::Graph>& g, LowerInfo lower_info) {
   passes::SiluToSigmoidMultipication(g);
   passes::RemoveSingleUse0DTensors(g);
   passes::RemoveUnnecessaryCasts(g);
+  passes::UnpackAndCastMaskedFill(g, lower_info.getGPUDeviceString());
+  passes::UnpackAndCastNumToTensor(g, lower_info.getGPUDeviceString());
+  passes::UnpackAndCastFull(g, lower_info.getGPUDeviceString());
+  passes::ReplaceScalarImplicit(g);
+  passes::RewriteInputsWithParams(g, params);
   LOG_GRAPH(*g);
 }
 
@@ -103,7 +108,7 @@ std::pair<std::shared_ptr<torch::jit::Graph>, std::vector<torch::jit::IValue>> L
   // In quantization aware trained (QAT) models, weights are passed through quantize and
   // dequantize nodes which should not be folded. So unfreeze_module is set to True for QAT models.
   LOG_GRAPH("Torch-TensorRT.TorchScript Graph Lowering");
-  lowering::LowerGraph(graph_and_ivalues.first, lower_info);
+  lowering::LowerGraph(graph_and_ivalues.first, graph_and_ivalues.second, lower_info);
 
   // Is this necessary?
   // lowering::LowerBlock(g->block());
 
@@ -1,5 +1,6 @@
 #pragma once
 #include <memory>
+#include "core/ir/ir.h"
 #include "torch/csrc/jit/ir/ir.h"
 
 namespace torch_tensorrt {
@@ -15,8 +16,13 @@ struct LowerInfo {
   // Since these QDQ nodes will be identical as they share same input, one of them is eliminated due to CSE lowering
   // pass. Disable this in order to not disturb TensorRT's QAT optimizations.
   bool disable_cse = false;
+  ir::Device target_device;
   std::vector<std::string> forced_fallback_modules;
   friend std::ostream& operator<<(std::ostream& os, const LowerInfo& l);
+
+  std::string getGPUDeviceString() {
+    return "cuda:" + std::to_string(target_device.gpu_id);
+  };
 };
 
 void LowerBlock(torch::jit::Block* b);
 
@@ -14,6 +14,7 @@ cc_library(
     name = "passes",
     srcs = [
         "convNd_to_convolution.cpp",
+        "device_casting.cpp",
         "exception_elimination.cpp",
         "fuse_addmm_branches.cpp",
         "linear_to_addmm.cpp",
@@ -27,6 +28,7 @@ cc_library(
         "remove_dropout.cpp",
         "remove_nops.cpp",
         "remove_unnecessary_casts.cpp",
+        "rewrite_inputs_with_params.cpp",
         "silu_to_sigmoid_multiplication.cpp",
         "unpack_addmm.cpp",
         "unpack_batch_norm.cpp",
 
@@ -1,5 +1,6 @@
 target_sources(${lib_name}
     PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/convNd_to_convolution.cpp"
+            "${CMAKE_CURRENT_SOURCE_DIR}/device_casting.cpp"
             "${CMAKE_CURRENT_SOURCE_DIR}/exception_elimination.cpp"
             "${CMAKE_CURRENT_SOURCE_DIR}/fuse_addmm_branches.cpp"
             "${CMAKE_CURRENT_SOURCE_DIR}/linear_to_addmm.cpp"
@@ -24,6 +25,7 @@ target_sources(${lib_name}
             "${CMAKE_CURRENT_SOURCE_DIR}/unpack_std.cpp"
             "${CMAKE_CURRENT_SOURCE_DIR}/unpack_var.cpp"
             "${CMAKE_CURRENT_SOURCE_DIR}/view_to_reshape.cpp"
+            "${CMAKE_CURRENT_SOURCE_DIR}/rewrite_inputs_with_params.cpp"
 )
 
 set(HEADER_FILES