pytorch
diff --git a/‎.bazelversion
Lines changed: 1 addition & 1 deletion b/‎.bazelversion
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 3 additions & 3 deletions b/‎README.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎WORKSPACE
Lines changed: 7 additions & 7 deletions b/‎WORKSPACE
Lines changed: 7 additions & 7 deletions
diff --git a/‎core/compiler.cpp
Lines changed: 8 additions & 8 deletions b/‎core/compiler.cpp
Lines changed: 8 additions & 8 deletions
diff --git a/‎core/conversion/conversion.cpp
Lines changed: 49 additions & 24 deletions b/‎core/conversion/conversion.cpp
Lines changed: 49 additions & 24 deletions
diff --git a/‎core/conversion/converters/impl/activation.cpp
Lines changed: 21 additions & 0 deletions b/‎core/conversion/converters/impl/activation.cpp
Lines changed: 21 additions & 0 deletions
diff --git a/‎core/conversion/converters/impl/batch_norm.cpp
Lines changed: 18 additions & 10 deletions b/‎core/conversion/converters/impl/batch_norm.cpp
Lines changed: 18 additions & 10 deletions
diff --git a/‎core/conversion/converters/impl/cast.cpp
Lines changed: 30 additions & 1 deletion b/‎core/conversion/converters/impl/cast.cpp
Lines changed: 30 additions & 1 deletion
diff --git a/‎core/conversion/converters/impl/cumsum.cpp
Lines changed: 2 additions & 1 deletion b/‎core/conversion/converters/impl/cumsum.cpp
Lines changed: 2 additions & 1 deletion
@@ -1 +1 @@
-4.2.1
+5.1.1
@@ -111,10 +111,10 @@ torch.jit.save(trt_ts_module, "trt_torchscript_module.ts") # save the TRT embedd
 These are the following dependencies used to verify the testcases. Torch-TensorRT can work with other versions, but the tests are not guaranteed to pass.
 
 - Bazel 4.2.1
-- Libtorch 1.10.0 (built with CUDA 11.3)
+- Libtorch 1.11.0 (built with CUDA 11.3)
 - CUDA 11.3 (10.2 on Jetson)
-- cuDNN 8.2
-- TensorRT 8.0.3.4 (TensorRT 8.0.1.6 on Jetson)
+- cuDNN 8.2.1
+- TensorRT 8.2.4.2 (TensorRT 8.2.1 on Jetson)
 
 ## Prebuilt Binaries and Wheel files
 
 
@@ -56,17 +56,17 @@ new_local_repository(
 http_archive(
     name = "libtorch",
     build_file = "@//third_party/libtorch:BUILD",
-    sha256 = "190e963e739d5f7c2dcf94b3994de8fcd335706a4ebb333812ea7d8c841beb06",
+    sha256 = "8d9e829ce9478db4f35bdb7943308cf02e8a2f58cf9bb10f742462c1d57bf287",
     strip_prefix = "libtorch",
-    urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-cxx11-abi-shared-with-deps-1.10.0%2Bcu113.zip"],
+    urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-cxx11-abi-shared-with-deps-1.11.0%2Bcu113.zip"],
 )
 
 http_archive(
     name = "libtorch_pre_cxx11_abi",
     build_file = "@//third_party/libtorch:BUILD",
-    sha256 = "0996a6a4ea8bbc1137b4fb0476eeca25b5efd8ed38955218dec1b73929090053",
+    sha256 = "90159ecce3ff451f3ef3f657493b6c7c96759c3b74bbd70c1695f2ea2f81e1ad",
     strip_prefix = "libtorch",
-    urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.10.0%2Bcu113.zip"],
+    urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.11.0%2Bcu113.zip"],
 )
 
 # Download these tarballs manually from the NVIDIA website
@@ -86,10 +86,10 @@ http_archive(
 http_archive(
     name = "tensorrt",
     build_file = "@//third_party/tensorrt/archive:BUILD",
-    sha256 = "da130296ac6636437ff8465812eb55dbab0621747d82dc4fe9b9376f00d214af",
-    strip_prefix = "TensorRT-8.2.2.1",
+    sha256 = "826180eaaecdf9a7e76116855b9f1f3400ea9b06e66b06a3f6a0747ba6f863ad",
+    strip_prefix = "TensorRT-8.2.4.2",
     urls = [
-        "https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.2.2.1/tars/tensorrt-8.2.2.1.linux.x86_64-gnu.cuda-11.4.cudnn8.2.tar.gz",
+        "https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.2.4/tars/tensorrt-8.2.4.2.linux.x86_64-gnu.cuda-11.4.cudnn8.2.tar.gz",
     ],
 )
 
 
@@ -328,7 +328,8 @@ void MapInputsAndDetermineDTypes(
         spec.dtype = nvinfer1::DataType::kFLOAT;
       } else if (spec.dtype_is_user_defined && cfg.partition_info.enabled) {
         if (!est_type_opt) {
-          LOG_INFO("Cannot infer input tensor dtype in graph, unable to verify user input dtype settings");
+          LOG_INFO("Cannot infer input tensor dtype in graph. Using user provided input dtype settings");
+          first_use_type_map[in] = {util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype)};
         } else {
           if (util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype) != est_type_opt.value()) {
             std::stringstream ss;
@@ -344,9 +345,10 @@ void MapInputsAndDetermineDTypes(
             ss << "- Disable partial compilation by setting require_full_compilation to True";
             auto warn_str = ss.str();
             LOG_WARNING(warn_str);
-            // Overwrite type map with user settings
-            first_use_type_map[in] = {util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype)};
           }
+          // Overwrite type map with user settings
+          // We use this map for partitiioning since we need c10::ScalarTypes not nvinfer::DataTypes
+          first_use_type_map[in] = {util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype)};
         }
       } else {
         // The user defined the type so no changes are necessary
@@ -417,18 +419,16 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
       auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block());
 
       MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
-
+      auto isBlockConvertible = conversion::VerifyConverterSupportForBlock(g->block(), true);
       if (cfg.partition_info.enabled &&
           (cfg.lower_info.forced_fallback_modules.size() == 0 &&
-           cfg.partition_info.forced_fallback_operators.size() == 0 &&
-           conversion::VerifyConverterSupportForBlock(g->block(), true))) {
+           cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible)) {
         LOG_INFO("Skipping partitioning since model is fully supported");
       }
 
       if (cfg.partition_info.enabled &&
           !(cfg.lower_info.forced_fallback_modules.size() == 0 &&
-            cfg.partition_info.forced_fallback_operators.size() == 0 &&
-            conversion::VerifyConverterSupportForBlock(g->block(), true))) {
+            cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible)) {
         auto input_ivalues_map = partitioning::generateRandomInputs(cfg.convert_info.inputs, first_use_types);
         auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), input_ivalues_map, cfg, static_params);
         new_g = graph_and_mapping.first;
 
@@ -1,15 +1,15 @@
 #include "core/conversion/conversion.h"
+#include <ATen/core/operator_name.h>
 #include <torch/torch.h>
 #include <sstream>
+#include "c10/util/intrusive_ptr.h"
 #include "core/conversion/conversionctx/ConversionCtx.h"
+#include "core/conversion/converters/converter_util.h"
 #include "core/conversion/converters/converters.h"
 #include "core/conversion/evaluators/evaluators.h"
+#include "core/conversion/tensorcontainer/TensorContainer.h"
 #include "core/conversion/var/Var.h"
 #include "core/util/prelude.h"
-
-#include "c10/util/intrusive_ptr.h"
-#include "core/conversion/converters/converter_util.h"
-#include "core/conversion/tensorcontainer/TensorContainer.h"
 #include "core/util/trt_util.h"
 
 namespace torch_tensorrt {
@@ -105,7 +105,8 @@ void AddLayer(ConversionCtx* ctx, const torch::jit::Node* n) {
       // Node input has not been converted yet or is a prim op
       TORCHTRT_THROW_ERROR(
           "Unable to retrieve all node inputs for node: "
-          << util::node_info(n) << " (ctx.AddLayer)\nSpecifically failed to retrieve value for input: " << *input_node);
+          << util::node_info(n) << " (ctx.AddLayer)\nSpecifically failed to retrieve value for input: %"
+          << input->debugName());
     }
   }
 
@@ -426,10 +427,18 @@ void ConvertBlockToNetDef(
                                               << " and node outputs size: " << n->outputs().size() << " must match.");
             for (size_t i = 0; i < eval_list->elements().size(); i++) {
               auto eval_output = eval_list.get()->elements()[i];
-              LOG_DEBUG(
-                  ctx->logger,
-                  "Found the evaluated value(s) to be " << eval_output << " for node: " << util::node_info(n));
-              ctx->AssociateValueAndIValue(n->output(i), eval_output);
+              if (eval_output.isCustomClass()) {
+                auto container = eval_output.toCustomClass<TensorContainer>();
+                auto tensor = container->tensor();
+                LOG_DEBUG(
+                    ctx->logger, "Found the evaluated value(s) to be an ITensor of shape: " << tensor->getDimensions());
+                ctx->AssociateValueAndTensor(n->output(i), tensor);
+              } else {
+                LOG_DEBUG(
+                    ctx->logger,
+                    "Found the evaluated value(s) to be " << eval_output << " for node: " << util::node_info(n));
+                ctx->AssociateValueAndIValue(n->output(i), eval_output);
+              }
             }
           } else {
             TORCHTRT_THROW_ERROR("Unsupported return type for evaluated node");
@@ -487,15 +496,23 @@ std::string ConvertBlockToEngine(
 std::unordered_map<c10::OperatorName, std::string> GetUnsupportedOpsInBlock(const torch::jit::Block* b) {
   std::unordered_map<c10::OperatorName, std::string> unsupported_ops;
   for (const auto n : b->nodes()) {
-    if (n->kind() != torch::jit::prim::Loop && n->kind() != torch::jit::prim::If && !OpSupported(n)) {
-      auto schema = n->maybeSchema();
-      TORCHTRT_CHECK(
-          schema,
-          "Unable to get schema for Node " << util::node_info(n) << " (conversion.VerifyCoverterSupportForBlock)");
-      std::stringstream ss;
-      ss << *schema;
-      unsupported_ops[schema->operator_name()] = ss.str();
+    auto schema = n->maybeSchema();
+    // Some ops like torch::jit::prim::Loop, torch::jit::prim::If, torch::jit::prim::DictConstruct don't have a schema
+    // but they are supported. torch::jit::prim::DictConstruct is supported via fallback only
+    if (!OpSupported(n)) {
+      if (schema) {
+        std::stringstream ss;
+        ss << *schema;
+        unsupported_ops[schema->operator_name()] = ss.str();
+      } else {
+        std::stringstream ss;
+        ss << util::node_info(n);
+        // operator.overload is a filler name just to call the constructor.
+        c10::OperatorName op(ss.str(), "operator.overload");
+        unsupported_ops[op] = ss.str();
+      }
     }
+
     for (const auto sub_b : n->blocks()) {
       auto sub_b_unsupported_ops = GetUnsupportedOpsInBlock(sub_b);
       unsupported_ops.insert(sub_b_unsupported_ops.begin(), sub_b_unsupported_ops.end());
@@ -530,22 +547,25 @@ std::set<std::string> ConvertableOpsInBlock(const torch::jit::Block* b) {
 
 bool VerifyConverterSupportForBlock(const torch::jit::Block* b, bool suppress_errors) {
   auto unsupported_ops = GetUnsupportedOpsInBlock(b);
-
   if (unsupported_ops.size() != 0) {
     std::stringstream unsupported_msg;
     unsupported_msg
-        << "Method requested cannot be compiled by Torch-TensorRT.TorchScript.\nUnsupported operators listed below:"
+        << "Method requested cannot be compiled end to end by Torch-TensorRT.TorchScript.\nUnsupported operators listed below:"
         << std::endl;
     for (auto s : unsupported_ops) {
       unsupported_msg << "  - " << s.second << std::endl;
     }
-    unsupported_msg << "You can either implement converters for these ops in your application or request implementation"
-                    << std::endl;
-    unsupported_msg << "https://www.github.com/nvidia/Torch-TensorRT/issues" << std::endl;
-    unsupported_msg << std::endl << "In Module:" << std::endl;
 
     if (!suppress_errors) {
+      unsupported_msg
+          << "You can either implement converters for these ops in your application or request implementation"
+          << std::endl;
+      unsupported_msg << "https://www.github.com/nvidia/Torch-TensorRT/issues" << std::endl;
+      unsupported_msg << std::endl << "In Module:" << std::endl;
+
       LOG_ERROR(unsupported_msg.str());
+    } else {
+      LOG_INFO(unsupported_msg.str());
     }
 
     std::unordered_map<std::string, std::unordered_set<std::string>> unsupported_node_locations;
@@ -571,8 +591,13 @@ bool VerifyConverterSupportForBlock(const torch::jit::Block* b, bool suppress_er
       for (const auto& str : type.second) {
         traceback << str;
       }
+
       auto tb_str = traceback.str();
-      LOG_ERROR(tb_str);
+      if (!suppress_errors) {
+        LOG_ERROR(tb_str);
+      } else {
+        LOG_DEBUG(tb_str);
+      }
     }
 
     return false;
 
@@ -87,6 +87,27 @@ auto acthardtanh TORCHTRT_UNUSED =
 
                bool to_reshape = false;
                auto original_shape = in->getDimensions();
+
+               // Out_tensor of ParametricReLU shape is all 0, when slopes nDims is not equal to in nDims.
+               // Since make sure splopes nDims is equal to in nDims.
+               if (slopes.ndimension() == 1 and original_shape.nbDims != slopes.ndimension()) {
+                 std::vector<int64_t> slopes_new_shape(original_shape.nbDims, 1);
+                 auto first_inputs_allowed_formats = ctx->net->getInput(0)->getAllowedFormats();
+                 for (size_t inputs_index = 1; inputs_index < ctx->num_inputs; inputs_index++) {
+                   auto inputs_allowed_formats = ctx->net->getInput(inputs_index)->getAllowedFormats();
+                   TORCHTRT_CHECK(
+                       first_inputs_allowed_formats == inputs_allowed_formats,
+                       "Unable to create batch prelu layer from node,since the formats(like NHWC or NCHW) of inputs is different: "
+                           << *n);
+                 }
+                 if (1U << static_cast<int>(nvinfer1::TensorFormat::kLINEAR) == first_inputs_allowed_formats) {
+                   slopes_new_shape[1] = slopes.sizes().vec()[0];
+                 } else {
+                   slopes_new_shape[original_shape.nbDims - 1] = slopes.sizes().vec()[0];
+                 }
+                 slopes = slopes.reshape(slopes_new_shape);
+               }
+
                if (slopes.numel() != 1 &&
                    !util::broadcastable(
                        in->getDimensions(),
 
@@ -50,27 +50,35 @@ auto batch_norm_registrations TORCHTRT_UNUSED =
               auto orig_shape = input->getDimensions();
               auto shape = util::toVec(orig_shape);
               auto tensor_type = util::TRTDataTypeToScalarType(input->getType());
-              auto options = torch::TensorOptions().dtype(tensor_type);
+              auto options =
+                  torch::TensorOptions().dtype(tensor_type).device(torch::kCUDA, ctx->settings.device.gpu_id);
 
               torch::Tensor gamma, beta, mean, var;
+              LOG_DEBUG("Input :" << orig_shape << "/" << input->getType());
+              // affine=True
+              LOG_DEBUG("Args[1] gamma : " << args[1].isIValue() << " / " << args[1].IValue()->isNone());
+              LOG_DEBUG("Args[2] beta : " << args[2].isIValue() << " / " << args[2].IValue()->isNone());
+              // track_running_stats=True
+              LOG_DEBUG("Args[3] mean : " << args[3].isIValue() << " / " << args[3].IValue()->isNone());
+              LOG_DEBUG("Args[4] var : " << args[4].isIValue() << " / " << args[4].IValue()->isNone());
+              LOG_DEBUG("use_input_stats, momemtum, cudnn_enabled disregarded");
+              LOG_DEBUG("ctx->input_is_dynamic : " << ctx->input_is_dynamic);
 
+              auto channel_dim = shape[1];
               if (ctx->input_is_dynamic) {
-                gamma = args[1].unwrapToTensor();
-                beta = args[2].unwrapToTensor();
+                gamma = args[1].unwrapToTensor(at::full(channel_dim, 1, options));
+                beta = args[2].unwrapToTensor(at::full(channel_dim, 0, options));
                 mean = args[3].unwrapToTensor();
                 var = args[4].unwrapToTensor();
               } else {
-                gamma = args[1].unwrapToTensor(at::full({shape}, 1, {options}));
-                beta = args[2].unwrapToTensor(at::full({shape}, 1, {options}));
-                mean = args[3].unwrapToTensor(at::full({shape}, 0, {options}));
-                var = args[4].unwrapToTensor(at::full({shape}, 0, {options}));
+                gamma = args[1].unwrapToTensor(at::full(channel_dim, 1, options));
+                beta = args[2].unwrapToTensor(at::full(channel_dim, 0, options));
+                mean = args[3].unwrapToTensor(at::full(channel_dim, 0, options));
+                var = args[4].unwrapToTensor(at::full(channel_dim, 0, options));
               }
 
               auto eps = static_cast<float>(args[7].unwrapToDouble(1e-5f));
 
-              LOG_DEBUG("momentum disregarded");
-              LOG_DEBUG("training disregarded");
-              LOG_DEBUG("cudnn disregarded");
               TORCHTRT_CHECK(orig_shape.nbDims >= 2, "Unable to create batch normalization layer from node: " << *n);
 
               // Expand spatial dims from 1D to 2D if needed
 
@@ -18,14 +18,43 @@ auto cast_registrations TORCHTRT_UNUSED =
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                auto self = args[0].ITensorOrFreeze(ctx);
                auto output_dtype = args[1].unwrapToScalar().to<int64_t>();
-               auto trt_dtype = util::ScalarTypeToTRTDataType(static_cast<at::ScalarType>(output_dtype));
+               auto scalar_dtype = static_cast<at::ScalarType>(output_dtype);
+               nvinfer1::DataType trt_dtype;
+               if (scalar_dtype == at::kLong) {
+                 LOG_WARNING("Truncating aten::to output type from at::kLong to at::kInt");
+                 trt_dtype = nvinfer1::DataType::kINT32;
+               } else {
+                 trt_dtype = util::ScalarTypeToTRTDataType(static_cast<at::ScalarType>(output_dtype));
+               }
                auto casted_itensor = castITensor(ctx, self, trt_dtype);
                auto output = ctx->AssociateValueAndTensor(n->outputs()[0], casted_itensor);
                LOG_DEBUG("[aten::to.dtype] Output tensor shape: " << output->getDimensions());
 
                return true;
              }})
         .pattern(
+            {"aten::to.device(Tensor(a) self, Device device, int dtype, bool non_blocking=False, bool copy=False, int? memory_format=None) -> (Tensor(a))",
+             [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+               // what this function does is basically the same with the previous one, however, we cannot lower this
+               // signature to previous one because this will incur the device issues when we run Torchscript module in
+               // later shape analysis phase of fallback
+               auto self = args[0].ITensorOrFreeze(ctx);
+               auto output_dtype = args[2].unwrapToScalar().to<int64_t>();
+               auto scalar_dtype = static_cast<at::ScalarType>(output_dtype);
+               nvinfer1::DataType trt_dtype;
+               if (scalar_dtype == at::kLong) {
+                 LOG_WARNING("Truncating aten::to output type from at::kLong to at::kInt");
+                 trt_dtype = nvinfer1::DataType::kINT32;
+               } else {
+                 trt_dtype = util::ScalarTypeToTRTDataType(static_cast<at::ScalarType>(output_dtype));
+               }
+               auto casted_itensor = castITensor(ctx, self, trt_dtype);
+               auto output = ctx->AssociateValueAndTensor(n->outputs()[0], casted_itensor);
+               LOG_DEBUG("[aten::to.device] Output tensor shape: " << output->getDimensions());
+
+               return true;
+             }})
+        .pattern(
             {"aten::to.other(Tensor self, Tensor other, bool non_blocking=False, bool copy=False, int? memory_format=None) -> (Tensor)",
              [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
                auto self = args[0].ITensorOrFreeze(ctx);
 
@@ -48,7 +48,8 @@ auto cumsum_registrations TORCHTRT_UNUSED = RegisterNodeConversionPatterns().pat
        auto data = iterator->getOutput(0);
        auto newDims = data->getDimensions();
 
-       torch::Tensor zeroValue = at::full(util::toVec(newDims), 0, torch::kFloat32);
+       torch::Tensor zeroValue =
+           at::full(util::toVec(newDims), 0, torch_tensorrt::core::util::TRTDataTypeToScalarType(in->getType()));
        auto zeroTensor = tensor_to_const(ctx, zeroValue);
        auto runningSum = loop->addRecurrence(*zeroTensor);
        auto runningSumTensor = runningSum->getOutput(0);