pytorch
diff --git a/‎.gitignore
Lines changed: 4 additions & 0 deletions b/‎.gitignore
Lines changed: 4 additions & 0 deletions
diff --git a/‎core/compiler.cpp
Lines changed: 4 additions & 4 deletions b/‎core/compiler.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎core/compiler.h
Lines changed: 2 additions & 0 deletions b/‎core/compiler.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎core/conversion/conversionctx/ConversionCtx.cpp
Lines changed: 4 additions & 2 deletions b/‎core/conversion/conversionctx/ConversionCtx.cpp
Lines changed: 4 additions & 2 deletions
diff --git a/‎core/conversion/converters/BUILD
Lines changed: 1 addition & 0 deletions b/‎core/conversion/converters/BUILD
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/conversion/converters/impl/conv_deconv.cpp
Lines changed: 84 additions & 32 deletions b/‎core/conversion/converters/impl/conv_deconv.cpp
Lines changed: 84 additions & 32 deletions
diff --git a/‎core/conversion/converters/impl/matrix_multiply.cpp
Lines changed: 1 addition & 0 deletions b/‎core/conversion/converters/impl/matrix_multiply.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/conversion/converters/impl/quantization.cpp
Lines changed: 61 additions & 0 deletions b/‎core/conversion/converters/impl/quantization.cpp
Lines changed: 61 additions & 0 deletions
diff --git a/‎core/conversion/evaluators/aten.cpp
Lines changed: 16 additions & 0 deletions b/‎core/conversion/evaluators/aten.cpp
Lines changed: 16 additions & 0 deletions
diff --git a/‎core/lowering/lowering.cpp
Lines changed: 18 additions & 11 deletions b/‎core/lowering/lowering.cpp
Lines changed: 18 additions & 11 deletions
@@ -44,3 +44,7 @@ tests/py/data
 examples/**/deps/**/*
 !examples/**/deps/.gitkeep
 examples/trtorchrt_example/trtorchrt_example
+examples/int8/ptq/ptq
+examples/int8/qat/qat
+examples/int8/training/vgg16/data/*
+examples/int8/datasets/data/*
@@ -119,8 +119,8 @@ void AddEngineToGraph(
 }
 
 bool CheckMethodOperatorSupport(const torch::jit::script::Module& mod, std::string method_name) {
-  // Go through Lowering to simplify graph and extract weight parameters
-  auto graph_and_parameters = lowering::Lower(mod, method_name);
+  // Go through Lowering to simplify graph
+  auto graph_and_parameters = lowering::Lower(mod, method_name, lowering::LowerInfo());
 
   auto g = graph_and_parameters.first;
   LOG_DEBUG(*g << "(CheckMethodOperatorSupport)\n");
@@ -130,7 +130,7 @@ bool CheckMethodOperatorSupport(const torch::jit::script::Module& mod, std::stri
 
 std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::string method_name, CompileSpec cfg) {
   // Go through Lowering to simplify graph and extract weight parameters
-  auto graph_and_parameters = lowering::Lower(mod, method_name);
+  auto graph_and_parameters = lowering::Lower(mod, method_name, cfg.lower_info);
 
   auto convert_cfg = std::move(cfg.convert_info);
   auto g = graph_and_parameters.first;
@@ -309,7 +309,7 @@ torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Mo
     // Compile only forward methods. forward method contains the entire graph.
     if (method.name().compare("forward") == 0) {
       auto new_g = std::make_shared<torch::jit::Graph>();
-      auto graph_and_parameters = lowering::Lower(mod, method.name());
+      auto graph_and_parameters = lowering::Lower(mod, method.name(), cfg.lower_info);
 
       auto g = graph_and_parameters.first;
       auto params = graph_and_parameters.second;
 
@@ -4,6 +4,7 @@
 #include <vector>
 #include "core/conversion/conversion.h"
 #include "core/ir/ir.h"
+#include "core/lowering/lowering.h"
 #include "core/partitioning/partitioning.h"
 #include "core/runtime/runtime.h"
 #include "torch/csrc/jit/api/module.h"
@@ -14,6 +15,7 @@ namespace core {
 struct CompileSpec {
   CompileSpec(std::vector<ir::Input> inputs) : convert_info(std::move(inputs)) {}
   conversion::ConversionInfo convert_info;
+  lowering::LowerInfo lower_info;
   partitioning::PartitionInfo partition_info;
 };
 
 
@@ -69,9 +69,11 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings)
       case nvinfer1::DataType::kINT8:
         TRTORCH_CHECK(builder->platformHasFastInt8(), "Requested inference in INT8 but platform does not support INT8");
         cfg->setFlag(nvinfer1::BuilderFlag::kINT8);
-        if (settings.calibrator == nullptr) {
+        if (!settings.calibrator) {
           LOG_INFO(
-              "INT8 kernels are enabled but not calibrator was provided, assuming source model was trained quantization aware");
+              "Int8 precision has been enabled but no calibrator provided. This assumes the network has Q/DQ nodes obtained from Quantization aware training. For more details, refer to https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#work-with-qat-networks");
+        } else {
+          cfg->setInt8Calibrator(settings.calibrator);
         }
         break;
       case nvinfer1::DataType::kFLOAT:
 
@@ -69,6 +69,7 @@ cc_library(
         "impl/matrix_multiply.cpp",
         "impl/normalize.cpp",
         "impl/pooling.cpp",
+        "impl/quantization.cpp",
         "impl/reduce.cpp",
         "impl/replication_pad.cpp",
         "impl/select.cpp",
 
@@ -11,15 +11,97 @@ namespace impl {
 namespace {
 
 bool add_conv_deconv(ConversionCtx* ctx, const torch::jit::Node* n, args& args) {
-  auto in = args[0].ITensor(); // assumes non-static input Tensor
-  auto w = Weights(ctx, args[1].unwrapToTensor());
+  // Input to conv/deconv
+  auto in = args[0].ITensor();
+
+  // Conv /deconv parameters
   auto stride = util::toDims(args[3].unwrapToIntList());
   auto padding = util::toDims(args[4].unwrapToIntList());
   auto dilation = util::toDims(args[5].unwrapToIntList());
   bool transposed = args[6].unwrapToBool();
   auto out_padding = util::toDims(args[7].unwrapToIntList());
   int64_t groups = args[8].unwrapToInt();
 
+  // Reshape the parameters to 2D if needed
+  if (stride.nbDims == 1) {
+    stride = util::unsqueezeDims(stride, 1, 1);
+    LOG_DEBUG("Reshaped stride: " << stride);
+  }
+  if (dilation.nbDims == 1) {
+    dilation = util::unsqueezeDims(dilation, 1, 1);
+    LOG_DEBUG("Reshaped dilation: " << dilation);
+  }
+  if (padding.nbDims == 1) {
+    padding = util::unsqueezeDims(padding, 1, 0);
+    LOG_DEBUG("Reshaped padding: " << padding);
+  }
+  if (out_padding.nbDims == 1) {
+    out_padding = util::unsqueezeDims(out_padding, 1, 0);
+    LOG_DEBUG("Reshaped out_padding: " << out_padding);
+  }
+
+  // Get bias tensor or initialize it to zeros.
+  Weights bias;
+  if (args[2].IValue()->isTensor()) {
+    bias = Weights(ctx, args[2].unwrapToTensor());
+  } else {
+    bias = Weights();
+  }
+
+  // Handle case when weights of conv/deconv is an ITensor. This case happens for QAT networks where
+  // conv_weights -> Quantize -> Dequantize -> new_conv_weights -> conv <- input
+  // new_conv_weights will be an ITensor because it is an output of Dequantize layer defined in impl/quantization.cpp
+  if (args[1].isITensor()) {
+    // Get the kernel tensor
+    auto kernel = args[1].ITensor();
+    auto kernel_dims = kernel->getDimensions();
+
+    // Make a new Dims with only the spatial dimensions.
+    nvinfer1::Dims filter_dim;
+    int64_t nbSpatialDims = in->getDimensions().nbDims - 2;
+    TRTORCH_CHECK(
+        nbSpatialDims = kernel_dims.nbDims - 2,
+        "Number of input spatial dimensions should match the kernel spatial dimensions");
+    filter_dim.nbDims = nbSpatialDims;
+    filter_dim.d[0] = kernel_dims.d[2];
+    filter_dim.d[1] = kernel_dims.d[3];
+
+    // Initialize a dummy constant kernel to pass it to INetwork->addConvolutionNd/addDeconvolutionNd API.
+    auto kernel_weights = nvinfer1::Weights{nvinfer1::DataType::kFLOAT, nullptr, 0};
+
+    nvinfer1::ILayer* layer = nullptr;
+    if (transposed) {
+      nvinfer1::IDeconvolutionLayer* deconvLayer =
+          ctx->net->addDeconvolutionNd(*in, kernel_dims.d[0], filter_dim, kernel_weights, bias.data);
+      deconvLayer->setStrideNd(stride);
+      deconvLayer->setDilationNd(dilation);
+      deconvLayer->setNbGroups(groups);
+      deconvLayer->setPaddingNd(padding);
+      // Set deconv kernel weights
+      deconvLayer->setInput(1, *kernel);
+      TRTORCH_CHECK(deconvLayer, "Unable to create deconv layer with non-const weights from node: " << *n);
+      layer = deconvLayer;
+    } else {
+      nvinfer1::IConvolutionLayer* convLayer =
+          ctx->net->addConvolutionNd(*in, kernel_dims.d[0], filter_dim, kernel_weights, bias.data);
+      convLayer->setStrideNd(stride);
+      convLayer->setPaddingMode(nvinfer1::PaddingMode::kCAFFE_ROUND_DOWN);
+      convLayer->setPaddingNd(padding);
+      convLayer->setPostPadding(out_padding);
+      convLayer->setDilationNd(dilation);
+      convLayer->setNbGroups(groups);
+
+      // Set conv kernel weights
+      convLayer->setInput(1, *kernel);
+      layer = convLayer;
+    }
+
+    ctx->AssociateValueAndTensor(n->outputs()[0], layer->getOutput(0));
+    LOG_DEBUG("Output tensor shape: " << layer->getOutput(0)->getDimensions());
+    return true;
+  }
+
+  auto w = Weights(ctx, args[1].unwrapToTensor());
   auto dims = in->getDimensions();
   auto orig_dims = dims;
   LOG_DEBUG("Input dims: " << orig_dims);
@@ -47,32 +129,9 @@ bool add_conv_deconv(ConversionCtx* ctx, const torch::jit::Node* n, args& args)
     w.kernel_shape.d[1] = 1;
     LOG_DEBUG("Reshaped Weights: " << w);
   }
-  if (stride.nbDims == 1) {
-    stride = util::unsqueezeDims(stride, 1, 1);
-    LOG_DEBUG("Reshaped stride: " << stride);
-  }
-  if (dilation.nbDims == 1) {
-    dilation = util::unsqueezeDims(dilation, 1, 1);
-    LOG_DEBUG("Reshaped dilation: " << dilation);
-  }
-  if (padding.nbDims == 1) {
-    padding = util::unsqueezeDims(padding, 1, 0);
-    LOG_DEBUG("Reshaped padding: " << padding);
-  }
-  if (out_padding.nbDims == 1) {
-    out_padding = util::unsqueezeDims(out_padding, 1, 0);
-    LOG_DEBUG("Reshaped out_padding: " << out_padding);
-  }
 
   nvinfer1::ILayer* new_layer;
   if (transposed) {
-    Weights bias;
-    if (args[2].IValue()->isTensor()) {
-      bias = Weights(ctx, args[2].unwrapToTensor());
-    } else {
-      bias = Weights(ctx, torch::zeros(w.shape.d[1] * groups));
-    }
-
     // shape of deconvolution's weight: [in, out/groups, ...]
     auto deconv = ctx->net->addDeconvolutionNd(*in, w.shape.d[1] * groups, w.kernel_shape, w.data, bias.data);
     TRTORCH_CHECK(deconv, "Unable to create deconvolution layer from node: " << *n);
@@ -90,13 +149,6 @@ bool add_conv_deconv(ConversionCtx* ctx, const torch::jit::Node* n, args& args)
 #endif
     new_layer = deconv;
   } else {
-    Weights bias;
-    if (args[2].IValue()->isTensor()) {
-      bias = Weights(ctx, args[2].unwrapToTensor());
-    } else {
-      bias = Weights(ctx, torch::zeros(w.shape.d[0]));
-    }
-
     // shape of convolution's weight: [out, in/groups, ...]
     auto conv = ctx->net->addConvolutionNd(*in, w.shape.d[0], w.kernel_shape, w.data, bias.data);
     TRTORCH_CHECK(conv, "Unable to create convolution layer from node: " << *n);
 
@@ -25,6 +25,7 @@ auto mm_registrations TRTORCH_UNUSED =
 
                     auto mm_layer = ctx->net->addMatrixMultiply(
                         *self, nvinfer1::MatrixOperation::kNONE, *other, nvinfer1::MatrixOperation::kNONE);
+
                     TRTORCH_CHECK(mm_layer, "Unable to create matrix multiplication node: " << *n);
                     mm_layer->setName(util::node_info(n).c_str());
                     auto out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], mm_layer->getOutput(0));
 
@@ -0,0 +1,61 @@
+#include <torch/torch.h>
+#include "core/conversion/converters/converters.h"
+#include "core/util/prelude.h"
+
+namespace trtorch {
+namespace core {
+namespace conversion {
+namespace converters {
+namespace impl {
+namespace {
+
+// clang-format off
+auto quantization_registrations TRTORCH_UNUSED = RegisterNodeConversionPatterns()
+  .pattern({"aten::fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor)",
+            [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+              // This aten operator is generated from torch.fake_quantize_per_tensor_affine op in Pytorch python API.
+              // Example usage: https://github.com/pytorch/pytorch/blob/master/torch/quantization/fake_quantize.py#L145
+              auto input = args[0].ITensorOrFreeze(ctx);
+              auto scale = args[1].unwrapToScalar().to<float>();
+              auto scaleTensor = tensor_to_const(ctx, torch::tensor({scale}));
+              // Add and configure a QuantizeLayer.
+              nvinfer1::IQuantizeLayer* quantize_layer = ctx->net->addQuantize(*input, *scaleTensor);
+              quantize_layer->setAxis(0);
+
+              // Add and configure DequantizeLayer following a QuantizeLayer
+              nvinfer1::IDequantizeLayer* dequantize_layer = ctx->net->addDequantize(*quantize_layer->getOutput(0), *scaleTensor);
+              dequantize_layer->setAxis(0);
+
+              auto qdq_out = ctx->AssociateValueAndTensor(n->outputs()[0], dequantize_layer->getOutput(0));
+              LOG_DEBUG("[fake_quantize_per_tensor_affine] Output tensor shape: " << qdq_out->getDimensions());
+
+              return true;
+            }})
+  .pattern({"aten::fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor)",
+            [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+              // This aten operator is generated from torch.fake_quantize_per_channel_affine op in Pytorch python API.
+              // Example usage: https://github.com/pytorch/pytorch/blob/master/torch/quantization/fake_quantize.py#L141
+              auto input = args[0].ITensorOrFreeze(ctx);
+              auto scale = args[1].ITensorOrFreeze(ctx);
+              int64_t axis = args[3].unwrapToScalar().to<int64_t>();
+              // Add and configure a QuantizeLayer.
+              nvinfer1::IQuantizeLayer* quantize_layer = ctx->net->addQuantize(*input, *scale);
+              // Set a channel axis which represents output channels
+              quantize_layer->setAxis(axis);
+
+              // Add and configure a DequantizeLayer.
+              nvinfer1::IDequantizeLayer* dequantize_layer = ctx->net->addDequantize(*quantize_layer->getOutput(0), *scale);
+              dequantize_layer->setAxis(axis);
+              auto qdq_out = ctx->AssociateValueAndTensor(n->outputs()[0], dequantize_layer->getOutput(0));
+
+              LOG_DEBUG("[fake_quantize_per_channel_affine] Ouput tensor shape: " << qdq_out->getDimensions());
+
+              return true;
+            }});
+// clang-format on
+} // namespace
+} // namespace impl
+} // namespace converters
+} // namespace conversion
+} // namespace core
+} // namespace trtorch
@@ -143,6 +143,22 @@ auto aten_registrations TRTORCH_UNUSED =
                       auto out_tensor = torch::ones(args.at(n->input(0)).unwrapToIntList().vec(), options);
                       return out_tensor;
                     }})
+        .evaluator({c10::Symbol::fromQualString("aten::full"),
+                    // aten::full(int[] size, Scalar fill_value, *, int? dtype=None, int? layout=None,
+                    // Device? device=None, bool? pin_memory=None) -> (Tensor)
+                    [](const torch::jit::Node* n, kwargs& args) -> c10::optional<torch::jit::IValue> {
+                      auto options = torch::TensorOptions().layout(torch::kStrided).device(torch::kCUDA);
+
+                      // Input 2 here is the dtype
+                      if (!args.at(n->input(2)).isNone() && !args.at(n->input(2)).IValue()->isNone()) {
+                        options = options.dtype(c10::ScalarType(args.at(n->input(2)).unwrapToInt()));
+                      }
+
+                      auto scalar_value = args.at(n->input(1)).unwrapToScalar().to<float>();
+                      auto out_tensor =
+                          torch::full(args.at(n->input(0)).unwrapToIntList().vec(), scalar_value, options);
+                      return out_tensor;
+                    }})
         .evaluator({c10::Symbol::fromQualString("aten::slice"),
                     [](const torch::jit::Node* n, kwargs& args) -> c10::optional<torch::jit::IValue> {
                       c10::List<c10::IValue> list = args.at(n->input(0)).IValue()->to<c10::List<c10::IValue>>();
 
@@ -24,7 +24,7 @@ void LowerBlock(torch::jit::Block* b) {
   DropUnusedNodes(b);
 }
 
-void LowerGraph(std::shared_ptr<torch::jit::Graph>& g) {
+void LowerGraph(std::shared_ptr<torch::jit::Graph>& g, LowerInfo lower_info) {
   passes::UnpackHardSwish(g);
   torch::jit::EliminateRedundantGuards(g);
   torch::jit::RemoveListMutation(g);
@@ -43,9 +43,10 @@ void LowerGraph(std::shared_ptr<torch::jit::Graph>& g) {
   passes::Conv3DToConvolution(g);
   passes::FuseAddMMBranches(g);
   passes::RemoveBNDimCheck(g);
-  torch::jit::EliminateCommonSubexpression(g);
+  if (!lower_info.disable_cse) {
+    torch::jit::EliminateCommonSubexpression(g);
+  }
   // torch::jit::UnrollLoops(g);
-  torch::jit::EliminateCommonSubexpression(g);
   passes::UnpackAddMM(g);
   // passes::UnpackBatchNorm(g);
   passes::UnpackLogSoftmax(g);
@@ -59,26 +60,32 @@ void LowerGraph(std::shared_ptr<torch::jit::Graph>& g) {
 }
 
 torch::jit::Module LowerModule(const torch::jit::script::Module& mod) {
+  LOG_DEBUG("Input module is being frozen by torch::jit::freeze_module");
   auto mod_ = torch::jit::freeze_module(mod);
   return mod_;
 }
 
 std::pair<std::shared_ptr<torch::jit::Graph>, std::vector<torch::jit::IValue>> Lower(
     const torch::jit::script::Module& mod,
-    std::string method_name) {
-  auto lowered_mod = LowerModule(mod);
+    std::string method_name,
+    LowerInfo lower_info) {
+  auto lowered_mod = lower_info.unfreeze_module ? mod : LowerModule(mod);
   auto g = lowered_mod.get_method(method_name).graph();
   LOG_GRAPH(*g);
 
-  // Go through TRTorch Lowering to reformat graph to be conversion friendly
-  // and also segment for accelerators and executors (TRT-DLA, TRT-GPU, PYT)
-  LOG_GRAPH("TRTorch Graph Lowering");
-  lowering::LowerGraph(g);
-  //=[torch::jit::FoldConvBatchNorm2d(lowered_mod);
   LOG_GRAPH("LibTorch Lowering");
   auto graph_and_ivalues = torch::jit::LowerGraph(*g, lowered_mod._ivalue());
+
+  // Go through TRTorch Lowering to reformat graph to be conversion friendly
+  // and also segment for accelerators and executors (TRT-DLA, TRT-GPU  , PYT)
+  // unfreeze_module is used to not perform constant folding on weights in the network.
+  // In quantization aware trained (QAT) models, weights are passed through quantize and
+  // dequantize nodes which should not be folded. So unfreeze_module is set to True for QAT models.
+  LOG_GRAPH("TRTorch Graph Lowering");
+  lowering::LowerGraph(graph_and_ivalues.first, lower_info);
+
   // Is this necessary?
-  lowering::LowerBlock(g->block());
+  // lowering::LowerBlock(g->block());
 
   return graph_and_ivalues;
 }