pytorch
diff --git a/‎.bazelrc
Lines changed: 4 additions & 4 deletions b/‎.bazelrc
Lines changed: 4 additions & 4 deletions
diff --git a/‎.github/workflows/docgen.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/docgen.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/conversion/converters/impl/activation.cpp
Lines changed: 0 additions & 32 deletions b/‎core/conversion/converters/impl/activation.cpp
Lines changed: 0 additions & 32 deletions
diff --git a/‎core/lowering/lowering.cpp
Lines changed: 1 addition & 0 deletions b/‎core/lowering/lowering.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/lowering/passes/BUILD
Lines changed: 1 addition & 0 deletions b/‎core/lowering/passes/BUILD
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/lowering/passes/passes.h
Lines changed: 1 addition & 0 deletions b/‎core/lowering/passes/passes.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/lowering/passes/reduce_gelu.cpp
Lines changed: 44 additions & 0 deletions b/‎core/lowering/passes/reduce_gelu.cpp
Lines changed: 44 additions & 0 deletions
diff --git a/‎core/partitioning/README.md
Lines changed: 5 additions & 8 deletions b/‎core/partitioning/README.md
Lines changed: 5 additions & 8 deletions
@@ -35,8 +35,8 @@ build:pre_cxx11_abi --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0"
 build:pre_cxx11_abi --linkopt="-D_GLIBCXX_USE_CXX11_ABI=0"
 build:pre_cxx11_abi --define=abi=pre_cxx11_abi
 
-build:ci_testing --define=torchtrt_src=pre_built --cxxopt="-DDISABLE_TEST_IN_CI" --action_env "NVIDIA_TF32_OVERRIDE=0"
-build:use_precompiled_torchtrt --define=torchtrt_src=pre_built
+build:ci_testing --define=torchtrt_src=prebuilt --cxxopt="-DDISABLE_TEST_IN_CI" --action_env "NVIDIA_TF32_OVERRIDE=0"
+build:use_precompiled_torchtrt --define=torchtrt_src=prebuilt
 
-test:ci_testing --define=torchtrt_src=pre_built --cxxopt="-DDISABLE_TEST_IN_CI" --action_env "NVIDIA_TF32_OVERRIDE=0"
-test:use_precompiled_torchtrt --define=torchtrt_src=pre_built
+test:ci_testing --define=torchtrt_src=prebuilt --cxxopt="-DDISABLE_TEST_IN_CI" --action_env "NVIDIA_TF32_OVERRIDE=0"
+test:use_precompiled_torchtrt --define=torchtrt_src=prebuilt
@@ -36,6 +36,7 @@ jobs:
       - name: Generate New Docs
         run: |
           cd docsrc
+          pip3 install -r requirements.txt
           python3 -c "import torch_tensorrt; print(torch_tensorrt.__version__)"
           make html
       - uses: stefanzweifel/git-auto-commit-action@v4
 
@@ -166,39 +166,7 @@ auto acthardtanh TORCHTRT_UNUSED =
                     auto out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], new_layer->getOutput(0));
                     LOG_DEBUG("Output shape: " << out_tensor->getDimensions());
                     return true;
-                  }})
-        .pattern({"aten::gelu(Tensor self) -> (Tensor)",
-                  [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
-                    auto in = args[0].ITensorOrFreeze(ctx);
-                    nvinfer1::DataType type = in->getType();
-                    TORCHTRT_CHECK(
-                        type == nvinfer1::DataType::kFLOAT || type == nvinfer1::DataType::kHALF,
-                        "gelu only supports kFLOAT and kHALF");
-                    std::string pluginName = "CustomGeluPluginDynamic";
-                    nvinfer1::PluginFieldCollection fc;
-                    std::vector<nvinfer1::PluginField> f;
-                    // REVIEW is this right?
-                    int type_id = ctx->settings.enabled_precisions.find(nvinfer1::DataType::kHALF) ==
-                            ctx->settings.enabled_precisions.end()
-                        ? 0
-                        : 1; // Integer encoding the DataType (0: FP32, 1: FP16)
-                    f.emplace_back(nvinfer1::PluginField("type_id", &type_id, nvinfer1::PluginFieldType::kINT32, 1));
-                    fc.nbFields = f.size();
-                    fc.fields = f.data();
-
-                    auto creator = getPluginRegistry()->getPluginCreator("CustomGeluPluginDynamic", "1", "");
-                    auto gelu_plugin = creator->createPlugin("gelu", &fc);
-
-                    TORCHTRT_CHECK(gelu_plugin, "Unable to create gelu plugin from TensorRT plugin registry" << *n);
-                    auto new_layer =
-                        ctx->net->addPluginV2(reinterpret_cast<nvinfer1::ITensor* const*>(&in), 1, *gelu_plugin);
-                    new_layer->setName(util::node_info(n).c_str());
-                    auto out_tensor = new_layer->getOutput(0);
-                    out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], out_tensor);
-                    LOG_DEBUG("Output shape: " << out_tensor->getDimensions());
-                    return true;
                   }});
-
 } // namespace
 } // namespace impl
 } // namespace converters
 
@@ -43,6 +43,7 @@ void LowerGraph(std::shared_ptr<torch::jit::Graph>& g, LowerInfo lower_info) {
   passes::UnpackHardSwish(g);
   passes::EliminateExceptionOrPassPattern(g);
   passes::ReduceToOperation(g);
+  passes::ReduceGelu(g);
   passes::RemoveContiguous(g);
   passes::RemoveDropout(g);
   passes::LinearToAddMM(g);
 
@@ -17,6 +17,7 @@ cc_library(
         "module_fallback.cpp",
         "op_aliasing.cpp",
         "reduce_to.cpp",
+        "reduce_gelu.cpp",
         "remove_bn_dim_check.cpp",
         "remove_contiguous.cpp",
         "remove_dropout.cpp",
 
@@ -20,6 +20,7 @@ void FuseAddMMBranches(std::shared_ptr<torch::jit::Graph> graph);
 void LinearToAddMM(std::shared_ptr<torch::jit::Graph>& graph);
 void EliminateExceptionOrPassPattern(std::shared_ptr<torch::jit::Graph> graph);
 void ReduceToOperation(std::shared_ptr<torch::jit::Graph>& graph);
+void ReduceGelu(std::shared_ptr<torch::jit::Graph>& graph);
 void MarkNodesForFallback(std::shared_ptr<torch::jit::Graph>& g, bool delete_delims);
 void RemoveBNDimCheck(std::shared_ptr<torch::jit::Graph> graph);
 void RemoveContiguous(std::shared_ptr<torch::jit::Graph>& graph);
 
@@ -0,0 +1,44 @@
+#include <torch/csrc/jit/passes/subgraph_rewrite.h>
+#include "core/util/prelude.h"
+
+namespace torch_tensorrt {
+namespace core {
+namespace lowering {
+namespace passes {
+
+void ReduceGelu(std::shared_ptr<torch::jit::Graph>& graph) {
+  std::string gelu_pattern = R"IR(
+        graph(%x):
+            %out : Tensor = aten::gelu(%x)
+            return (%out))IR";
+
+  std::string gelu_reduce_pattern = R"IR(
+    graph(%x.1 : Tensor):
+        %6 : float = prim::Constant[value=0.044714999999999998]()
+        %5 : float = prim::Constant[value=0.79788456080000003]()
+        %4 : float = prim::Constant[value=1.]()
+        %3 : float = prim::Constant[value=0.5]()
+        %2 : int = prim::Constant[value=1]()
+        %7 : Tensor = aten::mul(%x.1, %3)
+        %8 : Tensor = aten::mul(%x.1, %5)
+        %9 : Tensor = aten::mul(%x.1, %6)
+        %10 : Tensor = aten::mul(%9, %x.1)
+        %11 : Tensor = aten::add(%10, %4, %2)
+        %12 : Tensor = aten::mul(%8, %11)
+        %13 : Tensor = aten::tanh(%12)
+        %14 : Tensor = aten::add(%13, %4, %2)
+        %15 : Tensor = aten::mul(%7, %14)
+        return (%15))IR";
+
+  // replace aten::gelu with pointwise operations
+  torch::jit::SubgraphRewriter map_gelu_to_pointwise_ops;
+  map_gelu_to_pointwise_ops.RegisterRewritePattern(gelu_pattern, gelu_reduce_pattern);
+  map_gelu_to_pointwise_ops.runOnGraph(graph);
+
+  LOG_GRAPH("Post lowering of [aten::gelu] -> " << *graph);
+}
+
+} // namespace passes
+} // namespace lowering
+} // namespace core
+} // namespace torch_tensorrt
@@ -34,11 +34,9 @@ To enable automatic fallback feature, you can set following attributes in Python
   ts_model = torch.jit.script(model)
   trt_model = torchtrt.ts.compile(model, **{
     ...
-    "torch_fallback" : {
-      "enabled" : True,
-      "min_block_size" : 3,
-      "forced_fallback_ops": ["aten::add"],
-    }
+    "min_block_size" : 3,
+    "torch_executed_ops": ["aten::add"],
+    "torch_executed_modules": [],
   })
 ```
 - `enabled`: By default automatic fallback will be off. It is enabled by setting it to True.
@@ -59,9 +57,8 @@ auto in = torch::randn({1, 3, 224, 224}, {torch::kCUDA});
 auto mod = torch::jit::load("trt_ts_module.ts");
 auto input_sizes =  std::vector<torchtrt::InputRange>{{in.sizes()}};
 torchtrt::ts::CompileSpec cfg(input_sizes);
-cfg.torch_fallback = torchtrt::CompileSpec::TorchFallback(true);
-cfg.torch_fallback.min_block_size = 2;
-cfg.torch_fallback.forced_fallback_ops.push_back("aten::relu");
+cfg.min_block_size = 2;
+cfg.torch_executed_ops.push_back("aten::relu");
 auto trt_mod = torchtrt::ts::compile(mod, cfg);
 auto out = trt_mod.forward({in});
 ```