pytorch
diff --git a/‎.gitignore
Lines changed: 3 additions & 1 deletion b/‎.gitignore
Lines changed: 3 additions & 1 deletion
diff --git a/‎docsrc/user_guide/runtime.rst
Lines changed: 130 additions & 6 deletions b/‎docsrc/user_guide/runtime.rst
Lines changed: 130 additions & 6 deletions
diff --git a/‎docsrc/user_guide/saving_models.rst
Lines changed: 29 additions & 5 deletions b/‎docsrc/user_guide/saving_models.rst
Lines changed: 29 additions & 5 deletions
diff --git a/‎examples/torchtrt_aoti_example/BUILD
Lines changed: 56 additions & 0 deletions b/‎examples/torchtrt_aoti_example/BUILD
Lines changed: 56 additions & 0 deletions
diff --git a/‎examples/torchtrt_aoti_example/CMakeLists.txt
Lines changed: 16 additions & 0 deletions b/‎examples/torchtrt_aoti_example/CMakeLists.txt
Lines changed: 16 additions & 0 deletions
diff --git a/‎examples/torchtrt_aoti_example/Makefile
Lines changed: 22 additions & 0 deletions b/‎examples/torchtrt_aoti_example/Makefile
Lines changed: 22 additions & 0 deletions
@@ -78,4 +78,6 @@ MODULE.bazel.lock
 *.whl
 .coverage
 coverage.xml
-*.log
+*.log
+*.pt2
+examples/torchtrt_aoti_example/torchtrt_aoti_example
@@ -24,7 +24,7 @@ programs just as you would otherwise via PyTorch API.
 
 .. note:: If you are linking ``libtorchtrt_runtime.so``, likely using the following flags will help ``-Wl,--no-as-needed -ltorchtrt -Wl,--as-needed`` as there's no direct symbol dependency to anything in the Torch-TensorRT runtime for most Torch-TensorRT runtime applications
 
-An example of how to use ``libtorchtrt_runtime.so`` can be found here: https://github.com/pytorch/TensorRT/tree/master/examples/torchtrt_runtime_example
+An example of how to use ``libtorchtrt_runtime.so`` can be found here: https://github.com/pytorch/TensorRT/tree/master/examples/torchtrt_aoti_example
 
 Plugin Library
 ---------------
@@ -87,8 +87,8 @@ Cudagraphs can accelerate certain models by reducing kernel overheads, as docume
     with torch_tensorrt.runtime.enable_cudagraphs(trt_module):
         ...
 
-In the current implementation, use of a new input shape (for instance in dynamic shape 
-cases), will cause the cudagraph to be re-recorded. Cudagraph recording is generally 
+In the current implementation, use of a new input shape (for instance in dynamic shape
+cases), will cause the cudagraph to be re-recorded. Cudagraph recording is generally
 not latency intensive, and future improvements include caching cudagraphs for multiple input shapes.
 
 Dynamic Output Allocation Mode
@@ -101,11 +101,11 @@ Without dynamic output allocation, the output buffer is allocated based on the i
 
 There are two scenarios in which dynamic output allocation is enabled:
 
-1. The model has been identified at compile time to require dynamic output allocation for at least one TensorRT subgraph. 
-These models will engage the runtime mode automatically (with logging) and are incompatible with other runtime modes 
+1. The model has been identified at compile time to require dynamic output allocation for at least one TensorRT subgraph.
+These models will engage the runtime mode automatically (with logging) and are incompatible with other runtime modes
 such as CUDA Graphs.
 
-Converters can declare that subgraphs that they produce will require the output allocator using `requires_output_allocator=True` 
+Converters can declare that subgraphs that they produce will require the output allocator using `requires_output_allocator=True`
 there by forcing any model which utilizes the converter to automatically use the output allocator runtime mode. e.g.,
 
 .. code-block:: python
@@ -131,3 +131,127 @@ there by forcing any model which utilizes the converter to automatically use the
     # Enables Dynamic Output Allocation Mode, then resets the mode to its prior setting
     with torch_tensorrt.runtime.enable_output_allocator(trt_module):
         ...
+
+Deploying Torch-TensorRT Programs without Python
+--------------------------------------------------------
+
+AOT-Inductor
+~~~~~~~~~~~~~~~~
+
+AOTInductor is a specialized version of TorchInductor, designed to process exported PyTorch models, optimize them, and produce shared
+libraries as well as other relevant artifacts. These compiled artifacts are specifically crafted for deployment in non-Python environments,
+which are frequently employed for inference deployments on the server side.
+
+Torch-TensorRT is able to accelerate subgraphs within AOTInductor exports in the same way it does in Python.
+
+.. code-block:: py
+
+    dynamo_model = torch_tensorrt.compile(model, ir="dynamo", arg_inputs=[...])
+    torch_tensorrt.save(
+        dynamo_model,
+        file_path=os.path.join(os.getcwd(), "model.pt2"),
+        output_format="aot_inductor",
+        retrace=True,
+        arg_inputs=[...],
+    )
+
+This artifact then can be loaded in a C++ application to be executed with out a Python dependency.
+
+.. code-block:: c++
+
+    #include <iostream>
+    #include <vector>
+
+    #include "torch/torch.h"
+    #include "torch/csrc/inductor/aoti_package/model_package_loader.h"
+
+    int main(int argc, const char* argv[]) {
+    // Check for correct number of command-line arguments
+    std::string trt_aoti_module_path = "model.pt2";
+
+    if (argc == 2) {
+        trt_aoti_module_path = argv[1];
+    }
+
+        std::cout << trt_aoti_module_path << std::endl;
+
+        // Get the path to the TRT AOTI model package from the command line
+        c10::InferenceMode mode;
+
+        torch::inductor::AOTIModelPackageLoader loader(trt_aoti_module_path);
+        // Assume running on CUDA
+        std::vector<torch::Tensor> inputs = {torch::randn({8, 10}, at::kCUDA)};
+        std::vector<torch::Tensor> outputs = loader.run(inputs);
+        std::cout << "Result from the first inference:"<< std::endl;
+        std::cout << outputs << std::endl;
+
+        // The second inference uses a different batch size and it works because we
+        // specified that dimension as dynamic when compiling model.pt2.
+        std::cout << "Result from the second inference:"<< std::endl;
+        // Assume running on CUDA
+        std::cout << loader.run({torch::randn({1, 10}, at::kCUDA)}) << std::endl;
+
+        return 0;
+    }
+
+Note: Similar to Python, at runtime, no Torch-TensorRT APIs are used to operate the model. Therefore typically additional
+flags are needed to make sure that ``libtorchtrt_runtime.so`` gets optimized out (see above).
+
+See: ``//examples/torchtrt_aoti_example`` for a full end to end demo of this workflow
+
+
+TorchScript
+~~~~~~~~~~~~~~
+
+TorchScript is a legacy compiler stack for PyTorch that includes a Python-less interpreter for TorchScript programs.
+It has historically been used by Torch-TensorRT to execute models without Python. Even after the transition to TorchDynamo,
+the TorchScript interpreter can continue to be used to run PyTorch models with TensorRT engines outside of Python.
+
+.. code-block:: py
+
+    dynamo_model = torch_tensorrt.compile(model, ir="dynamo", arg_inputs=[...])
+    ts_model = torch.jit.trace(dynamo_model, inputs=[...])
+    torch.jit.save(ts_model, os.path.join(os.getcwd(), "model.ts"),)
+
+This artifact then can be loaded in a C++ application to be executed with out a Python dependency.
+
+.. code-block:: c++
+
+    #include <fstream>
+    #include <iostream>
+    #include <memory>
+    #include <sstream>
+    #include <vector>
+    #include "torch/script.h"
+
+    int main(int argc, const char* argv[]) {
+        if (argc < 2) {
+            std::cerr << "usage: samplertapp <path-to-pre-built-trt-ts module>\n";
+            return -1;
+        }
+
+        std::string trt_ts_module_path = argv[1];
+
+        torch::jit::Module trt_ts_mod;
+        try {
+            // Deserialize the ScriptModule from a file using torch::jit::load().
+            trt_ts_mod = torch::jit::load(trt_ts_module_path);
+        } catch (const c10::Error& e) {
+            std::cerr << "error loading the model from : " << trt_ts_module_path << std::endl;
+            return -1;
+        }
+
+        std::cout << "Running TRT engine" << std::endl;
+        std::vector<torch::jit::IValue> trt_inputs_ivalues;
+        trt_inputs_ivalues.push_back(at::randint(-5, 5, {1, 3, 5, 5}, {at::kCUDA}).to(torch::kFloat32));
+        torch::jit::IValue trt_results_ivalues = trt_ts_mod.forward(trt_inputs_ivalues);
+        std::cout << "==================TRT outputs================" << std::endl;
+        std::cout << trt_results_ivalues << std::endl;
+        std::cout << "=============================================" << std::endl;
+        std::cout << "TRT engine execution completed. " << std::endl;
+    }
+
+Note: Similar to Python, at runtime, no Torch-TensorRT APIs are used to operate the model. Therefore typically additional
+flags are needed to make sure that ``libtorchtrt_runtime.so`` gets optimized out (see above).
+
+See: ``//examples/torchtrt_runtime_example`` for a full end to end demo of this workflow
@@ -14,12 +14,13 @@ Saving models compiled with Torch-TensorRT can be done using `torch_tensorrt.sav
 Dynamo IR
 -------------
 
-The output type of `ir=dynamo` compilation of Torch-TensorRT is `torch.fx.GraphModule` object by default. 
-We can save this object in either `TorchScript` (`torch.jit.ScriptModule`) or `ExportedProgram` (`torch.export.ExportedProgram`) formats by 
+The output type of `ir=dynamo` compilation of Torch-TensorRT is `torch.fx.GraphModule` object by default.
+We can save this object in either `TorchScript` (`torch.jit.ScriptModule`), `ExportedProgram` (`torch.export.ExportedProgram`) or `PT2` formats by
 specifying the `output_format` flag. Here are the options `output_format` will accept
 
 * `exported_program` : This is the default. We perform transformations on the graphmodule first and use `torch.export.save` to save the module.
 * `torchscript` : We trace the graphmodule via `torch.jit.trace` and save it via `torch.jit.save`.
+* `PT2 Format` : This is a next generation runtime for PyTorch models, allowing them to run in Python and in C++
 
 a) ExportedProgram
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -52,8 +53,8 @@ b) Torchscript
     model = MyModel().eval().cuda()
     inputs = [torch.randn((1, 3, 224, 224)).cuda()]
     # trt_gm is a torch.fx.GraphModule object
-    trt_gm = torch_tensorrt.compile(model, ir="dynamo", inputs=inputs)
-    torch_tensorrt.save(trt_gm, "trt.ts", output_format="torchscript", inputs=inputs)
+    trt_gm = torch_tensorrt.compile(model, ir="dynamo", arg_inputs=inputs)
+    torch_tensorrt.save(trt_gm, "trt.ts", output_format="torchscript", arg_inputs=inputs)
 
     # Later, you can load it and run inference
     model = torch.jit.load("trt.ts").cuda()
@@ -73,7 +74,7 @@ For `ir=ts`, this behavior stays the same in 2.X versions as well.
 
   model = MyModel().eval().cuda()
   inputs = [torch.randn((1, 3, 224, 224)).cuda()]
-  trt_ts = torch_tensorrt.compile(model, ir="ts", inputs=inputs) # Output is a ScriptModule object
+  trt_ts = torch_tensorrt.compile(model, ir="ts", arg_inputs=inputs) # Output is a ScriptModule object
   torch.jit.save(trt_ts, "trt_model.ts")
 
   # Later, you can load it and run inference
@@ -98,3 +99,26 @@ Here's an example usage
     inputs = [torch.randn((1, 3, 224, 224)).cuda()]
     model = torch_tensorrt.load(<file_path>).module()
     model(*inputs)
+
+b) PT2 Format
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+PT2 is a new format that allows models to be run outside of Python in the future. It utilizes `AOTInductor <https://docs.pytorch.org/docs/main/torch.compiler_aot_inductor.html>`_
+to generate kernels for components that will not be run in TensorRT.
+
+Here's an example on how to save and load Torch-TensorRT Module using AOTInductor in Python
+
+.. code-block:: python
+
+    import torch
+    import torch_tensorrt
+
+    model = MyModel().eval().cuda()
+    inputs = [torch.randn((1, 3, 224, 224)).cuda()]
+    # trt_ep is a torch.fx.GraphModule object
+    trt_gm = torch_tensorrt.compile(model, ir="dynamo", inputs=inputs)
+    torch_tensorrt.save(trt_gm, "trt.pt2", arg_inputs=inputs, output_format="aot_inductor", retrace=True)
+
+    # Later, you can load it and run inference
+    model = torch._inductor.aoti_load_package("trt.pt2")
+    model(*inputs)
@@ -0,0 +1,56 @@
+load("@rules_cc//cc:defs.bzl", "cc_binary")
+
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "use_torch_whl",
+    flag_values = {
+        "//toolchains/dep_src:torch": "whl"
+    },
+)
+
+config_setting(
+    name = "jetpack",
+    constraint_values = [
+        "@platforms//cpu:aarch64",
+    ],
+    flag_values = {
+        "//toolchains/dep_collection:compute_libs": "jetpack"
+    },
+)
+config_setting(
+    name = "windows",
+    constraint_values = [
+        "@platforms//os:windows",
+    ],
+)
+
+cc_binary(
+    name = "torchtrt_aoti_example",
+    srcs = [
+        "inference.cpp"
+    ],
+    linkopts = [
+        "-ldl",
+    ],
+    deps = [
+        "//cpp:torch_tensorrt",
+    ] + select({
+        ":windows": [
+            "@libtorch_win//:caffe2",
+            "@libtorch_win//:libtorch"
+        ],
+        ":use_torch_whl": [
+            "@torch_whl//:caffe2",
+            "@torch_whl//:libtorch"
+        ],
+        ":jetpack": [
+            "@torch_l4t//:caffe2",
+            "@torch_l4t//:libtorch"
+        ],
+        "//conditions:default": [
+            "@libtorch",
+            "@libtorch//:caffe2",
+        ],
+    }),
+)
@@ -0,0 +1,16 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+project(torchtrt_aoti_example LANGUAGES CXX)
+
+find_package(Torch REQUIRED)
+find_package(torchtrt REQUIRED)
+
+add_executable(torchtrt_aoti_example inference.cpp model.pt2)
+
+add_custom_command(
+    OUTPUT model.pt2
+    COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/model.py
+    DEPENDS model.py
+)
+
+target_link_libraries(torchtrt_aoti_example "${TORCH_LIBRARIES}" "-Wl,--no-as-needed" torchtrt_runtime "-Wl,--as-needed")
+set_property(TARGET torchtrt_aoti_example PROPERTY CXX_STANDARD 17)
@@ -0,0 +1,22 @@
+CXX=g++
+SITE_PACKAGES=$(shell python -c 'import site; print(site.getsitepackages()[0])')
+CUDA_HOME=/usr/local/cuda-12.8
+
+INCLUDE_DIRS=-I$(SITE_PACKAGES)/torch/include -I$(SITE_PACKAGES)/torch_tensorrt/include -I$(CUDA_HOME)/include -I$(SITE_PACKAGES)/torch/include/torch/csrc/api/include
+
+LIB_DIRS=-L$(SITE_PACKAGES)/torch_tensorrt/lib -L$(SITE_PACKAGES)/torch/lib -Wl,-rpath $(SITE_PACKAGES)/tensorrt_libs -L/home/naren/pytorch_org/tensorrt/py/torch_tensorrt/lib
+LIBS=-Wl,--no-as-needed -ltorchtrt_runtime -ltorchtrt_plugins  -Wl,--as-needed -ltorch -ltorch_cuda -ltorch_cpu -ltorch_global_deps -ltorch_cuda_linalg -lc10 -lc10_cuda -lshm -ltorch_global_deps -ltorch_python
+
+SRCS=inference.cpp
+
+TARGET=torchtrt_aoti_example
+
+$(TARGET): *cpp
+	$(CXX) $(SRCS) $(INCLUDE_DIRS) $(LIB_DIRS) $(LIBS) -o $(TARGET)
+	echo "\n\nAdd to LD_LIBRARY_PATH: $(SITE_PACKAGES)/torch_tensorrt/lib:$(SITE_PACKAGES)/torch/lib:$(SITE_PACKAGES)/tensorrt_libs:$(CUDA_HOME)/lib64"
+
+generate_pt2:
+	python model.py
+
+clean:
+	$(RM) $(TARGET)