BREAKING CHANGE: Removing deprecated APIs like setNumMinTimingIters and added DLA memory size configurations

peri044 · peri044 · commit 19fc7a761307 · 2022-07-22T17:56:23.000-07:00
Signed-off-by: Dheeraj Peri &lt;peri.dheeraj@gmail.com&gt;
diff --git a/core/conversion/conversionctx/ConversionCtx.h b/core/conversion/conversionctx/ConversionCtx.h
@@ -33,7 +33,6 @@ struct BuilderSettings {
   Device device;
   nvinfer1::EngineCapability capability = TRT_ENGINE_CAPABILITY_STANDARD;
   nvinfer1::IInt8Calibrator* calibrator = nullptr;
-  uint64_t num_min_timing_iters = 2;
   uint64_t num_avg_timing_iters = 1;
   uint64_t workspace_size = 0;
   uint64_t dla_sram_size = 1048576;
diff --git a/cpp/bin/torchtrtc/README.md b/cpp/bin/torchtrtc/README.md
@@ -82,13 +82,17 @@ torchtrtc [input_file_path] [output_file_path]
                                         serialized TensorRT engine and embed it
                                         into a TorchScript module (device spec
                                         must be provided)
-      --num-min-timing-iter=[num_iters] Number of minimization timing iterations
-                                        used to select kernels
       --num-avg-timing-iters=[num_iters]
                                         Number of averaging timing iterations
                                         used to select kernels
       --workspace-size=[workspace_size] Maximum size of workspace given to
                                         TensorRT
+      --dla-sram-size=[dla_sram_size]   Fast software managed RAM used by DLA
+                                        to communicate within a layer.
+      --dla-local-dram-size=[dla_local_dram_size]  Host RAM used by DLA to share
+                                        intermediate tensor data across operations.
+      --dla-global-dram-size=[dla_global_dram_size] Host RAM used by DLA to store
+                                        weights and metadata for execution
       --atol=[atol]                     Absolute tolerance threshold for acceptable
                                         numerical deviation from standard torchscript
                                         output (default 1e-8)
diff --git a/cpp/bin/torchtrtc/main.cpp b/cpp/bin/torchtrtc/main.cpp
@@ -365,10 +365,6 @@ int main(int argc, char** argv) {
     }
   }
 
-  if (num_min_timing_iters) {
-    compile_settings.num_min_timing_iters = args::get(num_min_timing_iters);
-  }
-
   if (num_avg_timing_iters) {
     compile_settings.num_avg_timing_iters = args::get(num_avg_timing_iters);
   }
diff --git a/cpp/include/torch_tensorrt/torch_tensorrt.h b/cpp/include/torch_tensorrt/torch_tensorrt.h
@@ -636,10 +636,6 @@ struct TORCHTRT_API CompileSpec {
    */
   EngineCapability capability = EngineCapability::kSTANDARD;
 
-  /**
-   * Number of minimization timing iterations used to select kernels
-   */
-  uint64_t num_min_timing_iters = 2;
   /**
    * Number of averaging timing iterations used to select kernels
    */
diff --git a/docsrc/tutorials/ptq.rst b/docsrc/tutorials/ptq.rst
@@ -130,8 +130,6 @@ Then all thats required to setup the module for INT8 calibration is to set the f
     compile_spec.enabled_precisions.insert(torch::kI8);
     /// Use the TensorRT Entropy Calibrator
     compile_spec.ptq_calibrator = calibrator;
-    /// Set a larger workspace (you may get better performace from doing so)
-    compile_spec.workspace_size = 1 << 28;
 
     auto trt_mod = torch_tensorrt::CompileGraph(mod, compile_spec);
 
diff --git a/docsrc/tutorials/torchtrtc.rst b/docsrc/tutorials/torchtrtc.rst
@@ -85,13 +85,17 @@ to standard TorchScript. Load with ``torch.jit.load()`` and run like you would r
                                           serialized TensorRT engine and embed it
                                           into a TorchScript module (device spec
                                           must be provided)
-        --num-min-timing-iter=[num_iters] Number of minimization timing iterations
-                                          used to select kernels
         --num-avg-timing-iters=[num_iters]
                                           Number of averaging timing iterations
                                           used to select kernels
         --workspace-size=[workspace_size] Maximum size of workspace given to
                                           TensorRT
+        --dla-sram-size=[dla_sram_size]   Fast software managed RAM used by DLA
+                                          to communicate within a layer.
+        --dla-local-dram-size=[dla_local_dram_size]  Host RAM used by DLA to share
+                                          intermediate tensor data across operations.
+        --dla-global-dram-size=[dla_global_dram_size] Host RAM used by DLA to store
+                                          weights and metadata for execution
         --atol=[atol]                     Absolute tolerance threshold for acceptable
                                           numerical deviation from standard torchscript
                                           output (default 1e-8)
diff --git a/docsrc/tutorials/using_dla.rst b/docsrc/tutorials/using_dla.rst
@@ -33,9 +33,6 @@ Using DLA in a C++ application
     # If a layer fails to run on DLA it will fallback to GPU
     compile_spec.device.allow_gpu_fallback = true;
 
-    # Set the workspace size
-    compile_spec.workspace_size = 1 << 28;
-
 
 Using DLA in a python application
 
diff --git a/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp b/py/torch_tensorrt/csrc/register_tensorrt_classes.cpp
@@ -60,8 +60,6 @@ void RegisterTRTCompileSpec() {
   ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, refit);
   ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, debug);
   ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, capability);
-  ADD_FIELD_GET_SET_REGISTRATION(
-      TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, num_min_timing_iters);
   ADD_FIELD_GET_SET_REGISTRATION(
       TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, num_avg_timing_iters);
   ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, workspace_size);
diff --git a/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp b/py/torch_tensorrt/csrc/torch_tensorrt_py.cpp
@@ -300,7 +300,6 @@ PYBIND11_MODULE(_C, m) {
       .def_readwrite("debug", &CompileSpec::debug)
       .def_readwrite("device", &CompileSpec::device)
       .def_readwrite("capability", &CompileSpec::capability)
-      .def_readwrite("num_min_timing_iters", &CompileSpec::num_min_timing_iters)
       .def_readwrite("num_avg_timing_iters", &CompileSpec::num_avg_timing_iters)
       .def_readwrite("workspace_size", &CompileSpec::workspace_size)
       .def_readwrite("dla_sram_size", &CompileSpec::dla_sram_size)
diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py
@@ -211,6 +211,18 @@ def _parse_compile_spec(compile_spec: Dict[str, Any]) -> _ts_C.CompileSpec:
         assert type(compile_spec["workspace_size"]) is int
         info.workspace_size = compile_spec["workspace_size"]
 
+    if "dla_sram_size" in compile_spec:
+        assert type(compile_spec["dla_sram_size"]) is int
+        info.dla_sram_size = compile_spec["dla_sram_size"]
+
+    if "dla_local_dram_size" in compile_spec:
+        assert type(compile_spec["dla_local_dram_size"]) is int
+        info.dla_local_dram_size = compile_spec["dla_local_dram_size"]
+
+    if "dla_global_dram_size" in compile_spec:
+        assert type(compile_spec["dla_global_dram_size"]) is int
+        info.dla_global_dram_size = compile_spec["dla_global_dram_size"]
+
     if "truncate_long_and_double" in compile_spec:
         assert type(compile_spec["truncate_long_and_double"]) is bool
         info.truncate_long_and_double = compile_spec["truncate_long_and_double"]
@@ -229,9 +241,11 @@ def TensorRTCompileSpec(inputs=[],
                         refit=False,
                         debug=False,
                         capability=_enums.EngineCapability.default,
-                        num_min_timing_iters=2,
                         num_avg_timing_iters=1,
                         workspace_size=0,
+                        dla_sram_size=1048576,
+                        dla_local_dram_size=1073741824,
+                        dla_global_dram_size=536870912,
                         truncate_long_and_double=False,
                         calibrator=None) -> torch.classes.tensorrt.CompileSpec:
     """Utility to create a formated spec dictionary for using the PyTorch TensorRT backend
@@ -263,7 +277,6 @@ def TensorRTCompileSpec(inputs=[],
         refit (bool): Enable refitting
         debug (bool): Enable debuggable engine
         capability (torch_tensorrt.EngineCapability): Restrict kernel selection to safe gpu kernels or safe dla kernels
-        num_min_timing_iters (int): Number of minimization timing iterations used to select kernels
         num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels
         workspace_size (int): Maximum size of workspace given to TensorRT
         truncate_long_and_double (bool): Truncate weights provided in int64 or double (float64) to int32 and float32
@@ -283,9 +296,11 @@ def TensorRTCompileSpec(inputs=[],
         "refit": refit,  # enable refit
         "debug": debug,  # enable debuggable engine
         "capability": capability,  # Restrict kernel selection to safe gpu kernels or safe dla kernels
-        "num_min_timing_iters": num_min_timing_iters,  # Number of minimization timing iterations used to select kernels
         "num_avg_timing_iters": num_avg_timing_iters,  # Number of averaging timing iterations used to select kernels
         "workspace_size": workspace_size,  # Maximum size of workspace given to TensorRT
+        "dla_sram_size": dla_sram_size,  # Fast software managed RAM used by DLA to communicate within a layer.
+        "dla_local_dram_size": dla_local_dram_size,  # Host RAM used by DLA to share intermediate tensor data across operations
+        "dla_global_dram_size": dla_global_dram_size,  # Host RAM used by DLA to store weights and metadata for execution
         "calibrator": calibrator,
         "truncate_long_and_double": truncate_long_and_double
     }
@@ -331,9 +346,11 @@ def TensorRTCompileSpec(inputs=[],
     backend_spec._set_debug(parsed_spec.debug)
     backend_spec._set_refit(parsed_spec.refit)
     backend_spec._set_capability(int(parsed_spec.capability))
-    backend_spec._set_num_min_timing_iters(parsed_spec.num_min_timing_iters)
     backend_spec._set_num_avg_timing_iters(parsed_spec.num_avg_timing_iters)
     backend_spec._set_workspace_size(parsed_spec.workspace_size)
+    backend_spec._set_dla_sram_size(parsed_spec.dla_sram_size)
+    backend_spec._set_dla_local_dram_size(parsed_spec.dla_local_dram_size)
+    backend_spec._set_dla_global_dram_size(parsed_spec._set_dla_global_dram_size)
     backend_spec._set_truncate_long_and_double(parsed_spec.truncate_long_and_double)
     backend_spec._set_ptq_calibrator(parsed_spec._get_calibrator_handle())
 
diff --git a/py/torch_tensorrt/ts/_compiler.py b/py/torch_tensorrt/ts/_compiler.py
@@ -20,6 +20,9 @@ def compile(module: torch.jit.ScriptModule,
             capability=_enums.EngineCapability.default,
             num_avg_timing_iters=1,
             workspace_size=0,
+            dla_sram_size=1048576,
+            dla_local_dram_size=1073741824,
+            dla_global_dram_size=536870912,
             calibrator=None,
             truncate_long_and_double=False,
             require_full_compilation=False,
@@ -64,9 +67,11 @@ def compile(module: torch.jit.ScriptModule,
         refit (bool): Enable refitting
         debug (bool): Enable debuggable engine
         capability (torch_tensorrt.EngineCapability): Restrict kernel selection to safe gpu kernels or safe dla kernels
-        num_min_timing_iters (int): Number of minimization timing iterations used to select kernels
         num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels
         workspace_size (int): Maximum size of workspace given to TensorRT
+        dla_sram_size (int): Fast software managed RAM used by DLA to communicate within a layer.
+        dla_local_dram_size (int): Host RAM used by DLA to share intermediate tensor data across operations
+        dla_global_dram_size (int): Host RAM used by DLA to store weights and metadata for execution
         truncate_long_and_double (bool): Truncate weights provided in int64 or double (float64) to int32 and float32
         calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration
         require_full_compilation (bool): Require modules to be compiled end to end or return an error as opposed to returning a hybrid graph where operations that cannot be run in TensorRT are run in PyTorch
@@ -96,7 +101,6 @@ def compile(module: torch.jit.ScriptModule,
         "refit": refit,  # enable refit
         "debug": debug,  # enable debuggable engine
         "capability": capability,  # Restrict kernel selection to safe gpu kernels or safe dla kernels
-        "num_min_timing_iters": num_min_timing_iters,  # Number of minimization timing iterations used to select kernels
         "num_avg_timing_iters": num_avg_timing_iters,  # Number of averaging timing iterations used to select kernels
         "workspace_size": workspace_size,  # Maximum size of workspace given to TensorRT
         "calibrator": calibrator,
@@ -124,9 +128,11 @@ def convert_method_to_trt_engine(module: torch.jit.ScriptModule,
                                  refit=False,
                                  debug=False,
                                  capability=_enums.EngineCapability.default,
-                                 num_min_timing_iters=2,
                                  num_avg_timing_iters=1,
                                  workspace_size=0,
+                                 dla_sram_size=1048576,
+                                 dla_local_dram_size=1073741824,
+                                 dla_global_dram_size=536870912,
                                  truncate_long_and_double=False,
                                  calibrator=None) -> str:
     """Convert a TorchScript module method to a serialized TensorRT engine
@@ -165,9 +171,11 @@ def convert_method_to_trt_engine(module: torch.jit.ScriptModule,
         refit (bool): Enable refitting
         debug (bool): Enable debuggable engine
         capability (torch_tensorrt.EngineCapability): Restrict kernel selection to safe gpu kernels or safe dla kernels
-        num_min_timing_iters (int): Number of minimization timing iterations used to select kernels
         num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels
         workspace_size (int): Maximum size of workspace given to TensorRT
+        dla_sram_size (int): Fast software managed RAM used by DLA to communicate within a layer.
+        dla_local_dram_size (int): Host RAM used by DLA to share intermediate tensor data across operations
+        dla_global_dram_size (int): Host RAM used by DLA to store weights and metadata for execution
         truncate_long_and_double (bool): Truncate weights provided in int64 or double (float64) to int32 and float32
         calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration
 
@@ -188,7 +196,6 @@ def convert_method_to_trt_engine(module: torch.jit.ScriptModule,
         "refit": refit,  # enable refit
         "debug": debug,  # enable debuggable engine
         "capability": capability,  # Restrict kernel selection to safe gpu kernels or safe dla kernels
-        "num_min_timing_iters": num_min_timing_iters,  # Number of minimization timing iterations used to select kernels
         "num_avg_timing_iters": num_avg_timing_iters,  # Number of averaging timing iterations used to select kernels
         "workspace_size": workspace_size,  # Maximum size of workspace given to TensorRT
         "calibrator": calibrator,

Original file line number	Diff line number	Diff line change
`@@ -365,10 +365,6 @@ int main(int argc, char** argv) {`
`365`	`365`	`}`
`366`	`366`	`}`
`367`	`367`
`368`		`- if (num_min_timing_iters) {`
`369`		`- compile_settings.num_min_timing_iters = args::get(num_min_timing_iters);`
`370`		`- }`
`371`		`-`
`372`	`368`	`if (num_avg_timing_iters) {`
`373`	`369`	`compile_settings.num_avg_timing_iters = args::get(num_avg_timing_iters);`
`374`	`370`	`}`