Skip to content

Commit 19fc7a7

Browse files
committed
BREAKING CHANGE: Removing deprecated APIs like setNumMinTimingIters and added DLA memory size configurations
Signed-off-by: Dheeraj Peri <[email protected]>
1 parent 2895fb8 commit 19fc7a7

File tree

11 files changed

+45
-30
lines changed

11 files changed

+45
-30
lines changed

core/conversion/conversionctx/ConversionCtx.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ struct BuilderSettings {
3333
Device device;
3434
nvinfer1::EngineCapability capability = TRT_ENGINE_CAPABILITY_STANDARD;
3535
nvinfer1::IInt8Calibrator* calibrator = nullptr;
36-
uint64_t num_min_timing_iters = 2;
3736
uint64_t num_avg_timing_iters = 1;
3837
uint64_t workspace_size = 0;
3938
uint64_t dla_sram_size = 1048576;

cpp/bin/torchtrtc/README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,13 +82,17 @@ torchtrtc [input_file_path] [output_file_path]
8282
serialized TensorRT engine and embed it
8383
into a TorchScript module (device spec
8484
must be provided)
85-
--num-min-timing-iter=[num_iters] Number of minimization timing iterations
86-
used to select kernels
8785
--num-avg-timing-iters=[num_iters]
8886
Number of averaging timing iterations
8987
used to select kernels
9088
--workspace-size=[workspace_size] Maximum size of workspace given to
9189
TensorRT
90+
--dla-sram-size=[dla_sram_size] Fast software managed RAM used by DLA
91+
to communicate within a layer.
92+
--dla-local-dram-size=[dla_local_dram_size] Host RAM used by DLA to share
93+
intermediate tensor data across operations.
94+
--dla-global-dram-size=[dla_global_dram_size] Host RAM used by DLA to store
95+
weights and metadata for execution
9296
--atol=[atol] Absolute tolerance threshold for acceptable
9397
numerical deviation from standard torchscript
9498
output (default 1e-8)

cpp/bin/torchtrtc/main.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -365,10 +365,6 @@ int main(int argc, char** argv) {
365365
}
366366
}
367367

368-
if (num_min_timing_iters) {
369-
compile_settings.num_min_timing_iters = args::get(num_min_timing_iters);
370-
}
371-
372368
if (num_avg_timing_iters) {
373369
compile_settings.num_avg_timing_iters = args::get(num_avg_timing_iters);
374370
}

cpp/include/torch_tensorrt/torch_tensorrt.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -636,10 +636,6 @@ struct TORCHTRT_API CompileSpec {
636636
*/
637637
EngineCapability capability = EngineCapability::kSTANDARD;
638638

639-
/**
640-
* Number of minimization timing iterations used to select kernels
641-
*/
642-
uint64_t num_min_timing_iters = 2;
643639
/**
644640
* Number of averaging timing iterations used to select kernels
645641
*/

docsrc/tutorials/ptq.rst

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,8 +130,6 @@ Then all thats required to setup the module for INT8 calibration is to set the f
130130
compile_spec.enabled_precisions.insert(torch::kI8);
131131
/// Use the TensorRT Entropy Calibrator
132132
compile_spec.ptq_calibrator = calibrator;
133-
/// Set a larger workspace (you may get better performace from doing so)
134-
compile_spec.workspace_size = 1 << 28;
135133

136134
auto trt_mod = torch_tensorrt::CompileGraph(mod, compile_spec);
137135

docsrc/tutorials/torchtrtc.rst

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,13 +85,17 @@ to standard TorchScript. Load with ``torch.jit.load()`` and run like you would r
8585
serialized TensorRT engine and embed it
8686
into a TorchScript module (device spec
8787
must be provided)
88-
--num-min-timing-iter=[num_iters] Number of minimization timing iterations
89-
used to select kernels
9088
--num-avg-timing-iters=[num_iters]
9189
Number of averaging timing iterations
9290
used to select kernels
9391
--workspace-size=[workspace_size] Maximum size of workspace given to
9492
TensorRT
93+
--dla-sram-size=[dla_sram_size] Fast software managed RAM used by DLA
94+
to communicate within a layer.
95+
--dla-local-dram-size=[dla_local_dram_size] Host RAM used by DLA to share
96+
intermediate tensor data across operations.
97+
--dla-global-dram-size=[dla_global_dram_size] Host RAM used by DLA to store
98+
weights and metadata for execution
9599
--atol=[atol] Absolute tolerance threshold for acceptable
96100
numerical deviation from standard torchscript
97101
output (default 1e-8)

docsrc/tutorials/using_dla.rst

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,6 @@ Using DLA in a C++ application
3333
# If a layer fails to run on DLA it will fallback to GPU
3434
compile_spec.device.allow_gpu_fallback = true;
3535

36-
# Set the workspace size
37-
compile_spec.workspace_size = 1 << 28;
38-
3936

4037
Using DLA in a python application
4138

py/torch_tensorrt/csrc/register_tensorrt_classes.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,6 @@ void RegisterTRTCompileSpec() {
6060
ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, refit);
6161
ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, debug);
6262
ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, capability);
63-
ADD_FIELD_GET_SET_REGISTRATION(
64-
TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, num_min_timing_iters);
6563
ADD_FIELD_GET_SET_REGISTRATION(
6664
TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, num_avg_timing_iters);
6765
ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, workspace_size);

py/torch_tensorrt/csrc/torch_tensorrt_py.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,6 @@ PYBIND11_MODULE(_C, m) {
300300
.def_readwrite("debug", &CompileSpec::debug)
301301
.def_readwrite("device", &CompileSpec::device)
302302
.def_readwrite("capability", &CompileSpec::capability)
303-
.def_readwrite("num_min_timing_iters", &CompileSpec::num_min_timing_iters)
304303
.def_readwrite("num_avg_timing_iters", &CompileSpec::num_avg_timing_iters)
305304
.def_readwrite("workspace_size", &CompileSpec::workspace_size)
306305
.def_readwrite("dla_sram_size", &CompileSpec::dla_sram_size)

py/torch_tensorrt/ts/_compile_spec.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,18 @@ def _parse_compile_spec(compile_spec: Dict[str, Any]) -> _ts_C.CompileSpec:
211211
assert type(compile_spec["workspace_size"]) is int
212212
info.workspace_size = compile_spec["workspace_size"]
213213

214+
if "dla_sram_size" in compile_spec:
215+
assert type(compile_spec["dla_sram_size"]) is int
216+
info.dla_sram_size = compile_spec["dla_sram_size"]
217+
218+
if "dla_local_dram_size" in compile_spec:
219+
assert type(compile_spec["dla_local_dram_size"]) is int
220+
info.dla_local_dram_size = compile_spec["dla_local_dram_size"]
221+
222+
if "dla_global_dram_size" in compile_spec:
223+
assert type(compile_spec["dla_global_dram_size"]) is int
224+
info.dla_global_dram_size = compile_spec["dla_global_dram_size"]
225+
214226
if "truncate_long_and_double" in compile_spec:
215227
assert type(compile_spec["truncate_long_and_double"]) is bool
216228
info.truncate_long_and_double = compile_spec["truncate_long_and_double"]
@@ -229,9 +241,11 @@ def TensorRTCompileSpec(inputs=[],
229241
refit=False,
230242
debug=False,
231243
capability=_enums.EngineCapability.default,
232-
num_min_timing_iters=2,
233244
num_avg_timing_iters=1,
234245
workspace_size=0,
246+
dla_sram_size=1048576,
247+
dla_local_dram_size=1073741824,
248+
dla_global_dram_size=536870912,
235249
truncate_long_and_double=False,
236250
calibrator=None) -> torch.classes.tensorrt.CompileSpec:
237251
"""Utility to create a formated spec dictionary for using the PyTorch TensorRT backend
@@ -263,7 +277,6 @@ def TensorRTCompileSpec(inputs=[],
263277
refit (bool): Enable refitting
264278
debug (bool): Enable debuggable engine
265279
capability (torch_tensorrt.EngineCapability): Restrict kernel selection to safe gpu kernels or safe dla kernels
266-
num_min_timing_iters (int): Number of minimization timing iterations used to select kernels
267280
num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels
268281
workspace_size (int): Maximum size of workspace given to TensorRT
269282
truncate_long_and_double (bool): Truncate weights provided in int64 or double (float64) to int32 and float32
@@ -283,9 +296,11 @@ def TensorRTCompileSpec(inputs=[],
283296
"refit": refit, # enable refit
284297
"debug": debug, # enable debuggable engine
285298
"capability": capability, # Restrict kernel selection to safe gpu kernels or safe dla kernels
286-
"num_min_timing_iters": num_min_timing_iters, # Number of minimization timing iterations used to select kernels
287299
"num_avg_timing_iters": num_avg_timing_iters, # Number of averaging timing iterations used to select kernels
288300
"workspace_size": workspace_size, # Maximum size of workspace given to TensorRT
301+
"dla_sram_size": dla_sram_size, # Fast software managed RAM used by DLA to communicate within a layer.
302+
"dla_local_dram_size": dla_local_dram_size, # Host RAM used by DLA to share intermediate tensor data across operations
303+
"dla_global_dram_size": dla_global_dram_size, # Host RAM used by DLA to store weights and metadata for execution
289304
"calibrator": calibrator,
290305
"truncate_long_and_double": truncate_long_and_double
291306
}
@@ -331,9 +346,11 @@ def TensorRTCompileSpec(inputs=[],
331346
backend_spec._set_debug(parsed_spec.debug)
332347
backend_spec._set_refit(parsed_spec.refit)
333348
backend_spec._set_capability(int(parsed_spec.capability))
334-
backend_spec._set_num_min_timing_iters(parsed_spec.num_min_timing_iters)
335349
backend_spec._set_num_avg_timing_iters(parsed_spec.num_avg_timing_iters)
336350
backend_spec._set_workspace_size(parsed_spec.workspace_size)
351+
backend_spec._set_dla_sram_size(parsed_spec.dla_sram_size)
352+
backend_spec._set_dla_local_dram_size(parsed_spec.dla_local_dram_size)
353+
backend_spec._set_dla_global_dram_size(parsed_spec._set_dla_global_dram_size)
337354
backend_spec._set_truncate_long_and_double(parsed_spec.truncate_long_and_double)
338355
backend_spec._set_ptq_calibrator(parsed_spec._get_calibrator_handle())
339356

0 commit comments

Comments
 (0)