Skip to content

Commit 92e32aa

Browse files
authored
Merge pull request #1152 from pytorch/trt_8.4ga
feat: Upgrade TRT to 8.4
2 parents 1625cd3 + 66c1cab commit 92e32aa

File tree

31 files changed

+145
-131
lines changed

31 files changed

+145
-131
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,8 @@ These are the following dependencies used to verify the testcases. Torch-TensorR
114114
- Bazel 5.1.1
115115
- Libtorch 1.11.0 (built with CUDA 11.3)
116116
- CUDA 11.3
117-
- cuDNN 8.2.1
118-
- TensorRT 8.2.4.2
117+
- cuDNN 8.4.1
118+
- TensorRT 8.4.1.5
119119

120120
## Prebuilt Binaries and Wheel files
121121

WORKSPACE

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,20 +76,20 @@ http_archive(
7676
http_archive(
7777
name = "cudnn",
7878
build_file = "@//third_party/cudnn/archive:BUILD",
79-
sha256 = "0e5d2df890b9967efa6619da421310d97323565a79f05a1a8cb9b7165baad0d7",
80-
strip_prefix = "cuda",
79+
sha256 = "ec96d2376d81fca42bdd3d4c3d705a99b29a065bab57f920561c763e29c67d01",
80+
strip_prefix = "cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive",
8181
urls = [
82-
"https://developer.nvidia.com/compute/machine-learning/cudnn/secure/8.2.4/11.4_20210831/cudnn-11.4-linux-x64-v8.2.4.15.tgz",
82+
"https://developer.nvidia.com/compute/cudnn/secure/8.4.1/local_installers/11.6/cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive.tar.xz",
8383
],
8484
)
8585

8686
http_archive(
8787
name = "tensorrt",
8888
build_file = "@//third_party/tensorrt/archive:BUILD",
89-
sha256 = "826180eaaecdf9a7e76116855b9f1f3400ea9b06e66b06a3f6a0747ba6f863ad",
90-
strip_prefix = "TensorRT-8.2.4.2",
89+
sha256 = "8107861af218694130f170e071f49814fa3e27f1386ce7cb6d807ac05a7fcf0e",
90+
strip_prefix = "TensorRT-8.4.1.5",
9191
urls = [
92-
"https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.2.4/tars/tensorrt-8.2.4.2.linux.x86_64-gnu.cuda-11.4.cudnn8.2.tar.gz",
92+
"https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.4.1/tars/tensorrt-8.4.1.5.linux.x86_64-gnu.cuda-11.6.cudnn8.4.tar.gz",
9393
],
9494
)
9595

core/compiler.cpp

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -359,14 +359,6 @@ void MapInputsAndDetermineDTypes(
359359
}
360360
}
361361

362-
uint64_t GetRecommendedWorkspaceSize(const runtime::CudaDevice& device) {
363-
if (device.major < 6) {
364-
return 256 * (1 << 20);
365-
} else {
366-
return 1 << 30;
367-
}
368-
}
369-
370362
std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::string method_name, CompileSpec cfg) {
371363
// Go through Lowering to simplify graph and extract weight parameters
372364
auto graph_and_parameters = lowering::Lower(mod, method_name, cfg.lower_info);
@@ -380,14 +372,14 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
380372
// Infer the type of an input from the weights of the calculation
381373
auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block());
382374

383-
// GPU default WS size : 1 GB
384-
// Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
385-
auto workspace_size = cfg.convert_info.engine_settings.workspace_size;
386-
auto device_spec = cfg.convert_info.engine_settings.device;
387-
auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
388-
if (workspace_size == 0) {
389-
cfg.convert_info.engine_settings.workspace_size = GetRecommendedWorkspaceSize(cuda_device);
390-
}
375+
// // GPU default WS size : 1 GB
376+
// // Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
377+
// auto workspace_size = cfg.convert_info.engine_settings.workspace_size;
378+
// auto device_spec = cfg.convert_info.engine_settings.device;
379+
// auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
380+
// if (workspace_size == 0) {
381+
// cfg.convert_info.engine_settings.workspace_size = GetRecommendedWorkspaceSize(cuda_device);
382+
// }
391383

392384
MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
393385

@@ -399,14 +391,14 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
399391
torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) {
400392
torch::jit::Module new_mod(mod._ivalue()->name() + "_trt");
401393

402-
// GPU default WS size : 1 GB
403-
// Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
404-
auto workspace_size = cfg.convert_info.engine_settings.workspace_size;
394+
// // GPU default WS size : 1 GB
395+
// // Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
396+
// auto workspace_size = cfg.convert_info.engine_settings.workspace_size;
405397
auto device_spec = cfg.convert_info.engine_settings.device;
406398
auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
407-
if (workspace_size == 0) {
408-
cfg.convert_info.engine_settings.workspace_size = GetRecommendedWorkspaceSize(cuda_device);
409-
}
399+
// if (workspace_size == 0) {
400+
// cfg.convert_info.engine_settings.workspace_size = GetRecommendedWorkspaceSize(cuda_device);
401+
// }
410402

411403
for (const torch::jit::Method& method : mod.get_methods()) {
412404
if (method.name().compare("forward") == 0) {

core/conversion/conversionctx/ConversionCtx.cpp

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,11 @@ std::ostream& operator<<(std::ostream& os, const BuilderSettings& s) {
2020
<< "\n Debuggable Engine: " << s.debug \
2121
<< "\n GPU ID: " << s.device.gpu_id \
2222
<< "\n Allow GPU Fallback (if running on DLA): " << s.device.allow_gpu_fallback \
23-
<< "\n Min Timing Iterations: " << s.num_min_timing_iters \
2423
<< "\n Avg Timing Iterations: " << s.num_avg_timing_iters \
25-
<< "\n Max Workspace Size: " << s.workspace_size;
24+
<< "\n Max Workspace Size: " << s.workspace_size \
25+
<< "\n DLA SRAM Size: " << s.dla_sram_size \
26+
<< "\n DLA Local DRAM Size: " << s.dla_local_dram_size \
27+
<< "\n DLA Global DRAM Size: " << s.dla_global_dram_size;
2628

2729
os << "\n Device Type: " << s.device.device_type \
2830
<< "\n GPU ID: " << s.device.gpu_id;
@@ -104,9 +106,11 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings)
104106
cfg->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
105107
}
106108

107-
cfg->setMinTimingIterations(settings.num_min_timing_iters);
108109
cfg->setAvgTimingIterations(settings.num_avg_timing_iters);
109-
cfg->setMaxWorkspaceSize(settings.workspace_size);
110+
if (settings.workspace_size != 0){
111+
cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, settings.workspace_size);
112+
}
113+
110114
cfg->setDefaultDeviceType(settings.device.device_type);
111115
cfg->setEngineCapability(settings.capability);
112116

@@ -120,6 +124,15 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings)
120124
settings.enabled_precisions.find(nvinfer1::DataType::kFLOAT) == settings.enabled_precisions.end(),
121125
"DLA supports only fp16 or int8 precision");
122126
cfg->setDLACore(settings.device.dla_core);
127+
if (settings.dla_sram_size != 1048576){
128+
cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_MANAGED_SRAM, settings.dla_sram_size);
129+
}
130+
if (settings.dla_local_dram_size != 1073741824){
131+
cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_LOCAL_DRAM, settings.dla_local_dram_size);
132+
}
133+
if (settings.dla_global_dram_size != 536870912){
134+
cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_GLOBAL_DRAM, settings.dla_global_dram_size);
135+
}
123136
}
124137
}
125138

core/conversion/conversionctx/ConversionCtx.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,11 @@ struct BuilderSettings {
3333
Device device;
3434
nvinfer1::EngineCapability capability = TRT_ENGINE_CAPABILITY_STANDARD;
3535
nvinfer1::IInt8Calibrator* calibrator = nullptr;
36-
uint64_t num_min_timing_iters = 2;
3736
uint64_t num_avg_timing_iters = 1;
3837
uint64_t workspace_size = 0;
38+
uint64_t dla_sram_size = 1048576;
39+
uint64_t dla_local_dram_size = 1073741824;
40+
uint64_t dla_global_dram_size = 536870912;
3941

4042
BuilderSettings() = default;
4143
BuilderSettings(const BuilderSettings& other) = default;

core/conversion/converters/converter_util.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,9 +135,10 @@ nvinfer1::ITensor* castITensor(ConversionCtx* ctx, nvinfer1::ITensor* tensor, nv
135135

136136
auto id_layer = ctx->net->addIdentity(*tensor);
137137
TORCHTRT_CHECK(id_layer, "Unable to create identity layer for ITensor: " << tensor_id.str());
138-
auto casted_tensor = id_layer->getOutput(0);
139-
casted_tensor->setType(dtype);
138+
// layer->setOutputType should be used for casting and not manually setting output_tensor->setType()
139+
id_layer->setOutputType(0, dtype);
140140

141+
auto casted_tensor = id_layer->getOutput(0);
141142
LOG_DEBUG(ctx->logger, "Casting ITensor " << tensor_id.str() << " from " << tensor->getType() << " to " << dtype);
142143

143144
std::stringstream ss;

cpp/bin/torchtrtc/README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,13 +82,17 @@ torchtrtc [input_file_path] [output_file_path]
8282
serialized TensorRT engine and embed it
8383
into a TorchScript module (device spec
8484
must be provided)
85-
--num-min-timing-iter=[num_iters] Number of minimization timing iterations
86-
used to select kernels
8785
--num-avg-timing-iters=[num_iters]
8886
Number of averaging timing iterations
8987
used to select kernels
9088
--workspace-size=[workspace_size] Maximum size of workspace given to
9189
TensorRT
90+
--dla-sram-size=[dla_sram_size] Fast software managed RAM used by DLA
91+
to communicate within a layer.
92+
--dla-local-dram-size=[dla_local_dram_size] Host RAM used by DLA to share
93+
intermediate tensor data across operations.
94+
--dla-global-dram-size=[dla_global_dram_size] Host RAM used by DLA to store
95+
weights and metadata for execution
9296
--atol=[atol] Absolute tolerance threshold for acceptable
9397
numerical deviation from standard torchscript
9498
output (default 1e-8)

cpp/bin/torchtrtc/main.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -113,12 +113,16 @@ int main(int argc, char** argv) {
113113
"Whether to treat input file as a serialized TensorRT engine and embed it into a TorchScript module (device spec must be provided)",
114114
{"embed-engine"});
115115

116-
args::ValueFlag<uint64_t> num_min_timing_iters(
117-
parser, "num_iters", "Number of minimization timing iterations used to select kernels", {"num-min-timing-iter"});
118116
args::ValueFlag<uint64_t> num_avg_timing_iters(
119117
parser, "num_iters", "Number of averaging timing iterations used to select kernels", {"num-avg-timing-iters"});
120118
args::ValueFlag<uint64_t> workspace_size(
121119
parser, "workspace_size", "Maximum size of workspace given to TensorRT", {"workspace-size"});
120+
args::ValueFlag<uint64_t> dla_sram_size(
121+
parser, "dla_sram_size", "DLA managed SRAM size", {"dla-sram-size"});
122+
args::ValueFlag<uint64_t> dla_local_dram_size(
123+
parser, "dla_local_dram_size", "DLA Local DRAM size", {"dla-local-dram-size"});
124+
args::ValueFlag<uint64_t> dla_global_dram_size(
125+
parser, "dla_global_dram_size", "DLA Global DRAM size", {"dla-global-dram-size"});
122126
args::ValueFlag<double> atol(
123127
parser,
124128
"atol",
@@ -325,6 +329,15 @@ int main(int argc, char** argv) {
325329
if (dla_core) {
326330
compile_settings.device.dla_core = args::get(dla_core);
327331
}
332+
if (dla_sram_size) {
333+
compile_settings.dla_sram_size = args::get(dla_sram_size);
334+
}
335+
if (dla_local_dram_size) {
336+
compile_settings.dla_local_dram_size = args::get(dla_local_dram_size);
337+
}
338+
if (dla_global_dram_size) {
339+
compile_settings.dla_global_dram_size = args::get(dla_global_dram_size);
340+
}
328341
} else {
329342
torchtrt::logging::log(
330343
torchtrt::logging::Level::kERROR, "Invalid device type, options are [ gpu | dla ] found: " + device);
@@ -352,10 +365,6 @@ int main(int argc, char** argv) {
352365
}
353366
}
354367

355-
if (num_min_timing_iters) {
356-
compile_settings.num_min_timing_iters = args::get(num_min_timing_iters);
357-
}
358-
359368
if (num_avg_timing_iters) {
360369
compile_settings.num_avg_timing_iters = args::get(num_avg_timing_iters);
361370
}

cpp/include/torch_tensorrt/torch_tensorrt.h

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -636,10 +636,6 @@ struct TORCHTRT_API CompileSpec {
636636
*/
637637
EngineCapability capability = EngineCapability::kSTANDARD;
638638

639-
/**
640-
* Number of minimization timing iterations used to select kernels
641-
*/
642-
uint64_t num_min_timing_iters = 2;
643639
/**
644640
* Number of averaging timing iterations used to select kernels
645641
*/
@@ -650,6 +646,21 @@ struct TORCHTRT_API CompileSpec {
650646
*/
651647
uint64_t workspace_size = 0;
652648

649+
/**
650+
* Fast software managed RAM used by DLA to communicate within a layer.
651+
*/
652+
uint64_t dla_sram_size = 1048576;
653+
654+
/**
655+
* Host RAM used by DLA to share intermediate tensor data across operations
656+
*/
657+
uint64_t dla_local_dram_size = 1073741824;
658+
659+
/**
660+
* host RAM used by DLA to store weights and metadata for execution
661+
*/
662+
uint64_t dla_global_dram_size = 536870912;
663+
653664
/**
654665
* Calibration dataloaders for each input for post training quantizatiom
655666
*/

cpp/src/compile_spec.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,11 @@ torchtrt::core::CompileSpec to_internal_compile_spec(CompileSpec external) {
8181

8282
internal.convert_info.engine_settings.device.gpu_id = external.device.gpu_id;
8383
internal.convert_info.engine_settings.device.dla_core = external.device.dla_core;
84-
internal.convert_info.engine_settings.num_min_timing_iters = external.num_min_timing_iters;
8584
internal.convert_info.engine_settings.num_avg_timing_iters = external.num_avg_timing_iters;
8685
internal.convert_info.engine_settings.workspace_size = external.workspace_size;
86+
internal.convert_info.engine_settings.dla_sram_size = external.dla_sram_size;
87+
internal.convert_info.engine_settings.dla_local_dram_size = external.dla_local_dram_size;
88+
internal.convert_info.engine_settings.dla_global_dram_size = external.dla_global_dram_size;
8789

8890
if (internal.convert_info.engine_settings.enabled_precisions.find(nvinfer1::DataType::kINT8) !=
8991
internal.convert_info.engine_settings.enabled_precisions.end()) {

0 commit comments

Comments
 (0)