Skip to content

Commit 2895fb8

Browse files
committed
feat: Integrate TRT 8.4 APIs for handling workspace size and other DLA memory options
Signed-off-by: Dheeraj Peri <[email protected]>
1 parent af12039 commit 2895fb8

File tree

18 files changed

+88
-37
lines changed

18 files changed

+88
-37
lines changed

core/compiler.cpp

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -359,14 +359,6 @@ void MapInputsAndDetermineDTypes(
359359
}
360360
}
361361

362-
uint64_t GetRecommendedWorkspaceSize(const runtime::CudaDevice& device) {
363-
if (device.major < 6) {
364-
return 256 * (1 << 20);
365-
} else {
366-
return 1 << 30;
367-
}
368-
}
369-
370362
std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::string method_name, CompileSpec cfg) {
371363
// Go through Lowering to simplify graph and extract weight parameters
372364
auto graph_and_parameters = lowering::Lower(mod, method_name, cfg.lower_info);
@@ -380,14 +372,14 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
380372
// Infer the type of an input from the weights of the calculation
381373
auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block());
382374

383-
// GPU default WS size : 1 GB
384-
// Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
385-
auto workspace_size = cfg.convert_info.engine_settings.workspace_size;
386-
auto device_spec = cfg.convert_info.engine_settings.device;
387-
auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
388-
if (workspace_size == 0) {
389-
cfg.convert_info.engine_settings.workspace_size = GetRecommendedWorkspaceSize(cuda_device);
390-
}
375+
// // GPU default WS size : 1 GB
376+
// // Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
377+
// auto workspace_size = cfg.convert_info.engine_settings.workspace_size;
378+
// auto device_spec = cfg.convert_info.engine_settings.device;
379+
// auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
380+
// if (workspace_size == 0) {
381+
// cfg.convert_info.engine_settings.workspace_size = GetRecommendedWorkspaceSize(cuda_device);
382+
// }
391383

392384
MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
393385

@@ -399,14 +391,14 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
399391
torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) {
400392
torch::jit::Module new_mod(mod._ivalue()->name() + "_trt");
401393

402-
// GPU default WS size : 1 GB
403-
// Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
404-
auto workspace_size = cfg.convert_info.engine_settings.workspace_size;
394+
// // GPU default WS size : 1 GB
395+
// // Set WS = 256 Mb for Jetson nano/TX1 like platforms whose compute capability is 5.X.
396+
// auto workspace_size = cfg.convert_info.engine_settings.workspace_size;
405397
auto device_spec = cfg.convert_info.engine_settings.device;
406398
auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
407-
if (workspace_size == 0) {
408-
cfg.convert_info.engine_settings.workspace_size = GetRecommendedWorkspaceSize(cuda_device);
409-
}
399+
// if (workspace_size == 0) {
400+
// cfg.convert_info.engine_settings.workspace_size = GetRecommendedWorkspaceSize(cuda_device);
401+
// }
410402

411403
for (const torch::jit::Method& method : mod.get_methods()) {
412404
if (method.name().compare("forward") == 0) {

core/conversion/conversionctx/ConversionCtx.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@ std::ostream& operator<<(std::ostream& os, const BuilderSettings& s) {
2121
<< "\n GPU ID: " << s.device.gpu_id \
2222
<< "\n Allow GPU Fallback (if running on DLA): " << s.device.allow_gpu_fallback \
2323
<< "\n Avg Timing Iterations: " << s.num_avg_timing_iters \
24-
<< "\n Max Workspace Size: " << s.workspace_size;
24+
<< "\n Max Workspace Size: " << s.workspace_size \
25+
<< "\n DLA SRAM Size: " << s.dla_sram_size \
26+
<< "\n DLA Local DRAM Size: " << s.dla_local_dram_size \
27+
<< "\n DLA Global DRAM Size: " << s.dla_global_dram_size;
2528

2629
os << "\n Device Type: " << s.device.device_type \
2730
<< "\n GPU ID: " << s.device.gpu_id;
@@ -104,7 +107,10 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings)
104107
}
105108

106109
cfg->setAvgTimingIterations(settings.num_avg_timing_iters);
107-
cfg->setMaxWorkspaceSize(settings.workspace_size);
110+
if (settings.workspace_size != 0){
111+
cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, settings.workspace_size);
112+
}
113+
108114
cfg->setDefaultDeviceType(settings.device.device_type);
109115
cfg->setEngineCapability(settings.capability);
110116

@@ -118,6 +124,15 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings)
118124
settings.enabled_precisions.find(nvinfer1::DataType::kFLOAT) == settings.enabled_precisions.end(),
119125
"DLA supports only fp16 or int8 precision");
120126
cfg->setDLACore(settings.device.dla_core);
127+
if (settings.dla_sram_size != 1048576){
128+
cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_MANAGED_SRAM, settings.dla_sram_size);
129+
}
130+
if (settings.dla_local_dram_size != 1073741824){
131+
cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_LOCAL_DRAM, settings.dla_local_dram_size);
132+
}
133+
if (settings.dla_global_dram_size != 536870912){
134+
cfg->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kDLA_GLOBAL_DRAM, settings.dla_global_dram_size);
135+
}
121136
}
122137
}
123138

core/conversion/conversionctx/ConversionCtx.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ struct BuilderSettings {
3636
uint64_t num_min_timing_iters = 2;
3737
uint64_t num_avg_timing_iters = 1;
3838
uint64_t workspace_size = 0;
39+
uint64_t dla_sram_size = 1048576;
40+
uint64_t dla_local_dram_size = 1073741824;
41+
uint64_t dla_global_dram_size = 536870912;
3942

4043
BuilderSettings() = default;
4144
BuilderSettings(const BuilderSettings& other) = default;

cpp/bin/torchtrtc/main.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,12 @@ int main(int argc, char** argv) {
117117
parser, "num_iters", "Number of averaging timing iterations used to select kernels", {"num-avg-timing-iters"});
118118
args::ValueFlag<uint64_t> workspace_size(
119119
parser, "workspace_size", "Maximum size of workspace given to TensorRT", {"workspace-size"});
120+
args::ValueFlag<uint64_t> dla_sram_size(
121+
parser, "dla_sram_size", "DLA managed SRAM size", {"dla-sram-size"});
122+
args::ValueFlag<uint64_t> dla_local_dram_size(
123+
parser, "dla_local_dram_size", "DLA Local DRAM size", {"dla-local-dram-size"});
124+
args::ValueFlag<uint64_t> dla_global_dram_size(
125+
parser, "dla_global_dram_size", "DLA Global DRAM size", {"dla-global-dram-size"});
120126
args::ValueFlag<double> atol(
121127
parser,
122128
"atol",
@@ -323,6 +329,15 @@ int main(int argc, char** argv) {
323329
if (dla_core) {
324330
compile_settings.device.dla_core = args::get(dla_core);
325331
}
332+
if (dla_sram_size) {
333+
compile_settings.dla_sram_size = args::get(dla_sram_size);
334+
}
335+
if (dla_local_dram_size) {
336+
compile_settings.dla_local_dram_size = args::get(dla_local_dram_size);
337+
}
338+
if (dla_global_dram_size) {
339+
compile_settings.dla_global_dram_size = args::get(dla_global_dram_size);
340+
}
326341
} else {
327342
torchtrt::logging::log(
328343
torchtrt::logging::Level::kERROR, "Invalid device type, options are [ gpu | dla ] found: " + device);

cpp/include/torch_tensorrt/torch_tensorrt.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -650,6 +650,21 @@ struct TORCHTRT_API CompileSpec {
650650
*/
651651
uint64_t workspace_size = 0;
652652

653+
/**
654+
* Fast software managed RAM used by DLA to communicate within a layer.
655+
*/
656+
uint64_t dla_sram_size = 1048576;
657+
658+
/**
659+
* Host RAM used by DLA to share intermediate tensor data across operations
660+
*/
661+
uint64_t dla_local_dram_size = 1073741824;
662+
663+
/**
664+
* host RAM used by DLA to store weights and metadata for execution
665+
*/
666+
uint64_t dla_global_dram_size = 536870912;
667+
653668
/**
654669
* Calibration dataloaders for each input for post training quantizatiom
655670
*/

cpp/src/compile_spec.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,9 @@ torchtrt::core::CompileSpec to_internal_compile_spec(CompileSpec external) {
8383
internal.convert_info.engine_settings.device.dla_core = external.device.dla_core;
8484
internal.convert_info.engine_settings.num_avg_timing_iters = external.num_avg_timing_iters;
8585
internal.convert_info.engine_settings.workspace_size = external.workspace_size;
86+
internal.convert_info.engine_settings.dla_sram_size = external.dla_sram_size;
87+
internal.convert_info.engine_settings.dla_local_dram_size = external.dla_local_dram_size;
88+
internal.convert_info.engine_settings.dla_global_dram_size = external.dla_global_dram_size;
8689

8790
if (internal.convert_info.engine_settings.enabled_precisions.find(nvinfer1::DataType::kINT8) !=
8891
internal.convert_info.engine_settings.enabled_precisions.end()) {

examples/int8/ptq/main.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,6 @@ torch::jit::Module compile_int8_model(const std::string& data_dir, torch::jit::M
4949
compile_spec.enabled_precisions.insert(torch::kI8);
5050
/// Use the TensorRT Entropy Calibrator
5151
compile_spec.ptq_calibrator = calibrator;
52-
/// Set a larger workspace
53-
compile_spec.workspace_size = 1 << 28;
5452

5553
#ifdef SAVE_ENGINE
5654
std::cout << "Compiling graph to save as TRT engine (/tmp/engine_converted_from_jit.trt)" << std::endl;

examples/int8/qat/main.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@ torch::jit::Module compile_int8_qat_model(const std::string& data_dir, torch::ji
3333
auto compile_spec = torch_tensorrt::ts::CompileSpec(inputs);
3434
/// Set operating precision to INT8
3535
compile_spec.enabled_precisions.insert(torch::kI8);
36-
/// Set a larger workspace
37-
compile_spec.workspace_size = 1 << 28;
3836

3937
#ifdef SAVE_ENGINE
4038
std::cout << "Compiling graph to save as TRT engine (/tmp/engine_converted_from_jit.trt)" << std::endl;

py/torch_tensorrt/csrc/register_tensorrt_classes.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ void RegisterTRTCompileSpec() {
6565
ADD_FIELD_GET_SET_REGISTRATION(
6666
TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, num_avg_timing_iters);
6767
ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, workspace_size);
68+
ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, dla_sram_size);
69+
ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, dla_local_dram_size);
70+
ADD_FIELD_GET_SET_REGISTRATION(TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, dla_global_dram_size);
6871
ADD_FIELD_GET_SET_REGISTRATION(
6972
TRTCompileSpecTSRegistration, torch_tensorrt::pyapi::CompileSpec, truncate_long_and_double);
7073
}

py/torch_tensorrt/csrc/tensorrt_classes.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,12 @@ core::CompileSpec CompileSpec::toInternalCompileSpec() {
225225
info.convert_info.engine_settings.num_avg_timing_iters = num_avg_timing_iters;
226226
TORCHTRT_CHECK(workspace_size >= 0, "workspace_size must be 0 or greater");
227227
info.convert_info.engine_settings.workspace_size = workspace_size;
228+
TORCHTRT_CHECK(dla_sram_size >= 4096, "DLA managed SRAM size must be at least 4 KiB and must be a power of 2. This defaults to 1 MiB");
229+
info.convert_info.engine_settings.dla_sram_size = dla_sram_size;
230+
TORCHTRT_CHECK(dla_local_dram_size >= 4096, "DLA Local DRAM size must be at least 4 KiB and must be a power of 2. This defaults to 1 GiB");
231+
info.convert_info.engine_settings.dla_local_dram_size = dla_local_dram_size;
232+
TORCHTRT_CHECK(dla_global_dram_size >= 4096, "DLA Global DRAM size must be at least 4 KiB and must be a power of 2. This defaults to 512 MiB");
233+
info.convert_info.engine_settings.dla_global_dram_size = dla_global_dram_size;
228234
return info;
229235
}
230236

@@ -249,6 +255,9 @@ std::string CompileSpec::stringify() {
249255
ss << " \"Engine Capability\": " << to_str(capability) << std::endl;
250256
ss << " \"Num Avg Timing Iters\": " << num_avg_timing_iters << std::endl;
251257
ss << " \"Workspace Size\": " << workspace_size << std::endl;
258+
ss << " \"DLA SRAM Size\": " << dla_sram_size << std::endl;
259+
ss << " \"DLA Local DRAM Size\": " << dla_local_dram_size << std::endl;
260+
ss << " \"DLA Global DRAM Size\": " << dla_global_dram_size << std::endl;
252261
ss << " \"Truncate long and double\": " << truncate_long_and_double << std::endl;
253262
ss << " \"Torch Fallback\": " << torch_fallback.to_str();
254263
ss << "}";

0 commit comments

Comments
 (0)