Skip to content

Commit 0f48534

Browse files
committed
Merge branch 'master' into pr934
2 parents dd7a44e + c952291 commit 0f48534

File tree

105 files changed

+4825
-3245
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

105 files changed

+4825
-3245
lines changed

.bazelversion

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
4.2.1
1+
5.1.1

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,10 +111,10 @@ torch.jit.save(trt_ts_module, "trt_torchscript_module.ts") # save the TRT embedd
111111
These are the following dependencies used to verify the testcases. Torch-TensorRT can work with other versions, but the tests are not guaranteed to pass.
112112

113113
- Bazel 4.2.1
114-
- Libtorch 1.10.0 (built with CUDA 11.3)
114+
- Libtorch 1.11.0 (built with CUDA 11.3)
115115
- CUDA 11.3 (10.2 on Jetson)
116-
- cuDNN 8.2
117-
- TensorRT 8.0.3.4 (TensorRT 8.0.1.6 on Jetson)
116+
- cuDNN 8.2.1
117+
- TensorRT 8.2.4.2 (TensorRT 8.2.1 on Jetson)
118118

119119
## Prebuilt Binaries and Wheel files
120120

WORKSPACE

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,17 +56,17 @@ new_local_repository(
5656
http_archive(
5757
name = "libtorch",
5858
build_file = "@//third_party/libtorch:BUILD",
59-
sha256 = "190e963e739d5f7c2dcf94b3994de8fcd335706a4ebb333812ea7d8c841beb06",
59+
sha256 = "8d9e829ce9478db4f35bdb7943308cf02e8a2f58cf9bb10f742462c1d57bf287",
6060
strip_prefix = "libtorch",
61-
urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-cxx11-abi-shared-with-deps-1.10.0%2Bcu113.zip"],
61+
urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-cxx11-abi-shared-with-deps-1.11.0%2Bcu113.zip"],
6262
)
6363

6464
http_archive(
6565
name = "libtorch_pre_cxx11_abi",
6666
build_file = "@//third_party/libtorch:BUILD",
67-
sha256 = "0996a6a4ea8bbc1137b4fb0476eeca25b5efd8ed38955218dec1b73929090053",
67+
sha256 = "90159ecce3ff451f3ef3f657493b6c7c96759c3b74bbd70c1695f2ea2f81e1ad",
6868
strip_prefix = "libtorch",
69-
urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.10.0%2Bcu113.zip"],
69+
urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.11.0%2Bcu113.zip"],
7070
)
7171

7272
# Download these tarballs manually from the NVIDIA website
@@ -86,10 +86,10 @@ http_archive(
8686
http_archive(
8787
name = "tensorrt",
8888
build_file = "@//third_party/tensorrt/archive:BUILD",
89-
sha256 = "da130296ac6636437ff8465812eb55dbab0621747d82dc4fe9b9376f00d214af",
90-
strip_prefix = "TensorRT-8.2.2.1",
89+
sha256 = "826180eaaecdf9a7e76116855b9f1f3400ea9b06e66b06a3f6a0747ba6f863ad",
90+
strip_prefix = "TensorRT-8.2.4.2",
9191
urls = [
92-
"https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.2.2.1/tars/tensorrt-8.2.2.1.linux.x86_64-gnu.cuda-11.4.cudnn8.2.tar.gz",
92+
"https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.2.4/tars/tensorrt-8.2.4.2.linux.x86_64-gnu.cuda-11.4.cudnn8.2.tar.gz",
9393
],
9494
)
9595

core/compiler.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,8 @@ void MapInputsAndDetermineDTypes(
328328
spec.dtype = nvinfer1::DataType::kFLOAT;
329329
} else if (spec.dtype_is_user_defined && cfg.partition_info.enabled) {
330330
if (!est_type_opt) {
331-
LOG_INFO("Cannot infer input tensor dtype in graph, unable to verify user input dtype settings");
331+
LOG_INFO("Cannot infer input tensor dtype in graph. Using user provided input dtype settings");
332+
first_use_type_map[in] = {util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype)};
332333
} else {
333334
if (util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype) != est_type_opt.value()) {
334335
std::stringstream ss;
@@ -344,9 +345,10 @@ void MapInputsAndDetermineDTypes(
344345
ss << "- Disable partial compilation by setting require_full_compilation to True";
345346
auto warn_str = ss.str();
346347
LOG_WARNING(warn_str);
347-
// Overwrite type map with user settings
348-
first_use_type_map[in] = {util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype)};
349348
}
349+
// Overwrite type map with user settings
350+
// We use this map for partitiioning since we need c10::ScalarTypes not nvinfer::DataTypes
351+
first_use_type_map[in] = {util::TRTDataTypeToScalarType(cfg.convert_info.inputs.find(in)->second.dtype)};
350352
}
351353
} else {
352354
// The user defined the type so no changes are necessary
@@ -417,18 +419,16 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
417419
auto first_use_types = ir::get_block_first_calc_dtypes_opt(g->block());
418420

419421
MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
420-
422+
auto isBlockConvertible = conversion::VerifyConverterSupportForBlock(g->block(), true);
421423
if (cfg.partition_info.enabled &&
422424
(cfg.lower_info.forced_fallback_modules.size() == 0 &&
423-
cfg.partition_info.forced_fallback_operators.size() == 0 &&
424-
conversion::VerifyConverterSupportForBlock(g->block(), true))) {
425+
cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible)) {
425426
LOG_INFO("Skipping partitioning since model is fully supported");
426427
}
427428

428429
if (cfg.partition_info.enabled &&
429430
!(cfg.lower_info.forced_fallback_modules.size() == 0 &&
430-
cfg.partition_info.forced_fallback_operators.size() == 0 &&
431-
conversion::VerifyConverterSupportForBlock(g->block(), true))) {
431+
cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible)) {
432432
auto input_ivalues_map = partitioning::generateRandomInputs(cfg.convert_info.inputs, first_use_types);
433433
auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), input_ivalues_map, cfg, static_params);
434434
new_g = graph_and_mapping.first;

core/conversion/conversion.cpp

Lines changed: 49 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
#include "core/conversion/conversion.h"
2+
#include <ATen/core/operator_name.h>
23
#include <torch/torch.h>
34
#include <sstream>
5+
#include "c10/util/intrusive_ptr.h"
46
#include "core/conversion/conversionctx/ConversionCtx.h"
7+
#include "core/conversion/converters/converter_util.h"
58
#include "core/conversion/converters/converters.h"
69
#include "core/conversion/evaluators/evaluators.h"
10+
#include "core/conversion/tensorcontainer/TensorContainer.h"
711
#include "core/conversion/var/Var.h"
812
#include "core/util/prelude.h"
9-
10-
#include "c10/util/intrusive_ptr.h"
11-
#include "core/conversion/converters/converter_util.h"
12-
#include "core/conversion/tensorcontainer/TensorContainer.h"
1313
#include "core/util/trt_util.h"
1414

1515
namespace torch_tensorrt {
@@ -105,7 +105,8 @@ void AddLayer(ConversionCtx* ctx, const torch::jit::Node* n) {
105105
// Node input has not been converted yet or is a prim op
106106
TORCHTRT_THROW_ERROR(
107107
"Unable to retrieve all node inputs for node: "
108-
<< util::node_info(n) << " (ctx.AddLayer)\nSpecifically failed to retrieve value for input: " << *input_node);
108+
<< util::node_info(n) << " (ctx.AddLayer)\nSpecifically failed to retrieve value for input: %"
109+
<< input->debugName());
109110
}
110111
}
111112

@@ -426,10 +427,18 @@ void ConvertBlockToNetDef(
426427
<< " and node outputs size: " << n->outputs().size() << " must match.");
427428
for (size_t i = 0; i < eval_list->elements().size(); i++) {
428429
auto eval_output = eval_list.get()->elements()[i];
429-
LOG_DEBUG(
430-
ctx->logger,
431-
"Found the evaluated value(s) to be " << eval_output << " for node: " << util::node_info(n));
432-
ctx->AssociateValueAndIValue(n->output(i), eval_output);
430+
if (eval_output.isCustomClass()) {
431+
auto container = eval_output.toCustomClass<TensorContainer>();
432+
auto tensor = container->tensor();
433+
LOG_DEBUG(
434+
ctx->logger, "Found the evaluated value(s) to be an ITensor of shape: " << tensor->getDimensions());
435+
ctx->AssociateValueAndTensor(n->output(i), tensor);
436+
} else {
437+
LOG_DEBUG(
438+
ctx->logger,
439+
"Found the evaluated value(s) to be " << eval_output << " for node: " << util::node_info(n));
440+
ctx->AssociateValueAndIValue(n->output(i), eval_output);
441+
}
433442
}
434443
} else {
435444
TORCHTRT_THROW_ERROR("Unsupported return type for evaluated node");
@@ -487,15 +496,23 @@ std::string ConvertBlockToEngine(
487496
std::unordered_map<c10::OperatorName, std::string> GetUnsupportedOpsInBlock(const torch::jit::Block* b) {
488497
std::unordered_map<c10::OperatorName, std::string> unsupported_ops;
489498
for (const auto n : b->nodes()) {
490-
if (n->kind() != torch::jit::prim::Loop && n->kind() != torch::jit::prim::If && !OpSupported(n)) {
491-
auto schema = n->maybeSchema();
492-
TORCHTRT_CHECK(
493-
schema,
494-
"Unable to get schema for Node " << util::node_info(n) << " (conversion.VerifyCoverterSupportForBlock)");
495-
std::stringstream ss;
496-
ss << *schema;
497-
unsupported_ops[schema->operator_name()] = ss.str();
499+
auto schema = n->maybeSchema();
500+
// Some ops like torch::jit::prim::Loop, torch::jit::prim::If, torch::jit::prim::DictConstruct don't have a schema
501+
// but they are supported. torch::jit::prim::DictConstruct is supported via fallback only
502+
if (!OpSupported(n)) {
503+
if (schema) {
504+
std::stringstream ss;
505+
ss << *schema;
506+
unsupported_ops[schema->operator_name()] = ss.str();
507+
} else {
508+
std::stringstream ss;
509+
ss << util::node_info(n);
510+
// operator.overload is a filler name just to call the constructor.
511+
c10::OperatorName op(ss.str(), "operator.overload");
512+
unsupported_ops[op] = ss.str();
513+
}
498514
}
515+
499516
for (const auto sub_b : n->blocks()) {
500517
auto sub_b_unsupported_ops = GetUnsupportedOpsInBlock(sub_b);
501518
unsupported_ops.insert(sub_b_unsupported_ops.begin(), sub_b_unsupported_ops.end());
@@ -530,22 +547,25 @@ std::set<std::string> ConvertableOpsInBlock(const torch::jit::Block* b) {
530547

531548
bool VerifyConverterSupportForBlock(const torch::jit::Block* b, bool suppress_errors) {
532549
auto unsupported_ops = GetUnsupportedOpsInBlock(b);
533-
534550
if (unsupported_ops.size() != 0) {
535551
std::stringstream unsupported_msg;
536552
unsupported_msg
537-
<< "Method requested cannot be compiled by Torch-TensorRT.TorchScript.\nUnsupported operators listed below:"
553+
<< "Method requested cannot be compiled end to end by Torch-TensorRT.TorchScript.\nUnsupported operators listed below:"
538554
<< std::endl;
539555
for (auto s : unsupported_ops) {
540556
unsupported_msg << " - " << s.second << std::endl;
541557
}
542-
unsupported_msg << "You can either implement converters for these ops in your application or request implementation"
543-
<< std::endl;
544-
unsupported_msg << "https://www.github.com/nvidia/Torch-TensorRT/issues" << std::endl;
545-
unsupported_msg << std::endl << "In Module:" << std::endl;
546558

547559
if (!suppress_errors) {
560+
unsupported_msg
561+
<< "You can either implement converters for these ops in your application or request implementation"
562+
<< std::endl;
563+
unsupported_msg << "https://www.github.com/nvidia/Torch-TensorRT/issues" << std::endl;
564+
unsupported_msg << std::endl << "In Module:" << std::endl;
565+
548566
LOG_ERROR(unsupported_msg.str());
567+
} else {
568+
LOG_INFO(unsupported_msg.str());
549569
}
550570

551571
std::unordered_map<std::string, std::unordered_set<std::string>> unsupported_node_locations;
@@ -571,8 +591,13 @@ bool VerifyConverterSupportForBlock(const torch::jit::Block* b, bool suppress_er
571591
for (const auto& str : type.second) {
572592
traceback << str;
573593
}
594+
574595
auto tb_str = traceback.str();
575-
LOG_ERROR(tb_str);
596+
if (!suppress_errors) {
597+
LOG_ERROR(tb_str);
598+
} else {
599+
LOG_DEBUG(tb_str);
600+
}
576601
}
577602

578603
return false;

core/conversion/converters/impl/activation.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,27 @@ auto acthardtanh TORCHTRT_UNUSED =
8787

8888
bool to_reshape = false;
8989
auto original_shape = in->getDimensions();
90+
91+
// Out_tensor of ParametricReLU shape is all 0, when slopes nDims is not equal to in nDims.
92+
// Since make sure splopes nDims is equal to in nDims.
93+
if (slopes.ndimension() == 1 and original_shape.nbDims != slopes.ndimension()) {
94+
std::vector<int64_t> slopes_new_shape(original_shape.nbDims, 1);
95+
auto first_inputs_allowed_formats = ctx->net->getInput(0)->getAllowedFormats();
96+
for (size_t inputs_index = 1; inputs_index < ctx->num_inputs; inputs_index++) {
97+
auto inputs_allowed_formats = ctx->net->getInput(inputs_index)->getAllowedFormats();
98+
TORCHTRT_CHECK(
99+
first_inputs_allowed_formats == inputs_allowed_formats,
100+
"Unable to create batch prelu layer from node,since the formats(like NHWC or NCHW) of inputs is different: "
101+
<< *n);
102+
}
103+
if (1U << static_cast<int>(nvinfer1::TensorFormat::kLINEAR) == first_inputs_allowed_formats) {
104+
slopes_new_shape[1] = slopes.sizes().vec()[0];
105+
} else {
106+
slopes_new_shape[original_shape.nbDims - 1] = slopes.sizes().vec()[0];
107+
}
108+
slopes = slopes.reshape(slopes_new_shape);
109+
}
110+
90111
if (slopes.numel() != 1 &&
91112
!util::broadcastable(
92113
in->getDimensions(),

core/conversion/converters/impl/batch_norm.cpp

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -50,27 +50,35 @@ auto batch_norm_registrations TORCHTRT_UNUSED =
5050
auto orig_shape = input->getDimensions();
5151
auto shape = util::toVec(orig_shape);
5252
auto tensor_type = util::TRTDataTypeToScalarType(input->getType());
53-
auto options = torch::TensorOptions().dtype(tensor_type);
53+
auto options =
54+
torch::TensorOptions().dtype(tensor_type).device(torch::kCUDA, ctx->settings.device.gpu_id);
5455

5556
torch::Tensor gamma, beta, mean, var;
57+
LOG_DEBUG("Input :" << orig_shape << "/" << input->getType());
58+
// affine=True
59+
LOG_DEBUG("Args[1] gamma : " << args[1].isIValue() << " / " << args[1].IValue()->isNone());
60+
LOG_DEBUG("Args[2] beta : " << args[2].isIValue() << " / " << args[2].IValue()->isNone());
61+
// track_running_stats=True
62+
LOG_DEBUG("Args[3] mean : " << args[3].isIValue() << " / " << args[3].IValue()->isNone());
63+
LOG_DEBUG("Args[4] var : " << args[4].isIValue() << " / " << args[4].IValue()->isNone());
64+
LOG_DEBUG("use_input_stats, momemtum, cudnn_enabled disregarded");
65+
LOG_DEBUG("ctx->input_is_dynamic : " << ctx->input_is_dynamic);
5666

67+
auto channel_dim = shape[1];
5768
if (ctx->input_is_dynamic) {
58-
gamma = args[1].unwrapToTensor();
59-
beta = args[2].unwrapToTensor();
69+
gamma = args[1].unwrapToTensor(at::full(channel_dim, 1, options));
70+
beta = args[2].unwrapToTensor(at::full(channel_dim, 0, options));
6071
mean = args[3].unwrapToTensor();
6172
var = args[4].unwrapToTensor();
6273
} else {
63-
gamma = args[1].unwrapToTensor(at::full({shape}, 1, {options}));
64-
beta = args[2].unwrapToTensor(at::full({shape}, 1, {options}));
65-
mean = args[3].unwrapToTensor(at::full({shape}, 0, {options}));
66-
var = args[4].unwrapToTensor(at::full({shape}, 0, {options}));
74+
gamma = args[1].unwrapToTensor(at::full(channel_dim, 1, options));
75+
beta = args[2].unwrapToTensor(at::full(channel_dim, 0, options));
76+
mean = args[3].unwrapToTensor(at::full(channel_dim, 0, options));
77+
var = args[4].unwrapToTensor(at::full(channel_dim, 0, options));
6778
}
6879

6980
auto eps = static_cast<float>(args[7].unwrapToDouble(1e-5f));
7081

71-
LOG_DEBUG("momentum disregarded");
72-
LOG_DEBUG("training disregarded");
73-
LOG_DEBUG("cudnn disregarded");
7482
TORCHTRT_CHECK(orig_shape.nbDims >= 2, "Unable to create batch normalization layer from node: " << *n);
7583

7684
// Expand spatial dims from 1D to 2D if needed

core/conversion/converters/impl/cast.cpp

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,43 @@ auto cast_registrations TORCHTRT_UNUSED =
1818
[](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
1919
auto self = args[0].ITensorOrFreeze(ctx);
2020
auto output_dtype = args[1].unwrapToScalar().to<int64_t>();
21-
auto trt_dtype = util::ScalarTypeToTRTDataType(static_cast<at::ScalarType>(output_dtype));
21+
auto scalar_dtype = static_cast<at::ScalarType>(output_dtype);
22+
nvinfer1::DataType trt_dtype;
23+
if (scalar_dtype == at::kLong) {
24+
LOG_WARNING("Truncating aten::to output type from at::kLong to at::kInt");
25+
trt_dtype = nvinfer1::DataType::kINT32;
26+
} else {
27+
trt_dtype = util::ScalarTypeToTRTDataType(static_cast<at::ScalarType>(output_dtype));
28+
}
2229
auto casted_itensor = castITensor(ctx, self, trt_dtype);
2330
auto output = ctx->AssociateValueAndTensor(n->outputs()[0], casted_itensor);
2431
LOG_DEBUG("[aten::to.dtype] Output tensor shape: " << output->getDimensions());
2532

2633
return true;
2734
}})
2835
.pattern(
36+
{"aten::to.device(Tensor(a) self, Device device, int dtype, bool non_blocking=False, bool copy=False, int? memory_format=None) -> (Tensor(a))",
37+
[](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
38+
// what this function does is basically the same with the previous one, however, we cannot lower this
39+
// signature to previous one because this will incur the device issues when we run Torchscript module in
40+
// later shape analysis phase of fallback
41+
auto self = args[0].ITensorOrFreeze(ctx);
42+
auto output_dtype = args[2].unwrapToScalar().to<int64_t>();
43+
auto scalar_dtype = static_cast<at::ScalarType>(output_dtype);
44+
nvinfer1::DataType trt_dtype;
45+
if (scalar_dtype == at::kLong) {
46+
LOG_WARNING("Truncating aten::to output type from at::kLong to at::kInt");
47+
trt_dtype = nvinfer1::DataType::kINT32;
48+
} else {
49+
trt_dtype = util::ScalarTypeToTRTDataType(static_cast<at::ScalarType>(output_dtype));
50+
}
51+
auto casted_itensor = castITensor(ctx, self, trt_dtype);
52+
auto output = ctx->AssociateValueAndTensor(n->outputs()[0], casted_itensor);
53+
LOG_DEBUG("[aten::to.device] Output tensor shape: " << output->getDimensions());
54+
55+
return true;
56+
}})
57+
.pattern(
2958
{"aten::to.other(Tensor self, Tensor other, bool non_blocking=False, bool copy=False, int? memory_format=None) -> (Tensor)",
3059
[](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
3160
auto self = args[0].ITensorOrFreeze(ctx);

core/conversion/converters/impl/cumsum.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ auto cumsum_registrations TORCHTRT_UNUSED = RegisterNodeConversionPatterns().pat
4848
auto data = iterator->getOutput(0);
4949
auto newDims = data->getDimensions();
5050

51-
torch::Tensor zeroValue = at::full(util::toVec(newDims), 0, torch::kFloat32);
51+
torch::Tensor zeroValue =
52+
at::full(util::toVec(newDims), 0, torch_tensorrt::core::util::TRTDataTypeToScalarType(in->getType()));
5253
auto zeroTensor = tensor_to_const(ctx, zeroValue);
5354
auto runningSum = loop->addRecurrence(*zeroTensor);
5455
auto runningSumTensor = runningSum->getOutput(0);

0 commit comments

Comments
 (0)