Skip to content

Commit 79dc360

Browse files
committed
Merge branch 'main' into fix_loop_fallback
2 parents 42e70bd + fce0a01 commit 79dc360

File tree

150 files changed

+1464
-399
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

150 files changed

+1464
-399
lines changed

.circleci/config.yml

Lines changed: 211 additions & 33 deletions
Large diffs are not rendered by default.

WORKSPACE

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,17 +56,17 @@ new_local_repository(
5656
http_archive(
5757
name = "libtorch",
5858
build_file = "@//third_party/libtorch:BUILD",
59-
sha256 = "59b8b5e1954a86d50b79c13f06398d385b200da13e37a08ecf31d3c62e5ca127",
59+
sha256 = "8b3b48615169c83c1b643c0efade078ea080b1da598e15fcf01bc59421f3095e",
6060
strip_prefix = "libtorch",
61-
urls = ["https://download.pytorch.org/libtorch/nightly/cu117/libtorch-cxx11-abi-shared-with-deps-2.0.0.dev20230103%2Bcu117.zip"],
61+
urls = ["https://download.pytorch.org/libtorch/nightly/cu117/libtorch-cxx11-abi-shared-with-deps-2.0.0.dev20230219%2Bcu117.zip"],
6262
)
6363

6464
http_archive(
6565
name = "libtorch_pre_cxx11_abi",
6666
build_file = "@//third_party/libtorch:BUILD",
67-
sha256 = "e260fc7476be89d1650953e8643e9f7363845f5a52de4bab87ac0e619c1f6ad4",
67+
sha256 = "aa7fd06079d260ff83c344d043fb84fbd9cf831cf375ed8b5a1b62416817af31",
6868
strip_prefix = "libtorch",
69-
urls = ["https://download.pytorch.org/libtorch/nightly/cu117/libtorch-shared-with-deps-2.0.0.dev20230103%2Bcu117.zip"],
69+
urls = ["https://download.pytorch.org/libtorch/nightly/cu117/libtorch-shared-with-deps-2.0.0.dev20230219%2Bcu117.zip"],
7070
)
7171

7272
# Download these tarballs manually from the NVIDIA website

core/compiler.cpp

Lines changed: 67 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,8 @@ partitioning::GraphAndMapping BuildHybridGraph(
138138
torch::jit::Block* block,
139139
CompileSpec cfg,
140140
ir::StaticParams static_params,
141-
ir::CollectionTypeMap first_use_types) {
141+
ir::CollectionTypeMap first_use_types,
142+
bool expect_full_compilation = false) {
142143
auto convert_info = cfg.convert_info;
143144
auto partitioning_info = cfg.partitioning_info;
144145

@@ -149,17 +150,20 @@ partitioning::GraphAndMapping BuildHybridGraph(
149150
// TODO: Combine this within partition call
150151
partitioning::populateInputIValues(&partitioning_ctx);
151152

152-
partitioning::partition(&partitioning_ctx);
153+
partitioning::partition(&partitioning_ctx, expect_full_compilation);
153154

154155
for (auto& partitioned_block : partitioning_ctx.partitioned_blocks) {
155156
partitioning::PartitionedGraph& segmented_blocks = partitioned_block.second;
157+
int num_torch_segments = 0;
158+
int num_trt_segments = 0;
156159

157160
for (auto& seg_block : segmented_blocks) {
158161
LOG_INFO("Block segment:" << seg_block);
159162
std::ostringstream trt_engine_id;
160163
trt_engine_id << reinterpret_cast<const int*>(&seg_block);
161164

162165
if (seg_block.target() == partitioning::SegmentedBlock::kTensorRT) {
166+
num_trt_segments++;
163167
auto inputs = seg_block.construct_inputs_spec();
164168
// update the input ranges for each segments
165169
convert_info.inputs = ir::associate_specs_with_inputs(seg_block.g(), inputs, static_params);
@@ -180,8 +184,32 @@ partitioning::GraphAndMapping BuildHybridGraph(
180184
true);
181185

182186
seg_block.update_graph(temp_g);
187+
} else {
188+
num_torch_segments++;
189+
190+
// If full compilation is expected, ensure that all operators in Torch blocks are
191+
// for collections processing
192+
if (expect_full_compilation) {
193+
for (auto torch_node : seg_block.block()->nodes()) {
194+
if (partitioning::CollectionNodeKinds.find(torch_node->kind()) == partitioning::CollectionNodeKinds.end()) {
195+
TORCHTRT_THROW_ERROR(
196+
"Full compilation specified but node "
197+
<< *torch_node
198+
<< " is set to run in PyTorch due to either lack of support in TensorRT or graph partitioning rules."
199+
<< " Try recompiling with require_full_compilation=False.");
200+
}
201+
}
202+
}
183203
}
184204
}
205+
206+
// If full compilation is expected, cannot have more than 2 Torch segments
207+
// (one for preprocessing inputs, one for post-processing outputs) and 1 TRT segment
208+
if (expect_full_compilation && !(num_torch_segments <= 2 && num_trt_segments == 1)) {
209+
TORCHTRT_THROW_ERROR(
210+
"Full compilation was requested but unable to convert all operations to TensorRT."
211+
<< " Try recompiling with require_full_compilation=False.");
212+
}
185213
}
186214

187215
return partitioning::stitch(&partitioning_ctx, block);
@@ -191,7 +219,8 @@ ir::TypeMap MapInputsAndDetermineDTypes(
191219
CompileSpec& cfg,
192220
std::shared_ptr<torch::jit::Graph>& g,
193221
ir::StaticParams& static_params,
194-
ir::CollectionTypeMap& first_use_type_map) {
222+
ir::CollectionTypeMap& first_use_type_map,
223+
bool requires_collection_handling = false) {
195224
cfg.convert_info.collection_input_spec_map =
196225
std::move(ir::associate_specs_with_collection_inputs(g, cfg.graph_inputs, static_params));
197226
cfg.partitioning_info.collection_input_spec_map =
@@ -226,7 +255,7 @@ ir::TypeMap MapInputsAndDetermineDTypes(
226255
"Cannot infer input type from calcuations in graph for input "
227256
<< in->debugName() << ". Assuming it is Float32. If not, specify input type explicity");
228257
spec[i].dtype = at::kFloat;
229-
} else if (spec[i].dtype_is_user_defined && cfg.partitioning_info.enabled) {
258+
} else if (spec[i].dtype_is_user_defined && (cfg.partitioning_info.enabled || requires_collection_handling)) {
230259
if (!est_type_opt[i]) {
231260
LOG_INFO("Cannot infer input tensor dtype in graph, compiler is going to use the user setting");
232261
std::stringstream ss;
@@ -297,6 +326,11 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
297326
return engine;
298327
}
299328

329+
bool userRequestedFallback(CompileSpec& cfg) {
330+
return cfg.lower_info.forced_fallback_modules.size() != 0 ||
331+
cfg.partitioning_info.forced_fallback_operators.size() != 0;
332+
}
333+
300334
torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg) {
301335
torch::jit::Module new_mod(mod._ivalue()->name() + "_trt");
302336

@@ -315,8 +349,17 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
315349
// Infer the type of an input from the weights of the calculation
316350
auto first_use_types = ir::get_block_first_calc_dtypes_opt_collection(g->block());
317351

352+
// Determine if the block is convertible/has collection output, and based on the result,
353+
// whether full compilation can be expected
354+
auto isBlockConvertible = conversion::VerifyConverterSupportForBlock(g->block(), true);
355+
auto outputIsCollection = conversion::OutputIsCollection(g->block());
356+
auto requires_collection_handling = (isBlockConvertible && outputIsCollection);
357+
358+
// Determine whether user specifications necessitate partitioning
359+
auto isFallbackRequested = userRequestedFallback(cfg);
360+
318361
// Extract map of IValue to DType
319-
auto type_map = MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types);
362+
auto type_map = MapInputsAndDetermineDTypes(cfg, g, static_params, first_use_types, requires_collection_handling);
320363

321364
// Check whether any of the input types are Long
322365
bool user_requested_long = false;
@@ -330,20 +373,28 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
330373
user_requested_long &= (casts_inserted > 0);
331374
}
332375

333-
auto isBlockConvertible = conversion::VerifyConverterSupportForBlock(g->block(), true);
334-
auto outputIsCollection = conversion::OutputIsCollection(g->block());
335-
if (cfg.partitioning_info.enabled && !user_requested_long &&
336-
(cfg.lower_info.forced_fallback_modules.size() == 0 &&
337-
cfg.partitioning_info.forced_fallback_operators.size() == 0 && isBlockConvertible) &&
338-
!outputIsCollection) {
376+
// Partitioning is required if:
377+
// 1. User requested some modules/operators fallback
378+
// 2. The block (graph) cannot be converted due to operator coverage
379+
// 3. The output of the graph is a collection
380+
// 4. The user requested a non-TRT data type input
381+
auto isPartitioningRequired =
382+
(isFallbackRequested || !isBlockConvertible || outputIsCollection || user_requested_long);
383+
384+
// The user did not require full compilation, but the model can be fully compiled
385+
if (cfg.partitioning_info.enabled && !isPartitioningRequired) {
339386
LOG_INFO("Skipping partitioning since model is fully supported");
340387
}
341388

342-
if (cfg.partitioning_info.enabled &&
343-
(!(cfg.lower_info.forced_fallback_modules.size() == 0 &&
344-
cfg.partitioning_info.forced_fallback_operators.size() == 0 && isBlockConvertible) ||
345-
outputIsCollection || user_requested_long)) {
346-
auto graph_and_mapping = BuildHybridGraph(new_mod, g->block(), cfg, static_params, first_use_types);
389+
// The user did not require full compilation, and the model can be fully compiled
390+
// or, the user required full compilation but the I/O of the graph use collections
391+
if ((cfg.partitioning_info.enabled && isPartitioningRequired) || requires_collection_handling) {
392+
// If the model is fully-compilable and the user has specified full compilation, run partitioning
393+
// to generate collection-processing code in Torch
394+
auto expect_full_compilation = (requires_collection_handling && !cfg.partitioning_info.enabled);
395+
396+
auto graph_and_mapping =
397+
BuildHybridGraph(new_mod, g->block(), cfg, static_params, first_use_types, expect_full_compilation);
347398
new_g = graph_and_mapping.first;
348399
// renaming the input name of graph after fallback to ensure pytorch deserialize it correctly
349400
for (size_t i = 0; i < new_g->inputs().size(); ++i) {

core/conversion/converters/impl/reduce.cpp

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,8 @@ auto reduce_registrations TORCHTRT_UNUSED =
203203
return true;
204204
}})
205205
.pattern(
206-
{"aten::min(Tensor self) -> Tensor", [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
206+
{"aten::min(Tensor self) -> Tensor",
207+
[](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
207208
auto in_tensor = args[0].ITensorOrFreeze(ctx);
208209
auto in_dims = util::toVec(in_tensor->getDimensions());
209210

@@ -216,6 +217,38 @@ auto reduce_registrations TORCHTRT_UNUSED =
216217
min_layer->setName(util::node_info(n).c_str());
217218
auto out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], min_layer->getOutput(0));
218219

220+
LOG_DEBUG("Output shape: " << out_tensor->getDimensions());
221+
return true;
222+
}})
223+
.pattern(
224+
{"aten::any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor",
225+
[](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
226+
auto in_tensor = args[0].ITensorOrFreeze(ctx);
227+
auto in_dims = in_tensor->getDimensions();
228+
auto dim = args[1].unwrapToInt();
229+
LOG_DEBUG("Dim to reduce (original): " << dim);
230+
dim = dim < 0 ? (in_dims.nbDims + dim) : dim;
231+
LOG_DEBUG("Dim to reduce (converted): " << dim);
232+
233+
uint32_t axis_mask = 1 << dim;
234+
LOG_DEBUG("Axis Mask: " << std::bitset<32>(axis_mask));
235+
236+
auto keepdim = args[2].unwrapToBool();
237+
LOG_DEBUG("Keep dims: " << keepdim);
238+
239+
// Reduce does not work on bool inputs
240+
if (in_tensor->getType() == nvinfer1::DataType::kBOOL) {
241+
in_tensor =
242+
castITensor(ctx, in_tensor, nvinfer1::DataType::kINT32, (util::node_info(n) + "_in").c_str());
243+
}
244+
auto sum_layer = ctx->net->addReduce(*in_tensor, nvinfer1::ReduceOperation::kSUM, axis_mask, keepdim);
245+
246+
TORCHTRT_CHECK(sum_layer, "Unable to create sum layer from node: " << *n);
247+
248+
sum_layer->setName(util::node_info(n).c_str());
249+
auto out_tensor = castITensor(
250+
ctx, sum_layer->getOutput(0), nvinfer1::DataType::kBOOL, (util::node_info(n) + "_out").c_str());
251+
out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], out_tensor);
219252
LOG_DEBUG("Output shape: " << out_tensor->getDimensions());
220253
return true;
221254
}});

core/conversion/converters/impl/select.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,29 @@ auto select_registrations TORCHTRT_UNUSED =
180180
return true;
181181
}})
182182
.pattern(
183+
{"aten::index_select(Tensor self, int dim, Tensor index) -> Tensor",
184+
[](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
185+
auto in = args[0].ITensorOrFreeze(ctx);
186+
auto maxDim = static_cast<int64_t>(in->getDimensions().nbDims);
187+
auto dim = args[1].unwrapToInt();
188+
// Handle negative axis by refering to nbDims of input Tensor
189+
dim = dim < 0 ? dim + maxDim : dim;
190+
auto index = args[2].ITensorOrFreeze(ctx);
191+
192+
LOG_DEBUG("Gather input dimensions: " << in->getDimensions());
193+
LOG_DEBUG("Dimension to select: " << dim);
194+
LOG_DEBUG("Index dimensions: " << index->getDimensions());
195+
196+
auto gather_layer = ctx->net->addGather(*in, *index, dim);
197+
TORCHTRT_CHECK(gather_layer, "Unable to create gather layer from node: " << *n);
198+
auto out = gather_layer->getOutput(0);
199+
LOG_DEBUG("Gather tensor shape: " << out->getDimensions());
200+
201+
out = ctx->AssociateValueAndTensor(n->outputs()[0], out);
202+
LOG_DEBUG("Output tensor shape: " << out->getDimensions());
203+
return true;
204+
}})
205+
.pattern(
183206
{"aten::narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)",
184207
[](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
185208
auto in = args[0].ITensor();
@@ -337,7 +360,7 @@ auto select_registrations TORCHTRT_UNUSED =
337360

338361
// IGatherLayer takes in input tensor, the indices, and the axis of input tensor to take indices
339362
// from
340-
auto gather_layer = ctx->net->addGather(*in, *indicesTensor, 0);
363+
auto gather_layer = ctx->net->addGather(*in, *indicesTensor, adv_idx_indices[0]);
341364
TORCHTRT_CHECK(gather_layer, "Unable to create gather layer from node: " << *n);
342365
auto gather_out = gather_layer->getOutput(0);
343366

core/conversion/converters/impl/unary.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,21 @@ auto reciprocal_registration TORCHTRT_UNUSED = RegisterNodeConversionPatterns().
3434
return true;
3535
}});
3636

37+
auto logical_not_registration TORCHTRT_UNUSED = RegisterNodeConversionPatterns().pattern(
38+
{"aten::logical_not(Tensor self) -> Tensor", [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
39+
auto in = args[0].ITensorOrFreeze(ctx);
40+
if (in->getType() != nvinfer1::DataType::kBOOL) {
41+
// unary not layer only supports bool inputs
42+
in = castITensor(ctx, in, nvinfer1::DataType::kBOOL, util::node_info(n).c_str());
43+
}
44+
auto unary_layer = ctx->net->addUnary(*in, nvinfer1::UnaryOperation::kNOT);
45+
TORCHTRT_CHECK(unary_layer, "Unable to create logical_not layer from node: " << *n);
46+
unary_layer->setName(util::node_info(n).c_str());
47+
auto out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], unary_layer->getOutput(0));
48+
LOG_DEBUG("Output tensor shape: " << out_tensor->getDimensions());
49+
return true;
50+
}});
51+
3752
#define convert(unary, trt_type) \
3853
auto unary##_registrations TORCHTRT_UNUSED = RegisterNodeConversionPatterns().pattern( \
3954
{"aten::" #unary "(Tensor self) -> Tensor", \

core/conversion/converters/impl/unsqueeze.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ auto unsqueeze_registrations TORCHTRT_UNUSED = RegisterNodeConversionPatterns().
3232

3333
auto shuffle_layer = ctx->net->addShuffle(*self);
3434
TORCHTRT_CHECK(shuffle_layer, "Unable to create shuffle layer from node: " << *n);
35-
shuffle_layer->setReshapeDimensions(util::unsqueezeDims(self->getDimensions(), dim));
35+
shuffle_layer->setReshapeDimensions(util::unsqueezeDims(self->getDimensions(), dim, 1, false));
3636

3737
auto out = ctx->AssociateValueAndTensor(n->outputs()[0], shuffle_layer->getOutput(0));
3838

core/lowering/lowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ int AutocastLongInputs(
3232
std::string target_device_name) {
3333
int num_autocasts = 0;
3434
// For each graph input, determine if it can be autocasted
35-
for (int i = 0; i < g->inputs().size(); i++) {
35+
for (size_t i = 0; i < g->inputs().size(); i++) {
3636
auto input = g->inputs()[i];
3737

3838
// Autocasted inputs must be Tensor-type

0 commit comments

Comments
 (0)