Skip to content

Commit d91e84a

Browse files
NHZlXPaddle CI
authored andcommitted
fix ssa bug with batchnorm and refine the trt interface
Merge pull request from #12843 from NHZlX:fix_ssa_bug_for_trt
1 parent dafd6d0 commit d91e84a

9 files changed

+45
-21
lines changed

paddle/fluid/inference/analysis/analyzer.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ class DfgPassManagerImpl final : public DfgPassManager {
4444
if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
4545
auto trt_teller = [&](const Node* node) {
4646
std::unordered_set<std::string> teller_set(
47-
{"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax"});
47+
{"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax",
48+
"depthwise_conv2d", "batch_norm"});
4849
if (!node->IsFunction()) return false;
4950

5051
const auto* func = static_cast<const Function*>(node);

paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,6 @@
2323
namespace paddle {
2424
namespace inference {
2525

26-
DEFINE_int32(tensorrt_max_batchsize, 1, "TensorRT maximum batch size");
27-
DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
28-
2926
namespace analysis {
3027

3128
using framework::proto::ProgramDesc;
@@ -190,8 +187,6 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
190187
// Set attrs
191188
SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
192189
SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
193-
SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
194-
SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
195190
SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
196191
SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
197192
node->SetPbMsg(desc.Proto()->SerializeAsString());

paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,6 @@
2727
namespace paddle {
2828
namespace inference {
2929

30-
DECLARE_int32(tensorrt_max_batchsize);
31-
DECLARE_int32(tensorrt_workspace_size);
32-
3330
namespace analysis {
3431
class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
3532
public:

paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
9292
auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
9393
in->outlinks.push_back(o);
9494
o->inlinks.push_back(in);
95+
unique_written_vars.insert(in);
9596
}
9697
}
9798
for (int j = 0; j < op.outputs_size(); j++) {
@@ -112,7 +113,6 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
112113
}
113114
out->inlinks.push_back(o);
114115
o->outlinks.push_back(out);
115-
unique_written_vars.insert(out);
116116
}
117117
}
118118
}

paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "paddle/fluid/inference/analysis/analyzer.h"
1616
#include "paddle/fluid/inference/api/api_impl.h"
1717
#include "paddle/fluid/inference/api/paddle_inference_api.h"
18+
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
1819
#include "paddle/fluid/inference/utils/singleton.h"
1920
#include "paddle/fluid/operators/tensorrt_engine_op.h"
2021

@@ -32,7 +33,8 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
3233

3334
bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
3435
VLOG(3) << "Predictor::init()";
35-
36+
FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
37+
FLAGS_tensorrt_workspace_size = config_.workspace_size;
3638
if (config_.use_gpu) {
3739
place_ = paddle::platform::CUDAPlace(config_.device);
3840
} else {
@@ -150,3 +152,12 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
150152
}
151153

152154
} // namespace paddle
155+
156+
USE_TRT_CONVERTER(elementwise_add_weight);
157+
USE_TRT_CONVERTER(mul);
158+
USE_TRT_CONVERTER(conv2d);
159+
USE_TRT_CONVERTER(relu);
160+
USE_TRT_CONVERTER(fc);
161+
USE_TRT_CONVERTER(pool2d);
162+
USE_TRT_CONVERTER(softmax);
163+
USE_TRT_CONVERTER(batch_norm);

paddle/fluid/inference/api/paddle_inference_api.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,14 @@ struct AnakinConfig : public PaddlePredictor::Config {
137137
struct TensorRTConfig : public NativeConfig {
138138
// Determine whether a subgraph will be executed by TRT.
139139
int min_subgraph_size{1};
140+
// While TensorRT allows an engine optimized for a given max batch size
141+
// to run at any smaller size, the performance for those smaller
142+
// sizes may not be as well-optimized. Therefore, Max batch is best
143+
// equivalent to the runtime batch size.
144+
int max_batch_size{1};
145+
// For workspace_size, refer it from here:
146+
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
147+
int workspace_size{1 << 30};
140148
};
141149

142150
// A factory to help create different predictors.

paddle/fluid/operators/tensorrt_engine_op.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
namespace paddle {
2323

2424
DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
25+
DEFINE_int32(tensorrt_max_batch_size, 1, "TensorRT maximum batch size");
26+
DEFINE_int32(tensorrt_workspace_size, 16 << 20, "TensorRT workspace size");
2527

2628
namespace operators {
2729

@@ -32,8 +34,6 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
3234
AddOutput("Ys", "A list of outputs").AsDuplicable();
3335
AddAttr<std::string>("subgraph", "the subgraph.");
3436
AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
35-
AddAttr<int>("max_batch", "the maximum batch size.");
36-
AddAttr<int>("max_workspace", "the maximum batch size.");
3737
AddComment("TensorRT engine operator.");
3838
}
3939
};

paddle/fluid/operators/tensorrt_engine_op.h

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
namespace paddle {
2929

3030
DECLARE_int32(tensorrt_engine_batch_size);
31+
DECLARE_int32(tensorrt_max_batch_size);
32+
DECLARE_int32(tensorrt_workspace_size);
3133

3234
namespace operators {
3335

@@ -54,8 +56,10 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
5456
"TensorRT' tensor input requires at least 2 dimensions");
5557
PADDLE_ENFORCE_LE(shape.size(), 4UL,
5658
"TensorRT' tensor input requires at most 4 dimensions");
57-
PADDLE_ENFORCE_EQ(shape.size(), 4UL);
58-
return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
59+
PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
60+
if (shape.size() == 4UL)
61+
return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
62+
return nvinfer1::DimsCHW(shape[1], 1, 1);
5963
}
6064

6165
} // namespace
@@ -95,7 +99,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
9599
auto input_names = context.op().Inputs("Xs");
96100
PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
97101
PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
98-
context.Attr<int>("max_batch"));
102+
FLAGS_tensorrt_max_batch_size);
99103

100104
std::vector<std::string> output_maps =
101105
context.Attr<std::vector<std::string>>("output_name_mapping");
@@ -132,7 +136,12 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
132136
nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
133137
auto dims = trt_t->getDimensions();
134138
// Use the output ITensor's dims to reshape the Fluid Tensor.
135-
std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
139+
// The ITensor doesn't contain the batch size dim.
140+
std::vector<int> ddim;
141+
ddim.push_back(FLAGS_tensorrt_engine_batch_size);
142+
for (int i = 0; i < dims.nbDims; i++) {
143+
ddim.push_back(dims.d[i]);
144+
}
136145

137146
auto* fluid_v = context.scope().FindVar(y);
138147
PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
@@ -168,8 +177,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
168177
// Get the ProgramDesc and pass to convert.
169178
framework::proto::BlockDesc block_desc;
170179
block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
171-
int max_batch = context.Attr<int>("max_batch");
172-
auto max_workspace = context.Attr<int>("max_workspace");
180+
int max_batch = FLAGS_tensorrt_max_batch_size;
181+
auto max_workspace = FLAGS_tensorrt_workspace_size;
173182
auto params = context.Attr<std::vector<std::string>>("parameters");
174183
std::unordered_set<std::string> parameters;
175184
for (const auto& param : params) {

paddle/fluid/operators/tensorrt_engine_op_test.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

15+
#include "paddle/fluid/operators/tensorrt_engine_op.h"
1516
#include <gtest/gtest.h>
1617
#include "paddle/fluid/framework/block_desc.h"
1718
#include "paddle/fluid/framework/lod_tensor.h"
@@ -57,6 +58,8 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
5758
using inference::analysis::SetAttr;
5859

5960
TEST(TensorRTEngineOp, manual) {
61+
FLAGS_tensorrt_engine_batch_size = 2;
62+
FLAGS_tensorrt_max_batch_size = 2;
6063
framework::ProgramDesc program;
6164
auto* block_ = program.Proto()->add_blocks();
6265
block_->set_idx(0);
@@ -98,8 +101,6 @@ TEST(TensorRTEngineOp, manual) {
98101
engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
99102
SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
100103
block_->SerializeAsString());
101-
SetAttr<int>(engine_op_desc.Proto(), "max_batch", 100);
102-
SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 1 << 10);
103104
SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
104105
SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
105106
std::vector<std::string>({}));
@@ -128,6 +129,8 @@ TEST(TensorRTEngineOp, manual) {
128129
}
129130

130131
void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
132+
FLAGS_tensorrt_engine_batch_size = batch_size;
133+
FLAGS_tensorrt_max_batch_size = batch_size;
131134
framework::ProgramDesc program;
132135
framework::Scope scope;
133136
platform::CUDAPlace place;

0 commit comments

Comments
 (0)