Update shape range setting method (#1599)

LiuChiachi · web-flow · commit 27d87b6cafaa · 2022-01-17T19:19:50.000+08:00
* update shape range setting method

* fix readme

* fix readme

* update readme
diff --git a/examples/model_compression/pp-minilm/README.md b/examples/model_compression/pp-minilm/README.md
@@ -95,7 +95,6 @@ PP-MiniLM 压缩方案以面向预训练模型的任务无关知识蒸馏(Task-a
 ├── inference                    # 预测目录
 │ └── infer.py                   # 预测脚本
 │ └── infer_all.sh               # 批量预测量化模型启动脚本
-│ └── infer_perf.py              # 量化模型性能测试脚本
 │ └── infer_perf.sh              # 量化模型性能测试启动脚本
 ├── data.py                      # 数据处理脚本
 ├── pp-minilm.png                # PP-MiniLM 方案流程图
@@ -337,7 +336,7 @@ cd ..
 
 #### 运行方式
 
-这里使用了动态 shape 功能，因此需要设置获取 shape 的范围。Paddle Inference 提供了相应的接口，即首先通过离线输入数据来统计出所有临时 tensor 的 shape 范围，TRT 子图的 tensor 输入 shape 范围可直接根据上一步 tune 出来的结果来设置，即可完成自动 shape 范围设置。统计完成后，只需设置统计结果路径，即可启用 `tuned_dynamic_shape` 功能。在本案例中，只需要先设置 `--collect_shape` 参数，运行 `infer.py`，然后再取消传入这个参数，再次运行 `infer.py`。例如：
+这里使用了动态 shape 功能，因此需要设置 TensorRT 子图输入shape 的范围。用户需要事先根据自己的模型结构和数据 shape 的范围，设置 TensorRT 子图输入的 shape 的最大、最小、以及最优的范围，其中最优范围可以按照数据分布选择最常见的来设置。动态 shape 的设置可以参考[官方文档](https://paddleinference.paddlepaddle.org.cn/optimize/paddle_trt.html#dynamic-shape)中的教程，以及本案例中 infer.py 脚本中的 160 行 - 206 行）。
 
 INT8 预测运行脚本：
 
@@ -347,8 +346,7 @@ cd inference
 export task=tnews
 export algo=mse
 export bs=4
-python infer.py --task_name ${task}  --model_path  ../quantization/${task}_quant_models/${algo}${bs}/int8  --int8 --use_trt --collect_shape # 生成shape range info文件
-python infer.py --task_name ${task}  --model_path  ../quantization/${task}_quant_models/${algo}${bs}/int8  --int8 --use_trt # load shape range info文件进行预测
+python infer.py --task_name ${task}  --model_path  ../quantization/${task}_quant_models/${algo}${bs}/int8  --int8 --use_trt
 ```
 如果想要批量对量化模型进行预测并输出不同量化策略产出模型的精度，可以使用如下的脚本批量预测：
 
@@ -359,7 +357,6 @@ sh infer_all.sh
 FP32 预测运行脚本：
 
 ```shell
-python infer.py --task_name ${task}  --model_path  $MODEL_PATH  --use_trt --collect_shape
 python infer.py --task_name ${task}  --model_path  $MODEL_PATH --use_trt
 ```
 
@@ -378,7 +375,7 @@ cd ..
 ```
 
 下表后三行分别是微调后的模型、裁剪后的模型、量化后模型的总耗时情况。
-取 5 个非 `--collect_shape` 阶段打印出的时长取平均，可以发现借助 PaddleSlim 裁剪、量化后的模型是原 BERT<sub>base</sub>模型推理速度的 9.3 倍，其中裁剪后的模型是 BERT<sub>base</sub>推理速度的 2.6 倍。
+取 5 个测试时长取平均，可以发现借助 PaddleSlim 裁剪、量化后的模型是原 BERT<sub>base</sub>模型推理速度的 9.3 倍，其中裁剪后的模型是 BERT<sub>base</sub>推理速度的 2.6 倍。
 
 |                         | 平均耗时(s) | 加速比 |
 | ----------------------- | ----------- | ------ |
diff --git a/examples/model_compression/pp-minilm/inference/infer.py b/examples/model_compression/pp-minilm/inference/infer.py
@@ -156,16 +156,54 @@ def create_predictor(cls, args):
                     use_calib_mode=False)
             print("Enable TensorRT is: {}".format(
                 config.tensorrt_engine_enabled()))
-            if args.collect_shape:
-                config.collect_shape_range_info(
-                    os.path.join(
-                        os.path.dirname(args.model_path), args.task_name +
-                        '_shape_range_info.pbtxt'))
+
+            # Set min/max/opt tensor shape of each trt subgraph input.
+            if args.int8:
+                min_batch_size, max_batch_size, opt_batch_size = 16, 32, 32
+                min_seq_len, max_seq_len, opt_seq_len = 31, 128, 32
+
+                min_input_shape = {
+                    "faster_tokenizer_2.tmp_0": [min_batch_size, min_seq_len],
+                    "faster_tokenizer_2.tmp_1": [min_batch_size, min_seq_len],
+                    "tmp_4": [min_batch_size, min_seq_len],
+                    "unsqueeze2_0.tmp_0": [min_batch_size, 1, 1, min_seq_len],
+                }
+                max_input_shape = {
+                    "faster_tokenizer_2.tmp_0": [max_batch_size, max_seq_len],
+                    "faster_tokenizer_2.tmp_1": [max_batch_size, max_seq_len],
+                    "tmp_4": [max_batch_size, max_seq_len],
+                    "unsqueeze2_0.tmp_0": [max_batch_size, 1, 1, max_seq_len],
+                }
+                opt_input_shape = {
+                    "faster_tokenizer_2.tmp_0": [opt_batch_size, opt_seq_len],
+                    "faster_tokenizer_2.tmp_1": [opt_batch_size, opt_seq_len],
+                    "tmp_4": [opt_batch_size, opt_seq_len],
+                    "unsqueeze2_0.tmp_0": [opt_batch_size, 1, 1, opt_seq_len],
+                }
             else:
-                config.enable_tuned_tensorrt_dynamic_shape(
-                    os.path.join(
-                        os.path.dirname(args.model_path),
-                        args.task_name + "_shape_range_info.pbtxt"), True)
+                min_batch_size, max_batch_size, opt_batch_size = 16, 32, 32
+                min_seq_len, max_seq_len, opt_seq_len = 31, 128, 32
+
+                min_input_shape = {
+                    "faster_tokenizer_1.tmp_0": [min_batch_size, min_seq_len],
+                    "faster_tokenizer_1.tmp_1": [min_batch_size, min_seq_len],
+                    "tmp_4": [min_batch_size, min_seq_len],
+                    "unsqueeze2_0.tmp_0": [min_batch_size, 1, 1, min_seq_len],
+                }
+                max_input_shape = {
+                    "faster_tokenizer_1.tmp_0": [max_batch_size, max_seq_len],
+                    "faster_tokenizer_1.tmp_1": [max_batch_size, max_seq_len],
+                    "tmp_4": [max_batch_size, max_seq_len],
+                    "unsqueeze2_0.tmp_0": [max_batch_size, 1, 1, max_seq_len],
+                }
+                opt_input_shape = {
+                    "faster_tokenizer_1.tmp_0": [opt_batch_size, opt_seq_len],
+                    "faster_tokenizer_1.tmp_1": [opt_batch_size, opt_seq_len],
+                    "tmp_4": [opt_batch_size, opt_seq_len],
+                    "unsqueeze2_0.tmp_0": [opt_batch_size, 1, 1, opt_seq_len],
+                }
+            config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape,
+                                              opt_input_shape)
 
         predictor = paddle.inference.create_predictor(config)
 
diff --git a/examples/model_compression/pp-minilm/inference/infer_all.sh b/examples/model_compression/pp-minilm/inference/infer_all.sh
@@ -19,7 +19,6 @@ do
     do
         for algo in abs_max avg hist mse
         do
-            python infer.py --task_name ${task}  --model_path  ../quantization/${task}_quant_models/${algo}${bs}/int8  --int8 --use_trt --collect_shape
             python infer.py --task_name ${task}  --model_path  ../quantization/${task}_quant_models/${algo}${bs}/int8  --int8 --use_trt
             echo this is ${task}, ${algo}, ${bs}
         done
diff --git a/examples/model_compression/pp-minilm/inference/infer_perf.sh b/examples/model_compression/pp-minilm/inference/infer_perf.sh
@@ -13,9 +13,6 @@
 # limitations under the License.
 
 export task=TNEWS
-python infer.py  --task_name ${task} --model_path  ../finetuning/ppminilm-6l-768h/models/${task}/1e-4_64/inference  --use_trt  --collect_shape --perf
-python infer.py --task_name ${task} --model_path ../pruning/pruned_models/${task}/0.75/sub_static/float  --use_trt --collect_shape --perf
-python  infer.py  --task_name ${task} --model_path  ../quantization/${task}_quant_models/mse4/int8  --int8 --use_trt  --collect_shape --perf
 
 echo Inference of orgin FP32 model
 for ((i=0;i<=4;i++));