[LatencyPredictor] Add new op and speed up prediction (PaddlePaddle#1014)

zzjjay · minghaoBD · web-flow · commit 1a9376a8b68e · 2022-03-18T16:59:30.000+08:00
* add new op type and support fp16 model

* preload predictors' model and speed up prediction

* preload predictors' model

* preload predictors' model

* Modified the save path of TMP files

Co-authored-by: minghaoBD &lt;79566150+minghaoBD@users.noreply.github.com&gt;
diff --git a/paddleslim/analysis/_utils.py b/paddleslim/analysis/_utils.py
@@ -18,7 +18,7 @@
 import paddle
 import paddleslim
 import subprocess
-import sklearn
+import time
 __all__ = [
     "save_cls_model", "save_det_model", "save_seg_model", "nearest_interpolate",
     "opt_model", "load_predictor"
@@ -29,10 +29,11 @@ def opt_model(opt="paddle_lite_opt",
               model_file='',
               param_file='',
               optimize_out_type='protobuf',
-              valid_targets='arm'):
+              valid_targets='arm',
+              enable_fp16=False):
     assert os.path.exists(model_file) and os.path.exists(
         param_file), f'{model_file} or {param_file} does not exist.'
-    save_dir = f'./opt_models_tmp/{os.getpid()}'
+    save_dir = f'./opt_models_tmp/{os.getpid()}_{time.time()}'
     if not os.path.exists(save_dir):
         os.makedirs(save_dir)
 
@@ -41,8 +42,8 @@ def opt_model(opt="paddle_lite_opt",
         model_out = os.path.join(save_dir, 'pbmodel')
     else:
         model_out = os.path.join(save_dir, 'model')
-
-    cmd = f'{opt} --model_file={model_file} --param_file={param_file}  --optimize_out_type={optimize_out_type} --optimize_out={model_out} --valid_targets={valid_targets}'
+    enable_fp16 = str(enable_fp16).lower()
+    cmd = f'{opt} --model_file={model_file} --param_file={param_file}  --optimize_out_type={optimize_out_type} --optimize_out={model_out} --valid_targets={valid_targets} --enable_fp16={enable_fp16}'
     print(f'commands:{cmd}')
     m = subprocess.Popen(
         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
diff --git a/paddleslim/analysis/extract_features.py b/paddleslim/analysis/extract_features.py
@@ -52,8 +52,13 @@ def get_features_from_paramkey(param_key, op_type, data_type):
     features = None
 
     if 'conv2d' in op_type:
-        flag_quant = 'quant=None' if data_type == 'fp32' else 'quant=True'
-        if flag_quant not in param_key:
+        if data_type == 'fp16':
+            quant_bits = 'bit_length=16'
+        elif data_type == 'int8':
+            quant_bits = 'bit_length=8'
+        else:
+            quant_bits = 'bit_length=None'
+        if quant_bits not in param_key:
             return None
 
         weight = re.search(r'weight=(\(\d*, \d*, \d*, \d*\))',
@@ -178,7 +183,7 @@ def get_features_from_paramkey(param_key, op_type, data_type):
           'leaky_relu' in op_type or 'tanh' in op_type or 'swish' in op_type or
           'softmax' in op_type or 'hard_sigmoid' in op_type or
           'sigmoid' in op_type or 'gelu' in op_type or 'clip' in op_type or
-          'shape' in op_type or 'interp_v2' in op_type):
+          'shape' in op_type or 'interp_v2' in op_type or 'sqrt' in op_type):
 
         inputs = re.search(r'in=(\((-?\d+,* *)+\))',
                            param_key).group().split('=')[-1].strip(
diff --git a/paddleslim/analysis/latency_predictor.py b/paddleslim/analysis/latency_predictor.py
@@ -16,7 +16,7 @@
 
 import os
 import pickle
-import time
+import shutil
 import subprocess
 from .parse_ops import get_key_from_op
 from .extract_features import get_data_from_tables, get_features_from_paramkey
@@ -71,15 +71,16 @@ def __init__(self, table_file='SD710'):
         self.hardware = None
         self.threads = None
         self.predictor_state = False
+        self.predictor = {}
         self._initial_table()
 
     def _initial_table(self):
         if self.table_file in ['SD625', 'SD710', 'SD845', 'SD865']:
             self.hardware = self.table_file
-            if self.hardware in ['SD625', 'SD710']:
-                self.predictor_state = True
             self.threads = 4
             self.table_file = f'{self.hardware}_threads_4_power_mode_0.pkl'
+            if self.hardware in ['SD625', 'SD710']:
+                self.predictor_state = True
             if not os.path.exists(self.table_file):
                 subprocess.call(
                     f'wget https://paddlemodels.bj.bcebos.com/PaddleSlim/analysis/{self.table_file}',
@@ -115,6 +116,19 @@ def _get_input_shape(self, graph):
                 break
         return in_shape
 
+    def _preload_predictor(self, data_type='fp32'):
+        op_types = [
+            'depthwise_conv2d', 'conv2d', 'pool2d', 'matmul', 'elementwise_add',
+            'elementwise_mul', 'concat', 'calib', 'swish'
+        ]
+        op_dir = self.table_file.split('.')[0] + '_batchsize_1'
+        for op_type in op_types:
+            model = load_predictor(op_type, op_dir, data_type)
+            key = op_type
+            if 'conv2d' in op_type:
+                key = f'{op_type}_{data_type}'
+            self.predictor[key] = model
+
     def predict(self,
                 model_file,
                 param_file,
@@ -125,22 +139,27 @@ def predict(self,
         
         Args:
             model_file(str), param_file(str): The inference model(*.pdmodel, *.pdiparams).
-            data_type(str): Data type, fp32 or int8. Default : fp32
+            data_type(str): Data type, fp32, fp16 or int8.
             threads(int): threads num
             input_shape(list): Generally, the input shape is confirmed when saving the inference model and the parameter is only effective for input shape that has variable length.
         Returns:
             latency(float): The latency of the model.
         """
-        assert data_type in ['fp32', 'int8'
-                             ], f'data_type must be one of [fp32, int8]'
+        assert data_type in ['fp32', 'int8', 'fp16'
+                             ], f'data_type must be one of [fp32, int8, fp16]'
 
         if self.hardware and self.threads != threads:
             self._change_table(threads)
 
+        if self.predictor_state and f'conv2d_{data_type}' not in self.predictor:
+            self._preload_predictor(data_type)
+
+        enable_fp16 = True if data_type == 'fp16' else False
         pbmodel_file = opt_model(
             model_file=model_file,
             param_file=param_file,
-            optimize_out_type='protobuf', )
+            optimize_out_type='protobuf',
+            enable_fp16=enable_fp16)
 
         paddle.enable_static()
         with open(pbmodel_file, "rb") as f:
@@ -176,7 +195,7 @@ def predict(self,
             warnings.warn("OperatorType\tCalledTimes")
             for key in new_op:
                 warnings.warn(f"{key.ljust(15)}\t{new_op[key]}")
-
+        shutil.rmtree(os.path.dirname(pbmodel_file))
         return latency
 
     def op_predictor(self, op_type, param_key, data_type):
@@ -185,18 +204,20 @@ def op_predictor(self, op_type, param_key, data_type):
         Args:
             op_type: The operator's type
             param_key: The operator's parameter information.
-            data_type: Data type, fp32 or int8. Default : int8
+            data_type: Data type, fp32 or int8.
         Returns:
             latency(float): The latency of the operator.
         """
 
         latency = 0.0
-        op_dir = self.table_file.split('.')[0] + '_batchsize_1'
         if op_type in [
                 'depthwise_conv2d', 'conv2d', 'pool2d', 'matmul',
                 'elementwise_add', 'elementwise_mul', 'concat', 'calib', 'swish'
         ]:
-            predictor = load_predictor(op_type, op_dir, data_type)
+            key = op_type
+            if 'conv2d' in op_type:
+                key = f'{op_type}_{data_type}'
+            predictor = self.predictor[key]
             features = get_features_from_paramkey(param_key, op_type, data_type)
             latency = predictor.predict([features])
         else:
diff --git a/paddleslim/analysis/parse_ops.py b/paddleslim/analysis/parse_ops.py
@@ -24,25 +24,30 @@ def get_key_from_op(op):
     if 'conv2d' in op_type:
         out_shape = op.all_outputs()[0].shape()
         in_shape = op.all_inputs()[-1].shape()
+        in_name = op.all_inputs()[1].name()
         weight_shape = op.all_inputs()[-2].shape()
-        kernel = weight_shape[2]
+        weight_shape = (out_shape[1], weight_shape[1], weight_shape[2], weight_shape[3])
+        
         stride = op.attr('strides')[1]
         padding = op.attr('paddings')[1]
         groups = op.attr('groups')
         dilation = op.attr('dilations')[1]
-        int8 = op.attr('enable_int8')
+        quant = op.attr('enable_int8')
         bit_length = op.attr('bit_length')
+        if op.attr(in_name+'_fp16') == 'fp16':
+            quant = True
+            bit_length = 16   
 
-        param_key = f'{op_type} in={in_shape} weight={weight_shape} out={out_shape} pad={padding} stride={stride} group={groups} dilation={dilation} quant={int8} bit_length={bit_length}'
+        param_key = f'{op_type} in={in_shape} weight={weight_shape} out={out_shape} pad={padding} stride={stride} group={groups} dilation={dilation} quant={quant} bit_length={bit_length}'
 
     elif op_type == 'matmul' or op_type == 'matmul_v2':
         X = op.all_inputs()[0].shape()
         Y = op.all_inputs()[1].shape()
         out_shape = op.all_outputs()[0].shape()
-        int8 = op.attr('enable_int8')
+        quant = op.attr('enable_int8')
         bit_length = op.attr('bit_length')
 
-        param_key = f'{op_type} X={X} Y={Y} out={out_shape} quant={int8} bit_length={bit_length}'
+        param_key = f'{op_type} X={X} Y={Y} out={out_shape} quant={quant} bit_length={bit_length}'
 
     elif 'batch_norm' in op_type or 'layer_norm' in op_type:
         out_shape = op.all_outputs()[-1].shape()
@@ -67,14 +72,12 @@ def get_key_from_op(op):
 
     elif op_type in [
             'hard_swish', 'relu', 'leaky_relu', 'tanh', 'swish', 'softmax',
-            'hard_sigmoid', 'sigmoid', 'gelu', 'clip', 'shape'
+            'hard_sigmoid', 'sigmoid', 'gelu', 'clip', 'shape', 'sqrt'
     ] or 'transpose' in op_type or 'interp_v2' in op_type:
         in_shape = op.all_inputs()[-1].shape()
+        out_shape = op.all_outputs()[0].shape()
 
-        param_key = f'{op_type} in={in_shape}'
-        in_shape = op.all_inputs()[-1].shape()
-
-        param_key = f'{op_type} in={in_shape}'
+        param_key = f'{op_type} in={in_shape} out={out_shape}'
 
     elif op_type in ['fill_constant', 'range', 'cast'] or 'expand' in op_type: