[cherry-pick] fix QuantizeLinear pass and support reduce_max in quantization (#44872)

yghstill · RachelXu7 · web-flow · commit 24b3bbdef590 · 2022-08-04T17:52:40.000+08:00
* fix QuantizeLinear kernel and pass in QAT (#44784) * Add Reduce Max in Quant (#44825) Co-authored-by: Chang Xu <molixu7@gmail.com>
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
@@ -139,7 +139,8 @@ struct FindMovingAverageAbsMaxFunctor {
   void operator()(const DeviceContext &ctx,
                   const framework::Tensor &in_accum,
                   const framework::Tensor &in_state,
-                  const framework::Tensor &cur_scale,
+                  const T *cur_scale,
+                  const float rate,
                   framework::Tensor *out_state,
                   framework::Tensor *out_accum,
                   framework::Tensor *out_scale);
diff --git a/paddle/fluid/operators/quantize_linear_op.cc b/paddle/fluid/operators/quantize_linear_op.cc
@@ -93,6 +93,12 @@ class QuantizeLinearOp : public framework::OperatorWithKernel {
         ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[quant_axis]});
       }
     }
+    if (ctx->HasOutput("OutState")) {
+      ctx->SetOutputDim("OutState", {1});
+    }
+    if (ctx->HasOutput("OutAccum")) {
+      ctx->SetOutputDim("OutAccum", {1});
+    }
     ctx->ShareLoD("X", /*->*/ "Y");
   }
 
@@ -113,7 +119,25 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Y",
               "(Tensor) Output of quantized low level tensor, "
               "but also saved as float data type.");
-    AddOutput("OutScale", "(Tensor) Current scale").AsDispensable().AsExtra();
+    AddInput("InAccum", "Last accum.")
+        .AsDispensable()
+        .AsExtra();  // only qat use
+    AddInput("InState", "Last state.")
+        .AsDispensable()
+        .AsExtra();  // only qat use
+    AddOutput("OutState", "(Tensor) state buffer.")
+        .AsDispensable()
+        .AsExtra();  // only qat use
+    AddOutput("OutAccum", "(Tensor) accum buffer.")
+        .AsDispensable()
+        .AsExtra();  // only qat use
+    AddOutput("OutScale", "(Tensor) Current scale")
+        .AsDispensable()
+        .AsExtra();  // only qat use
+    AddAttr<float>("moving_rate",
+                   "(float, default 0.9) moving rate.")  // only qat use
+        .SetDefault(0.9)
+        .AsExtra();
     AddAttr<int>("quant_axis",
                  "(int, default 0) The axis for quantization. "
                  "For conv2d, depthwise_conv2d, conv2d_transpose "
@@ -154,8 +178,7 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
                   "nearest ties to even and 1 is rounding to nearest "
                   "ties away from zero.but the received is %d",
                   round_type));
-        })
-        .AsExtra();
+        });
     AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
diff --git a/paddle/fluid/operators/quantize_linear_op.h b/paddle/fluid/operators/quantize_linear_op.h
@@ -56,10 +56,31 @@ class QuantizeLinearKernel : public framework::OpKernel<T> {
 
     if (quant_axis < 0) {
       if (!is_test) {
-        auto* out_scale = context.Output<framework::Tensor>("OutScale");
-        T* out_s = out_scale->mutable_data<T>(context.GetPlace());
+        // training
+        auto* in_accum = context.Input<framework::Tensor>("InAccum");
+        auto* in_state = context.Input<framework::Tensor>("InState");
+        auto cur_scale = memory::Alloc(dev_ctx, sizeof(T));
+        T* cur_scale_data = static_cast<T*>(cur_scale->ptr());
+
         FindAbsMaxFunctor<DeviceContext, T>()(
-            dev_ctx, in->data<T>(), in->numel(), out_s);
+            dev_ctx, in->data<T>(), in->numel(), cur_scale_data);
+
+        auto* out_state = context.Output<framework::Tensor>("OutState");
+        auto* out_accum = context.Output<framework::Tensor>("OutAccum");
+        auto* out_scale = context.Output<framework::Tensor>("OutScale");
+        out_state->mutable_data<T>(context.GetPlace());
+        out_accum->mutable_data<T>(context.GetPlace());
+        out_scale->mutable_data<T>(context.GetPlace());
+        float moving_rate = context.Attr<float>("moving_rate");
+
+        FindMovingAverageAbsMaxFunctor<DeviceContext, T>()(dev_ctx,
+                                                           *in_accum,
+                                                           *in_state,
+                                                           cur_scale_data,
+                                                           moving_rate,
+                                                           out_state,
+                                                           out_accum,
+                                                           out_scale);
         ClipAndFakeQuantFunctor<DeviceContext, T>()(
             dev_ctx, *in, *out_scale, bin_cnt, round_type, out);
       } else {
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -418,8 +418,7 @@ def quantize(self):
             self._update_program()
 
         # save out_threshold for quantized ops.
-        if not self._onnx_format:
-            self._save_output_threshold()
+        self._save_output_threshold()
 
         if any(op_type in self._quantizable_op_type
                for op_type in self._dynamic_quantize_op_type):
@@ -996,16 +995,23 @@ def _save_output_threshold(self):
         '''
         Save output threshold to the quantized op.
         '''
+        self._calibration_scales = {}
 
         def save_info(op_node, out_var_name, threshold_map, out_info_name,
                       quantized_type):
             assert out_var_name in threshold_map, \
                 "The output ({}) of {} node does not have threshold.".format(
                 out_var_name, op_node.type)
-            op_node._set_attr(out_info_name, threshold_map[var_name])
-            op_node._set_attr("with_quant_attr", True)
-            if op_node.type in self._quantizable_op_type:
-                op._set_attr("quantization_type", quantized_type)
+            if self._onnx_format:
+                # For easy extension, every var_node set a dict to save parameters of quant.
+                self._calibration_scales[var_name] = {}
+                self._calibration_scales[var_name]['scale'] = threshold_map[
+                    var_name]
+            else:
+                op_node._set_attr(out_info_name, threshold_map[var_name])
+                op_node._set_attr("with_quant_attr", True)
+                if op_node.type in self._quantizable_op_type:
+                    op._set_attr("quantization_type", quantized_type)
 
         def analysis_and_save_info(op_node, out_var_name):
             argname_index = utils._get_output_name_index(op_node, out_var_name)
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -1785,6 +1785,7 @@ class InsertQuantizeLinear(object):
             equal to 0, it will quantization with per channel, else quantization with per layer.
             Default is -1.
         channel_wise(bool, optional): Whether quantization with per channel or not. Default is False.
+        moving_rate(float): the rate for 'moving average' method.
         is_test(bool, optional): Whether quantization with training or not. Default is True.
     """
 
@@ -1794,22 +1795,24 @@ def __init__(self,
                  quant_bits=8,
                  quant_axis=-1,
                  channel_wise=False,
+                 moving_rate=0.9,
                  is_test=True):
         self._place = place
         self._scope = scope
         self.quant_bits = quant_bits
         self.quant_axis = quant_axis
         self.channel_wise = channel_wise
         self._is_test = is_test
+        self._moving_rate = moving_rate
 
-    def insert_quant_op(self, graph, var_node):
+    def insert_quant_op(self, graph, var_node, var_name=None):
         assert var_node.is_var(), '{} is not a var'.format(var_node.name())
-
-        quant_var_node = graph.create_var_node(name=self._quantized_var_name(
-            var_node.name()),
-                                               var_type=var_node.type(),
-                                               shape=var_node.shape(),
-                                               var_dtype=var_node.dtype())
+        var_name = var_node.name() if not var_name else var_name
+        quant_var_node = graph.create_var_node(
+            name=self._quantized_var_name(var_name),
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())
         data_type = 'float64' if var_node.dtype(
         ) == core.VarDesc.VarType.FP64 else 'float32'
         if self.channel_wise:
@@ -1821,7 +1824,7 @@ def insert_quant_op(self, graph, var_node):
             scale_var_type = var_node.type()
             init_scale_value = np.array([_SCALE_DEFAULT_VALUE], dtype=data_type)
         scale_var_node = graph.create_persistable_node(
-            name=self._quantized_scale_name(var_node.name()),
+            name=self._quantized_scale_name(var_name),
             var_type=scale_var_type,
             shape=[scale_var_shape],
             var_dtype=var_node.dtype())
@@ -1844,13 +1847,39 @@ def insert_quant_op(self, graph, var_node):
             inputs["ZeroPoint"] = zero_point_node
 
         attrs = {"quant_axis": self.quant_axis, "bit_length": self.quant_bits}
+        attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward
         outputs = {"Y": quant_var_node}
         if not self._is_test:
-            attrs["is_test"] = self._is_test
-            attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward
             scale_out_node = graph.create_var_node_from_desc(
                 scale_var_node.var())
+            state_in_node = graph.create_persistable_node(
+                name=unique_name.generate('state'),
+                var_type=core.VarDesc.VarType.LOD_TENSOR,
+                var_dtype=var_node.dtype(),
+                shape=[1])
+            data_type = 'float64' if var_node.dtype(
+            ) == core.VarDesc.VarType.FP64 else 'float32'
+            _init_var_node(state_in_node, np.ones([1], dtype=data_type),
+                           self._scope, self._place)
+            accum_in_node = graph.create_persistable_node(
+                name=unique_name.generate('accum'),
+                var_type=core.VarDesc.VarType.LOD_TENSOR,
+                var_dtype=var_node.dtype(),
+                shape=[1])
+            _init_var_node(accum_in_node, np.ones([1], dtype=data_type),
+                           self._scope, self._place)
+            state_out_node = graph.create_var_node_from_desc(
+                state_in_node.var())
+            accum_out_node = graph.create_var_node_from_desc(
+                accum_in_node.var())
+
             outputs["OutScale"] = scale_out_node
+            inputs['InState'] = state_in_node
+            inputs['InAccum'] = accum_in_node
+            outputs['OutState'] = state_out_node
+            outputs['OutAccum'] = accum_out_node
+            attrs["is_test"] = self._is_test
+            attrs['moving_rate'] = self._moving_rate
 
         quant_op_node = graph.create_op_node(op_type="quantize_linear",
                                              attrs=attrs,
@@ -1863,6 +1892,10 @@ def insert_quant_op(self, graph, var_node):
             graph.link_to(zero_point_node, quant_op_node)
         graph.link_to(quant_op_node, quant_var_node)
         if not self._is_test:
+            graph.link_to(state_in_node, quant_op_node)
+            graph.link_to(accum_in_node, quant_op_node)
+            graph.link_to(quant_op_node, state_out_node)
+            graph.link_to(quant_op_node, accum_out_node)
             graph.link_to(quant_op_node, scale_out_node)
         return quant_var_node, scale_var_node
 
@@ -1891,8 +1924,7 @@ def insert_dequant_op(self, graph, var_node, scale_var_node):
             inputs["ZeroPoint"] = zero_point_node
 
         attrs = {"quant_axis": self.quant_axis, "bit_length": self.quant_bits}
-        if not self._is_test:
-            attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward
+        attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward
 
         quant_op_node = graph.create_op_node(op_type="dequantize_linear",
                                              attrs=attrs,
@@ -1931,10 +1963,10 @@ def _zero_point_name(self, var_name):
         return "%s@zero_point" % (var_name)
 
 
-class QuantizationTransformPassV2(object):
+class QuantizationTransformPassV2(QuantizationTransformPass):
     """
     Quantize the ops that have weights. Add quant and dequant ops for
-    the quantized ops's inputs.
+    the quantized ops's inputs. It is used in the new format of quantization.
     """
 
     def __init__(self,
@@ -2130,13 +2162,13 @@ def _transform_forward(self, graph, op):
                 if is_weight and self._weight_quantize_func is not None:
                     target_out_node = self._insert_func(
                         graph, self._weight_quantize_func, var_node, op)
-                    processed_vars.append(name)
+                    self.processed_vars.append(name)
                     continue
                 elif not is_weight and self._act_quantize_func is not None:
                     target_out_node = self._insert_func(graph,
                                                         self._act_quantize_func,
                                                         var_node, op)
-                    processed_vars.append(name)
+                    self.processed_vars.append(name)
                     continue
 
                 quant_bits = self._weight_bits if var_node.name() in self.persistable_vars \
@@ -2155,9 +2187,10 @@ def _transform_forward(self, graph, op):
                     quant_bits=quant_bits,
                     quant_axis=quant_axis,
                     channel_wise=channel_wise,
+                    moving_rate=self._moving_rate,
                     is_test=self._is_test)
                 quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op(
-                    graph, var_node)
+                    graph, var_node, var_name=name)
                 dequant_var_node = insert_quant_pass.insert_dequant_op(
                     graph, quant_var_node, scale_var_node)
 
@@ -2182,24 +2215,6 @@ def _has_weight(self, op):
                 has_weight = True
         return has_weight
 
-    def _is_skip_quant(self, graph, op_node):
-        """
-        Analyse whether the op node skips quantization.
-        """
-        is_skip = False
-        if op_node.op().has_attr("skip_quant") and \
-            op_node.op().attr("skip_quant"):
-            is_skip = True
-        # if the inputs of mul and matmul are not all persistable, use
-        # AddQuantDequantPassV2 to quantize them.
-        if op_node.name() in ["mul", "matmul", "matmul_v2"] and \
-            _is_input_all_not_persistable(graph, op_node):
-            is_skip = True
-        if op_node.op().has_attr("quantization_type") and \
-            op_node.op().attr("quantization_type") == "qat_without_weight":
-            is_skip = True
-        return is_skip
-
     def apply(self, graph):
         """
         Quantize the graph for training process. According to weight and
@@ -2250,7 +2265,7 @@ def apply(self, graph):
 class AddQuantDequantPassV2(object):
     """
     Quantize the ops that do not have weights, and add quant_linear and dequant_linear
-    op for the quantized ops's inputs.
+    op for the quantized ops's inputs. It is used in the new format of quantization.
     """
 
     # To be compatible with PaddleSlim, not remove _activation_type for now
@@ -2377,6 +2392,7 @@ def apply(self, graph):
                                 quant_bits=self._quant_bits,
                                 quant_axis=-1,
                                 channel_wise=False,
+                                moving_rate=self._moving_rate,
                                 is_test=self._is_test)
                             quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op(
                                 graph, in_node)
diff --git a/python/paddle/fluid/contrib/slim/quantization/utils.py b/python/paddle/fluid/contrib/slim/quantization/utils.py
@@ -109,6 +109,7 @@
     "square",
     "softplus",
     "shuffle_channel",
+    "reduce_max",
 ]
 
 _out_scale_op_list = list(
@@ -213,6 +214,7 @@
     "square": [["X"], ["Out"]],
     "softplus": [["X"], ["Out"]],
     "shuffle_channel": [["X"], ["Out"]],
+    "reduce_max": [["X"], ["Out"]],
 }
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -550,18 +550,41 @@ def set_args(self):
     def setUp(self):
         self.set_args()
         self.op_type = "quantize_linear"
-        x = np.random.randn(31, 65).astype(self.data_type)
-        yq, scale = quantize_max_abs(x, self.max_range)
-        scale = np.array(scale).astype(self.data_type)
-        zero_point = np.zeros(scale.shape, dtype="int32")
-
-        self.inputs = {'X': x, 'Scale': scale, 'ZeroPoint': zero_point}
         self.attrs = {
             'bit_length': self.bit_length,
             'quant_axis': self.quant_axis,
+            'moving_rate': 0.9,
             'is_test': self.is_test
         }
-        self.outputs = {'Y': yq, 'OutScale': scale}
+
+        x = np.random.randn(31, 65).astype(self.data_type)
+        scale = np.array([0.001]).astype(self.data_type)
+        zero_point = np.zeros(scale.shape, dtype="int32")
+        in_accum = np.ones(1).astype(self.data_type)
+        in_state = np.ones(1).astype(self.data_type)
+        out_accum = np.zeros(1).astype(self.data_type)
+        out_state = np.zeros(1).astype(self.data_type)
+        out_accum[0] = self.attrs['moving_rate'] * in_accum[0] + np.max(
+            np.abs(x))
+        out_state[0] = self.attrs['moving_rate'] * in_state[0] + 1.0
+        out_scale = out_accum / out_state
+
+        round_out = np.round(x / out_scale * self.max_range)
+        quant_data = np.clip(round_out, -self.max_range - 1, self.max_range)
+
+        self.inputs = {
+            'X': x,
+            'Scale': scale,
+            'ZeroPoint': zero_point,
+            'InAccum': in_accum,
+            'InState': in_state,
+        }
+        self.outputs = {
+            'Y': quant_data,
+            'OutScale': out_scale,
+            'OutAccum': out_accum,
+            'OutState': out_state,
+        }
 
     def test_check_output(self):
         self.check_output()

Original file line number	Diff line number	Diff line change
`@@ -109,6 +109,7 @@`
`109`	`109`	`"square",`
`110`	`110`	`"softplus",`
`111`	`111`	`"shuffle_channel",`
	`112`	`+ "reduce_max",`
`112`	`113`	`]`
`113`	`114`
`114`	`115`	`_out_scale_op_list = list(`
`@@ -213,6 +214,7 @@`
`213`	`214`	`"square": [["X"], ["Out"]],`
`214`	`215`	`"softplus": [["X"], ["Out"]],`
`215`	`216`	`"shuffle_channel": [["X"], ["Out"]],`
	`217`	`+ "reduce_max": [["X"], ["Out"]],`
`216`	`218`	`}`
`217`	`219`
`218`	`220`