fix quant_dequant pass, refine code (#9652) (#9660)

zhupengyang · web-flow · commit 954b732ef435 · 2022-11-10T10:06:07.000+08:00
diff --git a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h"
 #include <algorithm>
 #include <list>
@@ -19,11 +20,9 @@
 #include <string>
 #include <utility>
 #include <vector>
-#ifdef LITE_WITH_XPU
-#include "lite/backends/xpu/target_wrapper.h"
-#endif
 #include "lite/core/optimizer/mir/graph_visualize_pass.h"
 #include "lite/core/optimizer/mir/pass_registry.h"
+
 namespace paddle {
 namespace lite {
 namespace mir {
@@ -41,11 +40,9 @@ void XPUStaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
       << "kernel_pick_factors should be specified first";
   CHECK(graph) << "graph not valid";
 
-// Collect input data precision for each node in the graph
-// Collect XPU op type,which used in fp16/in8;
-#ifdef LITE_WITH_XPU
+  // Collect input data precision for each node in the graph
+  // Collect XPU op type,which used in fp16/in8;
   DataPrecisionDicide(graph);
-  GetXPUDeviceType();
   if (xpu_use_fp16_optimizer_ || xpu_use_int8_optimizer_) {
     CollectXPUSpecialOPType(graph);
     for (auto& node : graph->StmtTopologicalOrder()) {
@@ -75,9 +72,7 @@ void XPUStaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
       InplaceNodeInputPrecision(node);
     }
   }
-#endif
 
-#ifdef LITE_WITH_XPU
   // sort kernels by the factors.
   VLOG(2) << "graph block_idx: " << graph->blockIdx();
   VLOG(2) << "graph->mutable_nodes().size(): " << graph->mutable_nodes().size();
@@ -155,10 +150,8 @@ void XPUStaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     instruct.mutable_op_info()->SetAttr<std::string>(
         "kernel_summary", instruct.kernels().front()->summary());
   }
-#endif
 }
 
-#ifdef LITE_WITH_XPU
 void XPUStaticKernelPickPass::DataPrecisionDicide(
     const std::unique_ptr<SSAGraph>& graph) {
   if (GetStringFromEnv("XPUForceUseFP16", "false") == "true") {
@@ -198,8 +191,6 @@ bool XPUStaticKernelPickPass::ForceUsePrecision(
                      op_info->GetAttr<bool>("enable_int16");
   CHECK(!(int8_quant && int16_quant))
       << "You can only specify one quant type for an OP!";
-  bool xpu_local_quant =
-      GetBoolFromEnv("XPU_LOCAL_QUANT") || lite::TargetWrapperXPU::local_quant;
 
   if (instruct.op_type() == "__xpu__fc") {
     if (int8_quant && kernel.alias() == "XPU_Int8_FP32_FP32") {
@@ -210,12 +201,11 @@ bool XPUStaticKernelPickPass::ForceUsePrecision(
       *score *= 4;
       VLOG(6) << "__xpu__fc: force use PRECISON INT16: *4";
       return true;
-    } else if (xpu_local_quant && kernel.alias() == "XPU_FP32_LOCAL_QUANT") {
+    } else if (local_quant_ && kernel.alias() == "XPU_FP32_LOCAL_QUANT") {
       *score *= 4;
       VLOG(6) << "__xpu__fc: force use LOCAL QUANT: *4";
       return true;
-    } else if ((GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int31" ||
-                lite::TargetWrapperXPU::multi_encoder_precision == "int31") &&
+    } else if (encode_precision_ == "int31" &&
                kernel.alias() == "XPU_Real_kFloat") {
       *score *= 4;
       VLOG(6) << "__xpu__fc: force use PRECISON INT31: *4";
@@ -723,27 +713,6 @@ void XPUStaticKernelPickPass::SpecialOpScore(lite::mir::Node* node,
   *score += score_tmp_all;
 }
 
-void XPUStaticKernelPickPass::GetXPUDeviceType() {
-  int cur_dev_idx = 0;
-  uint64_t cur_dev_attr = 0;
-
-  XPU_CALL(xpu_current_device(&cur_dev_idx));
-  XPU_CALL(xpu_device_get_attr(&cur_dev_attr, XPUATTR_MODEL, cur_dev_idx));
-  if (cur_dev_attr <= 1) {
-    VLOG(4) << "Currents XPU device : XPU1";
-    xpu_disable_flag_ = "DISABLE_XPU1";
-  } else if (cur_dev_attr >= 2 && cur_dev_attr <= 299) {
-    VLOG(4) << "Currents XPU device : XPU2";
-    xpu_disable_flag_ = "DISABLE_XPU2";
-  } else if (cur_dev_attr >= 300 && cur_dev_attr <= 599) {
-    VLOG(4) << "Currents XPU device : XPU3";
-    xpu_disable_flag_ = "DISABLE_XPU3";
-  } else {
-    VLOG(4) << "invaid XPU device";
-    xpu_disable_flag_ = "NONE";
-  }
-}
-
 void XPUStaticKernelPickPass::GradeXPUKernelScore(
     lite::mir::Node* node,
     const lite::KernelBase& kernel,
@@ -846,7 +815,6 @@ void XPUStaticKernelPickPass::CollectXPUSpecialOPType(
   return;
 }
 
-#endif
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h
@@ -11,13 +11,17 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #pragma once
 #include <limits>
 #include <map>
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/target_wrapper.h"
+#endif
 #include "lite/core/optimizer/mir/pass.h"
 #include "lite/core/types.h"
 
@@ -38,6 +42,36 @@ namespace mir {
  */
 class XPUStaticKernelPickPass : public mir::StmtPass {
  public:
+  XPUStaticKernelPickPass() {
+#ifdef LITE_WITH_XPU
+    // get xpu device type
+    int cur_dev_idx = 0;
+    uint64_t cur_dev_attr = 0;
+    XPU_CALL(xpu_current_device(&cur_dev_idx));
+    XPU_CALL(xpu_device_get_attr(&cur_dev_attr, XPUATTR_MODEL, cur_dev_idx));
+    if (cur_dev_attr <= 1) {
+      VLOG(4) << "Currents XPU device : XPU1";
+      xpu_disable_flag_ = "DISABLE_XPU1";
+    } else if (cur_dev_attr >= 2 && cur_dev_attr <= 299) {
+      VLOG(4) << "Currents XPU device : XPU2";
+      xpu_disable_flag_ = "DISABLE_XPU2";
+    } else if (cur_dev_attr >= 300 && cur_dev_attr <= 599) {
+      VLOG(4) << "Currents XPU device : XPU3";
+      xpu_disable_flag_ = "DISABLE_XPU3";
+    } else {
+      VLOG(4) << "invaid XPU device";
+      xpu_disable_flag_ = "NONE";
+    }
+    // init quant type, encode precision
+    local_quant_ = GetBoolFromEnv("XPU_LOCAL_QUANT") ||
+                   lite::TargetWrapperXPU::local_quant;
+    encode_precision_ = lite::TargetWrapperXPU::multi_encoder_precision;
+    if (encode_precision_.empty()) {
+      encode_precision_ = GetStringFromEnv("XPU_ENCODER_PRECISION", "int16");
+    }
+#endif
+  }
+
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 
   const core::KernelPickFactor& kernel_pick_factors() const {
@@ -120,7 +154,6 @@ class XPUStaticKernelPickPass : public mir::StmtPass {
       }
       VLOG(4) << "[score s3]:" << score;
 
-#ifdef LITE_WITH_XPU
       bool type_match = false;
       GradeXPUKernelScore(node,
                           kernel,
@@ -136,10 +169,8 @@ class XPUStaticKernelPickPass : public mir::StmtPass {
         VLOG(4) << "[Input/Output precision compatible]: *2";
       }
       VLOG(4) << "[score s4]:" << score;
-#endif
 
-      // add new rules for datatype: When the input types are consistent
-      // with
+      // add new rules for datatype: When the input types are consistent with
       // kernel's input types, select the kernel of the datatype.
       if (instruct.op_info()->Type() != "conditional_block" &&
           instruct.op_info()->Type() != "while" &&
@@ -151,8 +182,7 @@ class XPUStaticKernelPickPass : public mir::StmtPass {
           std::string argname;
           instruct.op_info()->GetInputArgname(in->AsArg().name, &argname);
           VLOG(5) << "intput var name : " << in->AsArg().name;
-          // only when datatype is LOD_TENSOR, LOD_TENSOR_ARRAY,
-          // STEP_SCOPES,
+          // only when datatype is LOD_TENSOR, LOD_TENSOR_ARRAY, STEP_SCOPES,
           // the type pointer is not null;
           if (in->AsArg().type) {
             VLOG(5) << "input datatype : "
@@ -194,16 +224,11 @@ class XPUStaticKernelPickPass : public mir::StmtPass {
     VLOG(4) << "[score(final)]:" << final_score;
     VLOG(4) << "------------------------------";
 
-    // The data layout is not considered, for the input and output arguments
-    // might have different data layout.
-    // TODO(Superjomn) reconsider the idea of taking the data layout as a
-    // kernel
-    // specification.
     return final_score;
   }
 
   // Compatible for PrecisionType.
-  // For cuda, in the process of choosing kernel, fp16 and fp32 are
+  // In the process of choosing kernel, fp16 and fp32 are
   // compatiable.
   // If kernel's declared type is kAny, it is matched.
   bool PrecTypeCompatible(const PrecisionType& p1, const PrecisionType& p2) {
@@ -216,7 +241,6 @@ class XPUStaticKernelPickPass : public mir::StmtPass {
       return false;
     }
   }
-#ifdef LITE_WITH_XPU
   void DataPrecisionDicide(const std::unique_ptr<SSAGraph>& graph);
   bool ForceUsePrecision(size_t* score,
                          const lite::KernelBase& kernel,
@@ -240,7 +264,6 @@ class XPUStaticKernelPickPass : public mir::StmtPass {
                       const lite::KernelBase& kernel,
                       bool* type_match,
                       size_t* score);
-  void GetXPUDeviceType();
   void InplaceOpScore(lite::mir::Node* node,
                       const lite::KernelBase& kernel,
                       bool* type_match,
@@ -256,13 +279,11 @@ class XPUStaticKernelPickPass : public mir::StmtPass {
       size_t* score,
       bool* type_match);
   void CollectXPUSpecialOPType(const std::unique_ptr<SSAGraph>& graph);
-#endif
 
  private:
   core::KernelPickFactor kernel_pick_factors_;
 
   bool xpu_use_fp16_optimizer_{false};
-#ifdef LITE_WITH_XPU
   std::multimap<std::string, std::vector<std::map<std::string, PrecisionType>>>
       xpu_input_type_{};
   std::map<std::string, PrecisionType> xpu_output_type_{};
@@ -277,10 +298,11 @@ class XPUStaticKernelPickPass : public mir::StmtPass {
                                               "squeeze2",
                                               "unsqueeze",
                                               "unsqueeze2"};
-  // int8
   bool xpu_use_int8_optimizer_{false};
   std::set<std::string> xpu_int8_special_op_{"__xpu__fc", "__xpu__conv2d"};
-#endif
+
+  bool local_quant_{false};
+  std::string encode_precision_;
 };
 
 }  // namespace mir
diff --git a/lite/core/optimizer/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/optimizer/mir/fusion/quant_dequant_op_fuser.cc
@@ -735,6 +735,7 @@ void QuantDequantLinearOpFuser::InsertNewNode(SSAGraph* graph,
         break;
       }
     }
+    quantized_node->stmt()->op()->Attach(*op_info, scope);
     IR_NODE_LINK_TO(input_var_node, quantized_node);
   }
   // 3. Delete nodes and edges
diff --git a/lite/core/optimizer/mir/opencl_kernel_place_correct_pass.cc b/lite/core/optimizer/mir/opencl_kernel_place_correct_pass.cc
@@ -30,5 +30,4 @@ void OpenCLKernelPlaceCorrectPass::Apply(
 }  // namespace paddle
 
 REGISTER_MIR_PASS(opencl_kernel_place_correct_pass,
-                  paddle::lite::mir::OpenCLKernelPlaceCorrectPass)
-    .BindTargets({TARGET(kOpenCL)});
+                  paddle::lite::mir::OpenCLKernelPlaceCorrectPass);
diff --git a/lite/core/optimizer/mir/static_kernel_pick_pass.cc b/lite/core/optimizer/mir/static_kernel_pick_pass.cc
@@ -99,9 +99,7 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     } else {
       bool out_type_int8 = true;
       // Quantized lstm has fp32 output
-      if (instruct.op_type() == "lstm" || instruct.op_type() == "gru" ||
-          instruct.op_type() == "__xpu__multi_encoder" ||
-          instruct.op_type() == "__xpu__fc") {
+      if (instruct.op_type() == "lstm" || instruct.op_type() == "gru") {
         out_type_int8 = false;
       }
       // Only if all ops linked to this op output has enable_int8 attr,
@@ -114,9 +112,7 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
           CHECK(tmp_op->IsStmt());
           auto* tmp_op_info = tmp_op->AsStmt().op_info();
           if (!tmp_op_info->HasAttr("enable_int8") ||
-              tmp_op_info->Type() == "lstm" || tmp_op_info->Type() == "gru" ||
-              instruct.op_type() == "__xpu__multi_encoder" ||
-              instruct.op_type() == "__xpu__fc") {
+              tmp_op_info->Type() == "lstm" || tmp_op_info->Type() == "gru") {
             out_type_int8 = false;
             break;
           }
diff --git a/lite/core/optimizer/mir/variable_place_inference_pass.h b/lite/core/optimizer/mir/variable_place_inference_pass.h
@@ -256,47 +256,37 @@ class VariablePlaceInferencePass : public DebugPass {
           // update op's input variables precision from graph nodes info
           //    ps. op's input variables are stored in exec_scope, while
           //        graph node info is a temporary structure.
-          auto UpdateOpInputsFromNodeInfo = [&]() {
-            for (auto* in : node->inlinks) {
-              if (!(in->AsArg().is_weight) && in->AsArg().type->IsTensor()) {
-                auto in_arg_name = in->AsArg().name;
-                auto* tmp_tensor = node->AsStmt()
-                                       .op()
-                                       ->scope()
-                                       ->Var(in_arg_name)
-                                       ->GetMutable<lite::Tensor>();
-                tmp_tensor->set_precision(in->AsArg().type->precision());
-              }
+          for (auto* in : node->inlinks) {
+            if (!(in->AsArg().is_weight) && in->AsArg().type->IsTensor()) {
+              auto in_arg_name = in->AsArg().name;
+              auto* in_tensor = node->AsStmt()
+                                    .op()
+                                    ->scope()
+                                    ->Var(in_arg_name)
+                                    ->GetMutable<lite::Tensor>();
+              in_tensor->set_precision(in->AsArg().type->precision());
             }
-          };
-
-          // update graph nodes precision info from op's output variables
-          //    ps. op's output variables are stored in exec_scope, while
-          //        graph node info is a temporary structure.
-          auto UpdateNodeInfoFromOpOutputs = [&] {
-            for (auto* out : node->outlinks) {
-              if (!(out->AsArg().is_weight) && out->AsArg().type->IsTensor()) {
-                auto out_arg_name = out->AsArg().name;
-                auto* tmp_tensor = node->AsStmt()
-                                       .op()
-                                       ->scope()
-                                       ->Var(out_arg_name)
-                                       ->GetMutable<lite::Tensor>();
-                out->AsArg().type =
-                    LiteType::GetTensorTy(out->AsArg().type->target(),
-                                          tmp_tensor->precision(),
-                                          out->AsArg().type->layout());
-              }
-            }
-          };
-
-          // update op's input variables precision from graph nodes info
-          UpdateOpInputsFromNodeInfo();
+          }
           // update op's output precision from input precision by applying
           // InferType
           inst.op()->InferType();
           // update graph nodes precision info from op's output variables
-          UpdateNodeInfoFromOpOutputs();
+          //    ps. op's output variables are stored in exec_scope, while
+          //        graph node info is a temporary structure.
+          for (auto* out : node->outlinks) {
+            if (!(out->AsArg().is_weight) && out->AsArg().type->IsTensor()) {
+              auto out_arg_name = out->AsArg().name;
+              auto* out_tensor = node->AsStmt()
+                                     .op()
+                                     ->scope()
+                                     ->Var(out_arg_name)
+                                     ->GetMutable<lite::Tensor>();
+              out->AsArg().type =
+                  LiteType::GetTensorTy(out->AsArg().type->target(),
+                                        out_tensor->precision(),
+                                        out->AsArg().type->layout());
+            }
+          }
         }
       }
     }
diff --git a/lite/core/optimizer/optimizer.cc b/lite/core/optimizer/optimizer.cc

Original file line number	Diff line number	Diff line change
`@@ -735,6 +735,7 @@ void QuantDequantLinearOpFuser::InsertNewNode(SSAGraph* graph,`
`735`	`735`	`break;`
`736`	`736`	`}`
`737`	`737`	`}`
	`738`	`+ quantized_node->stmt()->op()->Attach(*op_info, scope);`
`738`	`739`	`IR_NODE_LINK_TO(input_var_node, quantized_node);`
`739`	`740`	`}`
`740`	`741`	`// 3. Delete nodes and edges`