[cherry pick] add cast trt convert (#44837)

ccrrong · web-flow · commit 7cdce09b5b53 · 2022-08-04T15:20:28.000+08:00
* add cast trt convert

* skip cast trt convert when input dtype is bool

* code format

* fix bug

* update unittest

* fix bug
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1793,6 +1793,7 @@ USE_TRT_CONVERTER(multiclass_nms3);
 USE_TRT_CONVERTER(nearest_interp);
 USE_TRT_CONVERTER(nearest_interp_v2);
 USE_TRT_CONVERTER(bilinear_interp_v2);
+USE_TRT_CONVERTER(cast);
 USE_TRT_CONVERTER(reshape);
 USE_TRT_CONVERTER(reduce_sum);
 USE_TRT_CONVERTER(gather_nd);
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,33 +1,66 @@
 # Add TRT tests
-nv_library(tensorrt_converter
-           SRCS matmul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-                batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc group_norm_op.cc
-                pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
-                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc flatten_contiguous_range_op.cc
-                emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc
-                gather_op.cc
-		bilinear_interp_v2_op.cc
-                anchor_generator_op.cc
-                yolo_box_op.cc
-                roi_align_op.cc
-                affine_channel_op.cc
-                multiclass_nms_op.cc
-                multiclass_nms3_op.cc
-                nearest_interp_op.cc
-                reshape_op.cc
-                reduce_op.cc
-                gather_nd_op.cc
-                tile_op.cc
-                conv3d_op.cc
-                mish_op.cc
-                nearest_interp_v2_op.cc
-                pool3d_op.cc
-                deformable_conv_op.cc
-                preln_emb_eltwise_layernorm.cc
-		strided_slice_op.cc
-                preln_skip_layernorm.cc
-		roll_op.cc
-           DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
+nv_library(
+  tensorrt_converter
+  SRCS matmul_op.cc
+       conv2d_op.cc
+       fc_op.cc
+       pool2d_op.cc
+       elementwise_op.cc
+       batch_norm_op.cc
+       activation_op.cc
+       softmax_op.cc
+       concat_op.cc
+       dropout_op.cc
+       group_norm_op.cc
+       pad_op.cc
+       split_op.cc
+       prelu_op.cc
+       leaky_relu_op.cc
+       gelu_op.cc
+       layer_norm_op.cc
+       multihead_matmul_op.cc
+       shuffle_channel_op.cc
+       swish_op.cc
+       instance_norm_op.cc
+       stack_op.cc
+       transpose_op.cc
+       flatten_op.cc
+       flatten_contiguous_range_op.cc
+       emb_eltwise_layernorm.cc
+       skip_layernorm.cc
+       scale_op.cc
+       slice_op.cc
+       hard_sigmoid_op.cc
+       hard_swish_op.cc
+       clip_op.cc
+       gather_op.cc
+       bilinear_interp_v2_op.cc
+       cast_op.cc
+       anchor_generator_op.cc
+       yolo_box_op.cc
+       roi_align_op.cc
+       affine_channel_op.cc
+       multiclass_nms_op.cc
+       multiclass_nms3_op.cc
+       nearest_interp_op.cc
+       reshape_op.cc
+       reduce_op.cc
+       gather_nd_op.cc
+       tile_op.cc
+       conv3d_op.cc
+       mish_op.cc
+       nearest_interp_v2_op.cc
+       pool3d_op.cc
+       deformable_conv_op.cc
+       preln_emb_eltwise_layernorm.cc
+       strided_slice_op.cc
+       preln_skip_layernorm.cc
+       roll_op.cc
+  DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto
+       op_registry)
 
-nv_test(test_op_converter SRCS test_op_converter.cc DEPS
-  paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter)
+nv_test(
+  test_op_converter
+  SRCS test_op_converter.cc
+  DEPS paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_engine
+       tensorrt_converter)
diff --git a/paddle/fluid/inference/tensorrt/convert/cast_op.cc b/paddle/fluid/inference/tensorrt/convert/cast_op.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class CastOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+    VLOG(3) << "convert a cast op to tensorrt";
+    framework::OpDesc op_desc(op, nullptr);
+
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto out_dtype = BOOST_GET_CONST(int, op_desc.GetAttr("out_dtype"));
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Identity, *input);
+
+    switch (out_dtype) {
+      case 2:  // INT32 = 2
+        layer->getOutput(0)->setType(nvinfer1::DataType::kINT32);
+        break;
+      case 4:  // FP16 = 4
+        layer->getOutput(0)->setType(nvinfer1::DataType::kHALF);
+        break;
+      case 5:  // FP32 = 5
+        layer->getOutput(0)->setType(nvinfer1::DataType::kFLOAT);
+        break;
+      default:
+        LOG(ERROR) << "Unable to convert a fluid data type(" << out_dtype
+                   << ") to a nvinfer DataType";
+        break;
+    }
+
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "cast", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(cast, CastOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -49,7 +49,8 @@ struct SimpleOpTypeSetTeller : public Teller {
 #endif
   }
 
-  bool operator()(const std::string& op_type, const framework::OpDesc& desc,
+  bool operator()(const std::string& op_type,
+                  const framework::OpDesc& desc,
                   bool use_no_calib_int8) override {
     if (use_no_calib_int8) {
       return int8_teller_set.count(op_type);
@@ -111,6 +112,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "mish",
       "nearest_interp_v2",
       "bilinear_interp_v2",
+      "cast",
       "pool3d",
       "deformable_conv",
       "relu6",
@@ -175,6 +177,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "mish",
       "bilinear_interp_v2",
       "nearest_interp_v2",
+      "cast",
       "pool3d",
       "deformable_conv",
       "relu6",
@@ -191,7 +194,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "multiclass_nms3"};
 };
 
-bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
+bool OpTeller::Tell(const framework::ir::Node* node,
+                    bool use_no_calib_int8,
                     bool with_dynamic_shape) {
   const std::string op_type = node->Op()->Type();
   const framework::OpDesc desc = *node->Op();
@@ -706,8 +710,8 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     }
 
     if (op_type == "nearest_interp") {
-      std::vector<std::string> attrs{"interp_method", "align_corners", "scale",
-                                     "out_h", "out_w"};
+      std::vector<std::string> attrs{
+          "interp_method", "align_corners", "scale", "out_h", "out_w"};
       for (auto const attr : attrs) {
         if (!desc.HasAttr(attr)) return false;
       }
@@ -747,9 +751,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     }
 
     if (op_type == "nearest_interp_v2") {
-      std::vector<std::string> attrs{"data_layout",   "interp_method",
-                                     "align_corners", "scale",
-                                     "out_h",         "out_w"};
+      std::vector<std::string> attrs{"data_layout",
+                                     "interp_method",
+                                     "align_corners",
+                                     "scale",
+                                     "out_h",
+                                     "out_w"};
       for (auto const attr : attrs) {
         if (!desc.HasAttr(attr)) return false;
       }
@@ -775,9 +782,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     }
 
     if (op_type == "bilinear_interp_v2") {
-      std::vector<std::string> attrs{"data_layout",   "interp_method",
-                                     "align_corners", "scale",
-                                     "out_h",         "out_w"};
+      std::vector<std::string> attrs{"data_layout",
+                                     "interp_method",
+                                     "align_corners",
+                                     "scale",
+                                     "out_h",
+                                     "out_w"};
       for (auto const attr : attrs) {
         if (!desc.HasAttr(attr)) {
           VLOG(3) << "The op_type " << op_type << " doesn't have the attr "
@@ -882,8 +892,8 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     }
 
     if (op_type == "batch_norm") {
-      const std::vector<std::string> bn_inputs = {"X", "Bias", "Mean", "Scale",
-                                                  "Variance"};
+      const std::vector<std::string> bn_inputs = {
+          "X", "Bias", "Mean", "Scale", "Variance"};
       for (unsigned int i = 0; i < bn_inputs.size(); i++) {
         if (desc.Input(bn_inputs[i]).size() != 1) {
           VLOG(3) << "Invalid " << bn_inputs[i]
@@ -1458,8 +1468,10 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
                    "the roi_align will change the batch size.";
         return false;
       }
-      std::vector<std::string> attrs{"pooled_height", "pooled_width",
-                                     "spatial_scale", "sampling_ratio",
+      std::vector<std::string> attrs{"pooled_height",
+                                     "pooled_width",
+                                     "spatial_scale",
+                                     "sampling_ratio",
                                      "aligned"};
       for (auto const attr : attrs) {
         if (!desc.HasAttr(attr)) return false;
@@ -1641,10 +1653,10 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
           auto x_var_name = desc.Input("X")[0];
           auto* x_var_desc = block->FindVar(x_var_name);
           const auto x_shape = x_var_desc->GetShape();
-          int input_num = std::accumulate(x_shape.begin() + 1, x_shape.end(), 1,
-                                          std::multiplies<int>());
-          int shape_num = std::accumulate(shape.begin() + 1, shape.end(), 1,
-                                          std::multiplies<int>());
+          int input_num = std::accumulate(
+              x_shape.begin() + 1, x_shape.end(), 1, std::multiplies<int>());
+          int shape_num = std::accumulate(
+              shape.begin() + 1, shape.end(), 1, std::multiplies<int>());
           if (input_num == shape_num) {
             return true;
           }
@@ -1751,6 +1763,36 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     }
 #endif
 
+    if (op_type == "cast") {
+// trt 6015 result in Windows ppyolo_mbv3 TRT fp32 diff
+#if !IS_TRT_VERSION_GE(7000)
+      return false;
+#endif
+      if (!(desc.HasAttr("in_dtype") && desc.HasAttr("out_dtype"))) {
+        VLOG(3) << "the " << op_type
+                << " does not have attr (in_dtype or "
+                   "out_dtype)";
+        return false;
+      }
+      int in_dtype = BOOST_GET_CONST(int, desc.GetAttr("in_dtype"));
+      int out_dtype = BOOST_GET_CONST(int, desc.GetAttr("out_dtype"));
+      if ((in_dtype == 4 || in_dtype == 5) && out_dtype == 4) {
+        VLOG(3) << "unsupport data type conversion";
+        return false;
+      }
+      if (in_dtype == 0) {
+        VLOG(3) << "do not support input data type as bool now";
+        return false;
+      }
+      if (!((in_dtype == 5 || in_dtype == 4 || in_dtype == 2) &&
+            (out_dtype == 5 || out_dtype == 4 || out_dtype == 2))) {
+        VLOG(3)
+            << "only valid conversions are: "
+               "(kFLOAT | kHALF | kINT32 | kBOOL) -> (kFLOAT | kHALF | kINT32)";
+        return false;
+      }
+    }
+
     if (op_type == "conv3d" || op_type == "conv3d_transpose") {
       if (desc.HasAttr("padding_algorithm")) {
         std::string padding_algorithm =
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cast.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cast.py