PaddlePaddle
diff --git a/‎paddle/fluid/operators/fake_dequantize_op.cc
Lines changed: 43 additions & 0 deletions b/‎paddle/fluid/operators/fake_dequantize_op.cc
Lines changed: 43 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/fake_dequantize_op.cu
Lines changed: 58 additions & 0 deletions b/‎paddle/fluid/operators/fake_dequantize_op.cu
Lines changed: 58 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/fake_dequantize_op.h
Lines changed: 16 additions & 27 deletions b/‎paddle/fluid/operators/fake_dequantize_op.h
Lines changed: 16 additions & 27 deletions
diff --git a/‎paddle/fluid/operators/fake_quantize_op.cc
Lines changed: 45 additions & 0 deletions b/‎paddle/fluid/operators/fake_quantize_op.cc
Lines changed: 45 additions & 0 deletions
@@ -33,8 +33,51 @@ struct DequantizeFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
+template <typename T>
+struct ChannelDequantizeFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor** scales,
+                  const int scale_num, T max_range, framework::Tensor* out) {
+    if (scale_num == 1) {
+      const int channel = in->dims()[0];
+      const T* scale_factor = scales[0]->data<T>();
+      for (int i = 0; i < channel; i++) {
+        T s = scale_factor[i];
+        framework::Tensor one_channel_in = in->Slice(i, i + 1);
+        framework::Tensor one_channel_out = out->Slice(i, i + 1);
+        auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
+        auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+        auto& dev = *dev_ctx.eigen_device();
+        out_e.device(dev) = (s / max_range) * in_e;
+      }
+    } else if (scale_num == 2) {
+      int batch_size = in->dims()[0];
+      int channel = in->dims()[1];
+      const T* scale_one = scales[0]->data<T>();
+      const T* scale_two = scales[1]->data<T>();
+      for (int i = 0; i < batch_size; i++) {
+        framework::Tensor one_batch_in = in->Slice(i, i + 1).Resize(
+            framework::slice_ddim(in->dims(), 1, in->dims().size()));
+        framework::Tensor one_batch_out = out->Slice(i, i + 1).Resize(
+            framework::slice_ddim(out->dims(), 1, out->dims().size()));
+        for (int j = 0; j < channel; j++) {
+          T s = scale_one[j];
+          framework::Tensor one_channel_in = one_batch_in.Slice(j, j + 1);
+          framework::Tensor one_channel_out = one_batch_out.Slice(j, j + 1);
+          auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
+          auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+          auto& dev = *dev_ctx.eigen_device();
+          out_e.device(dev) = (s * scale_two[0] / max_range) * in_e;
+        }
+      }
+    }
+  }
+};
+
 template struct DequantizeFunctor<platform::CPUDeviceContext, float>;
 template struct DequantizeFunctor<platform::CPUDeviceContext, double>;
+template struct ChannelDequantizeFunctor<platform::CPUDeviceContext, float>;
+template struct ChannelDequantizeFunctor<platform::CPUDeviceContext, double>;
 
 class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
  public:
 
@@ -44,8 +44,66 @@ struct DequantizeFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
+template <typename T>
+__global__ void DequantizeOneScale(const T* in, const T* scale, T max_range,
+                                   int num, int channel, T* out) {
+  int tid = threadIdx.x;
+  int channel_size = num / channel;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    out_c[i] = in_c[i] * scale[blockIdx.x] / max_range;
+  }
+}
+
+template <typename T>
+__global__ void DequantizeTwoScale(const T* in, const T* scale_one,
+                                   const T* scale_two, T max_range, int num,
+                                   int batch_size, int channel, T* out) {
+  int tid = threadIdx.x;
+  int channel_size = num / (batch_size * channel);
+  int scale_index = blockIdx.x % channel;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    out_c[i] = in_c[i] * scale_one[scale_index] * scale_two[0] / max_range;
+  }
+}
+
+template <typename T>
+struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor** scales,
+                  const int scale_num, T max_range, framework::Tensor* out) {
+    const T* in_data = in->data<T>();
+    T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+    if (scale_num == 1) {
+      int num = in->numel();
+      int channel = in->dims()[0];
+      const T* scale_factor = scales[0]->data<T>();
+      int block = 1024;
+      int grid = channel;
+      DequantizeOneScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
+          in_data, scale_factor, max_range, num, channel, out_data);
+    } else if (scale_num == 2) {
+      int num = in->numel();
+      int batch_size = in->dims()[0];
+      int channel = in->dims()[1];
+      const T* scale_one = scales[0]->data<T>();
+      const T* scale_two = scales[1]->data<T>();
+      int block = 1024;
+      int grid = batch_size * channel;
+      DequantizeTwoScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
+          in_data, scale_one, scale_two, max_range, num, batch_size, channel,
+          out_data);
+    }
+  }
+};
+
 template struct DequantizeFunctor<platform::CUDADeviceContext, float>;
 template struct DequantizeFunctor<platform::CUDADeviceContext, double>;
+template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, float>;
+template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace operators
 }  // namespace paddle
 
@@ -29,6 +29,13 @@ struct DequantizeFunctor {
                   framework::Tensor* out);
 };
 
+template <typename DeviceContext, typename T>
+struct ChannelDequantizeFunctor {
+  void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
+                  const framework::Tensor** scales, const int scale_num,
+                  T max_range, framework::Tensor* out);
+};
+
 template <typename DeviceContext, typename T>
 class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
  public:
@@ -56,50 +63,32 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
     auto* out = ctx.Output<framework::Tensor>("Out");
 
     auto quant_bits = ctx.Attr<std::vector<int>>("quant_bits");
-    int max_range = std::pow(2, quant_bits[0] - 1) - 1;
+    int max_range = 1;
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     out->mutable_data<T>(dev_ctx.GetPlace());
-
-    auto dequant = DequantizeFunctor<DeviceContext, T>();
-    if (scales.size() == 1) {
+    int scale_num = scales.size();
+    if (scale_num == 1) {
       PADDLE_ENFORCE_EQ(
           scales[0]->numel(), in->dims()[0],
           "The number of first scale values must be the same with "
           "first dimension value of Input(X) when the `Scales` has only one "
           "element.");
-      for (int64_t i = 0; i < in->dims()[0]; i++) {
-        framework::Tensor one_channel_in = in->Slice(i, i + 1);
-        framework::Tensor one_channel_out = out->Slice(i, i + 1);
-        framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1);
-        dequant(dev_ctx, &one_channel_in, &one_channel_scale,
-                static_cast<T>(max_range), &one_channel_out);
-      }
-    } else if (scales.size() == 2) {
+      max_range *= (std::pow(2, quant_bits[0] - 1) - 1);
+    } else if (scale_num == 2) {
       PADDLE_ENFORCE_EQ(
           scales[0]->numel(), in->dims()[1],
           "The number of first scale values must be the same with "
           "second dimension value of Input(X) when the `Scales` has two "
           "elements.");
-      for (int64_t i = 0; i < in->dims()[0]; i++) {
-        framework::Tensor one_batch_in = in->Slice(i, i + 1).Resize(
-            framework::slice_ddim(in->dims(), 1, in->dims().size()));
-        framework::Tensor one_batch_out = out->Slice(i, i + 1).Resize(
-            framework::slice_ddim(out->dims(), 1, out->dims().size()));
-        for (int64_t j = 0; j < in->dims()[1]; j++) {
-          framework::Tensor one_channel_in = one_batch_in.Slice(j, j + 1);
-          framework::Tensor one_channel_out = one_batch_out.Slice(j, j + 1);
-          framework::Tensor one_channel_scale = scales[0]->Slice(j, j + 1);
-          dequant(dev_ctx, &one_channel_in, &one_channel_scale,
-                  static_cast<T>(max_range), &one_channel_out);
-        }
-      }
       PADDLE_ENFORCE_EQ(
           scales[1]->numel(), 1,
           "The second scale tensor should only have one value at now.");
-      max_range = std::pow(2, quant_bits[1] - 1) - 1;
-      dequant(dev_ctx, out, scales[1], static_cast<T>(max_range), out);
+      max_range *= (std::pow(2, quant_bits[0] - 1) - 1) *
+                   (std::pow(2, quant_bits[1] - 1) - 1);
     }
+    ChannelDequantizeFunctor<DeviceContext, T>()(
+        dev_ctx, in, scales.data(), scale_num, static_cast<T>(max_range), out);
   }
 };
 
 
@@ -37,6 +37,21 @@ struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
 
 template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>;
 
+template <typename T>
+struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const T* in,
+                  const int num, const int channel, T* out) {
+    const int channel_size = num / channel;
+    for (int i = 0; i < channel; i++) {
+      auto* start = in + i * channel_size;
+      auto* end = in + (i + 1) * channel_size;
+      out[i] = std::abs(*(std::max_element(start, end, Compare<T>())));
+    }
+  }
+};
+
+template struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, float>;
+
 template <typename T>
 struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& ctx,
@@ -53,6 +68,36 @@ struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
 
 template struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, float>;
 
+template <typename T>
+struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int channel,
+                  framework::Tensor* out) {
+    auto* scale_data = scale.data<T>();
+    auto* in_data = in.data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+    const int channel_size = in.numel() / channel;
+    platform::Transform<platform::CPUDeviceContext> trans;
+    for (int i = 0; i < channel; i++) {
+      T s = scale_data[i];
+      auto* start = in_data + i * channel_size;
+      auto* end = in_data + (i + 1) * channel_size;
+      trans(ctx, start, end, out_data + i * channel_size,
+            ClipFunctor<T>(-s, s));
+    }
+    for (int i = 0; i < channel; i++) {
+      T s = scale_data[i];
+      framework::Tensor one_channel_out = out->Slice(i, i + 1);
+      auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+      out_e.device(*ctx.eigen_device()) = (bin_cnt / s * out_e).round();
+    }
+  }
+};
+
+template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext,
+                                               float>;
+
 template <typename T>
 struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& ctx,