Add dequantize_per_channel.out kernel (#1375)

larryliu0820 · facebook-github-bot · commit a08c8b79ab81 · 2023-12-07T19:27:57.000-08:00
Summary: Pull Request resolved: #1375 As titled Reviewed By: jerryzh168 Differential Revision: D51957534 fbshipit-source-id: 9962ec50862be9bdb24872c740bd6b9aea80121a
diff --git a/kernels/quantized/cpu/op_dequantize.cpp b/kernels/quantized/cpu/op_dequantize.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <algorithm>
 #include <cinttypes>
@@ -29,8 +30,6 @@ namespace {
  */
 void check_dequantize_per_tensor_args(
     const Tensor& input,
-    double scale,
-    int64_t zero_point,
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
@@ -58,9 +57,6 @@ void check_dequantize_per_tensor_args(
       "quant min: %" PRId64 " is greater than quant max: %" PRId64,
       quant_min,
       quant_max);
-
-  (void)scale;
-  (void)zero_point;
 }
 
 } // namespace
@@ -87,8 +83,7 @@ Tensor& dequantize_per_tensor_out(
       err == torch::executor::Error::Ok,
       "Failed to resize out Tensor in dequantize_per_tensor_out");
 
-  check_dequantize_per_tensor_args(
-      input, scale, zero_point, quant_min, quant_max, dtype, out);
+  check_dequantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
 
   // calculate the dequantized output, cast scale to float to match fbgemm
   // behavior
@@ -162,6 +157,136 @@ Tensor& dequantize_per_tensor_tensor_args_out(
   return out;
 }
 
+Tensor& dequantize_per_channel_out(
+    const Tensor& input,
+    const Tensor& scale,
+    const Tensor& zero_point,
+    int64_t axis,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  torch::executor::Error err = resize_tensor(out, input.sizes());
+
+  // normalize axis
+  ET_CHECK_MSG(
+      tensor_has_dim(input, axis),
+      "axis %zd is not legal it should be -input.dim() <= axis < input.dim() %zd",
+      ssize_t(axis),
+      ssize_t(input.dim()));
+
+  if (axis < 0) {
+    axis += nonzero_dim(input);
+  }
+
+  ET_CHECK_MSG(
+      err == torch::executor::Error::Ok,
+      "Failed to resize out Tensor in dequantize_per_channel_out");
+
+  ET_CHECK_MSG(
+      scale.scalar_type() == ScalarType::Double,
+      "scale.scalar_type() %" PRId8 " is not double type",
+      static_cast<int8_t>(scale.scalar_type()));
+
+  ET_CHECK_MSG(
+      scale.numel() == input.size(axis),
+      "scale.numel() %zd != input.size(axis) %zd",
+      ssize_t(scale.numel()),
+      ssize_t(input.size(axis)));
+
+  ET_CHECK_MSG(
+      zero_point.scalar_type() == ScalarType::Long,
+      "zero_point.scalar_type() %" PRId8 " is not integer type",
+      static_cast<int8_t>(zero_point.scalar_type()));
+
+  ET_CHECK_MSG(
+      zero_point.numel() == input.size(axis),
+      "zero_point.numel() %zd != input.size(axis) %zd",
+      ssize_t(zero_point.numel()),
+      ssize_t(input.size(axis)));
+
+  check_dequantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
+
+  // a list contains all dimensions except axis
+  int64_t dims[input.dim() - 1];
+  for (int64_t i = 0; i < input.dim() - 1; i++) {
+    if (i < axis) {
+      dims[i] = i;
+    } else {
+      dims[i] = i - 1;
+    }
+  }
+  const double* scale_data = scale.const_data_ptr<double>();
+  const int64_t* zero_point_data = zero_point.const_data_ptr<int64_t>();
+
+  exec_aten::optional<exec_aten::ArrayRef<int64_t>> optional_dim_list{
+      exec_aten::ArrayRef<int64_t>{dims, size_t(input.dim() - 1)}};
+
+  // Actual dequantization logic
+  // input, out are the input and output tensors
+  // channel_ix is the index along the axis dimension. 0 <= channel_ix <
+  // input.size(axis).
+  //   i.e. if the tensor has shape (N,C,H,W), axis being 1, then channel_ix
+  //   will be 0, 1, 2, ... C-1
+  // in_ix is the flat index of the element you are dequantizing.
+  //   in other words you are dequantizing in_data[in_ix]
+#define DEQUANTIZE_IMPL(CTYPE_IN, CTYPE_OUT, out_dtype)                        \
+  case ScalarType::out_dtype:                                                  \
+    for (size_t channel_ix = 0; channel_ix < input.size(axis); ++channel_ix) { \
+      double _scale = scale_data[channel_ix];                                  \
+      int64_t _zero_point = zero_point_data[channel_ix];                       \
+      apply_over_dim_list(                                                     \
+          [input, out, _scale, _zero_point](size_t in_ix) {                    \
+            out.mutable_data_ptr<CTYPE_OUT>()[in_ix] = static_cast<CTYPE_OUT>( \
+                (input.const_data_ptr<CTYPE_IN>()[in_ix] - _zero_point) *      \
+                _scale);                                                       \
+          },                                                                   \
+          input,                                                               \
+          optional_dim_list,                                                   \
+          channel_ix);                                                         \
+    }                                                                          \
+    break;
+#define CALCULATE_FLOAT_TYPE(CTYPE_IN, in_dtype)             \
+  case ScalarType::in_dtype:                                 \
+    switch (out.scalar_type()) {                             \
+      ET_FORALL_FLOAT_TYPES_WITH(CTYPE_IN, DEQUANTIZE_IMPL); \
+      default:                                               \
+        ET_CHECK_MSG(                                        \
+            false,                                           \
+            "Unhandled output dtype %" PRId8,                \
+            static_cast<int8_t>(out.scalar_type()));         \
+    }                                                        \
+    break;
+
+  switch (input.scalar_type()) {
+    ET_FORALL_INT_TYPES(CALCULATE_FLOAT_TYPE);
+    default:
+      ET_CHECK_MSG(
+          false,
+          "Unhandled input dtype %" PRId8,
+          static_cast<int8_t>(input.scalar_type()));
+  }
+#undef CALCULATE_FLOAT_TYPE
+#undef QUANTIZE_IMPL
+
+  return out;
+}
+
+Tensor& dequantize_per_channel_out(
+    RuntimeContext& context,
+    const Tensor& input,
+    const Tensor& scale,
+    const Tensor& zero_point,
+    int64_t axis,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  (void)context;
+  return dequantize_per_channel_out(
+      input, scale, zero_point, axis, quant_min, quant_max, dtype, out);
+}
+
 Tensor& dequantize_per_tensor_out(
     RuntimeContext& context,
     const Tensor& input,
diff --git a/kernels/quantized/cpu/targets.bzl b/kernels/quantized/cpu/targets.bzl
@@ -13,6 +13,12 @@ _QUANT_OPS = (
     ),
     op_target(
         name = "op_dequantize",
+        deps = [
+            "//executorch/kernels/portable/cpu/util:reduce_util",
+        ],
+        _aten_mode_deps = [
+            "//executorch/kernels/portable/cpu/util:reduce_util_aten",
+        ],
     ),
     op_target(
         name = "op_embedding",
diff --git a/kernels/quantized/quantized.yaml b/kernels/quantized/quantized.yaml
@@ -28,6 +28,12 @@
     - arg_meta: null
       kernel_name: torch::executor::quantize_per_channel_out
 
+- func: dequantize_per_channel.out(Tensor input, Tensor scales, Tensor zero_points, int axis, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::dequantize_per_channel_out
+
 - func: quantized_decomposed::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
diff --git a/kernels/quantized/test/op_dequantize_test.cpp b/kernels/quantized/test/op_dequantize_test.cpp
@@ -21,6 +21,7 @@ using exec_aten::ArrayRef;
 using exec_aten::Scalar;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
+using torch::executor::native::dequantize_per_channel_out;
 using torch::executor::native::dequantize_per_tensor_out;
 using torch::executor::native::dequantize_per_tensor_tensor_args_out;
 using torch::executor::testing::TensorFactory;
@@ -90,3 +91,52 @@ TEST(OpDequantizeOutTest, TensorArgOverload) {
 
   EXPECT_TENSOR_EQ(out, expected);
 }
+
+TEST(OpDequantizeOutTest, DequantizePerChannel) {
+  TensorFactory<ScalarType::Byte> tf_byte;
+  TensorFactory<ScalarType::Double> tf_double;
+  TensorFactory<ScalarType::Long> tf_long;
+
+  Tensor input = tf_byte.full({3, 2}, 100);
+  Tensor scale = tf_double.make({2}, {0.5, 1});
+  Tensor zero_point = tf_long.make({2}, {30, 60});
+  int64_t quant_min = 0;
+  int64_t quant_max = 255;
+
+  TensorFactory<ScalarType::Float> tfo;
+  Tensor out = tfo.zeros({3, 2});
+  // (100 - 30) * 0.5
+  // (100 - 60) * 1
+  Tensor expected = tfo.make({3, 2}, {35, 40, 35, 40, 35, 40});
+  dequantize_per_channel_out(
+      input,
+      scale,
+      zero_point,
+      /*axis=*/1,
+      quant_min,
+      quant_max,
+      ScalarType::Byte,
+      out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+
+  // Test with a different axis
+  out = tfo.zeros({3, 2});
+  scale = tf_double.make({3}, {0.5, 0.75, 1});
+  zero_point = tf_long.make({3}, {30, 50, 60});
+  // (100 - 30) * 0.5
+  // (100 - 50) * 0.75
+  // (100 - 60) * 1
+  expected = tfo.make({3, 2}, {35, 35, 37.5, 37.5, 40, 40});
+  dequantize_per_channel_out(
+      input,
+      scale,
+      zero_point,
+      /*axis=*/0,
+      quant_min,
+      quant_max,
+      ScalarType::Byte,
+      out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}