Fix dequantize per channel to handle double scale type

kimishpatel · kimishpatel · commit cf700d922784 · 2024-09-24T21:41:55.000-07:00
Pull Request resolved: pytorch/executorch#5524 ghstack-source-id: 244449200 @exported-using-ghexport Differential Revision: [D62301839](https://our.internmc.facebook.com/intern/diff/D62301839/)
diff --git a/kernels/quantized/cpu/op_dequantize.cpp b/kernels/quantized/cpu/op_dequantize.cpp
@@ -196,8 +196,8 @@ Tensor& dequantize_per_channel_out(
       "Failed to resize out Tensor in dequantize_per_channel_out");
 
   ET_CHECK_MSG(
-      scale.scalar_type() == ScalarType::Float,
-      "scale.scalar_type() %" PRId8 " is not float type",
+      scale.scalar_type() == ScalarType::Double,
+      "scale.scalar_type() %" PRId8 " is not double type",
       static_cast<int8_t>(scale.scalar_type()));
 
   ET_CHECK_MSG(
@@ -232,7 +232,7 @@ Tensor& dequantize_per_channel_out(
       dims[i] = i + 1;
     }
   }
-  const float* scale_data = scale.const_data_ptr<float>();
+  const double* scale_data = scale.const_data_ptr<double>();
   const int64_t* zero_point_data;
   if (opt_zero_points.has_value()) {
     zero_point_data = opt_zero_points.value().const_data_ptr<int64_t>();
@@ -264,7 +264,7 @@ Tensor& dequantize_per_channel_out(
               size_t numel, size_t stride, size_t base_ix) {                   \
             for (size_t i = 0; i < numel; i++) {                               \
               size_t current_ix = base_ix * stride + i;                        \
-              float _scale = scale_data[current_ix];                           \
+              float _scale = static_cast<float>(scale_data[current_ix]);       \
               int64_t zero_point = 0;                                          \
               if (zero_point_data != nullptr) {                                \
                 zero_point = zero_point_data[current_ix];                      \
@@ -280,7 +280,7 @@ Tensor& dequantize_per_channel_out(
       break;                                                                   \
     }                                                                          \
     for (size_t channel_ix = 0; channel_ix < input.size(axis); ++channel_ix) { \
-      float _scale = scale_data[channel_ix];                                   \
+      float _scale = static_cast<float>(scale_data[channel_ix]);               \
       int64_t _zero_point = 0;                                                 \
       if (zero_point_data != nullptr) {                                        \
         _zero_point = zero_point_data[channel_ix];                             \
diff --git a/kernels/quantized/test/op_dequantize_test.cpp b/kernels/quantized/test/op_dequantize_test.cpp
@@ -11,6 +11,7 @@
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/platform/runtime.h>
 #include <executorch/test/utils/DeathTest.h>
 
 #include <gtest/gtest.h>
@@ -57,10 +58,12 @@ void test_dtype() {
 }
 
 TEST(OpDequantizeOutTest, AllDtypesSupported) {
+  et_pal_init();
   test_dtype<ScalarType::Byte>();
 }
 
 TEST(OpDequantizeOutTest, NonWholeNumbers) {
+  et_pal_init();
   TensorFactory<ScalarType::Byte> tf;
 
   Tensor input = tf.full({3, 5}, 100);
@@ -87,6 +90,7 @@ TEST(OpDequantizeOutTest, NonWholeNumbers) {
 }
 
 TEST(OpDequantizeOutTest, TensorArgOverload) {
+  et_pal_init();
   TensorFactory<ScalarType::Byte> tf_byte;
   TensorFactory<ScalarType::Double> tf_double;
   TensorFactory<ScalarType::Long> tf_long;
@@ -115,12 +119,13 @@ TEST(OpDequantizeOutTest, TensorArgOverload) {
 }
 
 TEST(OpDequantizeOutTest, DequantizePerChannel) {
+  et_pal_init();
   TensorFactory<ScalarType::Byte> tf_byte;
-  TensorFactory<ScalarType::Float> tf_float;
+  TensorFactory<ScalarType::Double> tf_double;
   TensorFactory<ScalarType::Long> tf_long;
 
   Tensor input = tf_byte.full({3, 2}, 100);
-  Tensor scale = tf_float.make({2}, {0.5, 1});
+  Tensor scale = tf_double.make({2}, {0.5, 1});
   Tensor zero_point = tf_long.make({2}, {30, 60});
   int64_t quant_min = 0;
   int64_t quant_max = 255;
@@ -145,7 +150,7 @@ TEST(OpDequantizeOutTest, DequantizePerChannel) {
 
   // Test with a different axis
   out = tfo.zeros({3, 2});
-  scale = tf_float.make({3}, {0.5, 0.75, 1});
+  scale = tf_double.make({3}, {0.5, 0.75, 1});
   zero_point = tf_long.make({3}, {30, 50, 60});
   // (100 - 30) * 0.5
   // (100 - 50) * 0.75
@@ -167,7 +172,7 @@ TEST(OpDequantizeOutTest, DequantizePerChannel) {
   // Test with a different axis
   out = tfo.zeros({3});
   input = tf_byte.make({3}, {100, 100, 100});
-  scale = tf_float.make({3}, {0.5, 0.75, 1});
+  scale = tf_double.make({3}, {0.5, 0.75, 1});
   zero_point = tf_long.make({3}, {30, 50, 60});
   // (100 - 30) * 0.5
   // (100 - 50) * 0.75